def _post_datafile(dest_url, content): """ Do post to mytardis to create new datafile and any exp and dataset if needed """ (source_scheme, tardis_host_url, source_path, source_location, query_settings) = storage.parse_bdpurl(dest_url) query_settings['mytardis_host'] = tardis_host_url logger.debug("query_settings=%s" % query_settings) exp_name = _get_value('exp_name', query_settings) dataset_name = _get_value('dataset_name', query_settings) root_path = _get_value('root_path', query_settings) fname = _get_value('fname', query_settings) tardis_user = _get_value('mytardis_username', query_settings) tardis_pass = _get_value('mytardis_password', query_settings) tardis_port = _get_value('mytardis_port', query_settings) exp_id, _ = _get_or_create_experiment(query_settings, exp_name) dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id) url = "https://%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port) headers = {'Accept': 'application/json'} new_dataset_uri = "/api/v1/dataset/%s/" % dataset_id # import tempfile # temp = tempfile.NamedTemporaryFile() # temp.write(content) # temp.flush() # temp.seek(0) logger.debug("fname=%s" % fname) file_path = os.path.join(root_path, fname) logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) data = json.dumps({ 'dataset': str(new_dataset_uri), 'filename': os.path.basename(fname), # 'size': os.stat(temp.name).st_size, 'size': len(content), 'mimetype': 'text/plain', 'md5sum': hashlib.md5(content).hexdigest() #'md5sum': hashlib.md5(temp.read()).hexdigest() }) logger.debug("data=%s" % data) #temp.seek(0) temp = StringIO.StringIO(content) r = requests.post(url, data={'json_data': data}, headers=headers, files={'attached_file': temp}, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False )
def _post_datafile(dest_url, content): """ Do post to mytardis to create new datafile and any exp and dataset if needed """ (source_scheme, tardis_host_url, source_path, source_location, query_settings) = storage.parse_bdpurl(dest_url) query_settings['mytardis_host'] = tardis_host_url logger.debug("query_settings=%s" % query_settings) exp_name = _get_value('exp_name', query_settings) dataset_name = _get_value('dataset_name', query_settings) root_path = _get_value('root_path', query_settings) fname = _get_value('fname', query_settings) tardis_user = _get_value('mytardis_username', query_settings) tardis_pass = _get_value('mytardis_password', query_settings) tardis_port = _get_value('mytardis_port', query_settings) exp_id, _ = _get_or_create_experiment(query_settings, exp_name) dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id) url = "https://%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port) headers = {'Accept': 'application/json'} new_dataset_uri = "/api/v1/dataset/%s/" % dataset_id # import tempfile # temp = tempfile.NamedTemporaryFile() # temp.write(content) # temp.flush() # temp.seek(0) logger.debug("fname=%s" % fname) file_path = os.path.join(root_path, fname) logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) data = json.dumps({ 'dataset': str(new_dataset_uri), 'filename': os.path.basename(fname), # 'size': os.stat(temp.name).st_size, 'size': len(content), 'mimetype': 'text/plain', 'md5sum': hashlib.md5(content).hexdigest() #'md5sum': hashlib.md5(temp.read()).hexdigest() }) logger.debug("data=%s" % data) #temp.seek(0) temp = StringIO.StringIO(content) r = requests.post(url, data={'json_data': data}, headers=headers, files={'attached_file': temp}, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False)
def process(self, run_settings): self.experiment_id = 0 local_settings = setup_settings(run_settings) self.experiment_id = local_settings['experiment_id'] messages.info(run_settings, "1: waiting for completion") logger.debug("settings=%s" % local_settings) try: self.runs_left = ast.literal_eval(getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): self.runs_left = [] # if self._exists(run_settings, # 'http://rmit.edu.au/schemas/stages/make', # u'runs_left'): # self.runs_left = ast.literal_eval( # run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left']) # else: # self.runs_left = [] def _get_dest_bdp_url(local_settings): return "%s@%s" % ( "nci", os.path.join(local_settings['payload_destination'], str(local_settings['contextid']))) dest_url = _get_dest_bdp_url(local_settings) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( local_settings, dest_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) if self.runs_left: job_finished = self._job_finished( settings=local_settings, remote_path=dest_url) if not job_finished: return self._get_output(local_settings, dest_url) self.runs_left -= 1 if self.runs_left <= 0: messages.success(run_settings, "%s: finished" % (1)) logger.debug("processing finished")
def process(self, run_settings): self.experiment_id = 0 local_settings = setup_settings(run_settings) self.experiment_id = local_settings['experiment_id'] messages.info(run_settings, "1: waiting for completion") logger.debug("settings=%s" % local_settings) try: self.runs_left = ast.literal_eval( getval(run_settings, '%s/stages/make/runs_left' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): self.runs_left = [] # if self._exists(run_settings, # 'http://rmit.edu.au/schemas/stages/make', # u'runs_left'): # self.runs_left = ast.literal_eval( # run_settings['http://rmit.edu.au/schemas/stages/make'][u'runs_left']) # else: # self.runs_left = [] def _get_dest_bdp_url(local_settings): return "%s@%s" % ("nci", os.path.join( local_settings['payload_destination'], str(local_settings['contextid']))) dest_url = _get_dest_bdp_url(local_settings) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( local_settings, dest_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) if self.runs_left: job_finished = self._job_finished(settings=local_settings, remote_path=dest_url) if not job_finished: return self._get_output(local_settings, dest_url) self.runs_left -= 1 if self.runs_left <= 0: messages.success(run_settings, "%s: finished" % (1)) logger.debug("processing finished")
def create_dataset_for_final_output(self, run_settings, experiment_id, base_dir, output_url, all_settings): logger.debug("curate_dataset") iter_output_dir = os.path.join(os.path.join(base_dir, "output")) logger.debug("iter_output_dir=%s" % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug("iter_output_dir=%s" % iter_output_dir) logger.debug("output_url=%s" % output_url) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % self.SCHEMA_PREFIX)) if curate_data: if all_settings['mytardis_host']: output_dirs = [] for m, dir_name in enumerate(node_output_dirnames): output_dirs.append(os.path.join(iter_output_dir, dir_name)) for m, output_dir in enumerate(output_dirs): #node_path = os.path.join(iter_output_dir, node_dir) logger.debug("output_dir=%s" % output_dir) dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} self.load_metadata_builder(run_settings) if self.METADATA_BUILDER: (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func) = \ self.METADATA_BUILDER.build_metadata_for_final_output(m, output_dir, \ run_settings=run_settings, storage_settings=all_settings,\ output_dirs=output_dirs) source_url = get_url_with_credentials( all_settings, output_dir, is_relative_path=False) logger.debug("source_url=%s" % source_url) experiment_id = mytardis.create_dataset( settings=all_settings, source_url=source_url, exp_name=mytardis.get_exp_name_for_output, dataset_name=mytardis.get_dataset_name_for_output, exp_id=experiment_id, experiment_paramset=experiment_paramset, dataset_paramset=dataset_paramset, datafile_paramset=datafile_paramset, dfile_extract_func=dfile_extract_func) graph_paramset = [] else: logger.warn("no mytardis host specified") else: logger.warn('Data curation is off') return experiment_id
def create_dataset_for_intermediate_output(self, run_settings, experiment_id, base_dir, output_url, all_settings, outputs=[]): logger.debug('self_outpus_curate=%s' % outputs) iteration = int(getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX)) iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % iteration)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) if all_settings['mytardis_host']: output_dirs = [] for m, dir_name in enumerate(node_output_dirnames): output_dirs.append(os.path.join(iter_output_dir, dir_name)) for i, output_dir in enumerate(output_dirs): dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} self.load_metadata_builder(run_settings) if self.METADATA_BUILDER: (continue_loop, dataset_paramset, datafile_paramset, dfile_extract_func) = \ self.METADATA_BUILDER.build_metadata_for_intermediate_output(\ output_dir, outputs, run_settings=run_settings, storage_settings=all_settings,\ output_dirs=output_dirs) if continue_loop: continue source_dir_url = get_url_with_credentials( all_settings, output_dir, is_relative_path=False) logger.debug("source_dir_url=%s" % source_dir_url) logger.debug('all_settings_here=%s' % all_settings) system_id = int(getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX)) #TODO Mytardis experiment_id = mytardis.create_dataset( settings=all_settings, source_url=source_dir_url, exp_id=experiment_id, exp_name=mytardis.get_exp_name_for_intermediate_output, dataset_name=mytardis.get_dataset_name_for_output, dataset_paramset=dataset_paramset, datafile_paramset=datafile_paramset, dfile_extract_func=dfile_extract_func ) else: logger.warn("no mytardis host specified") return 0 return experiment_id
def retrieve_datafile(url): """ Retrieve contents from a mytardis datafile based on url NB: Has this function been tested? """ (source_scheme, tardis_host_url, source_path, source_location, query_settings) = storage.parse_bdpurl(url) query_settings['mytardis_host'] = tardis_host_url logger.debug("query_settings=%s" % query_settings) exp_name = _get_value('exp_name', query_settings) dataset_name = _get_value('dataset_name', query_settings) root_path = _get_value('root_path', query_settings) fname = _get_value('fname', query_settings) tardis_user = _get_value('mytardis_username', query_settings) tardis_pass = _get_value('mytardis_password', query_settings) exp_id, _ = _get_or_create_experiment(query_settings, exp_name) dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id) url = "http://%s/api/v1/dataset_file/%s/" % (tardis_host_url, dataset_id) headers = {'Accept': 'application/json'} logger.debug("fname=%s" % fname) # file_path = os.path.join(root_path, fname) # logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) # data = json.dumps({ # 'dataset': str(new_dataset_uri), # 'filename': os.path.basename(fname), # 'size': os.stat(temp.name).st_size, # 'mimetype': 'text/plain', # 'md5sum': hashlib.md5(temp.read()).hexdigest() # }) # logger.debug("data=%s" % data) #temp.seek(0) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass)) # FIXME: need to check for status_code and handle failures. # logger.debug("r.js=%s" % r.json) # logger.debug("r.te=%s" % r.text) # logger.debug("r.he=%s" % r.headers) return r.text
def _get_output(self, local_settings, source_url): """ Retrieve the output from the task on the node """ logger.debug("get_output from %s" % source_url) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_s_url = storage.get_url_with_credentials( local_settings, source_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_s_url) make_path = os.path.join(query_settings['root_path'], mypath) logger.debug("make_path=%s" % make_path) output_storage_url = local_settings['storeout_platform_url'] logger.debug("output_storage_url=%s" % output_storage_url) output_storage_settings = manage.get_platform_settings( output_storage_url, bdp_username) local_settings.update(output_storage_settings) logger.debug("output_storage_settings=%s" % output_storage_settings) dest_url = '%s://%s@%s/%s/make%s' % ( output_storage_settings['scheme'], output_storage_settings['type'], output_storage_settings['host'], local_settings['storeout_platform_offset'], str(local_settings['contextid'])) logger.debug("Transferring output from %s to %s" % (source_url, dest_url)) local_settings.update(output_storage_settings) encoded_d_url = storage.get_url_with_credentials( local_settings, dest_url) logger.debug("encoded_d_url=%s" % encoded_d_url) # FIXME: might want to turn on paramiko compress function #storage_files(encoded_d_url, exceptions=[]) # to speed up this transfer try: storage.copy_directories(encoded_s_url, encoded_d_url) except SSHException, e: logger.error(e) # FIXME: Could just exit, but need to flag that this data has not # been transferred. raise
def retrieve_datafile(url): """ Retrieve contents from a mytardis datafile based on url NB: Has this function been tested? """ (source_scheme, tardis_host_url, source_path, source_location, query_settings) = storage.parse_bdpurl(url) query_settings['mytardis_host'] = tardis_host_url logger.debug("query_settings=%s" % query_settings) exp_name = _get_value('exp_name', query_settings) dataset_name = _get_value('dataset_name', query_settings) root_path = _get_value('root_path', query_settings) fname = _get_value('fname', query_settings) tardis_user = _get_value('mytardis_username', query_settings) tardis_pass = _get_value('mytardis_password', query_settings) exp_id, _ = _get_or_create_experiment(query_settings, exp_name) dataset_id, _ = _get_or_create_dataset(query_settings, dataset_name, exp_id) url = "http://%s/api/v1/dataset_file/%s/" % (tardis_host_url, dataset_id) headers = {'Accept': 'application/json'} logger.debug("fname=%s" % fname) # file_path = os.path.join(root_path, fname) # logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) # data = json.dumps({ # 'dataset': str(new_dataset_uri), # 'filename': os.path.basename(fname), # 'size': os.stat(temp.name).st_size, # 'mimetype': 'text/plain', # 'md5sum': hashlib.md5(temp.read()).hexdigest() # }) # logger.debug("data=%s" % data) #temp.seek(0) r = requests.get(url, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass) ) # FIXME: need to check for status_code and handle failures. # logger.debug("r.js=%s" % r.json) # logger.debug("r.te=%s" % r.text) # logger.debug("r.he=%s" % r.headers) return r.text
def _get_output(self, local_settings, source_url): """ Retrieve the output from the task on the node """ logger.debug("get_output from %s" % source_url) computation_platform_url = local_settings['comp_platform_url'] bdp_username = local_settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) local_settings.update(comp_pltf_settings) encoded_s_url = storage.get_url_with_credentials( local_settings, source_url, is_relative_path=True, ip_address=local_settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_s_url) make_path = os.path.join(query_settings['root_path'], mypath) logger.debug("make_path=%s" % make_path) output_storage_url = local_settings['storeout_platform_url'] logger.debug("output_storage_url=%s" % output_storage_url) output_storage_settings = manage.get_platform_settings(output_storage_url, bdp_username) local_settings.update(output_storage_settings) logger.debug("output_storage_settings=%s" % output_storage_settings) dest_url = '%s://%s@%s/%s/make%s' % (output_storage_settings['scheme'], output_storage_settings['type'], output_storage_settings['host'], local_settings['storeout_platform_offset'], str(local_settings['contextid'])) logger.debug("Transferring output from %s to %s" % (source_url, dest_url)) local_settings.update(output_storage_settings) encoded_d_url = storage.get_url_with_credentials(local_settings, dest_url) logger.debug("encoded_d_url=%s" % encoded_d_url) # FIXME: might want to turn on paramiko compress function #storage_files(encoded_d_url, exceptions=[]) # to speed up this transfer try: storage.copy_directories(encoded_s_url, encoded_d_url) except SSHException, e: logger.error(e) # FIXME: Could just exit, but need to flag that this data has not # been transferred. raise
def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) logger.debug("output_url=%s" % output_url) output_settings = self.get_platform_settings(run_settings, RMIT_SCHEMA + '/platform/storage/output') current_output_url= "%s://%s@%s/%s" %(scheme, output_settings['type'], host, os.path.join(mypath, '1/')) logger.debug('current-dest=%s' % current_output_url) outcar_url = storage.get_url_with_credentials( output_settings, current_output_url +self.OUTCAR_FILE, is_relative_path=False) logger.debug("outcar_url=%s" % outcar_url) try: outcar_content = storage.get_file(outcar_url) except IOError, e: logger.error(e) toten = None
def _job_finished(self, settings, remote_path): encoded_d_url = storage.get_url_with_credentials( settings=settings, url_or_relative_path=remote_path, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stdout = '' stderr = '' try: ssh = open_connection(ip_address=host, settings=settings) (stdout, stderr) = compute.run_make( ssh, (os.path.join(query_settings['root_path'], mypath)), 'running') except Exception, e: logger.error(e) raise
def _job_finished(self, settings, remote_path): encoded_d_url = storage.get_url_with_credentials( settings=settings, url_or_relative_path=remote_path, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stdout = '' stderr = '' try: ssh = open_connection(ip_address=host, settings=settings) (stdout, stderr) = compute.run_make(ssh, (os.path.join( query_settings['root_path'], mypath)), 'running') except Exception, e: logger.error(e) raise
def process(self, run_settings): settings = setup_settings(run_settings) messages.info(run_settings, "1: execute starting") def _get_dest_bdp_url(settings): return "%s@%s" % ( "nci", os.path.join(settings['payload_destination'], str(settings['contextid']))) dest_url = _get_dest_bdp_url(settings) computation_platform_url = settings['comp_platform_url'] bdp_username = settings['bdp_username'] comp_pltf_settings = manage.get_platform_settings( computation_platform_url, bdp_username) logger.debug("comp_pltf_settings=%s" % pformat(comp_pltf_settings)) settings.update(comp_pltf_settings) encoded_d_url = storage.get_url_with_credentials( settings, dest_url, is_relative_path=True, ip_address=settings['host']) (scheme, host, mypath, location, query_settings) = \ storage.parse_bdpurl(encoded_d_url) stderr = '' try: ssh = open_connection( ip_address=settings['host'], settings=settings) (command_out, stderr) = compute.run_make(ssh, (os.path.join( query_settings['root_path'], mypath)), 'startrun') except Exception, e: logger.error(e) raise
def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): iteration = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join( os.path.join(base_dir, "output_%s" % iteration)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) if all_settings['mytardis_host']: for i, node_output_dirname in enumerate(node_output_dirnames): node_path = os.path.join(iter_output_dir, node_output_dirname) # find criterion crit = None # is there an infinity criterion for ni in self.outputs: if ni.dirname == node_output_dirname: crit = ni.criterion break else: logger.debug("criterion not found") continue logger.debug("crit=%s" % crit) # graph_params = [] def extract_psd_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys} return res def extract_psdexp_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys} return res def extract_grfinal_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [ xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0) ] cut_ys = [ ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0) ] res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys} return res def extract_inputgr_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [ xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0) ] cut_ys = [ ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0) ] res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys} return res #TODO: hrmcexp graph should be tagged to input directories (not output directories) #because we want the result after pruning. #todo: replace self.boto_setttings with mytardis_settings EXP_DATASET_NAME_SPLIT = 2 def get_exp_name_for_output(settings, url, path): # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) return str(os.sep.join(path.split(os.sep)[-4:-2])) def get_dataset_name_for_output(settings, url, path): logger.debug("path=%s" % path) host = settings['host'] prefix = 'ssh://%s@%s' % (settings['type'], host) source_url = get_url_with_credentials( settings, os.path.join(prefix, path, "HRMC.inp_values"), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = storage.get_file(source_url) except IOError, e: logger.warn("cannot read file %s" % e) return str( os.sep.join( path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) logger.debug("content=%s" % content) try: values_map = dict(json.loads(str(content))) except Exception, e: logger.warn("cannot load %s: %s" % (content, e)) return str( os.sep.join( path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) try: iteration = str(path.split(os.sep)[-2:-1][0]) except Exception, e: logger.error(e) iteration = "" if "_" in iteration: iteration = iteration.split("_")[1] else: iteration = "final" dataset_name = "%s_%s_%s" % ( iteration, values_map['generator_counter'], values_map['run_counter']) logger.debug("dataset_name=%s" % dataset_name) return dataset_name
def curate_dataset(self, run_settings, experiment_id, base_url, output_url, all_settings): ''' Curates dataset ''' # Retrieves process directories below the current output location iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) current_output_url = "%s%s" % ( output_prefix, os.path.join(os.path.join(base_url, "output_%s" % iteration))) (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url) output_fsys = storage.get_filesystem(output_url) process_output_dirs, _ = output_fsys.listdir(current_output_path) # Curates a dataset with metadata per process for i, process_output_dir in enumerate(process_output_dirs): # Expand the process output directory and add credentials for access process_output_url = '/'.join( [current_output_url, process_output_dir]) process_output_url_with_cred = get_url_with_credentials( all_settings, process_output_url, is_relative_path=False) # Expand the process output file and add credentials for access output_file_url_with_cred = storage.get_url_with_credentials( all_settings, '/'.join([process_output_url, OUTPUT_FILE]), is_relative_path=False) try: output_content = storage.get_file(output_file_url_with_cred) val1, val2 = output_content.split() except (IndexError, IOError) as e: logger.warn(e) continue try: x = float(val1) y = float(val2) except (ValueError, IndexError) as e: logger.warn(e) continue # Returns the process id as MyTardis dataset name all_settings['graph_point_id'] = str(i) def _get_dataset_name(settings, url, path): return all_settings['graph_point_id'] # Creates new dataset and adds to experiment # If experiment_id==0, creates new experiment experiment_id = mytardis.create_dataset( settings=all_settings, # MyTardis credentials source_url=process_output_url_with_cred, exp_id=experiment_id, dataset_name= _get_dataset_name, # the function that defines dataset name dataset_paramset=[ # a new blank parameter set conforming to schema 'remotemake/output' mytardis.create_paramset("remotemake/output", []), mytardis.create_graph_paramset( "dsetgraph", # name of schema name="randdset", # a unique dataset name graph_info={}, value_dict={ "randdset/x": x, "randdset/y": y }, # values to be used in experiment graphs value_keys=[]), ]) return experiment_id
def process_outputs(self, run_settings, base_dir, output_url, all_settings, offset): # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1 # output_prefix = ssh://unix@ # node_output_dir = 2 output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % id)) logger.debug('iter_output_dir=%s' % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) logger.debug('output_prefix=%s' % output_prefix) #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug('output_url=%s' % output_url) (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(output_url) logger.debug("iter_output_path=%s" % iter_output_path) iter_out_fsys = storage.get_filesystem(output_url) logger.debug("iter_out_fsys=%s" % iter_out_fsys) node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path) logger.debug('node_output_dirnames=%s' % node_output_dirnames) self.audit = "" Node_info = namedtuple('Node_info', ['dirname', 'number', 'criterion']) BASE_FNAME = "HRMC.inp" # generate criterias self.outputs = [] for node_output_dirname in node_output_dirnames: node_path = output_prefix + os.path.join(iter_output_dir, node_output_dirname) criterion = self.compute_psd_criterion(all_settings, node_path) #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,) logger.debug("criterion=%s" % criterion) try: values_url = get_url_with_credentials( all_settings, os.path.join(node_path, '%s_values' % BASE_FNAME), is_relative_path=False) values_content = storage.get_file(values_url) logger.debug("values_file=%s" % values_url) except IOError: logger.warn("no values file found") values_map = {} else: values_map = dict(json.loads(values_content)) self.outputs.append( Node_info(dirname=node_output_dirname, number=values_map['run_counter'], criterion=criterion)) if not self.outputs: logger.error("no ouput found for this iteration") return self.outputs.sort(key=lambda x: int(x.criterion)) logger.debug("self.outputs=%s" % self.outputs) try: # FIXME: need to validate this output to make sure list of int threshold = ast.literal_eval( getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): logger.warn("no threshold found when expected") return False logger.debug("threshold = %s" % threshold) total_picks = 1 if len(threshold) > 1: for i in threshold: total_picks *= threshold[i] else: total_picks = threshold[0] def copy_files_with_pattern(iter_out_fsys, source_path, dest_path, pattern, all_settings): """ """ output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path)) # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path) _, node_output_fnames = iter_out_fsys.listdir(source_path) ip_address = all_settings['ip_address'] for f in node_output_fnames: if fnmatch.fnmatch(f, pattern): source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False) logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url)) content = storage.get_file(source_url) storage.put_file(dest_url, content) # Make new input dirs new_input_dir = os.path.join( os.path.join(base_dir, "input_%d" % (id + 1))) for index in range(0, total_picks): Node_info = self.outputs[index] logger.debug("node_info.dirname=%s" % Node_info.dirname) logger.debug("Node_info=%s" % str(Node_info)) new_input_path = os.path.join(new_input_dir, Node_info.dirname) logger.debug("New input node dir %s" % new_input_path) old_output_path = os.path.join(iter_output_dir, Node_info.dirname) # Move all existing domain input files unchanged to next input directory for f in DOMAIN_INPUT_FILES: source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(old_output_path, f), is_relative_path=False) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, f), is_relative_path=False) logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url)) content = storage.get_file(source_url) logger.debug('content collected') storage.put_file(dest_url, content) logger.debug('put successfully') logger.debug('put file successfully') pattern = "*_values" output_offset = os.path.join( os.path.join(offset, "output_%s" % id, Node_info.dirname)) input_offset = os.path.join( os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname)) copy_files_with_pattern(iter_out_fsys, output_offset, input_offset, pattern, all_settings) pattern = "*_template" copy_files_with_pattern(iter_out_fsys, output_offset, input_offset, pattern, all_settings) # NB: Converge stage triggers based on criterion value from audit. logger.debug('starting audit') info = "Run %s preserved (error %s)\n" % (Node_info.number, Node_info.criterion) audit_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, 'audit.txt'), is_relative_path=False) storage.put_file(audit_url, info) logger.debug("audit=%s" % info) logger.debug('1:audit_url=%s' % audit_url) self.audit += info # move xyz_final.xyz to initial.xyz source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(old_output_path, "xyz_final.xyz"), is_relative_path=False) logger.debug('source_url=%s' % source_url) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, 'input_initial.xyz'), is_relative_path=False) logger.debug('dest_url=%s' % dest_url) content = storage.get_file(source_url) logger.debug('content=%s' % content) storage.put_file(dest_url, content) self.audit += "spawning diamond runs\n" logger.debug( "input_dir=%s" % (output_prefix + os.path.join(new_input_dir, 'audit.txt'))) audit_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_dir, 'audit.txt'), is_relative_path=False) logger.debug('audit_url=%s' % audit_url) storage.put_file(audit_url, self.audit)
def create_dataset(settings, source_url, exp_id, exp_name=_get_exp_name, dataset_name=_get_dataset_name, experiment_paramset=[], dataset_paramset=[], datafile_paramset=[], dfile_extract_func=None): """ Notes: POST to mytardis_host REST API with mytardis_user and mytardis_password credentials to create or update experiment for a new dataset containing datafiles from source_url BDP directory. Args: settings: source_url: url containing data to be ingested exp_id: [exp_name,dataset_name]: functions that return new experiment and dataset names respectively based on url and path experiment_paramset: ... dataset_paramset: ... datafile_paramset: dfile_extract_func: FIXME,TODO: What if tardis in unavailable? Connection to mytardis probably better handled as sperate celery subtask, which can retry until working and be async FIXME: missing all error checking and retrying of connection to mytardis. Reliability framework should be able to supply this? """ #TODO: method should take BDP url source_url not, expanded one. logger.debug("post_dataset") tardis_user = settings["mytardis_user"] tardis_pass = settings["mytardis_password"] tardis_host_url = "http://%s" % settings["mytardis_host"] logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url, tardis_host_url, tardis_pass)) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_path=%s" % source_path) if source_scheme == "file": root_path = _get_value('root_path', query_settings) else: logger.debug('schema=%s' % source_scheme) #raise InvalidInputError("only file source_schema supported for source of mytardis transfer") expname = exp_name(settings, source_url, source_path) new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset) new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id # TODO: check that we do not alreay have a dataset with # the same name and overwrite or don't move. # save dataset logger.debug("saving dataset in experiment at %s" % new_exp_id) url = "%s/api/v1/dataset/?format=json" % tardis_host_url headers = {'content-type': 'application/json'} # # FIXME: schema should be a parameter # schemas = [{ # "schema": "http://rmit.edu.au/schemas/hrmcdataset", # "parameters": [] # }] # if dataset_schema: # schemas.append({ # "schema": dataset_schema, # "parameters": [] # }) schemas = dataset_paramset logger.debug("schemas=%s" % schemas) data = json.dumps({ 'experiments': [new_experiment_uri], 'description': dataset_name(settings, source_url, source_path), "parameter_sets": schemas }) logger.debug("data=%s" % data) logger.debug("post to %s" % url) r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass)) # FIXME: need to check for status_code and handle failures. logger.debug("r.json=%s" % r.json) logger.debug("r.text=%s" % r.text) logger.debug("r.headers=%s" % r.headers) header_location = r.headers['location'] new_dataset_uri = header_location[len(tardis_host_url):] # move files across source_files = storage.list_all_files(source_url) logger.debug("source_files=%s" % source_files) url = "%s/api/v1/dataset_file/" % tardis_host_url headers = {'Accept': 'application/json'} args = source_url.split('?')[1] logger.debug('args=%s' % args) ''' psd_url = smartconnectorscheduler.get_url_with_credentials(output_storage_credentials, 'ssh://unix@' + os.path.join(self.output_dir, node_output_dir, "PSD_output", "psd.dat"), is_relative_path=False) logger.debug('psd_url=%s' % psd_url) psd = hrmcstages.storage.get_filep(psd_url) ''' for file_location in source_files: logger.debug('file_location=%s' % os.path.join(source_location, file_location)) source_file_url = "%s://%s?%s" % ( source_scheme, os.path.join(source_location, file_location), args) logger.debug('source_file_url=%s' % source_file_url) source_file, source_file_ref = storage.get_filep(source_file_url, sftp_reference=True) logger.debug('source_file=%s' % source_file._name) #file_path = os.path.join(root_path, file_location) #file_path = os.path.join(source_url, file_location) #logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) new_datafile_paramset = [] logger.debug("datafile_paramset=%s" % datafile_paramset) for paramset in datafile_paramset: new_paramset = {} logger.debug("paramset=%s" % paramset) new_paramset['schema'] = paramset['schema'] has_value = False has_keys = False new_param_vals = [] for param in paramset['parameters']: new_param = {} for param_key, v in param.items(): if param_key == 'name' and v == "value_dict": new_param['name'] = 'value_dict' new_value = {} #val = param['string_value'] # if not isinstance(val, basestring): # dfile_extract_func = val found_func_match = False for fname, func in dfile_extract_func.items(): logger.debug("fname=%s,func=%s" % (fname, func)) if fname == os.path.basename(file_location): #new_value.update(func(open(file_path, 'r'))) source_file.seek(0) new_value.update(func(source_file)) found_func_match = True # FIXME: can multiple funcs match? logger.debug("new_value=%s" % new_value) if found_func_match: new_param['string_value'] = json.dumps(new_value) else: new_param['string_value'] = param['string_value'] break else: # incase string_value is processed first new_param[param_key] = v if new_param['name'] == "value_dict" and len( json.loads(new_param['string_value'])): has_value = True if new_param['name'] == "value_keys" and len( json.loads(new_param['string_value'])): has_keys = True new_param_vals.append(new_param) new_paramset['parameters'] = new_param_vals logger.debug("has_value=%s" % has_value) logger.debug("has_keys=%s" % has_keys) if has_value or has_keys: new_datafile_paramset.append(new_paramset) else: logger.debug("not adding %s" % new_paramset) logger.debug("new_datafile_paramset=%s" % new_datafile_paramset) logger.debug("file_namee=%s" % source_file._name) file_size = source_file_ref.size(source_file._name) logger.debug("file_size=%s" % file_size) if file_size > 0: source_file.seek(0) data = json.dumps({ 'dataset': str(new_dataset_uri), "parameter_sets": new_datafile_paramset, 'filename': os.path.basename(file_location), #'filename': os.path.basename(file_path), 'size': file_size, 'mimetype': 'text/plain', 'md5sum': hashlib.md5(source_file.read()).hexdigest() #'md5sum': hashlib.md5(open(file_path, 'r').read()).hexdigest() }) logger.debug("data=%s" % data) #import pdb; pdb.set_trace() source_file.seek(0) #logger.debug(source_file.read()) source_file.seek(0) r = requests.post( url, data={'json_data': data}, headers=headers, files={'attached_file': source_file}, # open(file_path, 'rb')}, auth=HTTPBasicAuth(tardis_user, tardis_pass)) # FIXME: need to check for status_code and handle failures. logger.debug("r.js=%s" % r.json) logger.debug("r.te=%s" % r.text) logger.debug("r.he=%s" % r.headers) else: logger.warn("not transferring empty file %s" % file_location) #TODO: check whether mytardis api can accept zero length files return new_exp_id
def build_metadata_for_final_output(self, m, output_dir, **kwargs): #FIXME: this calculation should be done as in extract_psd_func # pulling directly from data_errors rather than passing in # through nested function. experiment_paramset = [] dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} exp_value_keys = [] legends = [] for m, current_dir in enumerate(kwargs['output_dirs']): #node_path = os.path.join(iter_output_dir, node_dir) exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m]) source_url = storage.get_url_with_credentials(\ kwargs['storage_settings'], current_dir, is_relative_path=False) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_url=%s" % source_url) legends.append( mytardis.get_dataset_name_for_output( kwargs['storage_settings'], "", source_path)) logger.debug("exp_value_keys=%s" % exp_value_keys) logger.debug("legends=%s" % legends) # for m, output_dir in enumerate(kwargs['output_dirs']): #node_path = os.path.join(iter_output_dir, output_dir) node_path = output_dir logger.debug("node_path=%s" % node_path) dataerrors_url = storage.get_url_with_credentials(kwargs['storage_settings'], os.path.join(node_path, self.DATA_ERRORS_FILE), is_relative_path=False) logger.debug("dataerrors_url=%s" % dataerrors_url) dataerrors_content = storage.get_file(dataerrors_url) xs = [] ys = [] re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)') for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() try: hrmc_step = int(columns[self.STEP_COLUMN_NUM]) except ValueError: logger.warn("could not parse hrmc_step value on line %s" % i) continue # handle format double precision float format val = columns[self.ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: hrmc_errgr = float(val) except ValueError: logger.warn("could not parse hrmc_errgr value on line %s" % i) continue xs.append(hrmc_step) ys.append(hrmc_errgr) logger.debug("xs=%s" % xs) logger.debug("ys=%s" % ys) crit_url = storage.get_url_with_credentials(kwargs['storage_settings'], os.path.join(node_path, "criterion.txt"), is_relative_path=False) try: crit = storage.get_file(crit_url) except ValueError: crit = None except IOError: crit = None # FIXME: can crit be zero? logger.debug("crit=%s" % crit) if crit: system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \ django_settings.SCHEMA_PREFIX)) hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit} else: hrmcdset_val = {} # TODO: move into utiltiy function for reuse def extract_psd_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() val = columns[self.STEP_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: x = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue val = columns[self.ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: y = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue xs.append(x) ys.append(y) res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys} return res def extract_psdexp_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys} return res def extract_grfinal_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)] res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys} return res def extract_inputgr_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0)] res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys} return res #todo: replace self.boto_setttings with mytardis_settings # Only save graph paramset for experiment once per experiment. if not self.final_graph_paramset: self.final_graph_paramset = [mytardis.create_graph_paramset("expgraph", name="hrmcexp2", graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends}, value_dict={}, value_keys=exp_value_keys)] experiment_paramset = self.final_graph_paramset else: experiment_paramset = [] dataset_paramset = [ mytardis.create_paramset('hrmcdataset/output', []), mytardis.create_graph_paramset('dsetgraph', name="hrmcdset", graph_info={"axes":["r (Angstroms)", "PSD"], "legends":["psd", "PSD_exp"], "type":"line"}, value_dict=hrmcdset_val, value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"], ["hrmcdfile/r2", "hrmcdfile/g2"]]), mytardis.create_graph_paramset('dsetgraph', name='hrmcdset2', graph_info={"axes":["r (Angstroms)", "g(r)"], "legends":["data_grfinal", "input_gr"], "type":"line"}, value_dict={}, value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"], ["hrmcdfile/r4", "hrmcdfile/g4"]]), mytardis.create_graph_paramset('dsetgraph', name='hrmcdset%s' % m, graph_info={}, value_dict={"hrmcdset%s/step" % m: xs, "hrmcdset%s/err" % m: ys}, value_keys=[]), ] datafile_paramset = [ mytardis.create_graph_paramset('dfilegraph', name="hrmcdfile", graph_info={}, value_dict={}, value_keys=[]) ] dfile_extract_func = { 'psd.dat': extract_psd_func, 'PSD_exp.dat': extract_psdexp_func, 'data_grfinal.dat': extract_grfinal_func, 'input_gr.dat': extract_inputgr_func} logger.debug("experiment_paramset=%s" % experiment_paramset) logger.debug("dataset_paramset=%s" % dataset_paramset) logger.debug("datafile_paramset=%s" % datafile_paramset) logger.debug("dfile_extract_func=%s" % dfile_extract_func) return (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func)
def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): logger.debug("curate_dataset") iter_output_dir = os.path.join(os.path.join(base_dir, "output")) logger.debug("iter_output_dir=%s" % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug("iter_output_dir=%s" % iter_output_dir) logger.debug("output_url=%s" % output_url) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA)) if curate_data: if all_settings['mytardis_host']: # if mytardis_settings['mytardis_host']: # EXP_DATASET_NAME_SPLIT = 2 # def get_exp_name_for_output(settings, url, path): # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) # def get_dataset_name_for_output(settings, url, path): # logger.debug("path=%s" % path) # host = settings['host'] # prefix = 'ssh://%s@%s' % (settings['type'], host) # source_url = smartconnectorscheduler.get_url_with_credentials( # settings, os.path.join(prefix, path, "HRMC.inp_values"), # is_relative_path=False) # logger.debug("source_url=%s" % source_url) # try: # content = storage.get_file(source_url) # except IOError, e: # logger.warn("cannot read file %s" % e) # return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) # logger.debug("content=%s" % content) # try: # values_map = dict(json.loads(str(content))) # except Exception, e: # logger.error("cannot load values_map %s: from %s. Error=%s" % (content, source_url, e)) # return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) # try: # iteration = str(path.split(os.sep)[-2:-1][0]) # except Exception, e: # logger.error(e) # iteration = "" # if "_" in iteration: # iteration = iteration.split("_")[1] # else: # iteration = "final" # dataset_name = "%s_%s_%s" % (iteration, # values_map['generator_counter'], # values_map['run_counter']) # logger.debug("dataset_name=%s" % dataset_name) # return dataset_name # re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)') # logger.debug("new_output_dir=%s" % new_output_dir) # exp_value_keys = [] # legends = [] # for m, node_dir in enumerate(node_dirs): # exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m]) # source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False) # (source_scheme, source_location, source_path, source_location, # query_settings) = storage.parse_bdpurl(source_url) # logger.debug("source_url=%s" % source_url) # legends.append( # get_dataset_name_for_output( # output_storage_settings, "", source_path)) # logger.debug("exp_value_keys=%s" % exp_value_keys) # logger.debug("legends=%s" % legends) # graph_paramset = [mytardis.create_graph_paramset("expgraph", # name="hrmcexp2", # graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends}, # value_dict={}, # value_keys=exp_value_keys)] # for m, node_dir in enumerate(node_dirs): # dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False) # dataerrors_content = storage.get_file(dataerrors_url) # xs = [] # ys = [] # for i, line in enumerate(dataerrors_content.splitlines()): # if i == 0: # continue # columns = line.split() # try: # hrmc_step = int(columns[STEP_COLUMN_NUM]) # except ValueError: # logger.warn("could not parse hrmc_step value on line %s" % i) # continue # # handle format double precision float format # val = columns[ERRGR_COLUMN_NUM] # val = re_dbl_fort.sub(r'\1E\2', val) # logger.debug("val=%s" % val) EXP_DATASET_NAME_SPLIT = 2 def get_exp_name_for_output(settings, url, path): return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) def get_dataset_name_for_output(settings, url, path): logger.debug("path=%s" % path) host = settings['host'] prefix = 'ssh://%s@%s' % (settings['type'], host) source_url = get_url_with_credentials( settings, os.path.join(prefix, path, "HRMC.inp_values"), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = storage.get_file(source_url) except IOError, e: logger.warn("cannot read file %s" % e) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) logger.debug("content=%s" % content) try: values_map = dict(json.loads(str(content))) except Exception, e: logger.error("cannot load values_map %s: from %s. Error=%s" % (content, source_url, e)) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) try: iteration = str(path.split(os.sep)[-2:-1][0]) except Exception, e: logger.error(e) iteration = "" if "_" in iteration: iteration = iteration.split("_")[1] else: iteration = "final" dataset_name = "%s_%s_%s" % (iteration, values_map['generator_counter'], values_map['run_counter']) logger.debug("dataset_name=%s" % dataset_name) return dataset_name
def process_outputs(self, run_settings, base_dir, output_url, all_settings, offset): # output_dir = 118.138.241.232/outptuersdfsd/sweep277/hrmc278/output_1 # output_prefix = ssh://unix@ # node_output_dir = 2 output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % id)) logger.debug('iter_output_dir=%s' % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) logger.debug('output_prefix=%s' % output_prefix) #iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug('output_url=%s' % output_url) (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(output_url) logger.debug("iter_output_path=%s" % iter_output_path) iter_out_fsys = storage.get_filesystem(output_url) logger.debug("iter_out_fsys=%s" % iter_out_fsys) node_output_dirnames, _ = iter_out_fsys.listdir(iter_output_path) logger.debug('node_output_dirnames=%s' % node_output_dirnames) self.audit = "" Node_info = namedtuple('Node_info', ['dirname', 'number', 'criterion']) BASE_FNAME = "HRMC.inp" # generate criterias self.outputs = [] for node_output_dirname in node_output_dirnames: node_path = output_prefix + os.path.join(iter_output_dir, node_output_dirname) criterion = self.compute_psd_criterion(all_settings, node_path) #criterion = self.compute_hrmc_criterion(values_map['run_counter'], node_output_dirname, fs,) logger.debug("criterion=%s" % criterion) try: values_url = get_url_with_credentials( all_settings, os.path.join(node_path, '%s_values' % BASE_FNAME), is_relative_path=False) values_content = storage.get_file(values_url) logger.debug("values_file=%s" % values_url) except IOError: logger.warn("no values file found") values_map = {} else: values_map = dict(json.loads(values_content)) self.outputs.append(Node_info(dirname=node_output_dirname, number=values_map['run_counter'], criterion=criterion)) if not self.outputs: logger.error("no ouput found for this iteration") return self.outputs.sort(key=lambda x: int(x.criterion)) logger.debug("self.outputs=%s" % self.outputs) try: # FIXME: need to validate this output to make sure list of int threshold = ast.literal_eval(getval(run_settings, '%s/input/hrmc/threshold' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): logger.warn("no threshold found when expected") return False logger.debug("threshold = %s" % threshold) total_picks = 1 if len(threshold) > 1: for i in threshold: total_picks *= threshold[i] else: total_picks = threshold[0] def copy_files_with_pattern(iter_out_fsys, source_path, dest_path, pattern, all_settings): """ """ output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) logger.debug('source_path=%s, dest_path=%s' % (source_path, dest_path)) # (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(source_path) _, node_output_fnames = iter_out_fsys.listdir(source_path) ip_address = all_settings['ip_address'] for f in node_output_fnames: if fnmatch.fnmatch(f, pattern): source_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, source_path, f), is_relative_path=False) dest_url = get_url_with_credentials(all_settings, output_prefix + os.path.join(ip_address, dest_path, f), is_relative_path=False) logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url)) content = storage.get_file(source_url) storage.put_file(dest_url, content) # Make new input dirs new_input_dir = os.path.join(os.path.join(base_dir, "input_%d" % (id + 1))) for index in range(0, total_picks): Node_info = self.outputs[index] logger.debug("node_info.dirname=%s" % Node_info.dirname) logger.debug("Node_info=%s" % str(Node_info)) new_input_path = os.path.join(new_input_dir, Node_info.dirname) logger.debug("New input node dir %s" % new_input_path) old_output_path = os.path.join(iter_output_dir, Node_info.dirname) # Move all existing domain input files unchanged to next input directory for f in DOMAIN_INPUT_FILES: source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(old_output_path, f), is_relative_path=False) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, f), is_relative_path=False) logger.debug('source_url=%s, dest_url=%s' % (source_url, dest_url)) content = storage.get_file(source_url) logger.debug('content collected') storage.put_file(dest_url, content) logger.debug('put successfully') logger.debug('put file successfully') pattern = "*_values" output_offset = os.path.join(os.path.join(offset, "output_%s" % id, Node_info.dirname)) input_offset = os.path.join(os.path.join(offset, "input_%s" % (id + 1), Node_info.dirname)) copy_files_with_pattern(iter_out_fsys, output_offset, input_offset, pattern, all_settings) pattern = "*_template" copy_files_with_pattern(iter_out_fsys, output_offset, input_offset, pattern, all_settings) # NB: Converge stage triggers based on criterion value from audit. logger.debug('starting audit') info = "Run %s preserved (error %s)\n" % (Node_info.number, Node_info.criterion) audit_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, 'audit.txt'), is_relative_path=False) storage.put_file(audit_url, info) logger.debug("audit=%s" % info) logger.debug('1:audit_url=%s' % audit_url) self.audit += info # move xyz_final.xyz to initial.xyz source_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(old_output_path, "xyz_final.xyz"), is_relative_path=False) logger.debug('source_url=%s' % source_url) dest_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_path, 'input_initial.xyz'), is_relative_path=False) logger.debug('dest_url=%s' % dest_url) content = storage.get_file(source_url) logger.debug('content=%s' % content) storage.put_file(dest_url, content) self.audit += "spawning diamond runs\n" logger.debug("input_dir=%s" % (output_prefix + os.path.join(new_input_dir, 'audit.txt'))) audit_url = get_url_with_credentials( all_settings, output_prefix + os.path.join(new_input_dir, 'audit.txt'), is_relative_path=False) logger.debug('audit_url=%s' % audit_url) storage.put_file(audit_url, self.audit)
def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): iteration = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join(os.path.join(base_dir, "output_%s" % iteration)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) if all_settings['mytardis_host']: for i, node_output_dirname in enumerate(node_output_dirnames): node_path = os.path.join(iter_output_dir, node_output_dirname) # find criterion crit = None # is there an infinity criterion for ni in self.outputs: if ni.dirname == node_output_dirname: crit = ni.criterion break else: logger.debug("criterion not found") continue logger.debug("crit=%s" % crit) # graph_params = [] def extract_psd_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys} return res def extract_psdexp_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys} return res def extract_grfinal_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0)] res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys} return res def extract_inputgr_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0)] res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys} return res #TODO: hrmcexp graph should be tagged to input directories (not output directories) #because we want the result after pruning. #todo: replace self.boto_setttings with mytardis_settings EXP_DATASET_NAME_SPLIT = 2 def get_exp_name_for_output(settings, url, path): # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) return str(os.sep.join(path.split(os.sep)[-4:-2])) def get_dataset_name_for_output(settings, url, path): logger.debug("path=%s" % path) host = settings['host'] prefix = 'ssh://%s@%s' % (settings['type'], host) source_url = get_url_with_credentials( settings, os.path.join(prefix, path, "HRMC.inp_values"), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = storage.get_file(source_url) except IOError, e: logger.warn("cannot read file %s" % e) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) logger.debug("content=%s" % content) try: values_map = dict(json.loads(str(content))) except Exception, e: logger.warn("cannot load %s: %s" % (content, e)) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) try: iteration = str(path.split(os.sep)[-2:-1][0]) except Exception, e: logger.error(e) iteration = "" if "_" in iteration: iteration = iteration.split("_")[1] else: iteration = "final" dataset_name = "%s_%s_%s" % (iteration, values_map['generator_counter'], values_map['run_counter']) logger.debug("dataset_name=%s" % dataset_name) return dataset_name
def create_dataset(settings, source_url, exp_id, exp_name=_get_exp_name, dataset_name=_get_dataset_name, experiment_paramset=[], dataset_paramset=[], datafile_paramset=[], dfile_extract_func=None): """ POST to mytardis_host REST API with mytardis_user and mytardis_password credentials to create or update experiment for a new dataset containing datafiles from source_url BDP directory. :param dict settings.keys(): ['mytardis_user', 'mytardis_password', 'mytardis_host'] :param str source_url: chiminey URL for the source of dataset :param int exp_id: unique experiment id for existing experiment or 0 for new :param func exp_name: function that returns experiment name based on url and path :param func dataset_name: function that returns dataset name based on url and path :param paramset dataset_param: metadata package for dataset :param paramset datafile_paramset: metadata package for datafiles :param func dfile_extract_func: function that extracts datafile information :return: new mytardis experiment id :rtype: int :raises: IndexError if setttings does not contain required configuration fields or is otherwise invalid. If exp_id is non-zero, adds to existing experiment with exp_id, else new created identifier returned. experiment_paramset is appended to any existing metadata and does not overwrite. """ #FIXME,TODO: What if tardis in unavailable? Connection to mytardis probably #better handled as sperate celery subtask, which can retry until working and #be async #FIXME: missing all error checking and retrying of connection to mytardis. #Reliability framework should be able to supply this? #TODO: method should take BDP url source_url not, expanded one. logger.debug("post_dataset") tardis_user = settings["mytardis_user"] tardis_pass = settings["mytardis_password"] tardis_ssh = int(settings["mytardis_ssl"]) tardis_protocol = "http://%s" if tardis_ssh > 0: tardis_protocol = "https://%s" tardis_host_url = tardis_protocol % settings["mytardis_host"] tardis_port = settings["mytardis_port"] logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url, tardis_host_url, tardis_pass)) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_path=%s" % source_path) if source_scheme == "file": root_path = _get_value('root_path', query_settings) else: logger.debug('schema=%s' % source_scheme) #raise InvalidInputError("only file source_schema supported for source of mytardis transfer") expname = exp_name(settings, source_url, source_path) new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset) new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id # TODO: check that we do not alreay have a dataset with # the same name and overwrite or don't move. # save dataset logger.debug("saving dataset in experiment at %s" % new_exp_id) url = "%s:%s/api/v1/dataset/?format=json" % (tardis_host_url, tardis_port) headers = {'content-type': 'application/json'} schemas = dataset_paramset logger.debug("schemas=%s" % schemas) data = json.dumps({ 'experiments': [new_experiment_uri], 'description': dataset_name(settings, source_url, source_path), "parameter_sets": schemas }) logger.debug("data=%s" % data) logger.debug("post to %s" % url) r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False) # FIXME: need to check for status_code and handle failures. logger.debug("r.json=%s" % r.json) logger.debug("r.text=%s" % r.text) logger.debug("r.headers=%s" % r.headers) header_location = r.headers['location'] new_dataset_uri = header_location[len(tardis_host_url):] # move files across source_files = storage.list_all_files(source_url) logger.debug("source_files=%s" % source_files) url = "%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port) args = source_url.split('?')[1] logger.debug('args=%s' % args) staging_dir = tempfile.mkdtemp(suffix="", prefix="chiminey") try: for fname in source_files: logger.debug('fname=%s' % os.path.join(source_location, fname)) source_file_url = "%s://%s?%s" % ( source_scheme, os.path.join(source_location, fname), args) logger.debug('source_file_url=%s' % source_file_url) # TODO: add retrying to this operation. source_file = storage.get_filep(source_file_url, sftp_reference=False) #logger.debug('source_file=%s' % source_file._name) # we have load contents locally at least once. f_contents = source_file.read() # Make temporary copy as mytardis datafile pos requires filename tempfname = os.path.basename(fname) with open(os.path.join(staging_dir, tempfname), 'wb') as fp: fp.write(f_contents) new_datafile_paramset = [] logger.debug("datafile_paramset=%s" % datafile_paramset) for paramset in datafile_paramset: new_paramset = {} logger.debug("paramset=%s" % paramset) new_paramset['schema'] = paramset['schema'] has_value = False has_keys = False new_param_vals = [] for param in paramset['parameters']: new_param = {} for param_key, v in param.iteritems(): logger.debug("param_key=%s v=%s" % (param_key, v)) if param_key == 'name' and v == "value_dict": new_param['name'] = 'value_dict' new_value = {} found_func_match = False for fn, func in dfile_extract_func.iteritems(): logger.debug("fn=%s,func=%s" % (fn, func)) if fn == os.path.basename(fname): # if fn file is very long, this is inefficient logger.debug("fname=%s" % os.path.join(staging_dir, fn)) with open(os.path.join(staging_dir, fn), 'r') as fp: new_value.update(func(fp)) found_func_match = True # FIXME: can multiple funcs match? logger.debug("matched %s %s" % (fn, func)) logger.debug("new_value=%s" % new_value) new_param['string_value'] = json.dumps( new_value ) if found_func_match else param['string_value'] break else: # incase string_value is processed first new_param[param_key] = v logger.debug("string_value len=%s" % new_param['string_value']) if new_param['name'] == "value_dict" and len( json.loads(new_param['string_value'])): has_value = True logger.debug("has_value=%s" % has_value) if new_param['name'] == "value_keys" and len( json.loads(new_param['string_value'])): has_keys = True logger.debug("has_keys=%s" % has_keys) new_param_vals.append(new_param) new_paramset['parameters'] = new_param_vals logger.debug("has_value=%s" % has_value) logger.debug("has_keys=%s" % has_keys) if has_value or has_keys: new_datafile_paramset.append(new_paramset) else: logger.debug("not adding %s" % new_paramset) logger.debug("new_datafile_paramset=%s" % new_datafile_paramset) file_size = len(f_contents) logger.debug("file_size=%s" % file_size) if file_size: data = json.dumps({ u'dataset': str(new_dataset_uri), u'parameter_sets': new_datafile_paramset, u'filename': os.path.basename(fname), u'size': file_size, u'mimetype': 'text/plain', u'md5sum': hashlib.md5(f_contents).hexdigest() }) logger.debug("data=%s" % data) with open(os.path.join(staging_dir, tempfname), 'rb') as fp: r = requests.post(url, data={"json_data": data}, headers={'Accept': 'application/json'}, files={'attached_file': fp}, auth=HTTPBasicAuth( tardis_user, tardis_pass), verify=False) # FIXME: need to check for status_code and handle failures. logger.debug("r.js=%s" % r.json) logger.debug("r.te=%s" % r.text) logger.debug("r.he=%s" % r.headers) else: logger.warn("not transferring empty file %s" % fname) #TODO: check whether mytardis api can accept zero length files finally: shutil.rmtree(staging_dir) return new_exp_id
class HRMCConverge(Converge): def input_valid(self, settings_to_test): """ Return a tuple, where the first element is True settings_to_test are syntactically and semantically valid for this stage. Otherwise, return False with the second element in the tuple describing the problem """ error = [] try: int(getval(settings_to_test, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): error.append("Cannot load max_iteration") try: float(getval(settings_to_test, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): error.append("Cannot load error threshold") if error: return (False, '. '.join(error)) return (True, "ok") def curate_dataset(self, run_settings, experiment_id, base_dir, output_url, all_settings): logger.debug("curate_dataset") iter_output_dir = os.path.join(os.path.join(base_dir, "output")) logger.debug("iter_output_dir=%s" % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug("iter_output_dir=%s" % iter_output_dir) logger.debug("output_url=%s" % output_url) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) curate_data = (getval(run_settings, '%s/input/mytardis/curate_data' % RMIT_SCHEMA)) if curate_data: if all_settings['mytardis_host']: # if mytardis_settings['mytardis_host']: # EXP_DATASET_NAME_SPLIT = 2 # def get_exp_name_for_output(settings, url, path): # return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) # def get_dataset_name_for_output(settings, url, path): # logger.debug("path=%s" % path) # host = settings['host'] # prefix = 'ssh://%s@%s' % (settings['type'], host) # source_url = smartconnectorscheduler.get_url_with_credentials( # settings, os.path.join(prefix, path, "HRMC.inp_values"), # is_relative_path=False) # logger.debug("source_url=%s" % source_url) # try: # content = storage.get_file(source_url) # except IOError, e: # logger.warn("cannot read file %s" % e) # return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) # logger.debug("content=%s" % content) # try: # values_map = dict(json.loads(str(content))) # except Exception, e: # logger.error("cannot load values_map %s: from %s. Error=%s" % (content, source_url, e)) # return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) # try: # iteration = str(path.split(os.sep)[-2:-1][0]) # except Exception, e: # logger.error(e) # iteration = "" # if "_" in iteration: # iteration = iteration.split("_")[1] # else: # iteration = "final" # dataset_name = "%s_%s_%s" % (iteration, # values_map['generator_counter'], # values_map['run_counter']) # logger.debug("dataset_name=%s" % dataset_name) # return dataset_name # re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)') # logger.debug("new_output_dir=%s" % new_output_dir) # exp_value_keys = [] # legends = [] # for m, node_dir in enumerate(node_dirs): # exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m]) # source_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(new_output_dir, node_dir), is_relative_path=False) # (source_scheme, source_location, source_path, source_location, # query_settings) = storage.parse_bdpurl(source_url) # logger.debug("source_url=%s" % source_url) # legends.append( # get_dataset_name_for_output( # output_storage_settings, "", source_path)) # logger.debug("exp_value_keys=%s" % exp_value_keys) # logger.debug("legends=%s" % legends) # graph_paramset = [mytardis.create_graph_paramset("expgraph", # name="hrmcexp2", # graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends}, # value_dict={}, # value_keys=exp_value_keys)] # for m, node_dir in enumerate(node_dirs): # dataerrors_url = smartconnectorscheduler.get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(new_output_dir, node_dir, DATA_ERRORS_FILE), is_relative_path=False) # dataerrors_content = storage.get_file(dataerrors_url) # xs = [] # ys = [] # for i, line in enumerate(dataerrors_content.splitlines()): # if i == 0: # continue # columns = line.split() # try: # hrmc_step = int(columns[STEP_COLUMN_NUM]) # except ValueError: # logger.warn("could not parse hrmc_step value on line %s" % i) # continue # # handle format double precision float format # val = columns[ERRGR_COLUMN_NUM] # val = re_dbl_fort.sub(r'\1E\2', val) # logger.debug("val=%s" % val) EXP_DATASET_NAME_SPLIT = 2 def get_exp_name_for_output(settings, url, path): return str(os.sep.join(path.split(os.sep)[:-EXP_DATASET_NAME_SPLIT])) def get_dataset_name_for_output(settings, url, path): logger.debug("path=%s" % path) host = settings['host'] prefix = 'ssh://%s@%s' % (settings['type'], host) source_url = get_url_with_credentials( settings, os.path.join(prefix, path, "HRMC.inp_values"), is_relative_path=False) logger.debug("source_url=%s" % source_url) try: content = storage.get_file(source_url) except IOError, e: logger.warn("cannot read file %s" % e) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) logger.debug("content=%s" % content) try: values_map = dict(json.loads(str(content))) except Exception, e: logger.error("cannot load values_map %s: from %s. Error=%s" % (content, source_url, e)) return str(os.sep.join(path.split(os.sep)[-EXP_DATASET_NAME_SPLIT:])) try: iteration = str(path.split(os.sep)[-2:-1][0]) except Exception, e: logger.error(e) iteration = "" if "_" in iteration: iteration = iteration.split("_")[1] else: iteration = "final" dataset_name = "%s_%s_%s" % (iteration, values_map['generator_counter'], values_map['run_counter']) logger.debug("dataset_name=%s" % dataset_name) return dataset_name re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)') exp_value_keys = [] legends = [] for m, node_dir in enumerate(node_output_dirnames): node_path = os.path.join(iter_output_dir, node_dir) exp_value_keys.append(["hrmcdset%s/step" % m, "hrmcdset%s/err" % m]) source_url = get_url_with_credentials(all_settings, node_path, is_relative_path=False) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_url=%s" % source_url) legends.append( get_dataset_name_for_output( all_settings, "", source_path)) logger.debug("exp_value_keys=%s" % exp_value_keys) logger.debug("legends=%s" % legends) graph_paramset = [mytardis.create_graph_paramset("expgraph", name="hrmcexp2", graph_info={"axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends}, value_dict={}, value_keys=exp_value_keys)] for m, node_dir in enumerate(node_output_dirnames): node_path = os.path.join(iter_output_dir, node_dir) logger.debug("node_path=%s" % node_path) #FIXME: this calculation should be done as in extract_psd_func # pulling directly from data_errors rather than passing in # through nested function. dataerrors_url = get_url_with_credentials(all_settings, os.path.join(node_path, DATA_ERRORS_FILE), is_relative_path=False) logger.debug("dataerrors_url=%s" % dataerrors_url) dataerrors_content = storage.get_file(dataerrors_url) xs = [] ys = [] for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() try: hrmc_step = int(columns[STEP_COLUMN_NUM]) except ValueError: logger.warn("could not parse hrmc_step value on line %s" % i) continue # handle format double precision float format val = columns[ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: hrmc_errgr = float(val) except ValueError: logger.warn("could not parse hrmc_errgr value on line %s" % i) continue xs.append(hrmc_step) ys.append(hrmc_errgr) logger.debug("xs=%s" % xs) logger.debug("ys=%s" % ys) crit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "criterion.txt"), is_relative_path=False) try: crit = storage.get_file(crit_url) except ValueError: crit = None except IOError: crit = None # FIXME: can crit be zero? if crit: hrmcdset_val = {"hrmcdset/it": self.id, "hrmcdset/crit": crit} else: hrmcdset_val = {} source_url = get_url_with_credentials( all_settings, node_path, is_relative_path=False) logger.debug("source_url=%s" % source_url) # TODO: move into utiltiy function for reuse def extract_psd_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() val = columns[STEP_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: x = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue val = columns[ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: y = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue xs.append(x) ys.append(y) res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys} return res def extract_psdexp_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys} return res def extract_grfinal_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0)] res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys} return res def extract_inputgr_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 20) == 0)] cut_ys = [ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 20) == 0)] res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys} return res #todo: replace self.boto_setttings with mytardis_settings experiment_id = mytardis.create_dataset( settings=all_settings, source_url=source_url, exp_name=get_exp_name_for_output, dataset_name=get_dataset_name_for_output, exp_id=experiment_id, experiment_paramset=graph_paramset, dataset_paramset=[ mytardis.create_paramset('hrmcdataset/output', []), mytardis.create_graph_paramset('dsetgraph', name="hrmcdset", graph_info={"axes":["r (Angstroms)", "PSD"], "legends":["psd", "PSD_exp"], "type":"line"}, value_dict=hrmcdset_val, value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"], ["hrmcdfile/r2", "hrmcdfile/g2"]]), mytardis.create_graph_paramset('dsetgraph', name='hrmcdset2', graph_info={"axes":["r (Angstroms)", "g(r)"], "legends":["data_grfinal", "input_gr"], "type":"line"}, value_dict={}, value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"], ["hrmcdfile/r4", "hrmcdfile/g4"]]), mytardis.create_graph_paramset('dsetgraph', name='hrmcdset%s' % m, graph_info={}, value_dict={"hrmcdset%s/step" % m: xs, "hrmcdset%s/err" % m: ys}, value_keys=[]), ], datafile_paramset=[ mytardis.create_graph_paramset('dfilegraph', name="hrmcdfile", graph_info={}, value_dict={}, value_keys=[]) ], dfile_extract_func={ 'psd.dat': extract_psd_func, 'PSD_exp.dat': extract_psdexp_func, 'data_grfinal.dat': extract_grfinal_func, 'input_gr.dat': extract_inputgr_func} ) graph_paramset = []
def curate_dataset(self, run_settings, experiment_id, base_url, output_url, all_settings): ''' Curates dataset ''' # Retrieves process directories below the current output location iteration = int(getval(run_settings, '%s/system/id' % SCHEMA_PREFIX)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) current_output_url = "%s%s" % (output_prefix, os.path.join(os.path.join( base_url, "output_%s" % iteration))) (scheme, host, current_output_path, location, query_settings) = storage.parse_bdpurl(output_url) output_fsys = storage.get_filesystem(output_url) process_output_dirs, _ = output_fsys.listdir(current_output_path) # Curates a dataset with metadata per process for i, process_output_dir in enumerate(process_output_dirs): # Expand the process output directory and add credentials for access process_output_url = '/'.join([current_output_url, process_output_dir]) process_output_url_with_cred = get_url_with_credentials( all_settings, process_output_url, is_relative_path=False) # Expand the process output file and add credentials for access output_file_url_with_cred = storage.get_url_with_credentials( all_settings, '/'.join([process_output_url, OUTPUT_FILE]), is_relative_path=False) try: output_content = storage.get_file(output_file_url_with_cred) val1, val2 = output_content.split() except (IndexError, IOError) as e: logger.warn(e) continue try: x = float(val1) y = float(val2) except (ValueError, IndexError) as e: logger.warn(e) continue # Returns the process id as MyTardis dataset name all_settings['graph_point_id'] = str(i) def _get_dataset_name(settings, url, path): return all_settings['graph_point_id'] # Creates new dataset and adds to experiment # If experiment_id==0, creates new experiment experiment_id = mytardis.create_dataset( settings=all_settings, # MyTardis credentials source_url=process_output_url_with_cred, exp_id=experiment_id, dataset_name=_get_dataset_name, # the function that defines dataset name dataset_paramset=[ # a new blank parameter set conforming to schema 'remotemake/output' mytardis.create_paramset("remotemake/output", []), mytardis.create_graph_paramset("dsetgraph", # name of schema name="randdset", # a unique dataset name graph_info={}, value_dict={"randdset/x": x, "randdset/y": y}, # values to be used in experiment graphs value_keys=[] ), ] ) return experiment_id
def create_dataset_for_intermediate_output(self, run_settings, experiment_id, base_dir, output_url, all_settings, outputs=[]): logger.debug('self_outpus_curate=%s' % outputs) iteration = int( getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX)) iter_output_dir = os.path.join( os.path.join(base_dir, "output_%s" % iteration)) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) if all_settings['mytardis_host']: output_dirs = [] for m, dir_name in enumerate(node_output_dirnames): output_dirs.append(os.path.join(iter_output_dir, dir_name)) for i, output_dir in enumerate(output_dirs): dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} self.load_metadata_builder(run_settings) if self.METADATA_BUILDER: (continue_loop, dataset_paramset, datafile_paramset, dfile_extract_func) = \ self.METADATA_BUILDER.build_metadata_for_intermediate_output(\ output_dir, outputs, run_settings=run_settings, storage_settings=all_settings,\ output_dirs=output_dirs) if continue_loop: continue source_dir_url = get_url_with_credentials( all_settings, output_dir, is_relative_path=False) logger.debug("source_dir_url=%s" % source_dir_url) logger.debug('all_settings_here=%s' % all_settings) system_id = int( getval(run_settings, '%s/system/id' % self.SCHEMA_PREFIX)) #TODO Mytardis experiment_id = mytardis.create_dataset( settings=all_settings, source_url=source_dir_url, exp_id=experiment_id, exp_name=mytardis.get_exp_name_for_intermediate_output, dataset_name=mytardis.get_dataset_name_for_output, dataset_paramset=dataset_paramset, datafile_paramset=datafile_paramset, dfile_extract_func=dfile_extract_func) else: logger.warn("no mytardis host specified") return 0 return experiment_id
def create_dataset(settings, source_url, exp_id, exp_name=_get_exp_name, dataset_name=_get_dataset_name, experiment_paramset=[], dataset_paramset=[], datafile_paramset=[], dfile_extract_func=None): """ POST to mytardis_host REST API with mytardis_user and mytardis_password credentials to create or update experiment for a new dataset containing datafiles from source_url BDP directory. :param dict settings.keys(): ['mytardis_user', 'mytardis_password', 'mytardis_host'] :param str source_url: chiminey URL for the source of dataset :param int exp_id: unique experiment id for existing experiment or 0 for new :param func exp_name: function that returns experiment name based on url and path :param func dataset_name: function that returns dataset name based on url and path :param paramset dataset_param: metadata package for dataset :param paramset datafile_paramset: metadata package for datafiles :param func dfile_extract_func: function that extracts datafile information :return: new mytardis experiment id :rtype: int :raises: IndexError if setttings does not contain required configuration fields or is otherwise invalid. If exp_id is non-zero, adds to existing experiment with exp_id, else new created identifier returned. experiment_paramset is appended to any existing metadata and does not overwrite. """ #FIXME,TODO: What if tardis in unavailable? Connection to mytardis probably #better handled as sperate celery subtask, which can retry until working and #be async #FIXME: missing all error checking and retrying of connection to mytardis. #Reliability framework should be able to supply this? #TODO: method should take BDP url source_url not, expanded one. logger.debug("post_dataset") tardis_user = settings["mytardis_user"] tardis_pass = settings["mytardis_password"] tardis_ssh = int(settings["mytardis_ssl"]) tardis_protocol = "http://%s" if tardis_ssh > 0: tardis_protocol = "https://%s" tardis_host_url = tardis_protocol % settings["mytardis_host"] tardis_port = settings["mytardis_port"] logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url, tardis_host_url, tardis_pass)) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_path=%s" % source_path) if source_scheme == "file": root_path = _get_value('root_path', query_settings) else: logger.debug('schema=%s' % source_scheme) #raise InvalidInputError("only file source_schema supported for source of mytardis transfer") expname = exp_name(settings, source_url, source_path) new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset) new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id # TODO: check that we do not alreay have a dataset with # the same name and overwrite or don't move. # save dataset logger.debug("saving dataset in experiment at %s" % new_exp_id) url = "%s:%s/api/v1/dataset/?format=json" % (tardis_host_url, tardis_port) headers = {'content-type': 'application/json'} schemas = dataset_paramset logger.debug("schemas=%s" % schemas) data = json.dumps({ 'experiments': [new_experiment_uri], 'description': dataset_name(settings, source_url, source_path), "parameter_sets": schemas }) logger.debug("data=%s" % data) logger.debug("post to %s" % url) r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False) # FIXME: need to check for status_code and handle failures. logger.debug("r.json=%s" % r.json) logger.debug("r.text=%s" % r.text) logger.debug("r.headers=%s" % r.headers) header_location = r.headers['location'] new_dataset_uri = header_location[len(tardis_host_url):] # move files across source_files = storage.list_all_files(source_url) logger.debug("source_files=%s" % source_files) url = "%s:%s/api/v1/dataset_file/" % (tardis_host_url, tardis_port) args = source_url.split('?')[1] logger.debug('args=%s' % args) staging_dir = tempfile.mkdtemp(suffix="", prefix="chiminey") try: for fname in source_files: logger.debug('fname=%s' % os.path.join(source_location, fname)) source_file_url = "%s://%s?%s" % ( source_scheme, os.path.join(source_location, fname), args) logger.debug('source_file_url=%s' % source_file_url) # TODO: add retrying to this operation. source_file = storage.get_filep(source_file_url, sftp_reference=False) #logger.debug('source_file=%s' % source_file._name) # we have load contents locally at least once. f_contents = source_file.read() # Make temporary copy as mytardis datafile pos requires filename tempfname = os.path.basename(fname) with open(os.path.join(staging_dir, tempfname), 'wb') as fp: fp.write(f_contents) new_datafile_paramset = [] logger.debug("datafile_paramset=%s" % datafile_paramset) for paramset in datafile_paramset: new_paramset = {} logger.debug("paramset=%s" % paramset) new_paramset['schema'] = paramset['schema'] has_value = False has_keys = False new_param_vals = [] for param in paramset['parameters']: new_param = {} for param_key, v in param.iteritems(): logger.debug("param_key=%s v=%s" % (param_key,v)) if param_key == 'name' and v == "value_dict": new_param['name'] = 'value_dict' new_value = {} found_func_match = False for fn, func in dfile_extract_func.iteritems(): logger.debug("fn=%s,func=%s" % (fn, func)) if fn == os.path.basename(fname): # if fn file is very long, this is inefficient logger.debug("fname=%s" % os.path.join(staging_dir, fn)) with open( os.path.join(staging_dir, fn), 'r') as fp: new_value.update(func(fp)) found_func_match = True # FIXME: can multiple funcs match? logger.debug("matched %s %s" % (fn, func)) logger.debug("new_value=%s" % new_value) new_param['string_value'] = json.dumps(new_value) if found_func_match else param['string_value'] break else: # incase string_value is processed first new_param[param_key] = v logger.debug("string_value len=%s" % new_param['string_value']) if new_param['name'] == "value_dict" and len(json.loads(new_param['string_value'])): has_value = True logger.debug("has_value=%s" % has_value) if new_param['name'] == "value_keys" and len(json.loads(new_param['string_value'])): has_keys = True logger.debug("has_keys=%s" % has_keys) new_param_vals.append(new_param) new_paramset['parameters'] = new_param_vals logger.debug("has_value=%s" % has_value) logger.debug("has_keys=%s" % has_keys) if has_value or has_keys: new_datafile_paramset.append(new_paramset) else: logger.debug("not adding %s" % new_paramset) logger.debug("new_datafile_paramset=%s" % new_datafile_paramset) file_size = len(f_contents) logger.debug("file_size=%s" % file_size) if file_size: data = json.dumps({ u'dataset': str(new_dataset_uri), u'parameter_sets': new_datafile_paramset, u'filename': os.path.basename(fname), u'size': file_size, u'mimetype': 'text/plain', u'md5sum': hashlib.md5(f_contents).hexdigest() }) logger.debug("data=%s" % data) with open(os.path.join(staging_dir, tempfname), 'rb') as fp: r = requests.post(url, data={"json_data": data}, headers={'Accept': 'application/json'}, files={'attached_file': fp}, auth=HTTPBasicAuth(tardis_user, tardis_pass), verify=False ) # FIXME: need to check for status_code and handle failures. logger.debug("r.js=%s" % r.json) logger.debug("r.te=%s" % r.text) logger.debug("r.he=%s" % r.headers) else: logger.warn("not transferring empty file %s" % fname) #TODO: check whether mytardis api can accept zero length files finally: shutil.rmtree(staging_dir) return new_exp_id
def build_metadata_for_final_output(self, m, output_dir, **kwargs): #FIXME: this calculation should be done as in extract_psd_func # pulling directly from data_errors rather than passing in # through nested function. experiment_paramset = [] dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} exp_value_keys = [] legends = [] for m, current_dir in enumerate(kwargs['output_dirs']): #node_path = os.path.join(iter_output_dir, node_dir) exp_value_keys.append( ["hrmcdset%s/step" % m, "hrmcdset%s/err" % m]) source_url = storage.get_url_with_credentials(\ kwargs['storage_settings'], current_dir, is_relative_path=False) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_url=%s" % source_url) legends.append( mytardis.get_dataset_name_for_output( kwargs['storage_settings'], "", source_path)) logger.debug("exp_value_keys=%s" % exp_value_keys) logger.debug("legends=%s" % legends) # for m, output_dir in enumerate(kwargs['output_dirs']): #node_path = os.path.join(iter_output_dir, output_dir) node_path = output_dir logger.debug("node_path=%s" % node_path) dataerrors_url = storage.get_url_with_credentials( kwargs['storage_settings'], os.path.join(node_path, self.DATA_ERRORS_FILE), is_relative_path=False) logger.debug("dataerrors_url=%s" % dataerrors_url) dataerrors_content = storage.get_file(dataerrors_url) xs = [] ys = [] re_dbl_fort = re.compile(r'(\d*\.\d+)[dD]([-+]?\d+)') for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() try: hrmc_step = int(columns[self.STEP_COLUMN_NUM]) except ValueError: logger.warn("could not parse hrmc_step value on line %s" % i) continue # handle format double precision float format val = columns[self.ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: hrmc_errgr = float(val) except ValueError: logger.warn("could not parse hrmc_errgr value on line %s" % i) continue xs.append(hrmc_step) ys.append(hrmc_errgr) logger.debug("xs=%s" % xs) logger.debug("ys=%s" % ys) crit_url = storage.get_url_with_credentials(kwargs['storage_settings'], os.path.join( node_path, "criterion.txt"), is_relative_path=False) try: crit = storage.get_file(crit_url) except ValueError: crit = None except IOError: crit = None # FIXME: can crit be zero? logger.debug("crit=%s" % crit) if crit: system_id = int(getval(kwargs['run_settings'], '%s/system/id' % \ django_settings.SCHEMA_PREFIX)) hrmcdset_val = {"hrmcdset/it": system_id, "hrmcdset/crit": crit} else: hrmcdset_val = {} # TODO: move into utiltiy function for reuse def extract_psd_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(dataerrors_content.splitlines()): if i == 0: continue columns = line.split() val = columns[self.STEP_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: x = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue val = columns[self.ERRGR_COLUMN_NUM] val = re_dbl_fort.sub(r'\1E\2', val) logger.debug("val=%s" % val) try: y = float(val) except ValueError: logger.warn("could not parse value on line %s" % i) continue xs.append(x) ys.append(y) res = {"hrmcdfile/r1": xs, "hrmcdfile/g1": ys} return res def extract_psdexp_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) res = {"hrmcdfile/r2": xs, "hrmcdfile/g2": ys} return res def extract_grfinal_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [ xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0) ] cut_ys = [ ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0) ] res = {"hrmcdfile/r3": cut_xs, "hrmcdfile/g3": cut_ys} return res def extract_inputgr_func(fp): res = [] xs = [] ys = [] for i, line in enumerate(fp): columns = line.split() xs.append(float(columns[0])) ys.append(float(columns[1])) #FIXME: len(xs) == len(ys) for this to work. #TODO: hack to handle when xs and ys are too # large to fit in Parameter with db_index. # solved by function call at destination cut_xs = [ xs[i] for i, x in enumerate(xs) if (i % (len(xs) / 50) == 0) ] cut_ys = [ ys[i] for i, x in enumerate(ys) if (i % (len(ys) / 50) == 0) ] res = {"hrmcdfile/r4": cut_xs, "hrmcdfile/g4": cut_ys} return res #todo: replace self.boto_setttings with mytardis_settings # Only save graph paramset for experiment once per experiment. if not self.final_graph_paramset: self.final_graph_paramset = [ mytardis.create_graph_paramset("expgraph", name="hrmcexp2", graph_info={ "axes": ["step", "ERRGr*wf"], "precision": [0, 2], "legends": legends }, value_dict={}, value_keys=exp_value_keys) ] experiment_paramset = self.final_graph_paramset else: experiment_paramset = [] dataset_paramset = [ mytardis.create_paramset('hrmcdataset/output', []), mytardis.create_graph_paramset( 'dsetgraph', name="hrmcdset", graph_info={ "axes": ["r (Angstroms)", "PSD"], "legends": ["psd", "PSD_exp"], "type": "line" }, value_dict=hrmcdset_val, value_keys=[["hrmcdfile/r1", "hrmcdfile/g1"], ["hrmcdfile/r2", "hrmcdfile/g2"]]), mytardis.create_graph_paramset( 'dsetgraph', name='hrmcdset2', graph_info={ "axes": ["r (Angstroms)", "g(r)"], "legends": ["data_grfinal", "input_gr"], "type": "line" }, value_dict={}, value_keys=[["hrmcdfile/r3", "hrmcdfile/g3"], ["hrmcdfile/r4", "hrmcdfile/g4"]]), mytardis.create_graph_paramset('dsetgraph', name='hrmcdset%s' % m, graph_info={}, value_dict={ "hrmcdset%s/step" % m: xs, "hrmcdset%s/err" % m: ys }, value_keys=[]), ] datafile_paramset = [ mytardis.create_graph_paramset('dfilegraph', name="hrmcdfile", graph_info={}, value_dict={}, value_keys=[]) ] dfile_extract_func = { 'psd.dat': extract_psd_func, 'PSD_exp.dat': extract_psdexp_func, 'data_grfinal.dat': extract_grfinal_func, 'input_gr.dat': extract_inputgr_func } logger.debug("experiment_paramset=%s" % experiment_paramset) logger.debug("dataset_paramset=%s" % dataset_paramset) logger.debug("datafile_paramset=%s" % datafile_paramset) logger.debug("dfile_extract_func=%s" % dfile_extract_func) return (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func)
def create_dataset_for_final_output(self, run_settings, experiment_id, base_dir, output_url, all_settings): logger.debug("curate_dataset") iter_output_dir = os.path.join(os.path.join(base_dir, "output")) logger.debug("iter_output_dir=%s" % iter_output_dir) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) logger.debug("iter_output_dir=%s" % iter_output_dir) logger.debug("output_url=%s" % output_url) (scheme, host, mypath, location, query_settings) = storage.parse_bdpurl(output_url) fsys = storage.get_filesystem(output_url) node_output_dirnames, _ = fsys.listdir(mypath) logger.debug("node_output_dirnames=%s" % node_output_dirnames) curate_data = (getval( run_settings, '%s/input/mytardis/curate_data' % self.SCHEMA_PREFIX)) if curate_data: if all_settings['mytardis_host']: output_dirs = [] for m, dir_name in enumerate(node_output_dirnames): output_dirs.append(os.path.join(iter_output_dir, dir_name)) for m, output_dir in enumerate(output_dirs): #node_path = os.path.join(iter_output_dir, node_dir) logger.debug("output_dir=%s" % output_dir) dataset_paramset = [] datafile_paramset = [] dfile_extract_func = {} self.load_metadata_builder(run_settings) if self.METADATA_BUILDER: (experiment_paramset, dataset_paramset, datafile_paramset, dfile_extract_func) = \ self.METADATA_BUILDER.build_metadata_for_final_output(m, output_dir, \ run_settings=run_settings, storage_settings=all_settings,\ output_dirs=output_dirs) source_url = get_url_with_credentials( all_settings, output_dir, is_relative_path=False) logger.debug("source_url=%s" % source_url) experiment_id = mytardis.create_dataset( settings=all_settings, source_url=source_url, exp_name=mytardis.get_exp_name_for_output, dataset_name=mytardis.get_dataset_name_for_output, exp_id=experiment_id, experiment_paramset=experiment_paramset, dataset_paramset=dataset_paramset, datafile_paramset=datafile_paramset, dfile_extract_func=dfile_extract_func) graph_paramset = [] else: logger.warn("no mytardis host specified") else: logger.warn('Data curation is off') return experiment_id
def process_outputs(self, run_settings, base_dir, input_url, all_settings): id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1))) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url) iter_out_fsys = storage.get_filesystem(input_url) input_dirs, _ = iter_out_fsys.listdir(iter_output_path) # TODO: store all audit info in single file in input_X directory in transform, # so we do not have to load individual files within node directories here. min_crit = sys.float_info.max - 1.0 min_crit_index = sys.maxint # # TODO: store all audit info in single file in input_X directory in transform, # # so we do not have to load individual files within node directories here. # min_crit = sys.float_info.max - 1.0 # min_crit_index = sys.maxint logger.debug("input_dirs=%s" % input_dirs) for input_dir in input_dirs: node_path = os.path.join(iter_output_dir, input_dir) logger.debug('node_path= %s' % node_path) # Retrieve audit file # audit_url = get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False) audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False) audit_content = storage.get_file(audit_url) logger.debug('audit_url=%s' % audit_url) # extract the best criterion error # FIXME: audit.txt is potentially debug file so format may not be fixed. p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE) m = p.search(audit_content) criterion = None if m: criterion = float(m.group(2)) best_numb = int(m.group(1)) # NB: assumes that subdirss in new input_x will have same names as output dir that created it. best_node = input_dir else: message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1) logger.warn(message) raise IOError(message) if criterion < min_crit: min_crit = criterion min_crit_index = best_numb min_crit_node = best_node logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index)) if min_crit_index >= sys.maxint: raise BadInputException("Unable to find minimum criterion of input files") # get previous best criterion try: self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): self.prev_criterion = sys.float_info.max - 1.0 logger.warn("no previous criterion found") # check whether we are under the error threshold logger.debug("best_num=%s" % best_numb) logger.debug("prev_criterion = %f" % self.prev_criterion) logger.debug("min_crit = %f" % min_crit) logger.debug('Current min criterion: %f, Prev ' 'criterion: %f' % (min_crit, self.prev_criterion)) difference = self.prev_criterion - min_crit logger.debug("Difference %f" % difference) try: max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): raise BadInputException("unknown max_iteration") logger.debug("max_iteration=%s" % max_iteration) try: self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): raise BadInputException("uknown error threshold") logger.debug("error_threshold=%s" % self.error_threshold) if self.id >= (max_iteration - 1): logger.debug("Max Iteration Reached %d " % self.id) return (True, min_crit) elif min_crit <= self.prev_criterion and difference <= self.error_threshold: logger.debug("Convergence reached %f" % difference) return (True, min_crit) else: if difference < 0: logger.debug("iteration diverged") logger.debug("iteration continues: %d iteration so far" % self.id) return (False, min_crit)
def process_outputs(self, run_settings, base_dir, input_url, all_settings): id = int(getval(run_settings, '%s/system/id' % RMIT_SCHEMA)) iter_output_dir = os.path.join(os.path.join(base_dir, "input_%s" % (id + 1))) output_prefix = '%s://%s@' % (all_settings['scheme'], all_settings['type']) iter_output_dir = "%s%s" % (output_prefix, iter_output_dir) (scheme, host, iter_output_path, location, query_settings) = storage.parse_bdpurl(input_url) iter_out_fsys = storage.get_filesystem(input_url) input_dirs, _ = iter_out_fsys.listdir(iter_output_path) # TODO: store all audit info in single file in input_X directory in transform, # so we do not have to load individual files within node directories here. min_crit = sys.float_info.max - 1.0 min_crit_index = sys.maxint # # TODO: store all audit info in single file in input_X directory in transform, # # so we do not have to load individual files within node directories here. # min_crit = sys.float_info.max - 1.0 # min_crit_index = sys.maxint logger.debug("input_dirs=%s" % input_dirs) for input_dir in input_dirs: node_path = os.path.join(iter_output_dir, input_dir) logger.debug('node_path= %s' % node_path) # Retrieve audit file # audit_url = get_url_with_credentials(output_storage_settings, # output_prefix + os.path.join(self.iter_inputdir, input_dir, 'audit.txt'), is_relative_path=False) audit_url = get_url_with_credentials(all_settings, os.path.join(node_path, "audit.txt"), is_relative_path=False) audit_content = storage.get_file(audit_url) logger.debug('audit_url=%s' % audit_url) # extract the best criterion error # FIXME: audit.txt is potentially debug file so format may not be fixed. p = re.compile("Run (\d+) preserved \(error[ \t]*([0-9\.]+)\)", re.MULTILINE) m = p.search(audit_content) criterion = None if m: criterion = float(m.group(2)) best_numb = int(m.group(1)) # NB: assumes that subdirss in new input_x will have same names as output dir that created it. best_node = input_dir else: message = "Cannot extract criterion from audit file for iteration %s" % (self.id + 1) logger.warn(message) raise IOError(message) if criterion < min_crit: min_crit = criterion min_crit_index = best_numb min_crit_node = best_node logger.debug("min_crit = %s at %s" % (min_crit, min_crit_index)) if min_crit_index >= sys.maxint: raise BadInputException("Unable to find minimum criterion of input files") # get previous best criterion try: self.prev_criterion = float(getval(run_settings, '%s/converge/criterion' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): self.prev_criterion = sys.float_info.max - 1.0 logger.warn("no previous criterion found") # check whether we are under the error threshold logger.debug("best_num=%s" % best_numb) logger.debug("prev_criterion = %f" % self.prev_criterion) logger.debug("min_crit = %f" % min_crit) logger.debug('Current min criterion: %f, Prev ' 'criterion: %f' % (min_crit, self.prev_criterion)) difference = self.prev_criterion - min_crit logger.debug("Difference %f" % difference) try: max_iteration = int(getval(run_settings, '%s/input/hrmc/max_iteration' % RMIT_SCHEMA)) except (ValueError, SettingNotFoundException): raise BadInputException("unknown max_iteration") logger.debug("max_iteration=%s" % max_iteration) try: self.error_threshold = float(getval(run_settings, '%s/input/hrmc/error_threshold' % RMIT_SCHEMA)) except (SettingNotFoundException, ValueError): raise BadInputException("uknown error threshold") logger.debug("error_threshold=%s" % self.error_threshold) if self.id >= (max_iteration - 1): logger.debug("Max Iteration Reached %d " % self.id) return (True, min_crit) elif min_crit <= self.prev_criterion and difference <= self.error_threshold: logger.debug("Convergence reached %f" % difference) return (True, min_crit) else: if difference < 0: logger.debug("iteration diverged") logger.debug("iteration continues: %d iteration so far" % self.id) return (False, min_crit)
def create_dataset(settings, source_url, exp_id, exp_name=_get_exp_name, dataset_name=_get_dataset_name, experiment_paramset=[], dataset_paramset=[], datafile_paramset=[], dfile_extract_func=None): """ Notes: POST to mytardis_host REST API with mytardis_user and mytardis_password credentials to create or update experiment for a new dataset containing datafiles from source_url BDP directory. Args: settings: source_url: url containing data to be ingested exp_id: [exp_name,dataset_name]: functions that return new experiment and dataset names respectively based on url and path experiment_paramset: ... dataset_paramset: ... datafile_paramset: dfile_extract_func: FIXME,TODO: What if tardis in unavailable? Connection to mytardis probably better handled as sperate celery subtask, which can retry until working and be async FIXME: missing all error checking and retrying of connection to mytardis. Reliability framework should be able to supply this? """ #TODO: method should take BDP url source_url not, expanded one. logger.debug("post_dataset") tardis_user = settings["mytardis_user"] tardis_pass = settings["mytardis_password"] tardis_host_url = "http://%s" % settings["mytardis_host"] logger.debug("posting dataset from %s to mytardis at %s with %s" % (source_url, tardis_host_url, tardis_pass)) (source_scheme, source_location, source_path, source_location, query_settings) = storage.parse_bdpurl(source_url) logger.debug("source_path=%s" % source_path) if source_scheme == "file": root_path = _get_value('root_path', query_settings) else: logger.debug('schema=%s' % source_scheme) #raise InvalidInputError("only file source_schema supported for source of mytardis transfer") expname = exp_name(settings, source_url, source_path) new_exp_id = create_experiment(settings, exp_id, expname, experiment_paramset) new_experiment_uri = "/api/v1/experiment/%s/" % new_exp_id # TODO: check that we do not alreay have a dataset with # the same name and overwrite or don't move. # save dataset logger.debug("saving dataset in experiment at %s" % new_exp_id) url = "%s/api/v1/dataset/?format=json" % tardis_host_url headers = {'content-type': 'application/json'} # # FIXME: schema should be a parameter # schemas = [{ # "schema": "http://rmit.edu.au/schemas/hrmcdataset", # "parameters": [] # }] # if dataset_schema: # schemas.append({ # "schema": dataset_schema, # "parameters": [] # }) schemas = dataset_paramset logger.debug("schemas=%s" % schemas) data = json.dumps({ 'experiments': [new_experiment_uri], 'description': dataset_name(settings, source_url, source_path), "parameter_sets": schemas }) logger.debug("data=%s" % data) logger.debug("post to %s" % url) r = requests.post(url, data=data, headers=headers, auth=HTTPBasicAuth(tardis_user, tardis_pass)) # FIXME: need to check for status_code and handle failures. logger.debug("r.json=%s" % r.json) logger.debug("r.text=%s" % r.text) logger.debug("r.headers=%s" % r.headers) header_location = r.headers['location'] new_dataset_uri = header_location[len(tardis_host_url):] # move files across source_files = storage.list_all_files(source_url) logger.debug("source_files=%s" % source_files) url = "%s/api/v1/dataset_file/" % tardis_host_url headers = {'Accept': 'application/json'} args = source_url.split('?')[1] logger.debug('args=%s' % args) ''' psd_url = smartconnectorscheduler.get_url_with_credentials(output_storage_credentials, 'ssh://unix@' + os.path.join(self.output_dir, node_output_dir, "PSD_output", "psd.dat"), is_relative_path=False) logger.debug('psd_url=%s' % psd_url) psd = hrmcstages.storage.get_filep(psd_url) ''' for file_location in source_files: logger.debug('file_location=%s' % os.path.join(source_location, file_location)) source_file_url = "%s://%s?%s" % (source_scheme, os.path.join(source_location, file_location), args) logger.debug('source_file_url=%s' % source_file_url) source_file, source_file_ref = storage.get_filep(source_file_url, sftp_reference=True) logger.debug('source_file=%s' % source_file._name) #file_path = os.path.join(root_path, file_location) #file_path = os.path.join(source_url, file_location) #logger.debug("file_path=%s" % file_path) #logger.debug("content=%s" % open(file_path,'rb').read()) new_datafile_paramset = [] logger.debug("datafile_paramset=%s" % datafile_paramset) for paramset in datafile_paramset: new_paramset = {} logger.debug("paramset=%s" % paramset) new_paramset['schema'] = paramset['schema'] has_value = False has_keys = False new_param_vals = [] for param in paramset['parameters']: new_param = {} for param_key, v in param.items(): if param_key == 'name' and v == "value_dict": new_param['name'] = 'value_dict' new_value = {} #val = param['string_value'] # if not isinstance(val, basestring): # dfile_extract_func = val found_func_match = False for fname, func in dfile_extract_func.items(): logger.debug("fname=%s,func=%s" % (fname, func)) if fname == os.path.basename(file_location): #new_value.update(func(open(file_path, 'r'))) source_file.seek(0) new_value.update(func(source_file)) found_func_match = True # FIXME: can multiple funcs match? logger.debug("new_value=%s" % new_value) if found_func_match: new_param['string_value'] = json.dumps(new_value) else: new_param['string_value'] = param['string_value'] break else: # incase string_value is processed first new_param[param_key] = v if new_param['name'] == "value_dict" and len(json.loads(new_param['string_value'])): has_value = True if new_param['name'] == "value_keys" and len(json.loads(new_param['string_value'])): has_keys = True new_param_vals.append(new_param) new_paramset['parameters'] = new_param_vals logger.debug("has_value=%s" % has_value) logger.debug("has_keys=%s" % has_keys) if has_value or has_keys: new_datafile_paramset.append(new_paramset) else: logger.debug("not adding %s" % new_paramset) logger.debug("new_datafile_paramset=%s" % new_datafile_paramset) logger.debug("file_namee=%s" % source_file._name) file_size = source_file_ref.size(source_file._name) logger.debug("file_size=%s" % file_size) if file_size > 0: source_file.seek(0) data = json.dumps({ 'dataset': str(new_dataset_uri), "parameter_sets": new_datafile_paramset, 'filename': os.path.basename(file_location), #'filename': os.path.basename(file_path), 'size': file_size, 'mimetype': 'text/plain', 'md5sum': hashlib.md5(source_file.read()).hexdigest() #'md5sum': hashlib.md5(open(file_path, 'r').read()).hexdigest() }) logger.debug("data=%s" % data) #import pdb; pdb.set_trace() source_file.seek(0) #logger.debug(source_file.read()) source_file.seek(0) r = requests.post(url, data={'json_data': data}, headers=headers, files={'attached_file': source_file}, # open(file_path, 'rb')}, auth=HTTPBasicAuth(tardis_user, tardis_pass) ) # FIXME: need to check for status_code and handle failures. logger.debug("r.js=%s" % r.json) logger.debug("r.te=%s" % r.text) logger.debug("r.he=%s" % r.headers) else: logger.warn("not transferring empty file %s" % file_location) #TODO: check whether mytardis api can accept zero length files return new_exp_id