def get_urls(file_functional_id): """returns a prioritized list of [url,protocol] where each url can supply the specified file""" try: result = sdquicksearch.run(parameter=[ 'limit=4', 'fields=%s' % url_fields, 'type=File', 'instance_id=%s' % file_functional_id ], post_pipeline_mode=None) except Exception as e: sdlog.debug("SDNEXTUR-015", "exception %s. instance_id=%s" % (e, file_functional_id)) raise e li = result.get_files() sdlog.info( "SDNEXTUR-016", "sdquicksearch returned %s sets of file urls: %s" % (len(li), li)) if li == []: # No urls found. Try again, but wildcard the file id. (That leads to a string search on all # fields for the wildcarded file id, rather than a match of the instance_id field only.) result = sdquicksearch.run(parameter=[ 'limit=4', 'fields=%s' % url_fields, 'type=File', 'instance_id=%s' % file_functional_id + '*' ], post_pipeline_mode=None) li = result.get_files() sdlog.info( "SDNEXTUR-017", "sdquicksearch 2nd call %s sets of file urls: %s" % (len(li), li)) # result looks like # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File', # size:number} }, {[another dict of the same format}, {another dict},... ] # with no more than limit=4 items in the list, and no more than three protocols. # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ] # The return value could be an empty list. # Note: These nested lists are ugly; it's just a quick way to code something up. urlps = [] for dic in li: urlps += [[dic[key], key] for key in dic.keys() if key.find('url_') >= 0 and dic[key].find('//None') < 0] # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp' # The search for //None bypasses an issue with the SOLR lookup where there is no # url_gridftp possibility. return prioritize_urlps(urlps)
def use_file_timestamp_if_dataset_timestamp_is_missing(d): if 'timestamp' not in d: # timestamp doesn't exist in ESGF for this dataset # hack # # Use a dataset's (random (i.e. files have not always the same even # timestmap in one dataset, so we take one randomly)) file timestamp # as dataset's timestamp is missing in ESGF ! # Note # We do not filter replica in the query below in case the master host is not up result=sdquicksearch.run(parameter=['limit=1','fields=instance_id,timestamp,type','type=File','dataset_id=%s'%d['instance_id']],post_pipeline_mode=None) if len(result.files)>0: file=result.files[0] if 'timestamp' in file: d['timestamp']=file['timestamp'] sdlog.info("SDTIMEST-001","Dataset timestamp set from one dataset's file's timestamp (dataset_functional_id=%s,file_functional_id=%s)"%(d['instance_id'],file['instance_id'])) else: raise SDException("SDTIMEST-008","Timestamp missing in both dataset and dataset's file(s) (%s)"%d['instance_id']) else: raise SDException("SDTIMEST-011","Dataset exist in ESGF, but is empty (%s)"%d['instance_id'])
def print_remote_sample(project): """Print one random file of given project.""" result=sdquicksearch.run(parameter=['project=%s'%project,"limit=1"]) for f in result.files: print "%s" % (f['local_path']) print "%s|%s" % (f['file_functional_id'],f['data_node']) print
def get_datasets( stream=None, parameter=None, post_pipeline_mode='dataset', dry_run=False ): # TODO: maybe remove parameter argument everywhere as there is a mess in get_selection_file_buffer, because of default/forced parameter (i.e. len(parameter) is non-zero even if non parameter args set on CLI !) if parameter is None: parameter = [] assert (stream is None) or ( len(parameter) < 1 ) # this is to prevent using stream and parameter together assert post_pipeline_mode != 'file' if len(parameter) > 0: sddeferredbefore.add_forced_parameter(parameter, 'type', 'Dataset') elif stream is not None: sddeferredbefore.add_forced_parameter(stream, 'type', 'Dataset') result = sdquicksearch.run(stream=stream, parameter=parameter, post_pipeline_mode=post_pipeline_mode, dry_run=dry_run) return result.get_files()
def print_remote_sample(project): """Print one random file of given project.""" result = sdquicksearch.run(parameter=['project=%s' % project, "limit=1"]) for f in result.get_files(): print "%s" % (f['local_path']) print "%s|%s" % (f['file_functional_id'], f['data_node']) print
def test_index_hosts(): print "ESGF indexes benchmark" print "======================" print "" ProgressThread.start(running_message='Building test query.. ',spinner_type=0,sleep=0.2,end_message=None) #parameter=get_test_query() parameter=get_random_test_query() parameter.append("limit=0") test_queries=sdpipeline.build_queries(parameter=parameter,index_host='<index_host>',load_default=False) ProgressThread.stop() test_query=test_queries[0] print "Test query" print "----------" print "%s"%test_query['url'] print "" ProgressThread.start(running_message='Test running.. ',spinner_type=0,sleep=0.2,end_message=None) li=[] for index_host in sdindex.index_host_list: result=sdquicksearch.run(index_host=index_host,parameter=parameter) li.append([index_host,result.num_found,result.call_duration if result.call_duration>=1 else 0.1]) ProgressThread.stop() print "Result" print "------" li=sorted(li, key=lambda record: record[2]) print tabulate(li,headers=['Index host','File count','Call duration (seconds)'],tablefmt="plain")
def get_urls(file_functional_id): result = sdquicksearch.run(parameter=[ 'limit=1', 'fields=%s' % url_fields, 'type=File', 'instance_id=%s' % file_functional_id ], post_pipeline_mode=None) li = result.get_files() if len(li) > 0: file_ = li[0] # remove non url attributes try: del file_['attached_parameters'] except Exception as e: pass urls = file_ else: sdlog.info( "SDNEXTUR-090", "File not found (file_functional_id=%s)" % (tr.file_functional_id, )) raise sdexception.FileNotFoundException() return urls
def get_sample_files(project, limit): """This func retrieves a list of sample files for the given project.""" result = sdquicksearch.run( parameter=['type=File', 'project=%s' % project, "limit=%s" % limit], post_pipeline_mode='file') return result
def get_files(stream=None,parameter=[],post_pipeline_mode='file',dry_run=False): # TODO: maybe remove parameter argument everywhere as there is a mess in get_selection_file_buffer, because of default/forced parameter (i.e. len(parameter) is non-zero even if non parameter args set on CLI !) assert (stream is None) or (len(parameter)<1) # this is to prevent using stream and parameter together if len(parameter)>0: sddeferredbefore.add_forced_parameter(parameter,'type','File') elif stream is not None: sddeferredbefore.add_forced_parameter(stream,'type','File') result=sdquicksearch.run(stream=stream,parameter=parameter,post_pipeline_mode=post_pipeline_mode,dry_run=dry_run) return result.files
def do_add(self, file): self.parameter = ["limit=1", "instance_id=%s" % file] self.complete_parameter() result = sdquicksearch.run(parameter=self.parameter) if len(result.files) == 1: f = result.files[0] if f['status'] == sdconst.TRANSFER_STATUS_NEW: sdenqueue.run(result.files) print "File successfully enqueued" else: print "File already enqueued" elif len(result.files) == 0: print "File not found"
def do_add(self,file): self.parameter=["limit=1","instance_id=%s"%file] self.complete_parameter() result=sdquicksearch.run(parameter=self.parameter) if len(result.files)==1: f=result.files[0] if f['status']==sdconst.TRANSFER_STATUS_NEW: sdenqueue.run(result.files) print "File successfully enqueued" else: print "File already enqueued" elif len(result.files)==0: print "File not found"
def fill_missing_dataset_timestamp(dataset_without_timestamp): """This funcs set the dataset timestamp. Notes - This func DO NOT commit. - In ESFG, timestamp differs from replica to replica, and so, as there is no dataset replica concept in 'sdt', it's really a hack, because we set the timestamp randomly (i.e. dataset's timestamp in Synda installation at user A may differ to dataset's timestamp in Synda installation at user B (because the timestamp for the dataset may have been retrieved from replica X in the case of user A and from replica Y in the case of user B (and X replica's timestamp may differ from Y replica's timestamp))). Anyway, in the end, we hope that the timestamp random is on a much smaller scale than the version-to-version time interval scale, so to be able to detect which version is the latest ! And yes: all this mess is because version exists in different formats ('v1', 'v20140318'..). """ # Retrieve timestamps from ESGF # Note # We do not filter replica in the query below in case the master host is not up result = sdquicksearch.run(parameter=[ 'limit=1', 'fields=%s' % timestamp_fields, 'type=Dataset', 'instance_id=%s' % dataset_without_timestamp.dataset_functional_id ], post_pipeline_mode=None) li = result.get_files() # check if dataset has been found in ESGF if len(li) > 0: d = li[0] else: raise SDException( "SDTIMEST-800", "%s dataset does not exist in ESGF (or the index used does not list it)" % dataset_without_timestamp.dataset_functional_id) # use file's timestamp if dataset's timestamp is not set in ESGF # (this is needed, because some dataset in ESGF have NO timestamp...) use_file_timestamp_if_dataset_timestamp_is_missing(d) # update timestamp in DB dataset = sddatasetdao.get_dataset(dataset_functional_id=d['instance_id']) dataset.timestamp = d['timestamp'] sddatasetdao.update_dataset(dataset, commit=False, keys=['timestamp'])
def get_data_nodes(instance_id,replica_scalar): """Return one or more data_nodes depending on the 'replica' flag.""" parameter=['limit=50','type=Dataset','instance_id=%s'%instance_id,'replica=%s'%replica_scalar] # debug #print parameter result=sdquicksearch.run(parameter=parameter,post_pipeline_mode=None,dry_run=False) if result.num_result>0: datanodes=[] for d in result.files: datanodes.append(d['data_node']) return datanodes else: return []
def do_select(self,arg): # -- SQL mode (non-documented as not sure if this is here to stay) -- # tokens=arg.split() column=tokens[0] table=tokens[2] where_clause=tokens[4:] parameter=where_clause dry_run=sdsessionparam.get_value('dry_run') parameter.append('type=%s'%table.title()) result=sdquicksearch.run(parameter=parameter,dry_run=dry_run) for f in result.files: if column in f: print f[column]
def get_urls(file_functional_id): """returns a prioritized list of [url,protocol] where each url can supply the specified file""" result=sdquicksearch.run(parameter=['limit=4','fields=%s'%url_fields,'type=File','instance_id=%s'%file_functional_id],post_pipeline_mode=None) li=result.get_files() sdlog.info("JFPNEXTUR-05","sdquicksearch returned %s sets of file urls: %s"%(len(li),li)) # result looks like # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File', # size:number} }, {[another dict of the same format}, {another dict},... ] # with no more than limit=4 items in the list, and no more than three protocols. # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ] # The return value could be an empty list. # Note: These nested lists are ugly; it's just a quick way to code something up. urlps = [] for dic in li: urlps += [ [dic[key],key] for key in dic.keys() if key.find('url_')>=0 ] # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp' return prioritize_urlps( urlps )
def get_data_nodes(instance_id,replica): """Return one or more data_nodes depending on the 'replica' flag.""" parameter=['limit=50','type=Dataset','instance_id=%s'%instance_id] if replica is not None: parameter.append('replica=%s'%replica) # debug #print parameter result=sdquicksearch.run(parameter=parameter,post_pipeline_mode=None,dry_run=False) if result.count()>0: datanodes=[] for d in result.get_files(): datanodes.append(d['data_node']) return datanodes else: return []
def use_file_timestamp_if_dataset_timestamp_is_missing(d): if 'timestamp' not in d: # timestamp doesn't exist in ESGF for this dataset # hack # # Use a dataset's (random (i.e. files have not always the same even # timestmap in one dataset, so we take one randomly)) file timestamp # as dataset's timestamp is missing in ESGF ! # Note # We do not filter replica in the query below in case the master host is not up result = sdquicksearch.run(parameter=[ 'limit=1', 'fields=%s' % timestamp_fields, 'type=File', 'dataset_id=%s' % d['instance_id'] ], post_pipeline_mode=None) li = result.get_files() if len(li) > 0: file = li[0] if 'timestamp' in file: d['timestamp'] = file['timestamp'] sdlog.info( "SDTIMEST-001", "Dataset timestamp set from one dataset's file's timestamp (dataset_functional_id=%s,file_functional_id=%s)" % (d['instance_id'], file['instance_id'])) else: raise SDException( "SDTIMEST-008", "Timestamp missing in both dataset and dataset's file(s) (%s)" % d['instance_id']) else: raise SDException( "SDTIMEST-011", "Dataset exist in ESGF, but is empty (%s)" % d['instance_id'])
def fill_missing_dataset_timestamp(dataset_without_timestamp): """This funcs set the dataset timestamp. Notes - This func DO NOT commit. - In ESFG, timestamp differs from replica to replica, and so, as there is no dataset replica concept in 'sdt', it's really a hack, because we set the timestamp randomly (i.e. dataset's timestamp in Synda installation at user A may differ to dataset's timestamp in Synda installation at user B (because the timestamp for the dataset may have been retrieved from replica X in the case of user A and from replica Y in the case of user B (and X replica's timestamp may differ from Y replica's timestamp))). Anyway, in the end, we hope that the timestamp random is on a much smaller scale than the version-to-version time interval scale, so to be able to detect which version is the latest ! And yes: all this mess is because version exists in different formats ('v1', 'v20140318'..). """ # Retrieve timestamps from ESGF # Note # We do not filter replica in the query below in case the master host is not up result=sdquicksearch.run(parameter=['limit=1','fields=%s'%timestamp_fields,'type=Dataset','instance_id=%s'%dataset_without_timestamp.dataset_functional_id],post_pipeline_mode=None) li=result.get_files() # check if dataset has been found in ESGF if len(li)>0: d=li[0] else: raise SDException("SDTIMEST-800","%s dataset does not exist in ESGF (or the index used does not list it)"%dataset_without_timestamp.dataset_functional_id) # use file's timestamp if dataset's timestamp is not set in ESGF # (this is needed, because some dataset in ESGF have NO timestamp...) use_file_timestamp_if_dataset_timestamp_is_missing(d) # update timestamp in DB dataset=sddatasetdao.get_dataset(dataset_functional_id=d['instance_id']) dataset.timestamp=d['timestamp'] sddatasetdao.update_dataset(dataset,commit=False,keys=['timestamp'])
def do_select(self, arg): # -- SQL mode (non-documented as not sure if this is here to stay) -- # import sdsessionparam, sdquicksearch tokens = arg.split() column = tokens[0] table = tokens[2] where_clause = tokens[4:] parameter = where_clause dry_run = sdsessionparam.get_value('dry_run') parameter.append('type=%s' % table.title()) result = sdquicksearch.run(parameter=parameter, dry_run=dry_run) for f in result.files: if column in f: print f[column]
def get_sample_datasets(project,limit): """This func retrieves a list of sample datasets for the given project.""" result=sdquicksearch.run(parameter=['type=Dataset','project=%s'%project,"limit=%s"%limit],post_pipeline_mode='dataset') return result
def get_sample_files(project,limit): """This func retrieves a list of sample files for the given project.""" result=sdquicksearch.run(parameter=['type=File','project=%s'%project,"limit=%s"%limit],post_pipeline_mode=None) return result