def search(args): import sdearlystreamutils,sdstream if args.replica: import sddeferredafter sdstream.set_scalar(args.stream,'keep_replica','true') sddeferredafter.add_forced_parameter(args.stream,'nearest','false') # timestamp filters if args.timestamp_left_boundary is not None: sdstream.set_scalar(args.stream,'from',args.timestamp_left_boundary) if args.timestamp_right_boundary is not None: sdstream.set_scalar(args.stream,'to',args.timestamp_right_boundary) if args.type_==sdconst.SA_TYPE_FILE: file_search(args) elif args.type_==sdconst.SA_TYPE_AGGREGATION: move_to_dataset_printing_routine=sdearlystreamutils.is_one_variable_per_dataset_project(args.stream) # HACK if move_to_dataset_printing_routine: # one var exist per dataset for this project dataset_search(args) else: # many var exist per dataset for this project variable_search(args) elif args.type_==sdconst.SA_TYPE_DATASET: dataset_search(args)
def build_queries(stream=None,selection=None,path=None,parameter=None,index_host=None,load_default=None,query_type='remote',dry_run=False,parallel=True,count=False): """This pipeline add 'path', 'parameter' and 'selection' input type to the standalone query pipeline. Returns: squeries (Serialized queries) # TODO: maybe rename stream to dqueries """ if parameter is None: parameter=[] if stream is None: if selection is None: buffer=sdbuffer.get_selection_file_buffer(path=path,parameter=parameter) selection=sdparse.build(buffer,load_default=load_default) stream=selection.merge_facets() # at this point, stream contains all possible parameters sources (file,stdin,cli..) if count: # in this mode, we don't want to return any files, so we force limit to # 0 just in case this option has been set by the user sddeferredafter.add_forced_parameter(stream,'limit','0') queries=sdquerypipeline.run(stream,index_host=index_host,query_type=query_type,dry_run=dry_run,parallel=parallel) return queries
def search(args): import sdearlystreamutils, sdstream if args.replica: import sddeferredafter sdstream.set_scalar(args.stream, 'keep_replica', 'true') sddeferredafter.add_forced_parameter(args.stream, 'nearest', 'false') # timestamp filters if args.timestamp_left_boundary is not None: sdstream.set_scalar(args.stream, 'from', args.timestamp_left_boundary) if args.timestamp_right_boundary is not None: sdstream.set_scalar(args.stream, 'to', args.timestamp_right_boundary) if args.type_ == sdconst.SA_TYPE_FILE: file_search(args) elif args.type_ == sdconst.SA_TYPE_AGGREGATION: move_to_dataset_printing_routine = sdearlystreamutils.is_one_variable_per_dataset_project( args.stream) # HACK if move_to_dataset_printing_routine: # one var exist per dataset for this project dataset_search(args) else: # many var exist per dataset for this project variable_search(args) elif args.type_ == sdconst.SA_TYPE_DATASET: dataset_search(args)
def variable_search(args): import sddeferredafter, sdrdataset, sdrvariable sddeferredafter.add_default_parameter(args.stream,'limit',15) # note: in variable mode, total number of row is given by: "total+=#variable for each ds" sddeferredafter.add_forced_parameter(args.stream,'fields',variable_light_fields) datasets=sdrdataset.get_datasets(stream=args.stream,dry_run=args.dry_run) if len(datasets)==0: print "Variable not found" else: sdrvariable.print_list(datasets)
def variable_search(args): import sddeferredafter, sdrdataset, sdrvariable, sdfields, sdearlystreamutils sddeferredafter.add_default_parameter(args.stream,'limit',args.limit) # TAGJ43JK3J43 lpcme=sdearlystreamutils.test_facet_value_early(args.stream,'local_path_format','custom') # lpcme means 'Local Path Custom Mode Enabled' fields_=sdfields.get_all_variable_fields() if lpcme else sdfields.get_variable_light_fields() sddeferredafter.add_forced_parameter(args.stream,'fields',fields_) datasets=sdrdataset.get_datasets(stream=args.stream,dry_run=args.dry_run) if len(datasets)==0: print "Variable not found" else: sdrvariable.print_list(datasets,args.limit) # TAGJ43JK3J43
def dataset_search(args): import sddeferredafter, sdrdataset, sdstream sddeferredafter.add_default_parameter(args.stream,'limit',100) # add default limit sddeferredafter.add_forced_parameter(args.stream,'fields',dataset_light_fields) datasets=sdrdataset.get_datasets(stream=args.stream,dry_run=args.dry_run) if not args.dry_run: if len(datasets)==0: print_stderr('Dataset not found') else: if args.replica: sdrdataset.print_replica_list(datasets) else: sdrdataset.print_list(datasets)
def build_queries(stream=None, selection=None, path=None, parameter=None, index_host=None, load_default=None, query_type='remote', dry_run=False, parallel=True, count=False): """This pipeline add 'path', 'parameter' and 'selection' input type to the standalone query pipeline. Returns: squeries (Serialized queries) # TODO: maybe rename stream to dqueries """ if parameter is None: parameter = [] if stream is None: if selection is None: buffer = sdbuffer.get_selection_file_buffer(path=path, parameter=parameter) selection = sdparse.build(buffer, load_default=load_default) stream = selection.merge_facets() # at this point, stream contains all possible parameters sources (file,stdin,cli..) if count: # in this mode, we don't want to return any files, so we force limit to # 0 just in case this option has been set by the user sddeferredafter.add_forced_parameter(stream, 'limit', '0') queries = sdquerypipeline.run(stream, index_host=index_host, query_type=query_type, dry_run=dry_run, parallel=parallel) return queries
def dataset_search(args): import sddeferredafter, sdrdataset, sdstream, sdfields, sdearlystreamutils sddeferredafter.add_default_parameter(args.stream,'limit',args.limit) lpcme=sdearlystreamutils.test_facet_value_early(args.stream,'local_path_format','custom') # lpcme means 'Local Path Custom Mode Enabled' fields_=sdfields.get_all_dataset_fields() if lpcme else sdfields.get_dataset_light_fields() sddeferredafter.add_forced_parameter(args.stream,'fields',fields_) datasets=sdrdataset.get_datasets(stream=args.stream,dry_run=args.dry_run) if not args.dry_run: if len(datasets)==0: print_stderr('Dataset not found') else: if args.replica: sdrdataset.print_replica_list(datasets) else: sdrdataset.print_list(datasets)
def variable_search(args): import sddeferredafter, sdrdataset, sdrvariable, sdfields, sdearlystreamutils sddeferredafter.add_default_parameter(args.stream, 'limit', args.limit) # TAGJ43JK3J43 lpcme = sdearlystreamutils.test_facet_value_early( args.stream, 'local_path_format', 'custom') # lpcme means 'Local Path Custom Mode Enabled' fields_ = sdfields.get_all_variable_fields( ) if lpcme else sdfields.get_variable_light_fields() sddeferredafter.add_forced_parameter(args.stream, 'fields', fields_) datasets = sdrdataset.get_datasets(stream=args.stream, dry_run=args.dry_run) if len(datasets) == 0: print "Variable not found" else: sdrvariable.print_list(datasets, args.limit) # TAGJ43JK3J43
def dump_ESGF(parameter=None, selection_file=None, fields=None, dry_run=False, playback=None, record=None, no_default=True, type_='Dataset'): """This func dumps fields for all ESGF matching files/datasets. Initially designed to batch update attribute in Synda database (e.g. when a new attribute is decided to be stored in Synda, all already downloaded files metadata must be updated). """ stream = sdstreamutils.get_stream(parameter=parameter, selection_file=selection_file, no_default=no_default) sddeferredafter.add_forced_parameter(stream, 'replica', 'false') sddeferredafter.add_forced_parameter(stream, 'type', type_) assert fields is not None sddeferredafter.add_forced_parameter(stream, 'fields', fields) metadata = sdsearch.run(stream=stream, post_pipeline_mode=None, dry_run=dry_run, playback=playback, record=record) return metadata.get_files()
def search(args): if args.replica: import sdstream, sddeferredafter sdstream.set_scalar(args.stream,'keep_replica','true') sddeferredafter.add_forced_parameter(args.stream,'nearest','false') if args.type_==sdconst.SA_TYPE_FILE: file_search(args) elif args.type_==sdconst.SA_TYPE_AGGREGATION: move_to_dataset_printing_routine=syndautils.is_one_variable_per_dataset_project(args) # HACK if move_to_dataset_printing_routine: # one var exist per dataset for this project dataset_search(args) else: # many var exist per dataset for this project variable_search(args) elif args.type_==sdconst.SA_TYPE_DATASET: dataset_search(args)
def dataset_search(args): import sddeferredafter, sdrdataset, sdstream, sdfields, sdearlystreamutils sddeferredafter.add_default_parameter(args.stream, 'limit', args.limit) lpcme = sdearlystreamutils.test_facet_value_early( args.stream, 'local_path_format', 'custom') # lpcme means 'Local Path Custom Mode Enabled' fields_ = sdfields.get_all_dataset_fields( ) if lpcme else sdfields.get_dataset_light_fields() sddeferredafter.add_forced_parameter(args.stream, 'fields', fields_) datasets = sdrdataset.get_datasets(stream=args.stream, dry_run=args.dry_run) if not args.dry_run: if len(datasets) == 0: print_stderr('Dataset not found') else: if args.replica: sdrdataset.print_replica_list(datasets) else: sdrdataset.print_list(datasets)
def dump_ESGF(parameter=None,selection_file=None,fields=None,dry_run=False,playback=None,record=None,no_default=True,type_='Dataset'): """This func dumps fields for all ESGF matching files/datasets. Initially designed to batch update attribute in Synda database (e.g. when a new attribute is decided to be stored in Synda, all already downloaded files metadata must be updated). """ stream=sdstreamutils.get_stream(parameter=parameter,selection_file=selection_file,no_default=no_default) sddeferredafter.add_forced_parameter(stream,'replica','false') sddeferredafter.add_forced_parameter(stream,'type',type_) assert fields is not None sddeferredafter.add_forced_parameter(stream,'fields',fields) metadata=sdsearch.run(stream=stream,post_pipeline_mode=None,dry_run=dry_run,playback=playback,record=record) return metadata.get_files()
def get(args): import sdlogon, sdrfile, sddeferredafter, sddirectdownload, syndautils, humanize, sdconfig, os, sdconst, sdearlystreamutils # hack # see TAG43534FSFS if args.quiet: args.verbosity=0 if args.verify_checksum and args.network_bandwidth_test: print_stderr("'verify_checksum' option cannot be set when 'network_bandwidth_test' option is set.") return 1 stream=syndautils.get_stream(subcommand=args.subcommand,parameter=args.parameter,selection_file=args.selection_file) if args.openid and args.password: # use credential from CLI oid=args.openid pwd=args.password else: # use credential from file if sdconfig.is_openid_set(): oid=sdconfig.openid pwd=sdconfig.password else: print_stderr('Error: OpenID not set in configuration file (%s).'%sdconfig.credential_file) return 1 # retrieve certificate sdlogon.renew_certificate(oid,pwd,force_renew_certificate=False) http_client=sdconst.HTTP_CLIENT_URLLIB if args.urllib2 else sdconst.HTTP_CLIENT_WGET # local_path # # 'synda get' subcommand currently force local_path to the following construct: # '<dest_folder>/<filename>' (i.e. you can't use DRS tree in-between). This may # change in the future. # if args.dest_folder is None: local_path_prefix=os.getcwd() # current working directory else: local_path_prefix=args.dest_folder # BEWARE # # when set in CLI parameter, url is usually an ESGF facet, and as so should # be sent to the search-api as other facets # BUT # we want a special behaviour here (i.e. with 'synda get' command) with url: # if url is set by user, we DON'T call search-api operator. Instead, we # download the url directly. urls=sdearlystreamutils.get_facet_values_early(stream,'url') if len(urls)==0: # no url in stream: switch to search-api operator mode sddeferredafter.add_default_parameter(stream,'limit',5) sddeferredafter.add_forced_parameter(stream,'local_path_format','notree') files=sdrfile.get_files(stream=stream,post_pipeline_mode='file',dry_run=args.dry_run) # yes: this is the second time we run sdinference filter, but it doesn't hurt as sdinference is idempotent if not args.dry_run: if len(files)>0: # compute metric total_size=sum(int(f['size']) for f in files) total_size=humanize.naturalsize(total_size,gnu=False) print_stderr('%i file(s) will be downloaded for a total size of %s.'%(len(files),total_size)) status=sddirectdownload.run(files, args.timeout, args.force, http_client, local_path_prefix, verify_checksum=args.verify_checksum, network_bandwidth_test=args.network_bandwidth_test, debug=True, verbosity=args.verbosity, buffered=False, hpss=args.hpss) if status!=0: return 1 else: print_stderr("File not found") return 1 else: for f in files: size=humanize.naturalsize(f['size'],gnu=False) print '%-12s %s'%(size,f['filename']) elif len(urls)>0: # url(s) found in stream: search-api operator not needed (download url directly) # TAGDSFDF432F if args.verify_checksum: print_stderr("To perform checksum verification, ESGF file identifier (e.g. title, id, tracking id..) must be used instead of file url.") return 1 # TODO: to improve genericity, maybe merge this block into the previous one (i.e. url CAN be used as a search key in the search-api (but not irods url)) files=[] for url in urls: filename=os.path.basename(url) local_path=filename f=dict(local_path=local_path,url=url) files.append(f) status=sddirectdownload.run(files, args.timeout, args.force, http_client, local_path_prefix, verify_checksum=args.verify_checksum, # see above at TAGDSFDF432F network_bandwidth_test=args.network_bandwidth_test, debug=True, verbosity=args.verbosity, buffered=False, hpss=args.hpss) if status!=0: return 1 else: assert False return 0
def get(args): import sdlogon, sdrfile, sddeferredafter, sddirectdownload, syndautils, humanize, sdconfig, os, sdconst, sdearlystreamutils # hack # see TAG43534FSFS if args.quiet: args.verbosity = 0 if args.verify_checksum and args.network_bandwidth_test: print_stderr( "'verify_checksum' option cannot be set when 'network_bandwidth_test' option is set." ) return 1 stream = syndautils.get_stream(subcommand=args.subcommand, parameter=args.parameter, selection_file=args.selection_file) if args.openid and args.password: # use credential from CLI oid = args.openid pwd = args.password else: # use credential from file if sdconfig.is_openid_set(): oid = sdconfig.openid pwd = sdconfig.password else: print_stderr('Error: OpenID not set in configuration file (%s).' % sdconfig.credential_file) return 1 # retrieve certificate sdlogon.renew_certificate(oid, pwd, force_renew_certificate=False) http_client = sdconst.HTTP_CLIENT_URLLIB if args.urllib2 else sdconst.HTTP_CLIENT_WGET # local_path # # 'synda get' subcommand currently force local_path to the following construct: # '<dest_folder>/<filename>' (i.e. you can't use DRS tree in-between). This may # change in the future. # if args.dest_folder is None: local_path_prefix = os.getcwd() # current working directory else: local_path_prefix = args.dest_folder # BEWARE # # when set in CLI parameter, url is usually an ESGF facet, and as so should # be sent to the search-api as other facets # BUT # we want a special behaviour here (i.e. with 'synda get' command) with url: # if url is set by user, we DON'T call search-api operator. Instead, we # download the url directly. urls = sdearlystreamutils.get_facet_values_early(stream, 'url') if len(urls) == 0: # no url in stream: switch to search-api operator mode sddeferredafter.add_default_parameter(stream, 'limit', 5) sddeferredafter.add_forced_parameter(stream, 'local_path_format', 'notree') files = sdrfile.get_files( stream=stream, post_pipeline_mode='file', dry_run=args.dry_run ) # yes: this is the second time we run sdinference filter, but it doesn't hurt as sdinference is idempotent if not args.dry_run: if len(files) > 0: # compute metric total_size = sum(int(f['size']) for f in files) total_size = humanize.naturalsize(total_size, gnu=False) print_stderr( '%i file(s) will be downloaded for a total size of %s.' % (len(files), total_size)) status = sddirectdownload.run( files, args.timeout, args.force, http_client, local_path_prefix, verify_checksum=args.verify_checksum, network_bandwidth_test=args.network_bandwidth_test, debug=True, verbosity=args.verbosity, buffered=False, hpss=args.hpss) if status != 0: return 1 else: print_stderr("File not found") return 1 else: for f in files: size = humanize.naturalsize(f['size'], gnu=False) print '%-12s %s' % (size, f['filename']) elif len(urls) > 0: # url(s) found in stream: search-api operator not needed (download url directly) # TAGDSFDF432F if args.verify_checksum: print_stderr( "To perform checksum verification, ESGF file identifier (e.g. title, id, tracking id..) must be used instead of file url." ) return 1 # TODO: to improve genericity, maybe merge this block into the previous one (i.e. url CAN be used as a search key in the search-api (but not irods url)) files = [] for url in urls: filename = os.path.basename(url) local_path = filename f = dict(local_path=local_path, url=url) files.append(f) status = sddirectdownload.run( files, args.timeout, args.force, http_client, local_path_prefix, verify_checksum=args.verify_checksum, # see above at TAGDSFDF432F network_bandwidth_test=args.network_bandwidth_test, debug=True, verbosity=args.verbosity, buffered=False, hpss=args.hpss) if status != 0: return 1 else: assert False return 0
raise SDException("SDQSEARC-002","Number of returned files reach maximum limit") result.add_attached_parameters(query.get('attached_parameters',{})) return result if __name__ == '__main__': prog=os.path.basename(__file__) parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, epilog="""examples of use\n%s"""%sdi18n.m0002(prog)) parser.add_argument('parameter',nargs='+',help=sdi18n.m0001) parser.add_argument('-c','--count',action='store_true',help='Count how many found files') parser.add_argument('-f','--format',choices=['raw','line','indent'],default='indent') parser.add_argument('-i','--index_host') parser.add_argument('-m','--post_pipeline_mode',default='file') parser.add_argument('-y','--dry_run',action='store_true') parser.add_argument('-1','--print_only_one_item',action='store_true') args = parser.parse_args() if args.count: # in this mode, we don't want to return any files, so we force limit to 0 just in case this option has been set by the user sddeferredafter.add_forced_parameter(args.parameter,'limit','0') result=run(parameter=args.parameter,index_host=args.index_host,post_pipeline_mode=args.post_pipeline_mode,dry_run=args.dry_run) if args.count: print "%i"%result.num_found else: sdprint.print_format(result.files,args.format,args.print_only_one_item)