def call_web_service(self, request): sdlog.debug("SYDPROXY-100", "Search-API call started (%s)." % request.get_url()) try: response = sdnetutils.call_web_service( request.get_url(), timeout=sdconst.SEARCH_API_HTTP_TIMEOUT ) # returns Response object except: # if exception occurs in sdnetutils.call_web_service() method, all # previous calls to this method inside this paginated call are also # cancelled # we reset the offset so the paginated call can be restarted from the begining the next time # (maybe overkill as offset is reinitialized when entering 'call_web_service__PAGINATION()' func) request.offset = 0 raise sdlog.info( "SYDPROXY-100", "Search-API call completed (returned-files-count=%i,match-count=%i,url=%s)." % (response.count(), response.num_found, request.get_url())) return response
def _reload(self, force=False): """Internal method to reload the dictionary of endpoints if the file has changed since it was last read""" if self.filepath: modtime = file_modification_datetime(self.filepath) if force or modtime > self.modtime: sdlog.debug("SDDMGLOB-014", "Loading endpoints from: %s, last modified: %s" % (self.filepath, modtime)) self.modtime = modtime endpoints = {} # read XML file with open(self.filepath, "r") as myfile: xml = myfile.read().replace("\n", "") # <endpoints xmlns="http://www.esgf.org/whitelist"> root = fromstring(xml) # <endpoint name="esg#jpl" gridftp="esg-datanode.jpl.nasa.gov:2811" /> for endpoint in root.findall("{%s}endpoint" % "http://www.esgf.org/whitelist"): gridftp = endpoint.attrib["gridftp"] name = endpoint.attrib["name"] path_out = endpoint.attrib.get("path_out", None) path_in = endpoint.attrib.get("path_in", None) endpoints[gridftp] = Endpoint(name, path_out=path_out, path_in=path_in) sdlog.debug("SDDMGLOB-018", "Using Globus endpoint %s : %s (%s --> %s)" % (gridftp, name, path_out, path_in)) # switch the dictionary of endpoints after reading self.endpoints = endpoints
def run(o,attached_parameters): """This func adds some parameters to the result of a query. Note The idea is the keep some parameters around by making them jump over the search call (e.g. Search-api call, SQL call..), from 'query pipeline' to 'file pipeline'. """ assert isinstance(attached_parameters, dict) sdlog.debug("SYDADDAP-620","Add attached_parameters..") if isinstance(o,sdtypes.Metadata): po=sdpipelineprocessing.ProcessingObject(add_attached_parameters,attached_parameters) o=sdpipelineprocessing.run_pipeline(o,po) elif isinstance(o,sdtypes.Response): # no need to process chunk by chunk here as Response size only contains a small amount of data (< sdconst.SEARCH_API_CHUNKSIZE) files=add_attached_parameters(o.get_files(),attached_parameters) o.set_files(files) else: assert False sdlog.debug("SYDADDAP-628","attached_parameters added") return o
def main_loop(): import sdapp, sdlog, sdtaskscheduler # must be here because of double-fork import sddb # this is to create database objects if not done already sdlog.info('SDDAEMON-001',"Daemon starting ...") try: sdtaskscheduler.event_loop() except SDException, e: level=sdconfig.config.get('log','verbosity_level') if level=='debug': # We log everything in debug mode no matter the exception type sdlog.debug('SDDAEMON-008',"Exception occured (%s)"%str(e)) else: if isinstance(e,SDException): # In this case, we only print the exception code, as the errmsg # is likely to be there already (i.e. low-level func should have # log information about this exception). # The primary reason for this is to have a clear log entry # when authentication failed (e.g. ESGF is down or openid is incorrect) sdlog.info('SDDAEMON-010',"Exception occured (%s)"%str(e.code)) else: # This case should not occur, so we log everything to help debugging sdlog.info('SDDAEMON-012',"Exception occured (%s)"%str(e))
def main_loop(): import sdapp, sdlog, sdtaskscheduler # must be here because of double-fork import sddb # this is to create database objects if not done already sdlog.info('SDDAEMON-001', "Daemon starting ...") try: sdtaskscheduler.event_loop() except SDException, e: level = sdconfig.config.get('log', 'verbosity_level') if level == 'debug': # We log everything in debug mode no matter the exception type sdlog.debug('SDDAEMON-008', "Exception occured (%s)" % str(e)) else: if isinstance(e, SDException): # In this case, we only print the exception code, as the errmsg # is likely to be there already (i.e. low-level func should have # log information about this exception). # The primary reason for this is to have a clear log entry # when authentication failed (e.g. ESGF is down or openid is incorrect) sdlog.info('SDDAEMON-010', "Exception occured (%s)" % str(e.code)) else: # This case should not occur, so we log everything to help debugging sdlog.info('SDDAEMON-012', "Exception occured (%s)" % str(e))
def map_to_globus(url): parsed_url = urlparse(url) # 'globus' scheme if parsed_url.scheme == "globus": slash_index = parsed_url.path.find("/") src_endpoint = parsed_url.path[0:slash_index] src_path = parsed_url.path[slash_index:] return src_endpoint, src_path, src_path # 'gridftp' scheme hostname = parsed_url.netloc src_endpoint = None src_path = re.sub("/+", "/", parsed_url.path) path = src_path if hostname in globus_endpoints: src_endpoint = globus_endpoints[hostname].name path_out = globus_endpoints[hostname].path_out path_in = globus_endpoints[hostname].path_in if path_out: src_path.replace(path_out, "", 1) if path_in: src_path = path_out + src_path sdlog.debug("SDDMGLOB-024", "Mapped url %s to %s%s" % (url, src_endpoint, src_path)) return src_endpoint, src_path, path
def run(o, attached_parameters): """This func adds some parameters to the result of a query. Note The idea is the keep some parameters around by making them jump over the search call (e.g. Search-api call, SQL call..), from 'query pipeline' to 'file pipeline'. """ assert isinstance(attached_parameters, dict) sdlog.debug("SYDADDAP-620", "Add attached_parameters..") if isinstance(o, sdtypes.Metadata): po = sdpipelineprocessing.ProcessingObject(add_attached_parameters, attached_parameters) o = sdpipelineprocessing.run_pipeline(o, po) elif isinstance(o, sdtypes.Response): # no need to process chunk by chunk here as Response size only contains a small amount of data (< sdconst.SEARCH_API_CHUNKSIZE) files = add_attached_parameters(o.get_files(), attached_parameters) o.set_files(files) else: assert False sdlog.debug("SYDADDAP-628", "attached_parameters added") return o
def _reload(self, force=False): '''Internal method to reload the dictionary of endpoints if the file has changed since it was last read''' if self.filepath: # only if endpoints file exists modtime = file_modification_datetime(self.filepath) if force or modtime > self.modtime: sdlog.debug("SDDMGLOB-014", "Loading endpoints from: %s, last modified: %s" % (self.filepath, modtime)) self.modtime = modtime endpoints = {} # read XML file with open (self.filepath, "r") as myfile: xml=myfile.read().replace('\n', '') # <endpoints xmlns="http://www.esgf.org/whitelist"> root = fromstring(xml) # <endpoint name="esg#jpl" gridftp="esg-datanode.jpl.nasa.gov:2811" /> for endpoint in root.findall("{%s}endpoint" % NS): gridftp = endpoint.attrib['gridftp'] name = endpoint.attrib['name'] # mandatory attribute path_out = endpoint.attrib.get('path_out', None) # optional attribute path_in = endpoint.attrib.get('path_in', None) # optional attribute endpoints[ gridftp ] = Endpoint(name, path_out=path_out, path_in=path_in) sdlog.debug("SDDMGLOB-018", "Using Globus endpoint %s : %s (%s --> %s)" % (gridftp, name, path_out, path_in)) # switch the dictionary of endpoints after reading self.endpoints = endpoints
def part_cleanup(paths): """Remove empty files and folders.""" sdlog.info("SYNCLEAN-018", "Cleanup begin") paths = sorted( paths, reverse=True ) # maybe overkill (idea is that reverse order may allow the suppression of empty sibling, but as all paths to be removed will go through a os.removedirs call it should work anyway) for p in paths: sdlog.info("SYNCLEAN-060", "Check for empty file and directory in %s" % p) # remove empty files sdlog.debug("SYNCLEAN-120", "Remove empty files (%s)" % (p, )) remove_empty_files(p) # remove empty directories starting from leaves sdlog.debug("SYNCLEAN-140", "Remove empty dirs (%s)" % (p, )) try: os.removedirs(p) except OSError as e: pass # Neutralize exception (needed as removedirs raise exception at first non empty dir). # as the previous command may also remove 'data' folder (when all data have been removed), we re-create 'data' if missing if not os.path.isdir(sdconfig.data_folder): os.makedirs(sdconfig.data_folder) sdlog.info("SYNCLEAN-020", "Cleanup done.")
def part_cleanup(paths): """Remove empty files and folders.""" sdlog.info("SYNCLEAN-018","Cleanup begin") paths=sorted(paths, reverse=True) # maybe overkill (idea is that reverse order may allow the suppression of empty sibling, but as all paths to be removed will go through a os.removedirs call it should work anyway) for p in paths: sdlog.info("SYNCLEAN-060","Check for empty file and directory in %s"%p) # remove empty files sdlog.debug("SYNCLEAN-120","Remove empty files (%s)"%(p,)) remove_empty_files(p) # remove empty directories starting from leaves sdlog.debug("SYNCLEAN-140","Remove empty dirs (%s)"%(p,)) try: os.removedirs(p) except OSError as e: pass # Neutralize exception (needed as removedirs raise exception at first non empty dir). # as the previous command may also remove 'data' folder (when all data have been removed), we re-create 'data' if missing if not os.path.isdir(sdconfig.data_folder): os.makedirs(sdconfig.data_folder) sdlog.info("SYNCLEAN-020","Cleanup done.")
def run(**kw): files=kw.get('files') check_type(files) check_fields(files) files=sdreducerow.run(files) files=sdremoveaggregation.run(files) files=sdprotocol.run(files) files=sdtimefilter.run(files) files=sdprepare_dataset_attr.run(files) #files=sdcheck_dataset_template.run(files) files=sdreducecol.run(files) files=sdprepare_file_attr.run(files) files=sdlocalpath.run(files) for f in files: sdlog.debug("SDFIPIPE-004","%s"%f['url'],stdout=True) files=sdshrink.run(files) for f in files: sdlog.debug("SDFIPIPE-005","%s"%f['url'],stdout=True) #files=sdonemgf_post.run(files) # BEWARE: this module do not respect 'KISS' principle (it updates global value by altering the syndac console session context). You can disable it to keep things simple (it's only there for tuning purpose). files=sdcomplete.run(files) files=sdstatusfilter.run(files) return files
def run_helper(queries): """ notes - "queries" is non-threadsafe (i.e. not a Queue), but doesn't matter as threads do not use it """ total_query_to_process=len(queries) sdlog.debug("SDPROXMT-003","%d search-API queries to process (max_thread_per_host=%d,timeout=%d)"%(total_query_to_process,max_thread_per_host,sdconst.SEARCH_API_HTTP_TIMEOUT)) while True: if sdconfig.proxymt_progress_stat: sdlog.info("SDPROXMT-033","threads per host: %s"%",".join(['%s=%s'%(host,len(searchAPIServices[host]['threadlist'])) for host in searchAPIServices.keys()])) if len(queries)>0: distribute_queries(queries) else: # leave the loop only if all threads completed if all_threads_completed(): break # remove completed threads from list for host in searchAPIServices.keys(): li=[] for t in searchAPIServices[host]['threadlist']: if t.is_alive(): li.append(t) searchAPIServices[host]['threadlist']=li # log total_query_already_processed = total_query_to_process - len(queries) if total_query_to_process > 0: # display progress only when there are a lot of queries if len(queries) > 0: # display progress only when still query to process sdlog.info("SDPROXMT-004","total_queries=%d, running_or_done_queries=%d, waiting_queries=%d"%(total_query_to_process,total_query_already_processed,len(queries))) # if all services are busy, we sleep to limit loop speed # (note that all the code around the "sleep" call is to detect system overload) sleep_time=10 warning_threshold=5 # threshold not to emit warning for every small load exceedance befo=time.time() time.sleep(sleep_time) afte=time.time() diff=afte-befo if diff>sleep_time+warning_threshold: sdlog.warning("SDPROXMT-005","WARNING: system overload detected (sleep takes %d second to complete)."%diff) # retrieve result from output queue metadata=sdtypes.Metadata() while not __result_queue.empty(): success=__result_queue.get(False) # retrieve result from ONE successful search-API call success.connect() # TAGKLK434L3K34K metadata.slurp(success) # warning: success is modified here # retrieve error from output queue and insert them into a list errors=[] while not __error_queue.empty(): query=__error_queue.get(False) errors.append(query) return (metadata,errors)
def run(stream=None, path=None, parameter=None, index_host=None, post_pipeline_mode='file', dry_run=False): if parameter is None: parameter = [] queries = sdpipeline.build_queries(stream=stream, path=path, parameter=parameter, index_host=index_host, parallel=False, load_default=False) if len(queries) < 1: raise SDException("SDQSEARC-001", "No query to process") progress = sdsqueries.get_scalar( queries, 'progress', False, type_=bool ) # we cast here as progress can be str (set from parameter) or bool (set programmaticaly) searchapi_host = sdsqueries.get_scalar(queries, 'searchapi_host') if dry_run: for query in queries: request = sdtypes.Request(url=query['url'], pagination=False) print '%s' % request.get_url() # debug #print 'Url: %s'%request.get_url() #print 'Attached parameters: %s'%query.get('attached_parameters') return sdtypes.Response() else: try: if progress: sdtools.print_stderr( sdi18n.m0003(searchapi_host) ) # waiting message => TODO: move into ProgressThread class ProgressThread.start( sleep=0.1, running_message='', end_message='Search completed.') # spinner start mqr = process_queries(queries) metadata = mqr.to_metadata() sdlog.debug("SDQSEARC-002", "files-count=%d" % metadata.count()) metadata = sdpipeline.post_pipeline(metadata, post_pipeline_mode) sdlog.debug("SDQSEARC-004", "files-count=%d" % metadata.count()) return metadata finally: if progress: ProgressThread.stop() # spinner stop
def start_new_thread(host,url): sdlog.debug("SDPROXMT-002","Starting new search-API thread (%s)"%host) service=searchAPIServices[host]["iSearchAPIProxy"] th=MetadataThread(host,service,url,__result_queue,__error_queue) th.setDaemon(True) th.start() return th
def is_nearestpost_enabled(metadata): result=False sdlog.debug("SSHRINKT-001","Check if nearestpost is enabled..") if sdconfig.nearest_schedule=='post' and nearest_flag_set_on_all_files(metadata): result=True else: result=False sdlog.debug("SSHRINKT-002","nearestpost is %s"%result) return result
def run(**kw): files=kw.get('files') check_type(files) check_fields(files) files=sdreducerow.run(files) files=sdremoveaggregation.run(files) files=sdprotocol.run(files) files=sdtimefilter.run(files) files=sdprepare_dataset_attr.run(files) #files=sdcheck_dataset_template.run(files) # we do not remove the number of column here anymore # # Notes # - not reducing the number of column here may slightly diminish # performance (memory, cpu). But as we do need those informations (e.g. # description, variable_long_name, facets..), we have no choice. # - we need to keep those informations even if they are not essential, # as we will need them soon to provide more descriptive informations to # the user (e.g. description, variable_long_name..) # - we need to keep all facets so the user can build custom local path # (see local_path_custom_transform() func for more info) # - we will now remove those column downstream (but only for 'dump' action) # #files=sdreducecol.run(files) files=sdprepare_file_attr.run(files) files=sdlocalpath.run(files) for f in files: sdlog.debug("SDFIPIPE-004","%s"%f['url'],stdout=True) files=sdshrink.run(files) for f in files: sdlog.debug("SDFIPIPE-005","%s"%f['url'],stdout=True) # EXT_FILE_POST # # load extensions here # # TODO files=sdcomplete.run(files) files=sdstatusfilter.run(files) return files
def is_nearestpost_enabled(metadata): result = False sdlog.debug("SSHRINKT-001", "Check if nearestpost is enabled..") if sdconfig.nearest_schedule == 'post' and nearest_flag_set_on_all_files( metadata): result = True else: result = False sdlog.debug("SSHRINKT-002", "nearestpost is %s" % result) return result
def get_urls(file_functional_id): """returns a prioritized list of [url,protocol] where each url can supply the specified file""" try: result = sdquicksearch.run(parameter=[ 'limit=4', 'fields=%s' % url_fields, 'type=File', 'instance_id=%s' % file_functional_id ], post_pipeline_mode=None) except Exception as e: sdlog.debug("SDNEXTUR-015", "exception %s. instance_id=%s" % (e, file_functional_id)) raise e li = result.get_files() sdlog.info( "SDNEXTUR-016", "sdquicksearch returned %s sets of file urls: %s" % (len(li), li)) if li == []: # No urls found. Try again, but wildcard the file id. (That leads to a string search on all # fields for the wildcarded file id, rather than a match of the instance_id field only.) result = sdquicksearch.run(parameter=[ 'limit=4', 'fields=%s' % url_fields, 'type=File', 'instance_id=%s' % file_functional_id + '*' ], post_pipeline_mode=None) li = result.get_files() sdlog.info( "SDNEXTUR-017", "sdquicksearch 2nd call %s sets of file urls: %s" % (len(li), li)) # result looks like # [ {protocol11:url11, protocol12:url12, attached_parameters:dict, score:number, type:'File', # size:number} }, {[another dict of the same format}, {another dict},... ] # with no more than limit=4 items in the list, and no more than three protocols. # We'll return something like urlps = [ [url1,protocol1], [url2,protocol2],... ] # The return value could be an empty list. # Note: These nested lists are ugly; it's just a quick way to code something up. urlps = [] for dic in li: urlps += [[dic[key], key] for key in dic.keys() if key.find('url_') >= 0 and dic[key].find('//None') < 0] # ... protocol keys are one of 'url_opendap', 'url_http', 'url_gridftp' # The search for //None bypasses an issue with the SOLR lookup where there is no # url_gridftp possibility. return prioritize_urlps(urlps)
def map_to_globus(url): parsed_url = urlparse.urlparse(url) hostname = parsed_url.netloc src_endpoint = None src_path = re.sub('/+', '/', parsed_url.path) path = src_path if hostname in globus_endpoints: src_endpoint = globus_endpoints[hostname].name path_out = globus_endpoints[hostname].path_out path_in = globus_endpoints[hostname].path_in if path_out: src_path.replace(path_out, '', 1) if path_in: src_path = path_out + src_path sdlog.debug("SDDMGLOB-024", "Mapped url %s to %s%s" % (url, src_endpoint, src_path)) return src_endpoint, src_path, path
def run(self,url=None,attached_parameters=None): """Execute one search query (as pagination is used, it can result in many HTTP queries).""" if attached_parameters is None: attached_parameters={} request=sdtypes.Request(url=url,pagination=True) final_url=request.get_url() sdlog.debug("SYDPROXY-490","paginated call started (url=%s)"%final_url) try: paginated_response=self.call_web_service__PAGINATION(request) except Exception,e: sdlog.error("SYDPROXY-400","Error occurs during search-API paginated call (url=%s)"%(final_url,)) sdlog.error("SYDPROXY-410","%s"%(str(e),)) raise
def run(files): for file in files: protocol = sdpostpipelineutils.get_attached_parameter( file, 'protocol', sdconst.TRANSFER_PROTOCOL_HTTP) if protocol not in sdconst.TRANSFER_PROTOCOLS: raise SDException("SYNPROTO-004", "Incorrect protocol (%s)" % protocol) if protocol == sdconst.TRANSFER_PROTOCOL_GLOBUS: if 'url_globus' in file: file['url'] = file['url_globus'] elif 'url_gridftp' in file: file['url'] = file['url_gridftp'] elif 'url_http' in file: sdlog.warning('SYNPROTO-005', 'Fallback to http as globus url is missing') file['url'] = file['url_http'] elif protocol == sdconst.TRANSFER_PROTOCOL_GRIDFTP: if 'url_gridftp' in file: file['url'] = file['url_gridftp'] elif 'url_http' in file: sdlog.debug( 'SYNPROTO-002', 'Fallback to http as gridftp url is missing (%s)' % file["title"]) file['url'] = file['url_http'] elif protocol == sdconst.TRANSFER_PROTOCOL_HTTP: if 'url_http' in file: file['url'] = file['url_http'] elif 'url_gridftp' in file: sdlog.warning('SYNPROTO-001', 'Fallback to gridftp as http url is missing') file['url'] = file['url_gridftp'] else: raise SDException("SYNPROTO-003", "Incorrect protocol (%s)" % protocol) sdtools.remove_dict_items( file, ['url_globus', 'url_gridftp', 'url_http', 'url_opendap']) return files
def run(stream=None,path=None,parameter=None,index_host=None,post_pipeline_mode='file',dry_run=False): if parameter is None: parameter=[] queries=sdpipeline.build_queries(stream=stream,path=path,parameter=parameter,index_host=index_host,parallel=False,load_default=False) if len(queries)<1: raise SDException("SDQSEARC-001","No query to process") progress=sdsqueries.get_scalar(queries,'progress',False,type_=bool) # we cast here as progress can be str (set from parameter) or bool (set programmaticaly) searchapi_host=sdsqueries.get_scalar(queries,'searchapi_host') if dry_run: for query in queries: request=sdtypes.Request(url=query['url'],pagination=False) print '%s'%request.get_url() # debug #print 'Url: %s'%request.get_url() #print 'Attached parameters: %s'%query.get('attached_parameters') return sdtypes.Response() else: try: if progress: sdtools.print_stderr(sdi18n.m0003(searchapi_host)) # waiting message => TODO: move into ProgressThread class ProgressThread.start(sleep=0.1,running_message='',end_message='Search completed.') # spinner start mqr=process_queries(queries) metadata=mqr.to_metadata() sdlog.debug("SDQSEARC-002","files-count=%d"%metadata.count()) metadata=sdpipeline.post_pipeline(metadata,post_pipeline_mode) sdlog.debug("SDQSEARC-004","files-count=%d"%metadata.count()) return metadata finally: if progress: ProgressThread.stop() # spinner stop
def run(files): for file in files: protocol=sdpostpipelineutils.get_attached_parameter(file,'protocol',sdconst.TRANSFER_PROTOCOL_HTTP) if protocol not in sdconst.TRANSFER_PROTOCOLS: raise SDException("SYNPROTO-004","Incorrect protocol (%s)"%protocol) if 'url_gridftp' in file and 'url_http' in file: if protocol==sdconst.TRANSFER_PROTOCOL_GRIDFTP: file['url']=file['url_gridftp'] elif protocol==sdconst.TRANSFER_PROTOCOL_HTTP: file['url']=file['url_http'] else: raise SDException("SYNPROTO-003","Incorrect protocol (%s)"%protocol) elif 'url_gridftp' in file: # only gridftp if protocol==sdconst.TRANSFER_PROTOCOL_HTTP: sdlog.warning('SYNPROTO-001','Fallback to gridftp as http url is missing') file['url']=file['url_gridftp'] elif 'url_http' in file: # only http if protocol==sdconst.TRANSFER_PROTOCOL_GRIDFTP: sdlog.debug('SYNPROTO-002','Fallback to http as gridftp url is missing (%s)'%file["title"]) file['url']=file['url_http'] else: # no url available to download the file # (should not be here as sdremoverow takes care of those cases) assert False sdtools.remove_dict_items(file,['url_gridftp', 'url_http', 'url_opendap']) return files
def run(self, url=None, attached_parameters=None): """Execute one search query (as pagination is used, it can result in many HTTP queries).""" if attached_parameters is None: attached_parameters = {} request = sdtypes.Request(url=url, pagination=True) final_url = request.get_url() sdlog.debug("SYDPROXY-490", "paginated call started (url=%s)" % final_url) try: paginated_response = self.call_web_service__PAGINATION(request) except Exception, e: sdlog.error( "SYDPROXY-400", "Error occurs during search-API paginated call (url=%s)" % (final_url, )) sdlog.error("SYDPROXY-410", "%s" % (str(e), )) raise
def call_web_service(self,request): sdlog.debug("SYDPROXY-100","Search-API call started (%s)."%request.get_url()) try: response=sdnetutils.call_web_service(request.get_url(),timeout=sdconst.SEARCH_API_HTTP_TIMEOUT) # returns Response object except: # if exception occurs in sdnetutils.call_web_service() method, all # previous calls to this method inside this paginated call are also # cancelled # we reset the offset so the paginated call can be restarted from the begining the next time # (maybe overkill as offset is reinitialized when entering 'call_web_service__PAGINATION()' func) request.offset=0 raise sdlog.info("SYDPROXY-100","Search-API call completed (returned-files-count=%i,match-count=%i,url=%s)."%(response.count(),response.num_found,request.get_url())) return response
def run_pipeline(metadata,po,io_mode=sdconst.PROCESSING_FETCH_MODE_GENERATOR): """ Note Beware: metadata input argument is modified in this func ! (you have to make a copy before calling this func if you want to keep original data) """ # alias f=po.f args=po.args kwargs=po.kwargs sdlog.debug("SYNDPIPR-001","Start chunk loop (files-count=%d)"%metadata.count()) if io_mode=='no_chunk': # way 0: load-all-in-memory (no chunk). files=f(metadata.get_files(),*args,**kwargs) metadata.set_files(files) elif io_mode=='generator': # way 1: chunk-by-chunk (using a second store) new_metadata=sdtypes.Metadata() for chunk in metadata.get_chunks(io_mode): sdlog.debug("SYNDPIPR-002","Process chunk") chunk=f(chunk,*args,**kwargs) new_metadata.add_files(chunk) metadata=new_metadata # note: metadata old value get's removed here (destructor is called). This is to enforce that this function IS destructive with its input argument (see func comment for more info). elif io_mode=='pagination': # way 2: chunk-by-chunk (updating store on-the-fly) for chunk in metadata.get_chunks(io_mode): chunk=f(chunk,*args,**kwargs) metadata.update(chunk) # TODO: check if 'size' is handled here elif io_mode=='experimental': # use 'ALTER TABLE foo RENAME TO bar' here pass else: assert False sdlog.debug("SYNDPIPR-003","Chunk loop completed (files-count=%d)"%metadata.count()) return metadata
class SearchAPIProxy(): def __init__(self, **kw): pass def run(self, url=None, attached_parameters=None): """Execute one search query (as pagination is used, it can result in many HTTP queries).""" if attached_parameters is None: attached_parameters = {} request = sdtypes.Request(url=url, pagination=True) final_url = request.get_url() sdlog.debug("SYDPROXY-490", "paginated call started (url=%s)" % final_url) try: paginated_response = self.call_web_service__PAGINATION(request) except Exception, e: sdlog.error( "SYDPROXY-400", "Error occurs during search-API paginated call (url=%s)" % (final_url, )) sdlog.error("SYDPROXY-410", "%s" % (str(e), )) raise sdlog.debug( "SYDPROXY-001", "paginated call completed (call-duration=%i, files-count=%i, url=%s)" % (paginated_response.call_duration, paginated_response.count(), final_url)) if attached_parameters.get('verbose', False) == True: sdtools.print_stderr("Url: %s" % final_url) sdtools.print_stderr("Duration: %s" % paginated_response.call_duration) sdtools.print_stderr("") md = paginated_response.to_metadata( ) # we cast to remove pagination related code and have a lighter object md = sdaddap.run(md, attached_parameters) return md
'|' )[0] # keep only first field (i.e. keep only the file url) protocol = item.split('|')[-1] if protocol.upper() == "HTTPSERVER": l__dict['url_http'] = url elif protocol.upper() == "GRIDFTP": l__dict['url_gridftp'] = url elif protocol.upper() == "OPENDAP": l__dict['url_opendap'] = url else: l__dict[attr_name] = attr_value l__files.append(l__dict) sdlog.debug("SYNDJSON-014", "files-count=%d" % len(l__files)) return { 'files': l__files, 'num_found': l__num_found, 'num_result': len(l__files) } if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f', '--file', required=True) args = parser.parse_args() # read search-api output sample with open(args.file, 'r') as fh:
def transfers_end(): _, _, access_token = api_client.goauth.get_access_token(username=globus_username, password=globus_password) api = api_client.TransferAPIClient(username=globus_username, goauth=access_token) for task_id in globus_tasks: code, reason, data = api.task(task_id, fields="status") status = data['status'] sdlog.debug("SDDMGLOB-016", "Checking the status of Globus transfer tasks, id: %s, status: %s" % (task_id, status)) for item in globus_tasks[task_id]['items']: tr = item['tr'] if status == "SUCCEEDED": assert tr.size is not None if int(tr.size) != os.path.getsize(tr.get_full_local_path()): sdlog.error("SDDMGLOB-002","size don't match (remote_size=%i,local_size=%i,local_path=%s)"%(int(tr.size),os.path.getsize(tr.get_full_local_path()),tr.get_full_local_path())) # retrieve local and remote checksum checksum_type=tr.checksum_type if tr.checksum_type is not None else 'md5' local_checksum=sdutils.compute_checksum(tr.get_full_local_path(),checksum_type) remote_checksum=tr.checksum # retrieve remote checksum if remote_checksum!=None: # remote checksum exists # compare local and remote checksum if remote_checksum==local_checksum: # checksum is ok tr.status = sdconst.TRANSFER_STATUS_DONE else: # checksum is not ok if incorrect_checksum_action=="remove": tr.status=sdconst.TRANSFER_STATUS_ERROR tr.error_msg="File corruption detected: local checksum doesn't match remote checksum" # remove file from local repository sdlog.error("SDDMGLOB-155","checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)"%(local_checksum,remote_checksum,tr.get_full_local_path())) try: os.remove(tr.get_full_local_path()) except Exception,e: sdlog.error("SDDMGLOB-158","error occurs while removing local file (%s)"%tr.get_full_local_path()) elif incorrect_checksum_action=="keep": sdlog.info("SDDMGLOB-157","local checksum doesn't match remote checksum (%s)"%tr.get_full_local_path()) tr.status=sdconst.TRANSFER_STATUS_DONE else: raise FatalException("SDDMGLOB-507","incorrect value (%s)"%incorrect_checksum_action) else: # remote checksum is missing # NOTE: we DON'T store the local checksum ('file' table contains only the REMOTE checksum) tr.status = sdconst.TRANSFER_STATUS_DONE if tr.status == sdconst.TRANSFER_STATUS_DONE: tr.end_date=sdtime.now() # WARNING: this is not the real end of transfer date but the date when we ask the globus scheduler if the transfer is done. tr.error_msg="" sdlog.info("SDDMGLOB-101", "Transfer done (%s)" % str(tr)) elif status == "FAILED": tr.status = sdconst.TRANSFER_STATUS_ERROR tr.error_msg = "Error occurs during download." sdlog.info("SDDMGLOB-101", "Transfer failed (%s)" % str(tr)) # Remove local file if exists if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) except Exception,e: sdlog.error("SDDMGLOB-528","Error occurs during file suppression (%s,%s)"%(tr.get_full_local_path(),str(e)))
# # TODO: maybe always enable this # sdtrace.log_exception() # debug # # (if the error is not due to a network error (e.g. internet connection # problem), raise the original exception below and set the debug mode # to see the stacktrace. # #raise raise SDException('SDNETUTI-008','Network error (see log for details)') # we raise a new exception 'network error' here, because most of the time, 'xml parsing error' is due to an 'network error'. sdlog.debug("SDNETUTI-044","files-count=%d"%len(di.get('files'))) return sdtypes.Response(call_duration=elapsed_time,lowmem=lowmem,**di) # RAM storage is ok here as one response is limited by SEARCH_API_CHUNKSIZE def call_param_web_service(url,timeout): buf=HTTP_GET(url,timeout) buf=fix_encoding(buf) try: params=search_api_parser.parse_parameters(buf) except Exception as e: # If we are here, it's likely that they is a problem with the internet connection # (e.g. we are behind an HTTP proxy and have no authorization to use it)
l__dict[l__name].append(l__value) elif arr_n.tag=="float": # type not used for now """ sample: <arr name="score"><float name="score">1.9600565</float></arr> """ pass l__files.append(l__dict) sdlog.debug("SYNDAXML-014","files-count=%d"%len(l__files)) return {'files':l__files,'num_found':l__num_found,'num_result':len(l__files)} if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('-f','--file',required=True) args = parser.parse_args() # read search-api output sample with open(args.file, 'r') as fh: buffer=fh.read() #result=parse_parameters(buffer) result=parse_metadata(buffer)
# sdtrace.log_exception() # debug # # (if the error is not due to a network error (e.g. internet connection # problem), raise the original exception below and set the debug mode # to see the stacktrace. # #raise raise SDException( 'SDNETUTI-008', 'Network error (see log for details)' ) # we raise a new exception 'network error' here, because most of the time, 'xml parsing error' is due to an 'network error'. sdlog.debug("SDNETUTI-044", "files-count=%d" % len(di.get('files'))) return sdtypes.Response( call_duration=elapsed_time, lowmem=lowmem, **di ) # RAM storage is ok here as one response is limited by SEARCH_API_CHUNKSIZE def call_param_web_service(url, timeout): buf = HTTP_GET(url, timeout) buf = fix_encoding(buf) try: params = search_api_parser.parse_parameters(buf) except Exception as e:
def transfers_end(): _, _, access_token = api_client.goauth.get_access_token( username=globus_username, password=globus_password) api = api_client.TransferAPIClient(username=globus_username, goauth=access_token) for task_id in globus_tasks: code, reason, data = api.task(task_id, fields="status") status = data['status'] sdlog.debug( "SDDMGLOB-016", "Checking the status of Globus transfer tasks, id: %s, status: %s" % (task_id, status)) for item in globus_tasks[task_id]['items']: tr = item['tr'] if status == "SUCCEEDED": assert tr.size is not None if int(tr.size) != os.path.getsize(tr.get_full_local_path()): sdlog.error( "SDDMGLOB-002", "size don't match (remote_size=%i,local_size=%i,local_path=%s)" % (int(tr.size), os.path.getsize(tr.get_full_local_path()), tr.get_full_local_path())) # retrieve local and remote checksum checksum_type = tr.checksum_type if tr.checksum_type is not None else sdconst.CHECKSUM_TYPE_MD5 local_checksum = sdutils.compute_checksum( tr.get_full_local_path(), checksum_type) remote_checksum = tr.checksum # retrieve remote checksum if remote_checksum != None: # remote checksum exists # compare local and remote checksum if remote_checksum == local_checksum: # checksum is ok tr.status = sdconst.TRANSFER_STATUS_DONE else: # checksum is not ok if incorrect_checksum_action == "remove": tr.status = sdconst.TRANSFER_STATUS_ERROR tr.priority -= 1 tr.error_msg = "File corruption detected: local checksum doesn't match remote checksum" # remove file from local repository sdlog.error( "SDDMGLOB-155", "checksum don't match: remove local file (local_checksum=%s,remote_checksum=%s,local_path=%s)" % (local_checksum, remote_checksum, tr.get_full_local_path())) try: os.remove(tr.get_full_local_path()) except Exception, e: sdlog.error( "SDDMGLOB-158", "error occurs while removing local file (%s)" % tr.get_full_local_path()) elif incorrect_checksum_action == "keep": sdlog.info( "SDDMGLOB-157", "local checksum doesn't match remote checksum (%s)" % tr.get_full_local_path()) tr.status = sdconst.TRANSFER_STATUS_DONE else: raise FatalException( "SDDMGLOB-507", "incorrect value (%s)" % incorrect_checksum_action) else: # remote checksum is missing # NOTE: we DON'T store the local checksum ('file' table contains only the REMOTE checksum) tr.status = sdconst.TRANSFER_STATUS_DONE if tr.status == sdconst.TRANSFER_STATUS_DONE: tr.end_date = sdtime.now( ) # WARNING: this is not the real end of transfer date but the date when we ask the globus scheduler if the transfer is done. tr.error_msg = "" sdlog.info("SDDMGLOB-101", "Transfer done (%s)" % str(tr)) elif status == "FAILED": tr.status = sdconst.TRANSFER_STATUS_ERROR tr.priority -= 1 tr.error_msg = "Error occurs during download." sdlog.info("SDDMGLOB-101", "Transfer failed (%s)" % str(tr)) # Remove local file if exists if os.path.isfile(tr.get_full_local_path()): try: os.remove(tr.get_full_local_path()) except Exception, e: sdlog.error( "SDDMGLOB-528", "Error occurs during file suppression (%s,%s)" % (tr.get_full_local_path(), str(e)))
break if quit == 1: if can_leave( ): # wait until all threads finish and until everything has been processed on the database I/O queue sdlog.info("SDTSCHED-001", "eot_queue orders processing completed", stderr=False) sdlog.info("SDTSCHED-003", "Running transfer processing completed", stderr=False) break time.sleep(main_loop_sleep) sdlog.debug("SDTSCHED-400", "end of event loop") print sdlog.info("SDTSCHED-901", "Scheduler successfully stopped", stderr=True) # module init. quit = 0 # 0 => start, 1 => stop scheduler_state = 0 # 0 => stopped, 1 => running, 2 => starting main_loop_sleep = 9 sdlog.set_default_logger(sdconst.LOGGER_CONSUMER) if sdconfig.prevent_daemon_and_ihm: if os.path.isfile(sdconfig.ihm_pid_file): sdlog.info("SDTSCHED-014",