def run_helper(queries): """ notes - "queries" is non-threadsafe (i.e. not a Queue), but doesn't matter as threads do not use it """ total_query_to_process=len(queries) sdlog.debug("SDPROXMT-003","%d search-API queries to process (max_thread_per_host=%d,timeout=%d)"%(total_query_to_process,max_thread_per_host,sdconst.SEARCH_API_HTTP_TIMEOUT)) while True: if sdconfig.proxymt_progress_stat: sdlog.info("SDPROXMT-033","threads per host: %s"%",".join(['%s=%s'%(host,len(searchAPIServices[host]['threadlist'])) for host in searchAPIServices.keys()])) if len(queries)>0: distribute_queries(queries) else: # leave the loop only if all threads completed if all_threads_completed(): break # remove completed threads from list for host in searchAPIServices.keys(): li=[] for t in searchAPIServices[host]['threadlist']: if t.is_alive(): li.append(t) searchAPIServices[host]['threadlist']=li # log total_query_already_processed = total_query_to_process - len(queries) if total_query_to_process > 0: # display progress only when there are a lot of queries if len(queries) > 0: # display progress only when still query to process sdlog.info("SDPROXMT-004","total_queries=%d, running_or_done_queries=%d, waiting_queries=%d"%(total_query_to_process,total_query_already_processed,len(queries))) # if all services are busy, we sleep to limit loop speed # (note that all the code around the "sleep" call is to detect system overload) sleep_time=10 warning_threshold=5 # threshold not to emit warning for every small load exceedance befo=time.time() time.sleep(sleep_time) afte=time.time() diff=afte-befo if diff>sleep_time+warning_threshold: sdlog.warning("SDPROXMT-005","WARNING: system overload detected (sleep takes %d second to complete)."%diff) # retrieve result from output queue metadata=sdtypes.Metadata() while not __result_queue.empty(): success=__result_queue.get(False) # retrieve result from ONE successful search-API call success.connect() # TAGKLK434L3K34K metadata.slurp(success) # warning: success is modified here # retrieve error from output queue and insert them into a list errors=[] while not __error_queue.empty(): query=__error_queue.get(False) errors.append(query) return (metadata,errors)
def sequential_exec(queries): search = sdproxy.SearchAPIProxy() metadata = sdtypes.Metadata() for i, q in enumerate(queries): sdlog.info("SYNDARUN-001", "Process query %d" % i) result = search.run(url=q['url'], attached_parameters=q.get('attached_parameters')) metadata.slurp(result) return metadata
def run_pipeline(metadata,po,io_mode=sdconst.PROCESSING_FETCH_MODE_GENERATOR): """ Note Beware: metadata input argument is modified in this func ! (you have to make a copy before calling this func if you want to keep original data) """ # alias f=po.f args=po.args kwargs=po.kwargs sdlog.debug("SYNDPIPR-001","Start chunk loop (files-count=%d)"%metadata.count()) if io_mode=='no_chunk': # way 0: load-all-in-memory (no chunk). files=f(metadata.get_files(),*args,**kwargs) metadata.set_files(files) elif io_mode=='generator': # way 1: chunk-by-chunk (using a second store) new_metadata=sdtypes.Metadata() for chunk in metadata.get_chunks(io_mode): sdlog.debug("SYNDPIPR-002","Process chunk") chunk=f(chunk,*args,**kwargs) new_metadata.add_files(chunk) metadata=new_metadata # note: metadata old value get's removed here (destructor is called). This is to enforce that this function IS destructive with its input argument (see func comment for more info). elif io_mode=='pagination': # way 2: chunk-by-chunk (updating store on-the-fly) for chunk in metadata.get_chunks(io_mode): chunk=f(chunk,*args,**kwargs) metadata.update(chunk) # TODO: check if 'size' is handled here elif io_mode=='experimental': # use 'ALTER TABLE foo RENAME TO bar' here pass else: assert False sdlog.debug("SYNDPIPR-003","Chunk loop completed (files-count=%d)"%metadata.count()) return metadata
def _get_files(squeries,parallel,post_pipeline_mode,action,playback,record): """ TODO: maybe move this code inside sdmts module (e.g. metadata.dump(path)) """ if playback is not None: with open(playback, 'r') as fh: metadata=sdtypes.Metadata(files=json.load(fh)) # warning: load full list in memory else: metadata=execute_queries(squeries,parallel,post_pipeline_mode,action) if record is not None: with open(record, 'w') as fh: json.dump(metadata.get_files(),fh,indent=4) # warning: load full list in memory return metadata
def run(queries, parallel=True): if parallel: metadata = sdtypes.Metadata() (queries_with_index_host, queries_without_index_host) = split_queries( queries ) # we need this, because query with specific index host can't be parallelized if len(queries_with_index_host) > 0: metadata.slurp(sequential_exec(queries_with_index_host)) if len(queries_without_index_host) > 0: metadata.slurp(parallel_exec(queries_without_index_host)) else: metadata = sequential_exec(queries) return metadata
def run(stream=None, selection=None, path=None, parameter=None, post_pipeline_mode='file', parallel=sdconfig.metadata_parallel_download, index_host=None, dry_run=False, load_default=None, playback=None, record=None): """ Note squeries means 'Serialized queries' """ if parameter is None: parameter=[] squeries=sdpipeline.build_queries(stream=stream,path=path,parameter=parameter,selection=selection,parallel=parallel,index_host=index_host,dry_run=dry_run,load_default=load_default) action=sdsqueries.get_scalar(squeries,'action',None) progress=sdsqueries.get_scalar(squeries,'progress',False,type_=bool) # we cast here as progress can be str (set from parameter) or bool (set programmaticaly) # Prevent use of 'limit' keyword ('limit' keyword can't be used in this module because it interfere with the pagination system) for q in squeries: if sdtools.url_contains_limit_keyword(q['url']): raise SDException('SDSEARCH-001',"'limit' facet is not supported in this mode. Use 'sdquicksearch' module instead.") if dry_run: sdsqueries.print_(squeries) return sdtypes.Metadata() else: if progress: #sdtools.print_stderr(sdi18n.m0003(ap.get('searchapi_host'))) # waiting message ProgressThread.start(sleep=0.1,running_message='',end_message='Search completed.') # spinner start metadata=_get_files(squeries,parallel,post_pipeline_mode,action,playback,record) if progress: ProgressThread.stop() # spinner stop return metadata
def run(i__queries): """This method contains the retry mecanism.""" # check for q in i__queries: if sdconst.IDXHOSTMARK not in q['url']: raise sdexception.SDException('SDPROXMT-044','Incorrect query: host must not be set at this step') # retry loop max_retry=6 i=0 metadata=sdtypes.Metadata() l__queries=i__queries while i < max_retry: (success,errors)=run_helper(l__queries) metadata.slurp(success) # warning: success is modified here if len(errors)>0: sdlog.info("SDPROXMT-082","%d search-API queries failed"%(len(errors),)) sdlog.info("SDPROXMT-083","retry 'failed search-API queries'") l__queries=errors i+=1 continue else: if i>0: sdlog.info("SDPROXMT-089","retry succeeded") break if len(errors)>0: sdlog.error("SDPROXMT-084","max retry iteration reached. %d queries did not succeed"%(len(errors),)) return metadata
def run_local(args, stream): import sdlfile syndautils.check_daemon() try: files = sdlfile.get_files(stream=stream, dry_run=args.dry_run) if len(files) == 0: raise sdexception.EmptySelectionException() if args.verbose: for f in files: buf = "file_id=%d, status=%s, local_path=%s, url=%s" % ( f.file_id, f.status, f.get_full_local_path(), f.url) print_stdout(buf) # transform object to dict (needed as remove_helper() expect list of dict, not list of File) files = [f.__dict__ for f in files] metadata = sdtypes.Metadata(files=files) except sdexception.EmptySelectionException, e: print_stderr('No packages will be installed, upgraded, or removed.') return 0