Beispiel #1
0
def run_helper(queries):
    """
    notes
      - "queries" is non-threadsafe (i.e. not a Queue), but doesn't matter as threads do not use it
    """
    total_query_to_process=len(queries)

    sdlog.debug("SDPROXMT-003","%d search-API queries to process (max_thread_per_host=%d,timeout=%d)"%(total_query_to_process,max_thread_per_host,sdconst.SEARCH_API_HTTP_TIMEOUT))

    while True:
        if sdconfig.proxymt_progress_stat:
            sdlog.info("SDPROXMT-033","threads per host: %s"%",".join(['%s=%s'%(host,len(searchAPIServices[host]['threadlist'])) for host in searchAPIServices.keys()]))

        if len(queries)>0:
            distribute_queries(queries)
        else:
            # leave the loop only if all threads completed
            if all_threads_completed():
                break

        # remove completed threads from list
        for host in searchAPIServices.keys():
            li=[]
            for t in searchAPIServices[host]['threadlist']:
                if t.is_alive():
                    li.append(t)
            searchAPIServices[host]['threadlist']=li

        # log
        total_query_already_processed = total_query_to_process - len(queries)
        if total_query_to_process > 0: # display progress only when there are a lot of queries
            if len(queries) > 0: # display progress only when still query to process
                sdlog.info("SDPROXMT-004","total_queries=%d, running_or_done_queries=%d, waiting_queries=%d"%(total_query_to_process,total_query_already_processed,len(queries)))

        # if all services are busy, we sleep to limit loop speed
        # (note that all the code around the "sleep" call is to detect system overload)
        sleep_time=10
        warning_threshold=5 # threshold not to emit warning for every small load exceedance
        befo=time.time()
        time.sleep(sleep_time)
        afte=time.time()
        diff=afte-befo
        if diff>sleep_time+warning_threshold:
            sdlog.warning("SDPROXMT-005","WARNING: system overload detected (sleep takes %d second to complete)."%diff)

    # retrieve result from output queue
    metadata=sdtypes.Metadata()
    while not __result_queue.empty():
        success=__result_queue.get(False) # retrieve result from ONE successful search-API call
        success.connect() # TAGKLK434L3K34K
        metadata.slurp(success) # warning: success is modified here

    # retrieve error from output queue and insert them into a list
    errors=[]
    while not __error_queue.empty():
        query=__error_queue.get(False)
        errors.append(query)

    return (metadata,errors)
Beispiel #2
0
def sequential_exec(queries):
    search = sdproxy.SearchAPIProxy()
    metadata = sdtypes.Metadata()
    for i, q in enumerate(queries):
        sdlog.info("SYNDARUN-001", "Process query %d" % i)
        result = search.run(url=q['url'],
                            attached_parameters=q.get('attached_parameters'))
        metadata.slurp(result)
    return metadata
Beispiel #3
0
def run_pipeline(metadata,po,io_mode=sdconst.PROCESSING_FETCH_MODE_GENERATOR):
    """
    Note
        Beware: metadata input argument is modified in this func !
        (you have to make a copy before calling this func if you want
        to keep original data)
    """

    # alias
    f=po.f
    args=po.args
    kwargs=po.kwargs

    sdlog.debug("SYNDPIPR-001","Start chunk loop (files-count=%d)"%metadata.count())

    if io_mode=='no_chunk':

        # way 0: load-all-in-memory (no chunk).
        files=f(metadata.get_files(),*args,**kwargs)
        metadata.set_files(files)

    elif io_mode=='generator':

        # way 1: chunk-by-chunk (using a second store)
        new_metadata=sdtypes.Metadata()
        for chunk in metadata.get_chunks(io_mode):

            sdlog.debug("SYNDPIPR-002","Process chunk")

            chunk=f(chunk,*args,**kwargs)
            new_metadata.add_files(chunk)

        metadata=new_metadata # note: metadata old value get's removed here (destructor is called). This is to enforce that this function IS destructive with its input argument (see func comment for more info).

    elif io_mode=='pagination':

        # way 2: chunk-by-chunk (updating store on-the-fly)
        for chunk in metadata.get_chunks(io_mode):
            chunk=f(chunk,*args,**kwargs)
            metadata.update(chunk) # TODO: check if 'size' is handled here

    elif io_mode=='experimental':

        # use 'ALTER TABLE foo RENAME TO bar' here

        pass

    else:
        assert False

    sdlog.debug("SYNDPIPR-003","Chunk loop completed (files-count=%d)"%metadata.count())

    return metadata
Beispiel #4
0
def _get_files(squeries,parallel,post_pipeline_mode,action,playback,record):
    """
    TODO: maybe move this code inside sdmts module (e.g. metadata.dump(path))
    """

    if playback is not None:
        with open(playback, 'r') as fh:
            metadata=sdtypes.Metadata(files=json.load(fh)) # warning: load full list in memory

    else:

        metadata=execute_queries(squeries,parallel,post_pipeline_mode,action)

        if record is not None:
            with open(record, 'w') as fh:
                json.dump(metadata.get_files(),fh,indent=4) # warning: load full list in memory

    return metadata
Beispiel #5
0
def run(queries, parallel=True):

    if parallel:
        metadata = sdtypes.Metadata()

        (queries_with_index_host, queries_without_index_host) = split_queries(
            queries
        )  # we need this, because query with specific index host can't be parallelized

        if len(queries_with_index_host) > 0:
            metadata.slurp(sequential_exec(queries_with_index_host))

        if len(queries_without_index_host) > 0:
            metadata.slurp(parallel_exec(queries_without_index_host))
    else:
        metadata = sequential_exec(queries)

    return metadata
Beispiel #6
0
def run(stream=None,
        selection=None,
        path=None,
        parameter=None,
        post_pipeline_mode='file',
        parallel=sdconfig.metadata_parallel_download,
        index_host=None,
        dry_run=False,
        load_default=None,
        playback=None,
        record=None):
    """
    Note
        squeries means 'Serialized queries'
    """

    if parameter is None:
        parameter=[]

    squeries=sdpipeline.build_queries(stream=stream,path=path,parameter=parameter,selection=selection,parallel=parallel,index_host=index_host,dry_run=dry_run,load_default=load_default)

    action=sdsqueries.get_scalar(squeries,'action',None)
    progress=sdsqueries.get_scalar(squeries,'progress',False,type_=bool) # we cast here as progress can be str (set from parameter) or bool (set programmaticaly)

    # Prevent use of 'limit' keyword ('limit' keyword can't be used in this module because it interfere with the pagination system)
    for q in squeries:
        if sdtools.url_contains_limit_keyword(q['url']):
            raise SDException('SDSEARCH-001',"'limit' facet is not supported in this mode. Use 'sdquicksearch' module instead.")

    if dry_run:
        sdsqueries.print_(squeries)
        return sdtypes.Metadata()
    else:
        if progress:
            #sdtools.print_stderr(sdi18n.m0003(ap.get('searchapi_host'))) # waiting message
            ProgressThread.start(sleep=0.1,running_message='',end_message='Search completed.') # spinner start

        metadata=_get_files(squeries,parallel,post_pipeline_mode,action,playback,record)

        if progress:
            ProgressThread.stop() # spinner stop

        return metadata
Beispiel #7
0
def run(i__queries):
    """This method contains the retry mecanism."""

    # check
    for q in i__queries:
        if sdconst.IDXHOSTMARK not in q['url']:
            raise sdexception.SDException('SDPROXMT-044','Incorrect query: host must not be set at this step')

    # retry loop
    max_retry=6
    i=0
    metadata=sdtypes.Metadata()
    l__queries=i__queries
    while i < max_retry:

        (success,errors)=run_helper(l__queries)

        metadata.slurp(success) # warning: success is modified here

        if len(errors)>0:
            sdlog.info("SDPROXMT-082","%d search-API queries failed"%(len(errors),))
            sdlog.info("SDPROXMT-083","retry 'failed search-API queries'")
            l__queries=errors

            i+=1

            continue
        else:
            if i>0:
                sdlog.info("SDPROXMT-089","retry succeeded")

            break

    if len(errors)>0:
        sdlog.error("SDPROXMT-084","max retry iteration reached. %d queries did not succeed"%(len(errors),))

    return metadata
Beispiel #8
0
def run_local(args, stream):
    import sdlfile

    syndautils.check_daemon()

    try:
        files = sdlfile.get_files(stream=stream, dry_run=args.dry_run)

        if len(files) == 0:
            raise sdexception.EmptySelectionException()

        if args.verbose:
            for f in files:
                buf = "file_id=%d, status=%s, local_path=%s, url=%s" % (
                    f.file_id, f.status, f.get_full_local_path(), f.url)
                print_stdout(buf)

        # transform object to dict (needed as remove_helper() expect list of dict, not list of File)
        files = [f.__dict__ for f in files]

        metadata = sdtypes.Metadata(files=files)
    except sdexception.EmptySelectionException, e:
        print_stderr('No packages will be installed, upgraded, or removed.')
        return 0