Ejemplo n.º 1
0
def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int,
                 calc_from: int, minsize: int) -> Union[PyConc, InitialConc]:
    """
    arguments:
    calc_from - from which operation idx (inclusive) we have to calculate respective results
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    # let's create cache records of the operations we'll have to perform
    if calc_from < len(q):
        for i in range(calc_from, len(q)):
            status = cache_map.add_to_map(subchash, q[:i + 1], CalcStatus(), overwrite=True)
            if os.path.isfile(status.cachefile):  # the file cannot be valid as otherwise, calc_from would be higher
                del_silent(status.cachefile)
                logging.getLogger(__name__).warning(f'Removed unbound conc. cache file {status.cachefile}')
        app = bgcalc.calc_backend_client(settings)
        app.send_task('conc_sync_calculate',
                      (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize),
                      time_limit=TASK_TIME_LIMIT)
    # for smaller concordances/corpora there is a chance the data
    # is ready in a few seconds - let's try this:
    conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        # return empty yet unfinished concordance to make the client watch the calculation
        return InitialConc(corp, cache_map.cache_file_path(subchash, q))
Ejemplo n.º 2
0
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize):
    """
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    status = cache_map.get_calc_status(subchash, q)
    if not status or status.error:
        app = bgcalc.calc_backend_client(settings)
        ans = app.send_task('conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None),
                                              subchash, q, samplesize, TASK_TIME_LIMIT),
                            time_limit=CONC_REGISTER_TASK_LIMIT)
        ans.get(timeout=CONC_REGISTER_WAIT_LIMIT)
    conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        return InitialConc(corp, cache_map.cache_file_path(subchash, q))
Ejemplo n.º 3
0
def find_cached_conc_base(
        corp: manatee.Corpus, subchash: Optional[str], q: Tuple[str, ...],
        minsize: int) -> Tuple[Optional[int], manatee.Concordance]:
    """
    Load a concordance from cache starting from a complete operation q[:],
    then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be
    used to skip calculation of already available operations q[:-i].

    arguments:
    minsize -- a minimum concordance size to return immediately (synchronously); please
                note that unlike wait_for_conc here we accept also 0

    returns:
    a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance]
    """
    start_time = time.time()
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    cache_map.refresh_map()
    calc_status = cache_map.get_calc_status(subchash, q)
    if calc_status:
        if calc_status.error is None:
            corp_mtime = corplib_corp_mtime(corp)
            if calc_status.created - corp_mtime < 0:
                logging.getLogger(__name__).warning(
                    'Removed outdated cache file (older than corpus indices)')
                cache_map.del_full_entry(subchash, q)
        else:
            logging.getLogger(__name__).warning(
                'Removed failed calculation cache record (error: {0}'.format(
                    calc_status.error))
            cache_map.del_full_entry(subchash, q)
            raise ConcCalculationStatusException(calc_status.error)

    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    conc = InitialConc(corp=corp)
    ans = (0, conc)
    # try to find the most complete cached operation
    # (e.g. query + filter + sample)
    for i in range(srch_from, 0, -1):
        cache_path = cache_map.cache_file_path(subchash, q[:i])
        # now we know that someone already calculated the conc (but it might not be finished yet)
        if cache_path:
            try:
                ready = wait_for_conc(cache_map=cache_map,
                                      subchash=subchash,
                                      q=q[:i],
                                      minsize=minsize)
                if not ready:
                    if minsize != 0:
                        cancel_async_task(cache_map, subchash, q[:i])
                        logging.getLogger(__name__).warning(
                            'Removed unfinished concordance cache record due to exceeded time limit'
                        )
                    continue
                _, finished = _check_result(cache_map=cache_map,
                                            subchash=subchash,
                                            q=q[:i],
                                            minsize=minsize)
                if finished:
                    mcorp = corp
                    for qq in reversed(
                            q[:i]):  # find the right main corp, if aligned
                        if qq.startswith('x-'):
                            mcorp = manatee.Corpus(qq[2:])
                            break
                    conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp)
            except (ConcCalculationStatusException,
                    manatee.FileAccessError) as ex:
                logging.getLogger(__name__).error(
                    f'Failed to use cached concordance for {q[:i]}: {ex}')
                cancel_async_task(cache_map, subchash, q[:i])
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        f'find_cached_conc_base({corp.get_conffile()}, [{", ".join(q)}]), '
        f'conc: {conc.__class__.__name__}, '
        f'must calc ops from {i} to {len(q)}, '
        f'time: {(time.time() - start_time):.4f}')
    return ans
Ejemplo n.º 4
0
def get_conc(corp: AbstractKCorpus,
             user_id,
             q: Tuple[str, ...] = None,
             fromp=0,
             pagesize=0,
             asnc=0,
             samplesize=0) -> Union[PyConc, InitialConc]:
    """
    Get/calculate a concordance. The function always tries to fetch as complete
    result as possible (related to the 'q' tuple) from cache. The rest is calculated
    in different ways depending on contents of 'q' and also on the 'asnc' argument
    (if 0 then the conc is always calculated synchronously and within the same process,
    if 1 then the calculation can involve a) background calculation based on Manatee's
    asynchronous/continuous concordance fetching or b) background calculation with
    no continuous data fetching (i.e. user waits and then the whole result is avail.).

    corp -- a respective KCorpus
    user_id -- database user ID
    q -- a tuple/list containing an extended query representation
         (e.g. ['aword,[] within <doc id="foo" />', 'p0 ...'])
    fromp -- a page offset
    pagesize -- a page size (in lines, related to 'fromp')
    asnc -- if 1 then KonText spawns an asynchronous process to calculate the concordance
            and will provide results as they are ready
    samplesize -- ?
    """
    if not q:
        return InitialConc(corp=corp, finished=True)
    # complete bg calc. without continuous data fetching => must accept 0
    if _should_be_bg_query(corp, q, asnc):
        minsize = 0
    elif len(
            q
    ) > 1 or asnc == 0:  # conc with additional ops. needs whole concordance
        minsize = -1
    else:
        minsize = fromp * pagesize  # happy case for a user
    subchash = getattr(corp, 'subchash', None)
    # try to locate concordance in cache
    calc_from, conc = find_cached_conc_base(corp, subchash, q, minsize)
    if not conc and q[0][0] == 'R':  # online sample
        q_copy = list(q)
        q_copy[0] = q[0][1:]
        q_copy = tuple(q_copy)
        find_cached_conc_base(corp, subchash, q_copy, -1)
        # TODO this branch has no use (unless we want to revive online sample func)

    # move mid-sized aligned corpora or large non-aligned corpora to background
    if _should_be_bg_query(corp, q, asnc):
        minsize = fromp * pagesize
        conc = _get_bg_conc(corp=corp,
                            user_id=user_id,
                            q=q,
                            subchash=subchash,
                            samplesize=samplesize,
                            calc_from=calc_from,
                            minsize=minsize)
    else:
        worker = GeneralWorker()
        if isinstance(conc, InitialConc):
            calc_from = 1
            # use Manatee asynchronous conc. calculation (= show 1st page once it's avail.)
            if asnc and len(q) == 1:
                conc = _get_async_conc(corp=corp,
                                       user_id=user_id,
                                       q=q,
                                       subchash=subchash,
                                       samplesize=samplesize,
                                       minsize=minsize)

            # do the calc here and return (OK for small to mid sized corpora without alignments)
            else:
                conc = _get_sync_conc(worker=worker,
                                      corp=corp,
                                      q=q,
                                      subchash=subchash,
                                      samplesize=samplesize)
        # save additional concordance actions to cache (e.g. sample)
        for act in range(calc_from, len(q)):
            command, args = q[act][0], q[act][1:]
            conc.exec_command(command, args)
            cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
            curr_status = cache_map.get_calc_status(subchash, q[:act + 1])
            if curr_status and not curr_status.finished:
                ready = wait_for_conc(cache_map=cache_map,
                                      subchash=subchash,
                                      q=q[:act + 1],
                                      minsize=-1)
                if not ready:
                    raise ConcCalculationStatusException(
                        'Wait for concordance operation failed')
            elif not curr_status:
                calc_status = worker.create_new_calc_status()
                calc_status.concsize = conc.size()
                calc_status = cache_map.add_to_map(subchash, q[:act + 1],
                                                   calc_status)
                conc.save(calc_status.cachefile)
                _normalize_permissions(calc_status.cachefile)
                # TODO can we be sure here that conc is finished even if its not the first query op.?
                cache_map.update_calc_status(subchash,
                                             q[:act + 1],
                                             finished=True,
                                             readable=True,
                                             concsize=conc.size())
    return conc