Ejemplo n.º 1
0
    def _get_freq_dispersion(self, conc: PyConc,
                             resolution: int) -> List[FreqDispersionBin]:
        conc_begs, values = conc.xdistribution([0] * resolution, 101)

        abs_freq = []
        last_valid_item = None
        for beg in reversed(conc_begs):
            # if beg is 0, it means there are no concordances in the bin
            if beg > 0:
                if last_valid_item is None:
                    abs_freq.append(int(conc.size()) - beg)
                else:
                    # `last_valid_item - beg` is number of concordances
                    # between beginnig of last non empty bin and the beginning of current bin
                    # (for cycle is going backwards)
                    abs_freq.append(last_valid_item - beg)
                last_valid_item = beg
            else:
                abs_freq.append(0)

        freq_dispersion = [
            FreqDispersionBin(
                100 * i / len(conc_begs),
                100 * (i + 0.5) / len(conc_begs),
                100 * (i + 1) / len(conc_begs),
                freq,
            ) for i, freq in enumerate(reversed(abs_freq))
        ]

        return freq_dispersion
Ejemplo n.º 2
0
def require_existing_conc(corp: AbstractKCorpus,
                          q: Union[Tuple[str, ...], List[str]]) -> PyConc:
    """
    Load a cached concordance based on a provided corpus and query.
    If nothing is found, ConcNotFoundException is thrown.
    """
    corpus_manager = CorpusManager(subcpath=[])
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    subchash = getattr(corp, 'subchash', None)
    status = cache_map.get_calc_status(subchash, q)
    if status is None:
        raise ConcNotFoundException('Concordance not found: {}'.format(
            ', '.join(q)))
    if status.finished and status.readable:
        mcorp = corp
        for qq in reversed(q):  # find the right main corp, if aligned
            if qq.startswith('x-'):
                mcorp = corpus_manager.get_corpus(qq[2:])
                break
        try:
            return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp)
        except manatee.FileAccessError as ex:
            raise ConcNotFoundException(ex)
    raise BrokenConcordanceException(
        'Concordance broken. File: {}, error: {}'.format(
            status.cachefile, status.error))
Ejemplo n.º 3
0
def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...],
                 subchash: Optional[str], samplesize: int, calc_from: int,
                 minsize: int) -> Union[PyConc, EmptyConc]:
    """
    arguments:
    calc_from - from which operation idx (inclusive) we have to calculate respective results
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    # let's create cache records of the operations we'll have to perform
    if calc_from < len(q):
        for i in range(calc_from, len(q)):
            cachefile, _ = cache_map.add_to_map(subchash,
                                                q[:i + 1],
                                                0,
                                                calc_status=CalcStatus())
            if os.path.isfile(cachefile):
                del_silent(cachefile)
                logging.getLogger(__name__).warning(
                    f'Removed unbound conc. cache file {cachefile}')
        app = bgcalc.calc_backend_client(settings)
        app.send_task('conc_sync_calculate',
                      (user_id, corp.corpname, getattr(
                          corp, 'subcname', None), subchash, q, samplesize),
                      time_limit=TASK_TIME_LIMIT)
    # for smaller concordances/corpora there is a chance the data
    # is ready in a few seconds - let's try this:
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        # return empty yet unfinished concordance to make the client watch the calculation
        return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
Ejemplo n.º 4
0
 def compute_conc(self, corp: manatee.Corpus, q: Tuple[str, ...], samplesize: int) -> PyConc:
     start_time = time.time()
     q = tuple(q)
     if q[0][0] != 'R':
         ans_conc = PyConc(corp, q[0][0], q[0][1:], samplesize)
     else:
         raise NotImplementedError('Function "online sample" is not supported')
     logging.getLogger(__name__).debug(f'compute_conc({corp.corpname}, [{", ".join(q)}]) '
                                       f'-> {(time.time() - start_time):.4f}')
     return ans_conc
Ejemplo n.º 5
0
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize):
    """
    """
    app = bgcalc.calc_backend_client(settings)
    ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None),
                                                 subchash, q, samplesize, TASK_TIME_LIMIT),
                        time_limit=CONC_REGISTER_TASK_LIMIT)
    ans.get(timeout=CONC_REGISTER_WAIT_LIMIT)
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
    else:
        return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
Ejemplo n.º 6
0
def get_existing_conc(corp: manatee.Corpus,
                      q: Tuple[str, ...]) -> manatee.Concordance:
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    subchash = getattr(corp, 'subchash', None)
    status = cache_map.get_calc_status(subchash, q)
    if status is None:
        raise ConcNotFoundException('Concordance not found.')
    if status.finished and status.readable:
        mcorp = corp
        for qq in reversed(q):  # find the right main corp, if aligned
            if qq.startswith('x-'):
                mcorp = manatee.Corpus(qq[2:])
                break
        return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp)
    raise BrokenConcordanceException(
        'Concordance broken. File: {}, error: {}'.format(
            status.cachefile, status.error))
Ejemplo n.º 7
0
def _get_bg_conc(corp: AbstractKCorpus, user_id: int, q: Tuple[str, ...],
                 subchash: Optional[str], samplesize: int, calc_from: int,
                 minsize: int) -> Union[PyConc, InitialConc]:
    """
    arguments:
    calc_from - from which operation idx (inclusive) we have to calculate respective results
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)

    status = cache_map.get_calc_status(subchash, q)
    if status and not status.finished:  # the calc is already running, the client has to wait and check regularly
        return InitialConc(corp, status.cachefile)
    # let's create cache records of the operations we'll have to perform
    if calc_from < len(q):
        for i in range(calc_from, len(q)):
            status = cache_map.add_to_map(subchash,
                                          q[:i + 1],
                                          ConcCacheStatus(),
                                          overwrite=True)
            # the file cannot be valid as otherwise, calc_from would be higher
            if os.path.isfile(status.cachefile):
                del_silent(status.cachefile)
                logging.getLogger(__name__).warning(
                    f'Removed unbound conc. cache file {status.cachefile}')
        worker = bgcalc.calc_backend_client(settings)
        worker.send_task(
            'conc_sync_calculate',
            object.__class__,
            (user_id, corp.corpname, getattr(corp, 'subcname',
                                             None), subchash, q, samplesize),
            time_limit=TASK_TIME_LIMIT)
    # for smaller concordances/corpora there is a chance the data
    # is ready in a few seconds - let's try this:
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q))
    else:
        # return empty yet unfinished concordance to make the client watch the calculation
        return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
Ejemplo n.º 8
0
 def _freq_dist(self, corp: KCorpus, conc: PyConc, fcrit: str, user_id: int):
     args = freq_calc.FreqCalcArgs(
         corpname=corp.corpname,
         subcname=corp.subcname,
         subcpath=[],
         user_id=user_id,
         pagesize=100,
         samplesize=0,
         flimit=1,
         fcrit=[fcrit],
         ftt_include_empty=0,
         rel_mode=1,
         freq_sort='freq',
         collator_locale='en_US',  # TODO use data provided by corparch plg
         fmaxitems=1,
         fpage=1,
         force_cache=False)
     freqs = [conc.xfreq_dist(
         cr, args.flimit, args.freq_sort, args.ftt_include_empty, args.rel_mode, args.collator_locale)
         for cr in args.fcrit]
     return freqs[0].get('Items', [])
Ejemplo n.º 9
0
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize):
    """
    """
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    status = cache_map.get_calc_status(subchash, q)
    if not status or status.error:
        worker = bgcalc.calc_backend_client(settings)
        ans = worker.send_task(
            'conc_register',
            object.__class__,
            (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash,
             q, samplesize, TASK_TIME_LIMIT),
            time_limit=CONC_REGISTER_TASK_LIMIT)
        ans.get(timeout=CONC_REGISTER_WAIT_LIMIT)
    conc_avail = wait_for_conc(cache_map=cache_map,
                               subchash=subchash,
                               q=q,
                               minsize=minsize)
    if conc_avail:
        return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q))
    else:
        return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
Ejemplo n.º 10
0
def find_cached_conc_base(
        corp: manatee.Corpus, subchash: Optional[str], q: Tuple[str, ...],
        minsize: int) -> Tuple[Optional[int], manatee.Concordance]:
    """
    Load a concordance from cache starting from a complete operation q[:],
    then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be
    used to skip calculation of already available operations q[:-i].

    arguments:
    minsize -- a minimum concordance size to return immediately (synchronously); please
                note that unlike wait_for_conc here we accept also 0

    returns:
    a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance]
    """
    start_time = time.time()
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    cache_map.refresh_map()
    calc_status = cache_map.get_calc_status(subchash, q)
    if calc_status:
        if calc_status.error is None:
            corp_mtime = corplib_corp_mtime(corp)
            if calc_status.created - corp_mtime < 0:
                logging.getLogger(__name__).warning(
                    'Removed outdated cache file (older than corpus indices)')
                cache_map.del_full_entry(subchash, q)
        else:
            logging.getLogger(__name__).warning(
                'Removed failed calculation cache record (error: {0}'.format(
                    calc_status.error))
            cache_map.del_full_entry(subchash, q)

    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    conc = EmptyConc(corp=corp)
    ans = (0, conc)
    # try to find the most complete cached operation
    # (e.g. query + filter + sample)
    for i in range(srch_from, 0, -1):
        cache_path = cache_map.cache_file_path(subchash, q[:i])
        # now we know that someone already calculated the conc (but it might not be finished yet)
        if cache_path:
            try:
                ready = wait_for_conc(cache_map=cache_map,
                                      subchash=subchash,
                                      q=q[:i],
                                      minsize=minsize)
                if not ready:
                    if minsize != 0:
                        cancel_async_task(cache_map, subchash, q[:i])
                        logging.getLogger(__name__).warning(
                            'Removed unfinished concordance cache record due to exceeded time limit'
                        )
                    continue
                _, finished = _check_result(cache_map=cache_map,
                                            subchash=subchash,
                                            q=q[:i],
                                            minsize=minsize)
                if finished:
                    mcorp = corp
                    for qq in reversed(
                            q[:i]):  # find the right main corp, if aligned
                        if qq.startswith('x-'):
                            mcorp = manatee.Corpus(qq[2:])
                            break
                    conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp)
            except (ConcCalculationStatusException,
                    manatee.FileAccessError) as ex:
                logging.getLogger(__name__).error(
                    f'Failed to use cached concordance for {q[:i]}: {ex}')
                cancel_async_task(cache_map, subchash, q[:i])
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        f'get_cached_conc({corp.get_conffile()}, [{", ".join(q)}]), '
        f'conc: {conc.__class__.__name__}, '
        f'missing ops start idx: {i if i < len(q) else "none"}, '
        f'time: {(time.time() - start_time):.4f}')
    return ans