Beispiel #1
0
def corp_mtime(corpus: Corpus) -> float:
    reg_mtime = os.path.getmtime(corpus.get_confpath())
    data_path = corpus.get_conf('PATH')
    data_dir = os.path.dirname(data_path) if data_path.endswith(
        '/') else data_path
    data_mtime = os.path.getmtime(data_dir)
    return max(reg_mtime, data_mtime)
Beispiel #2
0
def get_wordlist_length(corp: Corpus, wlattr: str, wlpat: str, wlnums: str,
                        wlminfreq: int, words: str, blacklist: str,
                        include_nonwords: bool) -> int:
    enc_pattern = wlpat.strip()
    attr = corp.get_attr(wlattr)
    attrfreq = _get_attrfreq(corp=corp,
                             attr=attr,
                             wlattr=wlattr,
                             wlnums=wlnums)
    if not include_nonwords:
        nwre = corp.get_conf('NONWORDRE')
    else:
        nwre = ''
    try:
        gen = attr.regexp2ids(enc_pattern, 0, nwre)
    except TypeError:
        gen = attr.regexp2ids(enc_pattern, 0)
    i = 0
    while not gen.end():
        wid = gen.next()
        frq = attrfreq[wid]
        if not frq:
            continue
        id_value = attr.id2str(wid)
        if frq >= wlminfreq and (not words or id_value in words) and (
                not blacklist or id_value not in blacklist):
            i += 1
    return i
Beispiel #3
0
    def get_cached_conc_sizes(self, corp: manatee.Corpus, q: Tuple[str, ...] = None, cachefile: str = None) -> Dict[str, Any]:
        """
        arguments:
        corp -- manatee.Corpus instance
        q -- a list containing preprocessed query
        cachefile -- if not provided then the path is determined automatically
        using CACHE_ROOT_DIR and corpus name, corpus name and the query

        returns:
        a dictionary {
            finished : 0/1,
            concsize : int,
            fullsize : int,
            relconcsize : float (concordance size recalculated to a million corpus),
            arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values)
        }
        """
        import struct

        if q is None:
            q = ()
        ans = dict(finished=False, concsize=0, fullsize=0, relconcsize=0)
        if not cachefile:  # AJAX call
            subchash = getattr(corp, 'subchash', None)
            cache_map = self._cache_factory.get_mapping(corp)
            cachefile = cache_map.cache_file_path(subchash, q)
            status = cache_map.get_calc_status(subchash, q)
            status.test_error(TASK_TIME_LIMIT)
            if not status:
                raise ConcCalculationStatusException('Concordance calculation not found', None)
            elif status.error is not None:
                raise ConcCalculationStatusException('Concordance calculation failed', status.error)

        if cachefile and os.path.isfile(cachefile):
            cache = open(cachefile, 'rb')
            cache.seek(15)
            finished = bool(ord(cache.read(1)))
            (fullsize,) = struct.unpack('q', cache.read(8))
            cache.seek(32)
            (concsize,) = struct.unpack('i', cache.read(4))

            if fullsize > 0:
                relconcsize = 1000000.0 * fullsize / corp.search_size()
            else:
                relconcsize = 1000000.0 * concsize / corp.search_size()

            if finished and not is_subcorpus(corp):
                conc = manatee.Concordance(corp, cachefile)
                result_arf = round(conc.compute_ARF(), 2)
            else:
                result_arf = None

            ans['finished'] = finished
            ans['concsize'] = concsize
            ans['fullsize'] = fullsize
            ans['relconcsize'] = relconcsize
            ans['arf'] = result_arf
        return ans
Beispiel #4
0
def wordlist(corp: Corpus,
             words: Optional[Set[str]] = None,
             wlattr: str = '',
             wlpat: str = '',
             wlminfreq: int = 5,
             wlmaxitems: int = 100,
             wlsort: str = '',
             blacklist: Optional[Set[str]] = None,
             wlnums: Optional[str] = 'frq',
             include_nonwords: int = 0) -> List[Dict[str, Any]]:
    """
    Note: 'words' and 'blacklist' are expected to contain utf-8-encoded strings.
    """
    blacklist = set(w for w in blacklist) if blacklist else set()
    words = set(w for w in words) if words else set()
    attr = corp.get_attr(wlattr)
    attrfreq = _get_attrfreq(corp=corp,
                             attr=attr,
                             wlattr=wlattr,
                             wlnums=wlnums)
    if words and wlpat == '.*':  # word list just for given words
        items = _wordlist_from_list(attr=attr,
                                    attrfreq=attrfreq,
                                    words=words,
                                    blacklist=blacklist,
                                    wlsort=wlsort,
                                    wlminfreq=wlminfreq,
                                    wlmaxitems=wlmaxitems,
                                    wlnums=wlnums)
    else:  # word list according to pattern
        if not include_nonwords:
            nwre = corp.get_conf('NONWORDRE')
        else:
            nwre = ''
        items = _wordlist_by_pattern(attr=attr,
                                     enc_pattern=wlpat.strip(),
                                     excl_pattern=nwre,
                                     wlminfreq=wlminfreq,
                                     words=words,
                                     blacklist=blacklist,
                                     wlnums=wlnums,
                                     wlsort=wlsort,
                                     wlmaxitems=wlmaxitems,
                                     attrfreq=attrfreq)

    if not words or wlpat != '.*':
        items = [(f, attr.id2str(i)) for (f, i) in items]
    if wlsort == 'f':
        items = sorted(items, key=lambda x: x[0], reverse=True)
    else:
        items = sorted(items, key=lambda x: x[1])
    del items[wlmaxitems:]
    return add_block_items([{'str': w, 'freq': f} for f, w in items])
Beispiel #5
0
def doc_sizes(corp: Corpus, struct: Structure, attrname: str, i: int,
              normvals: Dict[int, int]) -> int:
    r = corp.filter_query(struct.attr_val(attrname.split('.')[1], i))
    cnt = 0
    while not r.end():
        cnt += normvals[r.peek_beg()]
        r.next()
    return cnt
Beispiel #6
0
def frq_db(corp: Corpus,
           attrname: str,
           nums: str = 'frq',
           id_range: int = 0) -> array:
    import array
    filename = (subcorp_base_file(corp, attrname) + '.' + nums)
    if not id_range:
        id_range = corp.get_attr(attrname).id_range()
    if nums == 'arf':
        frq = array.array('f')
        try:
            frq.fromfile(open(filename, 'rb'), id_range)  # type: ignore
        except IOError as ex:
            raise MissingSubCorpFreqFile(corp, ex)
        except EOFError as ex:
            os.remove(filename.rsplit('.', 1)[0] + '.docf')
            raise MissingSubCorpFreqFile(corp, ex)
    else:
        try:
            if corp.get_conf('VIRTUAL') and not hasattr(
                    corp, 'spath') and nums == 'frq':
                raise IOError
            frq = array.array('i')
            frq.fromfile(open(filename, 'rb'), id_range)  # type: ignore
        except EOFError as ex:
            os.remove(filename.rsplit('.', 1)[0] + '.docf')
            os.remove(filename.rsplit('.', 1)[0] + '.arf')
            os.remove(filename.rsplit('.', 1)[0] + '.frq')
            raise MissingSubCorpFreqFile(corp, ex)
        except IOError:
            try:
                frq = array.array('l')
                frq.fromfile(open(filename + '64', 'rb'),
                             id_range)  # type: ignore
            except IOError as ex:
                if not hasattr(corp, 'spath') and nums == 'frq':
                    a = corp.get_attr(attrname)
                    frq.fromlist([a.freq(i) for i in range(a.id_range())])
                else:
                    raise MissingSubCorpFreqFile(corp, ex)
    return frq
Beispiel #7
0
    def _normalize_multivalues(self, corp: manatee.Corpus, attr1: str,
                               attr2: str) -> Tuple[str, str]:
        multisep1 = corp.get_conf(self._conf["attr1"] + '.MULTISEP')
        multisep2 = corp.get_conf(self._conf["attr2"] + '.MULTISEP')
        if multisep1 and multisep2:
            attr1_split = attr1.split(multisep1)
            attr2_split = attr2.split(multisep2)
            if len(attr1_split) == len(attr2_split):
                return attr1_split[0], attr2_split[0]

            logging.warning(
                f'PosAttrPairRelManateeBackend multivalue normalization mismatch - {attr1}...{attr2}'
            )

        return attr1, attr2
Beispiel #8
0
 def corpconf_pairs(self, corp: Corpus,
                    label: str) -> List[Tuple[str, str]]:
     """
     Encodes some specific corpus registry file configuration values
     where a list of pairs is actually flattened (k1, v1, k2, v2,..., kN, vN).
     This applies e.g. for WPOSLIST and LPOSLIST.
     Returns:
          a list of pairs
     """
     if type(corp) is str:
         corp = self.get_Corpus(corp)
     val = corp.get_conf(label)
     if len(val) > 2:
         val = val[1:].split(val[0])
     else:
         val = ''
     return [(val[i], val[i + 1]) for i in range(0, len(val), 2)]
Beispiel #9
0
def texttype_values(
        corp: Corpus,
        subcorpattrs: str,
        maxlistsize: int,
        shrink_list: Union[Tuple[str, ...], List[str]] = (),
        collator_locale: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}

    !!!!!!
    NOTE: avoid calling this method repeatedly for the same corpus as the
    attr = corp.get_attr(n) line is leaking opened files of corpora indexes which
    leads to exhausted limit for opened files for Gunicorn/Celery after some time.
    KonText caches the value returned by this function to prevent this.

    !!! TODO !!!

    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if not shrink_list:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and
                (corp.get_conf(n + '.TEXTBOXLENGTH')
                 or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH')
                                            or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{
                        'v': attr.id2str(i)
                    } for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            attr.id2str(i).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{
                            'v': x
                        } for x in sorted(
                            set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{
                            'v': attr.id2str(i)
                        } for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals,
                                               key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals,
                                                  collator_locale,
                                                  key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(
                        vals,
                        key=cmp_to_key(lambda x1, x2: cmp(
                            x1['v'].lower(), x2['v'].lower())))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Beispiel #10
0
def _should_be_bg_query(corp: manatee.Corpus, query: Tuple[str, ...],
                        asnc: int) -> bool:
    return (len(query) > 1 and asnc == 1
            and (query[1][0] == 'X'
                 and corp.size() > CONC_BG_SYNC_ALIGNED_CORP_THRESHOLD
                 or corp.size() > CONC_BG_SYNC_SINGLE_CORP_THRESHOLD))