Esempio n. 1
0
def get_wordlist_length(corp: Corpus, wlattr: str, wlpat: str, wlnums: str,
                        wlminfreq: int, words: str, blacklist: str,
                        include_nonwords: bool) -> int:
    enc_pattern = wlpat.strip()
    attr = corp.get_attr(wlattr)
    attrfreq = _get_attrfreq(corp=corp,
                             attr=attr,
                             wlattr=wlattr,
                             wlnums=wlnums)
    if not include_nonwords:
        nwre = corp.get_conf('NONWORDRE')
    else:
        nwre = ''
    try:
        gen = attr.regexp2ids(enc_pattern, 0, nwre)
    except TypeError:
        gen = attr.regexp2ids(enc_pattern, 0)
    i = 0
    while not gen.end():
        wid = gen.next()
        frq = attrfreq[wid]
        if not frq:
            continue
        id_value = attr.id2str(wid)
        if frq >= wlminfreq and (not words or id_value in words) and (
                not blacklist or id_value not in blacklist):
            i += 1
    return i
Esempio n. 2
0
def frq_db(corp: Corpus,
           attrname: str,
           nums: str = 'frq',
           id_range: int = 0) -> array:
    import array
    filename = (subcorp_base_file(corp, attrname) + '.' + nums)
    if not id_range:
        id_range = corp.get_attr(attrname).id_range()
    if nums == 'arf':
        frq = array.array('f')
        try:
            frq.fromfile(open(filename, 'rb'), id_range)  # type: ignore
        except IOError as ex:
            raise MissingSubCorpFreqFile(corp, ex)
        except EOFError as ex:
            os.remove(filename.rsplit('.', 1)[0] + '.docf')
            raise MissingSubCorpFreqFile(corp, ex)
    else:
        try:
            if corp.get_conf('VIRTUAL') and not hasattr(
                    corp, 'spath') and nums == 'frq':
                raise IOError
            frq = array.array('i')
            frq.fromfile(open(filename, 'rb'), id_range)  # type: ignore
        except EOFError as ex:
            os.remove(filename.rsplit('.', 1)[0] + '.docf')
            os.remove(filename.rsplit('.', 1)[0] + '.arf')
            os.remove(filename.rsplit('.', 1)[0] + '.frq')
            raise MissingSubCorpFreqFile(corp, ex)
        except IOError:
            try:
                frq = array.array('l')
                frq.fromfile(open(filename + '64', 'rb'),
                             id_range)  # type: ignore
            except IOError as ex:
                if not hasattr(corp, 'spath') and nums == 'frq':
                    a = corp.get_attr(attrname)
                    frq.fromlist([a.freq(i) for i in range(a.id_range())])
                else:
                    raise MissingSubCorpFreqFile(corp, ex)
    return frq
Esempio n. 3
0
def wordlist(corp: Corpus,
             words: Optional[Set[str]] = None,
             wlattr: str = '',
             wlpat: str = '',
             wlminfreq: int = 5,
             wlmaxitems: int = 100,
             wlsort: str = '',
             blacklist: Optional[Set[str]] = None,
             wlnums: Optional[str] = 'frq',
             include_nonwords: int = 0) -> List[Dict[str, Any]]:
    """
    Note: 'words' and 'blacklist' are expected to contain utf-8-encoded strings.
    """
    blacklist = set(w for w in blacklist) if blacklist else set()
    words = set(w for w in words) if words else set()
    attr = corp.get_attr(wlattr)
    attrfreq = _get_attrfreq(corp=corp,
                             attr=attr,
                             wlattr=wlattr,
                             wlnums=wlnums)
    if words and wlpat == '.*':  # word list just for given words
        items = _wordlist_from_list(attr=attr,
                                    attrfreq=attrfreq,
                                    words=words,
                                    blacklist=blacklist,
                                    wlsort=wlsort,
                                    wlminfreq=wlminfreq,
                                    wlmaxitems=wlmaxitems,
                                    wlnums=wlnums)
    else:  # word list according to pattern
        if not include_nonwords:
            nwre = corp.get_conf('NONWORDRE')
        else:
            nwre = ''
        items = _wordlist_by_pattern(attr=attr,
                                     enc_pattern=wlpat.strip(),
                                     excl_pattern=nwre,
                                     wlminfreq=wlminfreq,
                                     words=words,
                                     blacklist=blacklist,
                                     wlnums=wlnums,
                                     wlsort=wlsort,
                                     wlmaxitems=wlmaxitems,
                                     attrfreq=attrfreq)

    if not words or wlpat != '.*':
        items = [(f, attr.id2str(i)) for (f, i) in items]
    if wlsort == 'f':
        items = sorted(items, key=lambda x: x[0], reverse=True)
    else:
        items = sorted(items, key=lambda x: x[1])
    del items[wlmaxitems:]
    return add_block_items([{'str': w, 'freq': f} for f, w in items])
Esempio n. 4
0
def texttype_values(
        corp: Corpus,
        subcorpattrs: str,
        maxlistsize: int,
        shrink_list: Union[Tuple[str, ...], List[str]] = (),
        collator_locale: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}

    !!!!!!
    NOTE: avoid calling this method repeatedly for the same corpus as the
    attr = corp.get_attr(n) line is leaking opened files of corpora indexes which
    leads to exhausted limit for opened files for Gunicorn/Celery after some time.
    KonText caches the value returned by this function to prevent this.

    !!! TODO !!!

    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if not shrink_list:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and
                (corp.get_conf(n + '.TEXTBOXLENGTH')
                 or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH')
                                            or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{
                        'v': attr.id2str(i)
                    } for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            attr.id2str(i).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{
                            'v': x
                        } for x in sorted(
                            set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{
                            'v': attr.id2str(i)
                        } for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals,
                                               key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals,
                                                  collator_locale,
                                                  key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(
                        vals,
                        key=cmp_to_key(lambda x1, x2: cmp(
                            x1['v'].lower(), x2['v'].lower())))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines