def get_wordlist_length(corp: Corpus, wlattr: str, wlpat: str, wlnums: str, wlminfreq: int, words: str, blacklist: str, include_nonwords: bool) -> int: enc_pattern = wlpat.strip() attr = corp.get_attr(wlattr) attrfreq = _get_attrfreq(corp=corp, attr=attr, wlattr=wlattr, wlnums=wlnums) if not include_nonwords: nwre = corp.get_conf('NONWORDRE') else: nwre = '' try: gen = attr.regexp2ids(enc_pattern, 0, nwre) except TypeError: gen = attr.regexp2ids(enc_pattern, 0) i = 0 while not gen.end(): wid = gen.next() frq = attrfreq[wid] if not frq: continue id_value = attr.id2str(wid) if frq >= wlminfreq and (not words or id_value in words) and ( not blacklist or id_value not in blacklist): i += 1 return i
def frq_db(corp: Corpus, attrname: str, nums: str = 'frq', id_range: int = 0) -> array: import array filename = (subcorp_base_file(corp, attrname) + '.' + nums) if not id_range: id_range = corp.get_attr(attrname).id_range() if nums == 'arf': frq = array.array('f') try: frq.fromfile(open(filename, 'rb'), id_range) # type: ignore except IOError as ex: raise MissingSubCorpFreqFile(corp, ex) except EOFError as ex: os.remove(filename.rsplit('.', 1)[0] + '.docf') raise MissingSubCorpFreqFile(corp, ex) else: try: if corp.get_conf('VIRTUAL') and not hasattr( corp, 'spath') and nums == 'frq': raise IOError frq = array.array('i') frq.fromfile(open(filename, 'rb'), id_range) # type: ignore except EOFError as ex: os.remove(filename.rsplit('.', 1)[0] + '.docf') os.remove(filename.rsplit('.', 1)[0] + '.arf') os.remove(filename.rsplit('.', 1)[0] + '.frq') raise MissingSubCorpFreqFile(corp, ex) except IOError: try: frq = array.array('l') frq.fromfile(open(filename + '64', 'rb'), id_range) # type: ignore except IOError as ex: if not hasattr(corp, 'spath') and nums == 'frq': a = corp.get_attr(attrname) frq.fromlist([a.freq(i) for i in range(a.id_range())]) else: raise MissingSubCorpFreqFile(corp, ex) return frq
def wordlist(corp: Corpus, words: Optional[Set[str]] = None, wlattr: str = '', wlpat: str = '', wlminfreq: int = 5, wlmaxitems: int = 100, wlsort: str = '', blacklist: Optional[Set[str]] = None, wlnums: Optional[str] = 'frq', include_nonwords: int = 0) -> List[Dict[str, Any]]: """ Note: 'words' and 'blacklist' are expected to contain utf-8-encoded strings. """ blacklist = set(w for w in blacklist) if blacklist else set() words = set(w for w in words) if words else set() attr = corp.get_attr(wlattr) attrfreq = _get_attrfreq(corp=corp, attr=attr, wlattr=wlattr, wlnums=wlnums) if words and wlpat == '.*': # word list just for given words items = _wordlist_from_list(attr=attr, attrfreq=attrfreq, words=words, blacklist=blacklist, wlsort=wlsort, wlminfreq=wlminfreq, wlmaxitems=wlmaxitems, wlnums=wlnums) else: # word list according to pattern if not include_nonwords: nwre = corp.get_conf('NONWORDRE') else: nwre = '' items = _wordlist_by_pattern(attr=attr, enc_pattern=wlpat.strip(), excl_pattern=nwre, wlminfreq=wlminfreq, words=words, blacklist=blacklist, wlnums=wlnums, wlsort=wlsort, wlmaxitems=wlmaxitems, attrfreq=attrfreq) if not words or wlpat != '.*': items = [(f, attr.id2str(i)) for (f, i) in items] if wlsort == 'f': items = sorted(items, key=lambda x: x[0], reverse=True) else: items = sorted(items, key=lambda x: x[1]) del items[wlmaxitems:] return add_block_items([{'str': w, 'freq': f} for f, w in items])
def texttype_values( corp: Corpus, subcorpattrs: str, maxlistsize: int, shrink_list: Union[Tuple[str, ...], List[str]] = (), collator_locale: Optional[str] = None) -> List[Dict[str, Any]]: """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} !!!!!! NOTE: avoid calling this method repeatedly for the same corpus as the attr = corp.get_attr(n) line is leaking opened files of corpora indexes which leads to exhausted limit for opened files for Gunicorn/Celery after some time. KonText caches the value returned by this function to prevent this. !!! TODO !!! """ if subcorpattrs == '#': return [] attrlines = [] if not shrink_list: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ attr.id2str(i).split(multisep) for i in range(attr.id_range()) ] vals = [{ 'v': x } for x in sorted( set([s for subl in raw_vals for s in subl]))] else: vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted( vals, key=cmp_to_key(lambda x1, x2: cmp( x1['v'].lower(), x2['v'].lower()))) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines