def corp_mtime(corpus: Corpus) -> float: reg_mtime = os.path.getmtime(corpus.get_confpath()) data_path = corpus.get_conf('PATH') data_dir = os.path.dirname(data_path) if data_path.endswith( '/') else data_path data_mtime = os.path.getmtime(data_dir) return max(reg_mtime, data_mtime)
def get_wordlist_length(corp: Corpus, wlattr: str, wlpat: str, wlnums: str, wlminfreq: int, words: str, blacklist: str, include_nonwords: bool) -> int: enc_pattern = wlpat.strip() attr = corp.get_attr(wlattr) attrfreq = _get_attrfreq(corp=corp, attr=attr, wlattr=wlattr, wlnums=wlnums) if not include_nonwords: nwre = corp.get_conf('NONWORDRE') else: nwre = '' try: gen = attr.regexp2ids(enc_pattern, 0, nwre) except TypeError: gen = attr.regexp2ids(enc_pattern, 0) i = 0 while not gen.end(): wid = gen.next() frq = attrfreq[wid] if not frq: continue id_value = attr.id2str(wid) if frq >= wlminfreq and (not words or id_value in words) and ( not blacklist or id_value not in blacklist): i += 1 return i
def get_cached_conc_sizes(self, corp: manatee.Corpus, q: Tuple[str, ...] = None, cachefile: str = None) -> Dict[str, Any]: """ arguments: corp -- manatee.Corpus instance q -- a list containing preprocessed query cachefile -- if not provided then the path is determined automatically using CACHE_ROOT_DIR and corpus name, corpus name and the query returns: a dictionary { finished : 0/1, concsize : int, fullsize : int, relconcsize : float (concordance size recalculated to a million corpus), arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values) } """ import struct if q is None: q = () ans = dict(finished=False, concsize=0, fullsize=0, relconcsize=0) if not cachefile: # AJAX call subchash = getattr(corp, 'subchash', None) cache_map = self._cache_factory.get_mapping(corp) cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) status.test_error(TASK_TIME_LIMIT) if not status: raise ConcCalculationStatusException('Concordance calculation not found', None) elif status.error is not None: raise ConcCalculationStatusException('Concordance calculation failed', status.error) if cachefile and os.path.isfile(cachefile): cache = open(cachefile, 'rb') cache.seek(15) finished = bool(ord(cache.read(1))) (fullsize,) = struct.unpack('q', cache.read(8)) cache.seek(32) (concsize,) = struct.unpack('i', cache.read(4)) if fullsize > 0: relconcsize = 1000000.0 * fullsize / corp.search_size() else: relconcsize = 1000000.0 * concsize / corp.search_size() if finished and not is_subcorpus(corp): conc = manatee.Concordance(corp, cachefile) result_arf = round(conc.compute_ARF(), 2) else: result_arf = None ans['finished'] = finished ans['concsize'] = concsize ans['fullsize'] = fullsize ans['relconcsize'] = relconcsize ans['arf'] = result_arf return ans
def wordlist(corp: Corpus, words: Optional[Set[str]] = None, wlattr: str = '', wlpat: str = '', wlminfreq: int = 5, wlmaxitems: int = 100, wlsort: str = '', blacklist: Optional[Set[str]] = None, wlnums: Optional[str] = 'frq', include_nonwords: int = 0) -> List[Dict[str, Any]]: """ Note: 'words' and 'blacklist' are expected to contain utf-8-encoded strings. """ blacklist = set(w for w in blacklist) if blacklist else set() words = set(w for w in words) if words else set() attr = corp.get_attr(wlattr) attrfreq = _get_attrfreq(corp=corp, attr=attr, wlattr=wlattr, wlnums=wlnums) if words and wlpat == '.*': # word list just for given words items = _wordlist_from_list(attr=attr, attrfreq=attrfreq, words=words, blacklist=blacklist, wlsort=wlsort, wlminfreq=wlminfreq, wlmaxitems=wlmaxitems, wlnums=wlnums) else: # word list according to pattern if not include_nonwords: nwre = corp.get_conf('NONWORDRE') else: nwre = '' items = _wordlist_by_pattern(attr=attr, enc_pattern=wlpat.strip(), excl_pattern=nwre, wlminfreq=wlminfreq, words=words, blacklist=blacklist, wlnums=wlnums, wlsort=wlsort, wlmaxitems=wlmaxitems, attrfreq=attrfreq) if not words or wlpat != '.*': items = [(f, attr.id2str(i)) for (f, i) in items] if wlsort == 'f': items = sorted(items, key=lambda x: x[0], reverse=True) else: items = sorted(items, key=lambda x: x[1]) del items[wlmaxitems:] return add_block_items([{'str': w, 'freq': f} for f, w in items])
def doc_sizes(corp: Corpus, struct: Structure, attrname: str, i: int, normvals: Dict[int, int]) -> int: r = corp.filter_query(struct.attr_val(attrname.split('.')[1], i)) cnt = 0 while not r.end(): cnt += normvals[r.peek_beg()] r.next() return cnt
def frq_db(corp: Corpus, attrname: str, nums: str = 'frq', id_range: int = 0) -> array: import array filename = (subcorp_base_file(corp, attrname) + '.' + nums) if not id_range: id_range = corp.get_attr(attrname).id_range() if nums == 'arf': frq = array.array('f') try: frq.fromfile(open(filename, 'rb'), id_range) # type: ignore except IOError as ex: raise MissingSubCorpFreqFile(corp, ex) except EOFError as ex: os.remove(filename.rsplit('.', 1)[0] + '.docf') raise MissingSubCorpFreqFile(corp, ex) else: try: if corp.get_conf('VIRTUAL') and not hasattr( corp, 'spath') and nums == 'frq': raise IOError frq = array.array('i') frq.fromfile(open(filename, 'rb'), id_range) # type: ignore except EOFError as ex: os.remove(filename.rsplit('.', 1)[0] + '.docf') os.remove(filename.rsplit('.', 1)[0] + '.arf') os.remove(filename.rsplit('.', 1)[0] + '.frq') raise MissingSubCorpFreqFile(corp, ex) except IOError: try: frq = array.array('l') frq.fromfile(open(filename + '64', 'rb'), id_range) # type: ignore except IOError as ex: if not hasattr(corp, 'spath') and nums == 'frq': a = corp.get_attr(attrname) frq.fromlist([a.freq(i) for i in range(a.id_range())]) else: raise MissingSubCorpFreqFile(corp, ex) return frq
def _normalize_multivalues(self, corp: manatee.Corpus, attr1: str, attr2: str) -> Tuple[str, str]: multisep1 = corp.get_conf(self._conf["attr1"] + '.MULTISEP') multisep2 = corp.get_conf(self._conf["attr2"] + '.MULTISEP') if multisep1 and multisep2: attr1_split = attr1.split(multisep1) attr2_split = attr2.split(multisep2) if len(attr1_split) == len(attr2_split): return attr1_split[0], attr2_split[0] logging.warning( f'PosAttrPairRelManateeBackend multivalue normalization mismatch - {attr1}...{attr2}' ) return attr1, attr2
def corpconf_pairs(self, corp: Corpus, label: str) -> List[Tuple[str, str]]: """ Encodes some specific corpus registry file configuration values where a list of pairs is actually flattened (k1, v1, k2, v2,..., kN, vN). This applies e.g. for WPOSLIST and LPOSLIST. Returns: a list of pairs """ if type(corp) is str: corp = self.get_Corpus(corp) val = corp.get_conf(label) if len(val) > 2: val = val[1:].split(val[0]) else: val = '' return [(val[i], val[i + 1]) for i in range(0, len(val), 2)]
def texttype_values( corp: Corpus, subcorpattrs: str, maxlistsize: int, shrink_list: Union[Tuple[str, ...], List[str]] = (), collator_locale: Optional[str] = None) -> List[Dict[str, Any]]: """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} !!!!!! NOTE: avoid calling this method repeatedly for the same corpus as the attr = corp.get_attr(n) line is leaking opened files of corpora indexes which leads to exhausted limit for opened files for Gunicorn/Celery after some time. KonText caches the value returned by this function to prevent this. !!! TODO !!! """ if subcorpattrs == '#': return [] attrlines = [] if not shrink_list: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ attr.id2str(i).split(multisep) for i in range(attr.id_range()) ] vals = [{ 'v': x } for x in sorted( set([s for subl in raw_vals for s in subl]))] else: vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted( vals, key=cmp_to_key(lambda x1, x2: cmp( x1['v'].lower(), x2['v'].lower()))) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def _should_be_bg_query(corp: manatee.Corpus, query: Tuple[str, ...], asnc: int) -> bool: return (len(query) > 1 and asnc == 1 and (query[1][0] == 'X' and corp.size() > CONC_BG_SYNC_ALIGNED_CORP_THRESHOLD or corp.size() > CONC_BG_SYNC_SINGLE_CORP_THRESHOLD))