def get_user_items(self, plugin_api): ans = [] if self._auth.anonymous_user()['id'] != plugin_api.user_id: for item_id, item in list(self._db.hash_get_all(self._mk_key(plugin_api.user_id)).items()): ans.append(import_record(item)) ans = l10n.sort(ans, plugin_api.user_lang, key=lambda itm: itm.sort_key, reverse=False) return ans
def sort(self, plugin_api, data, field='name', *fields): if field == 'size': return sorted(data, key=lambda c: c.get(field, 0), reverse=True) else: def corp_cmp_key(c, field): return c.get(field) if c.get(field) is not None else '' return l10n.sort(data, loc=plugin_api.user_lang, key=lambda c: corp_cmp_key(c, field))
def get_user_items(self, user_id): ans = [] if self._auth.anonymous_user()['id'] != user_id: for item_id, item in self._db.hash_get_all(self._mk_key(user_id)).items(): ans.append(self.decoder.decode(item)) ans = l10n.sort(ans, self.getlocal('lang'), key=lambda itm: itm.name, reverse=False) return ans
def get_user_items(self, plugin_api): ans = [] if self._auth.anonymous_user()['id'] != plugin_api.user_id: for item_id, item in self._db.hash_get_all(self._mk_key(plugin_api.user_id)).items(): ans.append(import_record(item)) ans = l10n.sort(ans, plugin_api.user_lang, key=lambda itm: itm.sort_key, reverse=False) return ans
def require_existing_pquery( pquery: PqueryFormArgs, offset: int, limit: int, collator_locale: str, sort: str, reverse: bool) -> Tuple[int, List[Tuple[str, int]]]: path = _create_cache_path(pquery) if not os.path.exists(path): raise PqueryResultNotFound('The result does not exist') else: if sort == 'freq': if reverse is True: return load_cached_partial(path, offset, limit) else: total, rows = load_cached_full(path) return total, list(reversed(rows))[offset:offset + limit] elif sort == 'value': total, rows = load_cached_full(path) return (total, l10n.sort(rows, key=lambda x: x[0], loc=collator_locale, reverse=reverse)[offset:offset + limit]) elif sort.startswith('freq-'): conc_idx = pquery.conc_ids.index(sort[len('freq-'):]) total, rows = load_cached_full(path) return (total, sorted(rows, key=lambda x: x[conc_idx + 1], reverse=reverse)[offset:offset + limit]) else: raise PqueryArgumentError(f'Invalid sort argument: {sort}')
def find_suggestion( self, user_id, ui_lang, maincorp, corpora, subcorpus, value, value_type, value_subformat, query_type, p_attr, struct, s_attr): used_corp = self._preset_corp if self._preset_corp is not None else maincorp value_norm = value if value_subformat in ('regexp', 'advanced') else re_escape(value) icase = '(?i)' if value_subformat in ('simple_ic',) else '' rels = defaultdict(lambda: set()) try: conc = get_conc( used_corp, user_id, (f'aword,[{self._conf["attr1"]}="{icase}{value_norm}" | {self._conf["attr2"]}="{icase}{value_norm}"]',)) conc.sync() mlargs = dict(ml1attr=self._conf["attr1"], ml2attr=self._conf["attr2"]) fcrit = multi_level_crit(2, **mlargs) data = self._freq_dist(corp=used_corp, conc=conc, fcrit=fcrit, user_id=user_id) for item in data: attr1, attr2 = self._normalize_multivalues( used_corp, value_norm, *(tuple([w['n'] for w in item['Word']])[:2])) rels[attr1].add(attr2) except RuntimeError as ex: msg = str(ex).lower() if 'syntax error' not in msg: raise ex return dict(attrs=(self._conf['attr1'], self._conf['attr2']), data=dict((k, l10n.sort(v, ui_lang, key=lambda itm: itm, reverse=False)) for k, v in rels.items()))
def _export_attr_values(self, data: Dict[StructAttr, Set[AttrValue]], total_poscount: int, aligned_corpora: List[str], expand_attrs: List[StructAttr], collator_locale: str, max_attr_list_size: Optional[int]) -> AttrValuesResponse: exported = AttrValuesResponse( attr_values={}, aligned=aligned_corpora, poscount=total_poscount) for struct_attr, attr_values in data.items(): if max_attr_list_size is None or len(attr_values) <= max_attr_list_size or struct_attr in expand_attrs: out_data = l10n.sort(attr_values, collator_locale, key=lambda t: t[0]) exported.attr_values[struct_attr.key()] = out_data else: exported.attr_values[struct_attr.key()] = {'length': len(attr_values)} return exported
def get_user_items(self, user_id): ans = [] if self._auth.anonymous_user()['id'] != user_id: for item_id, item in self._db.hash_get_all( self._mk_key(user_id)).items(): ans.append(self.decoder.decode(item)) ans = l10n.sort(ans, self.getlocal('lang'), key=lambda itm: itm.name, reverse=False) return ans
def _export_attr_values(self, data, aligned_corpora, expand_attrs, collator_locale, max_attr_list_size): values = {} for k, v in data.items(): if isinstance(v, Iterable): if max_attr_list_size is None or len(v) <= max_attr_list_size or k in expand_attrs: out_data = l10n.sort(v, collator_locale, key=lambda t: t[0]) values[self.export_key(k)] = [AttrValue(*av) for av in out_data] else: values[self.export_key(k)] = {'length': len(v)} else: values[self.export_key(k)] = v return AttrValuesResponse(attr_values=values, aligned=aligned_corpora, poscount=values['poscount'])
def _export_attr_values(self, data, aligned_corpora, expand_attrs, collator_locale, max_attr_list_size): values = {} exported = dict(attr_values=values, aligned=aligned_corpora) for k in data.keys(): if isinstance(data[k], Iterable): if len(data[k]) <= max_attr_list_size or max_attr_list_size is None or k in expand_attrs: out_data = l10n.sort(data[k], collator_locale, key=lambda t: t[0]) values[self.export_key(k)] = out_data else: values[self.export_key(k)] = {'length': len(data[k])} else: values[self.export_key(k)] = data[k] exported['poscount'] = values['poscount'] return exported
def require_existing_wordlist( form: WordlistFormArgs, wlsort: str, reverse: bool, offset: int, limit: int, collator_locale: str) -> Tuple[int, List[Tuple[str, int]]]: path = _create_cache_path(form) if not os.path.exists(path): raise WordlistResultNotFound('The result does not exist') else: if wlsort == 'f': total, rows = load_cached_full(path) return (total, sorted(rows, key=lambda x: x[1], reverse=reverse)[offset:offset + limit]) else: total, rows = load_cached_full(path) rows = l10n.sort(rows, key=lambda x: x[0], loc=collator_locale, reverse=reverse) return total, rows[offset:offset + limit]
def _export_attr_values(self, data, aligned_corpora, expand_attrs, collator_locale, max_attr_list_size): values = {} exported = dict(attr_values=values, aligned=aligned_corpora) for k in list(data.keys()): if isinstance(data[k], Iterable): if max_attr_list_size is None or len( data[k]) <= max_attr_list_size or k in expand_attrs: out_data = l10n.sort(data[k], collator_locale, key=lambda t: t[0]) values[self.export_key(k)] = out_data else: values[self.export_key(k)] = {'length': len(data[k])} else: values[self.export_key(k)] = data[k] exported['poscount'] = values['poscount'] return exported
def _export_subcorpora_list(self, corpname, out): """ Updates passed dictionary by information about available sub-corpora. Listed values depend on current user and corpus. If there is a list already present in 'out' then it is extended by the new values. arguments: corpname -- corpus id out -- a dictionary used by templating system """ basecorpname = corpname.split(':')[0] subcorp_list = l10n.sort(self.cm.subcorp_names(basecorpname), loc=self.ui_lang, key=lambda x: x['n']) if len(subcorp_list) > 0: subcorp_list = [{'n': '--%s--' % _('whole corpus'), 'v': ''}] + subcorp_list if out.get('SubcorpList', None) is None: out['SubcorpList'] = [] out['SubcorpList'].extend(subcorp_list)
def _export_subcorpora_list(self, corpname, out): """ Updates passed dictionary by information about available sub-corpora. Listed values depend on current user and corpus. If there is a list already present in 'out' then it is extended by the new values. arguments: corpname -- corpus id out -- a dictionary used by templating system """ basecorpname = corpname.split(':')[0] subcorp_list = l10n.sort(self.cm.subcorp_names(basecorpname), loc=self.ui_lang, key=lambda x: x['n']) if len(subcorp_list) > 0: subcorp_list = [{ 'n': '--%s--' % _('whole corpus'), 'v': '' }] + subcorp_list if out.get('SubcorpList', None) is None: out['SubcorpList'] = [] out['SubcorpList'].extend(subcorp_list)
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None): """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} """ if subcorpattrs == '#': return [] attrlines = [] if shrink_list is False: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{'v': attr.id2str(i)} for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING')) .split(multisep) for i in range(attr.id_range())] vals = [{'v': x} for x in sorted(set([s for subl in raw_vals for s in subl]))] else: vals = [{'v': import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))} for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted(vals, cmp=lambda x1, x2: cmp( x1['v'].lower(), x2['v'].lower())) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def get_attr_values(self, corpus, attr_map, aligned_corpora=None): """ Finds all the available values of remaining attributes according to the provided attr_map and aligned_corpora arguments: corpus -- manatee.corpus object attr_map -- a dictionary of attributes and values as selected by a user aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument) returns: a dictionary containing matching attributes and values """ corpname = vanilla_corpname(corpus.corpname) corpus_info = self.corparch.get_corpus_info(corpname) bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr) bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr) attrs = self._get_subcorp_attrs(corpus) if bib_label and bib_label not in attrs: attrs.append(bib_label) srch_attrs = set(attrs) - set(attr_map.keys()) srch_attrs.add('poscount') hidden_attrs = set() if bib_id is not None and bib_id not in srch_attrs: hidden_attrs.add(bib_id) if not bib_id: hidden_attrs.add('id') selected_attrs = tuple(srch_attrs.union(hidden_attrs)) srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)]) attr_items = AttrArgs(attr_map, self.empty_val_placeholder) where_sql, where_values = attr_items.export_sql('t1', corpname) join_sql = [] i = 2 for item in aligned_corpora: join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' % (i, i)) where_sql += ' AND t%d.corpus_id = ?' % i where_values.append(item) i += 1 if len(where_sql) > 0: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql) else: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql)) ans = {} ans.update(attr_map) for attr in srch_attrs: if attr in ('poscount',): ans[attr] = 0 else: ans[attr] = set() for item in self.db(corpname).execute(sql_template, *where_values).fetchall(): for attr in selected_attrs: v = item[srch_attr_map[attr]] if v is not None and attr not in hidden_attrs: if attr == bib_label: ans[attr].add((self.shorten_value(unicode(v)), item[srch_attr_map[bib_id]], unicode(v))) elif type(ans[attr]) is set: ans[attr].add((self.shorten_value(v), v, v)) elif type(ans[attr]) is int: ans[attr] += int(v) exported = {} collator_locale = corpus_info.collator_locale for k in ans.keys(): if type(ans[k]) is set: if len(ans[k]) <= self.max_attr_list_size: if k == bib_label: out_data = l10n.sort(ans[k], collator_locale, key=lambda t: t[0]) else: out_data = tuple(l10n.sort(ans[k], collator_locale, key=lambda t: t[0])) exported[self.export_key(k)] = out_data else: exported[self.export_key(k)] = {'length': len(ans[k])} else: exported[self.export_key(k)] = ans[k] exported['aligned'] = aligned_corpora return exported
def xfreq_dist(self, crit, limit=1, sortkey='f', ml='', ftt_include_empty='', rel_mode=0, collator_locale='en_US'): """ Calculates data (including data for visual output) of a frequency distribution specified by the 'crit' parameter arguments: crit -- specified criteria (CQL) limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted values must be greater than the limit) sortkey -- a key according to which the distribution will be sorted ml -- str, if non-empty then multi-level freq. distribution is generated ftt_include_empty -- str, TODO rel_mode -- {0, 1}, TODO """ # ml = determines how the bar appears (multilevel x text type) # import math normwidth_freq = 100 normwidth_rel = 100 def calc_scale(freqs, norms): """ Create proper scaling coefficients for freqs and norms to match a 100 units length bar. """ from operator import add sumn = float(reduce(add, norms)) if sumn == 0: return float(normwidth_rel) / max(freqs), 0 else: sumf = float(reduce(add, freqs)) corr = min(sumf / max(freqs), sumn / max(norms)) return normwidth_rel / sumf * corr, normwidth_rel / sumn * corr def label(attr): if '/' in attr: attr = attr[:attr.index('/')] lab = self.pycorp.get_conf(attr + '.LABEL') return self.import_string(lab if lab else attr) words = manatee.StrVector() freqs = manatee.NumVector() norms = manatee.NumVector() self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms) words = [self.import_string(w) for w in words] if not len(freqs): return {} # now we intentionally rewrite norms as filled in by freq_dist() # because of "hard to explain" metrics they lead to if rel_mode == 0: norms2_dict = self.get_attr_values_sizes(crit) norms = [norms2_dict.get(x, 0) for x in words] sumf = float(sum([x for x in freqs])) attrs = crit.split() head = [dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2)] head.append(dict(n=translate('Freq'), s='freq', title=translate('Frequency'))) tofbar, tonbar = calc_scale(freqs, norms) if tonbar and not ml: maxf = max(freqs) # because of bar height minf = min(freqs) maxrel = 0 # because of bar width for index, (f, nf) in enumerate(zip(freqs, norms)): if nf == 0: nf = 100000 norms[index] = 100000 newrel = (f * tofbar / (nf * tonbar)) if maxrel < newrel: maxrel = newrel if rel_mode == 0: head.append(dict( n='i.p.m.', title=translate( 'instances per million positions (refers to the respective category)'), s='rel' )) else: head.append(dict(n='Freq [%]', title='', s='rel')) lines = [] for w, f, nf in zip(words, freqs, norms): w = self.import_string(w) rel_norm_freq = { 0: round(f * 1e6 / nf, 2), 1: round(f / sumf * 100, 2) }[rel_mode] rel_bar = { 0: 1 + int(f * tofbar * normwidth_rel / (nf * tonbar * maxrel)), 1: 1 + int(float(f) / maxf * normwidth_rel) }[rel_mode] freq_bar = { 0: int(normwidth_freq * float(f) / (maxf - minf + 1) + 1), 1: 10 }[rel_mode] lines.append(dict( Word=[{'n': ' '.join(n.split('\v'))} for n in w.split('\t')], freq=f, fbar=int(f * tofbar) + 1, norm=nf, nbar=int(nf * tonbar), relbar=rel_bar, norel=ml, freqbar=freq_bar, rel=rel_norm_freq )) else: lines = [] for w, f, nf in zip(words, freqs, norms): w = self.import_string(w) lines.append(dict( Word=[{'n': ' '.join(n.split('\v'))} for n in w.split('\t')], freq=f, fbar=int(f * tofbar) + 1, norel=1, relbar=None )) if ftt_include_empty and limit == 0 and '.' in attrs[0]: attr = self.pycorp.get_attr(attrs[0]) all_vals = [attr.id2str(i) for i in range(attr.id_range())] used_vals = [line['Word'][0]['n'] for line in lines] for v in all_vals: if v in used_vals: continue lines.append(dict( Word=[{'n': self.import_string(v)}], freq=0, rel=0, norm=0, nbar=0, relbar=0, norel=ml, freqbar=0, fbar=0 )) if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])): sortkey = int(sortkey) lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n']) else: if sortkey not in ('freq', 'rel'): sortkey = 'freq' lines = sorted(lines, key=lambda v: v[sortkey], reverse=True) return dict(Head=head, Items=lines)
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None): """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} """ if subcorpattrs == '#': return [] attrlines = [] if shrink_list is False: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ import_string(attr.id2str(i), from_encoding=corp.get_conf( 'ENCODING')).split(multisep) for i in range(attr.id_range()) ] vals = [{ 'v': x } for x in sorted( set([s for subl in raw_vals for s in subl]))] else: vals = [{ 'v': import_string( attr.id2str(i), from_encoding=corp.get_conf('ENCODING')) } for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted( vals, cmp=lambda x1, x2: cmp(x1['v'].lower(), x2['v'].lower( ))) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def sort(self, plugin_api, data, *fields): def corp_cmp_key(c): return c.get('name') if c.get('name') is not None else '' return l10n.sort(data, loc=plugin_api.user_lang, key=corp_cmp_key)
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): external_keywords = filter_dict.getlist('keyword') external_keywords = self._corparch.map_external_keywords( external_keywords, plugin_api.user_lang) if len(external_keywords) != 0: query_substrs = [] query_keywords = external_keywords + [self.default_label] else: if self.SESSION_KEYWORDS_KEY not in plugin_api.session: plugin_api.session[self.SESSION_KEYWORDS_KEY] = [ self.default_label ] initial_query = query if query is False: query = '' query_substrs, query_keywords = parse_query( self._tag_prefix, query) if len(query_keywords) == 0 and initial_query is False: query_keywords = plugin_api.session[self.SESSION_KEYWORDS_KEY] else: plugin_api.session[self.SESSION_KEYWORDS_KEY] = query_keywords query = ' '.join(query_substrs) \ + ' ' + ' '.join('%s%s' % (self._tag_prefix, s) for s in query_keywords) ans = {'rows': []} permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict) if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None sorting_field = filter_dict.get('sortBySize', 'name') if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) def fav_id(corpus_id): for item in user_items: if item.is_single_corpus and item.main_corpus_id == corpus_id: return item.ident return None query_substrs, query_keywords = parse_query(self._tag_prefix, query) all_keywords_map = dict( self._corparch.all_keywords(plugin_api.user_lang)) normalized_query_substrs = [s.lower() for s in query_substrs] used_keywords = set() for corp in self._corparch.get_list(plugin_api): full_data = self._corparch.get_corpus_info(plugin_api.user_lang, corp['id']) if not isinstance(full_data, BrokenCorpusInfo): keywords = [k for k, _ in full_data.metadata.keywords] tests = [] found_in = [] tests.extend([k in keywords for k in query_keywords]) for s in normalized_query_substrs: # the name must be tested first to prevent the list 'found_in' # to be filled in case item matches both name and description if s in corp['name'].lower(): tests.append(True) elif s in (corp['desc'].lower() if corp['desc'] else ''): tests.append(True) found_in.append('defaultCorparch__found_in_desc') else: tests.append(False) tests.append(self.matches_size(corp, min_size, max_size)) tests.append( self._corparch.custom_filter(self._plugin_api, full_data, permitted_corpora)) if all(test for test in tests): corp['size'] = corp['size'] corp['size_info'] = l10n.simplify_num( corp['size']) if corp['size'] else None corp['keywords'] = [(k, all_keywords_map[k]) for k in keywords] corp['found_in'] = found_in corp['fav_id'] = fav_id(corp['id']) # because of client-side fav/feat/search items compatibility corp['corpus_id'] = corp['id'] corp['pmltq'] = full_data['pmltq'] corp['repo'] = full_data['web'] corp['access'] = full_data['access'] corp['tokenConnect'] = full_data['token_connect'][ 'providers'] ans['rows'].append(corp) used_keywords.update(keywords) if not self.should_fetch_next(ans, offset, limit): break ans['rows'], ans['nextOffset'] = self.cut_result( self.sort(plugin_api, ans['rows'], field=sorting_field), offset, limit) ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang) ans['query'] = query ans['current_keywords'] = query_keywords ans['filters'] = dict(filter_dict) return ans
def get_attr_values(self, corpus, attr_map, aligned_corpora=None): """ Finds all the available values of remaining attributes according to the provided attr_map and aligned_corpora arguments: corpus -- manatee.corpus object attr_map -- a dictionary of attributes and values as selected by a user aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument) returns: a dictionary containing matching attributes and values """ corpname = vanilla_corpname(corpus.corpname) corpus_info = self.corparch.get_corpus_info(corpname) bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr) bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr) attrs = self._get_subcorp_attrs(corpus) if bib_label and bib_label not in attrs: attrs.append(bib_label) srch_attrs = set(attrs) - set(self.import_key(k) for k in attr_map.keys() if type(attr_map[k]) is not dict) srch_attrs.add('poscount') hidden_attrs = set() if bib_id is not None and bib_id not in srch_attrs: hidden_attrs.add(bib_id) if not bib_id: hidden_attrs.add('id') selected_attrs = tuple(srch_attrs.union(hidden_attrs)) # a map [db_col_name]=>[db_col_idx] srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)]) attr_items = AttrArgs(attr_map, self.empty_val_placeholder) where_sql, where_values = attr_items.export_sql('t1', corpname) join_sql = [] i = 2 for item in aligned_corpora: join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' % (i, i)) where_sql += ' AND t%d.corpus_id = ?' % i where_values.append(item) i += 1 if len(where_sql) > 0: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql) else: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql)) ans = {} # already selected items are part of the answer; no need to fetch them from db ans.update(dict([(self.import_key(k), v) for k, v in attr_map.items()])) range_attrs = set() for attr in ans.keys(): if type(ans[attr]) is dict: ans[attr] = set() # currently we throw away the range and load all the stuff range_attrs.add(attr) for attr in srch_attrs: if attr in ('poscount',): ans[attr] = 0 else: ans[attr] = set() poscounts = defaultdict(lambda: defaultdict(lambda: 0)) max_visible_chars = self.calc_max_attr_val_visible_chars(corpus_info) for item in self.db(corpname).execute(sql_template, *where_values).fetchall(): for attr in selected_attrs: v = item[srch_attr_map[attr]] if v is not None and attr not in hidden_attrs: attr_val = None if attr == bib_label: attr_val = (self.shorten_value(unicode(v), length=max_visible_chars), item[srch_attr_map[bib_id]], unicode(v)) elif type(ans[attr]) is set: attr_val = (self.shorten_value(unicode(v), length=max_visible_chars), v, v) elif type(ans[attr]) is int: ans[attr] += int(v) if attr_val is not None: poscounts[attr][attr_val] += item['poscount'] # here we append position count information to the respective items for attr, v in poscounts.items(): for k, c in v.items(): ans[attr].add(k + (l10n.format_number(c),)) del poscounts[attr] exported = {} collator_locale = corpus_info.collator_locale for k in ans.keys(): if type(ans[k]) is set: if len(ans[k]) <= self.max_attr_list_size or k in range_attrs: if k == bib_label: out_data = l10n.sort(ans[k], collator_locale, key=lambda t: t[0]) else: out_data = tuple(l10n.sort(ans[k], collator_locale, key=lambda t: t[0])) exported[self.export_key(k)] = out_data else: exported[self.export_key(k)] = {'length': len(ans[k])} else: exported[self.export_key(k)] = ans[k] exported['poscount'] = l10n.format_number(exported['poscount']) exported['aligned'] = aligned_corpora return exported
def subcorp_list(self, request): """ Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in installed then the list is enriched by additional re-use/undelete information. """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) sort = 'n' # TODO show_deleted = int(request.args.get('show_deleted', 0)) current_corp = self.args.corpname if self.get_http_method() == 'POST': selected_subc = request.form.getlist('selected_subc') self._delete_subcorpora(selected_subc) data = [] for corp in plugins.get('auth').permitted_corpora(self._session_get('user', 'id')).values(): try: self.cm.get_Corpus(corp) basecorpname = corp.split(':')[0] for item in self.cm.subcorp_names(basecorpname): sc = self.cm.get_Corpus(corp, item['n']) subc_id = '%s:%s' % (corp, item['n']) data.append({ 'n': subc_id, 'v': item['n'], 'size': sc.search_size(), 'created': sc.created, 'corpname': corp, 'usesubcorp': item['n'], 'deleted': False }) except Exception as e: for d in data: # permitted_corpora does this d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'], unsafe='+') logging.getLogger(__name__).warn( 'Failed to fetch information about subcorpus of [%s]: %s' % (corp, e)) if plugins.has_plugin('subc_restore'): try: full_list = plugins.get('subc_restore').extend_subc_list( data, self._session_get('user', 'id'), bool(show_deleted), 0) except Exception as e: logging.getLogger(__name__).error('subc_restore plug-in failed to list queries: %s' % e) full_list = [] else: full_list = data # TODO sorting does not work sort_key, rev = Kontext._parse_sorting_param(sort) if sort_key in ('size', 'created'): data = sorted(data, key=lambda x: x[sort_key], reverse=rev) else: data = l10n.sort(data, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev) sort_keys = dict([(x, (x, '')) for x in ('n', 'size', 'created')]) if not rev: sort_keys[sort_key] = ('-%s' % sort_key, '↑') else: sort_keys[sort_key] = (sort_key, '↓') # this is necessary to reset manatee module back to its original state self.cm.get_Corpus(current_corp) ans = { 'subcorp_list': full_list, 'sort_keys': sort_keys, 'show_deleted': show_deleted, 'rev': rev } self._export_subcorpora_list(ans) return ans
def subcorp_list(self, request): """ Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in installed then the list is enriched by additional re-use/undelete information. """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) sort = 'n' # TODO show_deleted = int(request.args.get('show_deleted', 0)) if self.get_http_method() == 'POST': selected_subc = request.form.getlist('selected_subc') self._delete_subcorpora(selected_subc) data = [] user_corpora = plugins.get('auth').permitted_corpora(self._session_get('user', 'id')).values() for corp in user_corpora: try: for item in self.cm.subcorp_names(corp): sc = self.cm.get_Corpus(corp, item['n']) data.append({ 'n': '%s:%s' % (self._canonical_corpname(corp), item['n']), 'internal_n': '%s:%s' % (corp, item['n']), 'v': item['n'], 'size': sc.search_size(), 'created': sc.created, 'corpname': corp, 'human_corpname': sc.get_conf('NAME'), 'usesubcorp': item['n'], 'deleted': False }) except Exception as e: for d in data: # permitted_corpora does this d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'], unsafe='+') logging.getLogger(__name__).warn( 'Failed to fetch information about subcorpus of [%s]: %s' % (corp, e)) if plugins.has_plugin('subc_restore'): try: full_list = plugins.get('subc_restore').extend_subc_list( data, self._session_get('user', 'id'), self._canonical_corpname, bool(show_deleted), 0) except Exception as e: logging.getLogger(__name__).error('subc_restore plug-in failed to list queries: %s' % e) full_list = [] else: full_list = data # TODO sorting does not work sort_key, rev = Kontext._parse_sorting_param(sort) if sort_key in ('size', 'created'): data = sorted(data, key=lambda x: x[sort_key], reverse=rev) else: data = l10n.sort(data, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev) sort_keys = dict([(x, (x, '')) for x in ('n', 'size', 'created')]) if not rev: sort_keys[sort_key] = ('-%s' % sort_key, '↑') else: sort_keys[sort_key] = (sort_key, '↓') ans = { 'SubcorpList': [], # this is used by subcorpus SELECT element; no need for that here 'subcorp_list': full_list, 'sort_keys': sort_keys, 'show_deleted': show_deleted, 'rev': rev } return ans
def list(self, request: Request) -> Dict[str, Any]: """ Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in installed then the list is enriched by additional re-use/undelete information. """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) filter_args = dict(show_deleted=bool( int(request.args.get('show_deleted', 0))), corpname=request.args.get('corpname')) data = [] user_corpora = list( plugins.runtime.AUTH.instance.permitted_corpora( self.session_get('user'))) related_corpora = set() for corp in user_corpora: for item in self.user_subc_names(corp): try: sc = self.cm.get_corpus(corp, subcname=item['n'], decode_desc=False) data.append({ 'name': '%s / %s' % (corp, item['n']), 'size': sc.search_size, 'created': time.mktime(sc.created.timetuple()), 'corpname': corp, 'human_corpname': sc.get_conf('NAME'), 'usesubcorp': sc.subcname, 'orig_subcname': sc.orig_subcname, 'deleted': False, 'description': sc.description, 'published': sc.is_published }) related_corpora.add(corp) except RuntimeError as e: logging.getLogger(__name__).warning( 'Failed to fetch information about subcorpus {0}:{1}: {2}' .format(corp, item['n'], e)) if filter_args['corpname']: data = [ item for item in data if not filter_args['corpname'] or item['corpname'] == filter_args['corpname'] ] elif filter_args['corpname'] is None: filter_args['corpname'] = '' # JS code requires non-null value if plugins.runtime.SUBC_RESTORE.exists: with plugins.runtime.SUBC_RESTORE as sr: try: full_list = sr.extend_subc_list(self._plugin_ctx, data, filter_args, 0) except Exception as e: logging.getLogger(__name__).error( 'subc_restore plug-in failed to list queries: %s' % e) full_list = data else: full_list = data sort = request.args.get('sort', '-created') sort_key, rev = self._parse_sorting_param(sort) if sort_key in ('size', 'created'): full_list = sorted(full_list, key=lambda x: x[sort_key], reverse=rev) else: full_list = l10n.sort(full_list, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev) ans = dict( SubcorpList= [], # this is used by subcorpus SELECT element; no need for that here subcorp_list=full_list, sort_key=dict(name=sort_key, reverse=rev), filter=filter_args, processed_subc=[ v.to_dict() for v in self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS) ], related_corpora=sorted(related_corpora), uses_subc_restore=plugins.runtime.SUBC_RESTORE.exists) return ans
def search(self, user_id, query, offset=0, limit=None, filter_dict=None): ans = {"rows": []} permitted_corpora = self._auth.permitted_corpora(user_id) user_items = self._user_items.get_user_items(user_id) used_keywords = set() all_keywords_map = dict(self.all_keywords) if filter_dict.get("minSize"): min_size = l10n.desimplify_num(filter_dict.get("minSize"), strict=False) else: min_size = 0 if filter_dict.get("maxSize"): max_size = l10n.desimplify_num(filter_dict.get("maxSize"), strict=False) else: max_size = None corplist = self.get_list(permitted_corpora) if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = self._max_page_size def cut_result(res): if limit is not None: right_lim = offset + int(limit) new_res = res[offset:right_lim] if right_lim >= len(res): right_lim = None else: right_lim = None new_res = res return new_res, right_lim def is_fav(corpus_id): for item in user_items: if isinstance(item, CorpusItem) and item.corpus_id == corpus_id: return True return False query_substrs, query_keywords = self._parse_query(query) matches_all = lambda d: reduce(lambda t1, t2: t1 and t2, d, True) def matches_size(d): item_size = d.get("size", None) return ( item_size is not None and (not min_size or int(item_size) >= int(min_size)) and (not max_size or int(item_size) <= int(max_size)) ) normalized_query_substrs = [s.lower() for s in query_substrs] for corp in corplist: full_data = self.get_corpus_info(corp["id"], self.getlocal("lang")) if not isinstance(full_data, BrokenCorpusInfo): keywords = [k for k in full_data["metadata"]["keywords"].keys()] hits = [] found_in = [] hits.extend([k in keywords for k in query_keywords]) for s in normalized_query_substrs: # the name must be tested first to prevent the list 'found_in' # to be filled in case item matches both name and description if s in corp["name"].lower(): hits.append(True) elif s in (corp["desc"].lower() if corp["desc"] else ""): hits.append(True) found_in.append(_("description")) else: hits.append(False) hits.append(matches_size(corp)) hits.append(self.custom_filter(full_data, permitted_corpora)) if matches_all(hits): corp["raw_size"] = l10n.simplify_num(corp["size"]) if corp["size"] else None corp["keywords"] = [(k, all_keywords_map[k]) for k in keywords] corp["found_in"] = found_in corp["user_item"] = is_fav(corp["id"]) self.customize_search_result_item(corp, full_data) ans["rows"].append(corp) used_keywords.update(keywords) corp_cmp_key = lambda c: c.get("name") if c.get("name") is not None else "" ans["rows"], ans["nextOffset"] = cut_result(l10n.sort(ans["rows"], loc=self._lang(), key=corp_cmp_key)) ans["keywords"] = l10n.sort(used_keywords, loc=self._lang()) ans["query"] = query ans["filters"] = dict(filter_dict) return ans
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): if query is False: # False means 'use default values' query = '' ans = {'rows': []} permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict) used_keywords = set() all_keywords_map = dict( self._corparch.all_keywords(plugin_api.user_lang)) if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None if filter_dict.get('favOnly'): favourite_only = bool(int(filter_dict.get('favOnly'))) else: favourite_only = False if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) def fav_id(corpus_id): for item in user_items: if item.is_single_corpus and item.main_corpus_id == corpus_id: return item.ident return None query_substrs, query_keywords = parse_query(self._tag_prefix, query) normalized_query_substrs = [s.lower() for s in query_substrs] for corp in self._corparch.get_list(plugin_api, permitted_corpora): full_data = self._corparch.get_corpus_info(plugin_api.user_lang, corp['id']) if not isinstance(full_data, BrokenCorpusInfo): if favourite_only and fav_id(corp['id']) is None: continue keywords = [k for k, _ in full_data.metadata.keywords] tests = [] found_in = [] tests.extend([k in keywords for k in query_keywords]) for s in normalized_query_substrs: # the name must be tested first to prevent the list 'found_in' # to be filled in case item matches both name and description if s in corp['name'].lower(): tests.append(True) elif s in (corp['desc'].lower() if corp['desc'] else ''): tests.append(True) found_in.append('defaultCorparch__found_in_desc') else: tests.append(False) tests.append(self.matches_size(corp, min_size, max_size)) tests.append( self._corparch.custom_filter(self._plugin_api, full_data, permitted_corpora)) if self.matches_all(tests): corp['size'] = corp['size'] corp['size_info'] = l10n.simplify_num( corp['size']) if corp['size'] else None corp['keywords'] = [(k, all_keywords_map[k]) for k in keywords] corp['found_in'] = found_in corp['fav_id'] = fav_id(corp['id']) # because of client-side fav/feat/search items compatibility corp['corpus_id'] = corp['id'] ans['rows'].append(corp) used_keywords.update(keywords) if not self.should_fetch_next(ans, offset, limit): break ans['rows'], ans['nextOffset'] = self.cut_result( self.sort(plugin_api, ans['rows']), offset, limit) ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang) ans['query'] = query ans['current_keywords'] = query_keywords ans['filters'] = dict(filter_dict) return ans
def subcorp_list(self, request): """ Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in installed then the list is enriched by additional re-use/undelete information. """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) sort = request.args.get('sort', 'name') filter_args = dict(show_deleted=bool( int(request.args.get('show_deleted', 0))), corpname=request.args.get('corpname')) data = [] user_corpora = plugins.runtime.AUTH.instance.permitted_corpora( self.session_get('user')).values() related_corpora = set() for corp in user_corpora: try: for item in self.cm.subcorp_names(corp): sc = self.cm.get_Corpus(corp, item['n']) data.append({ 'name': '%s:%s' % (self._canonical_corpname(corp), item['n']), 'size': sc.search_size(), 'created': time.mktime(sc.created.timetuple()), 'corpname': corp, 'human_corpname': sc.get_conf('NAME'), 'usesubcorp': item['n'], 'deleted': False }) related_corpora.add(self._canonical_corpname(corp)) except Exception as e: for d in data: # permitted_corpora does this d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'], unsafe='+') logging.getLogger(__name__).warn( 'Failed to fetch information about subcorpus of [%s]: %s' % (corp, e)) if filter_args['corpname']: data = filter( lambda item: not filter_args['corpname'] or item['corpname'] == filter_args['corpname'], data) elif filter_args['corpname'] is None: filter_args['corpname'] = '' # JS code requires non-null value if plugins.runtime.SUBC_RESTORE.exists: try: full_list = plugins.runtime.SUBC_RESTORE.instance.extend_subc_list( self._plugin_api, data, filter_args, 0) except Exception as e: logging.getLogger(__name__).error( 'subc_restore plug-in failed to list queries: %s' % e) full_list = data else: full_list = data sort_key, rev = self._parse_sorting_param(sort) if sort_key in ('size', 'created'): full_list = sorted(full_list, key=lambda x: x[sort_key], reverse=rev) else: full_list = l10n.sort(full_list, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev) unfinished_corpora = filter( lambda at: not at.is_finished(), self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) ans = dict( SubcorpList= [], # this is used by subcorpus SELECT element; no need for that here subcorp_list=full_list, sort_key=dict(name=sort_key, reverse=rev), filter=filter_args, unfinished_subc=[uc.to_dict() for uc in unfinished_corpora], related_corpora=sorted(related_corpora)) return ans
def texttype_values( corp: Corpus, subcorpattrs: str, maxlistsize: int, shrink_list: Union[Tuple[str, ...], List[str]] = (), collator_locale: Optional[str] = None) -> List[Dict[str, Any]]: """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} !!!!!! NOTE: avoid calling this method repeatedly for the same corpus as the attr = corp.get_attr(n) line is leaking opened files of corpora indexes which leads to exhausted limit for opened files for Gunicorn/Celery after some time. KonText caches the value returned by this function to prevent this. !!! TODO !!! """ if subcorpattrs == '#': return [] attrlines = [] if not shrink_list: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ attr.id2str(i).split(multisep) for i in range(attr.id_range()) ] vals = [{ 'v': x } for x in sorted( set([s for subl in raw_vals for s in subl]))] else: vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted( vals, key=cmp_to_key(lambda x1, x2: cmp( x1['v'].lower(), x2['v'].lower()))) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): if query is False: # False means 'use default values' query = '' if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None if filter_dict.get('requestable'): requestable = bool(int(filter_dict.get('requestable'))) else: requestable = False if filter_dict.get('favOnly'): favourites_only = bool(int(filter_dict.get('favOnly'))) else: favourites_only = False if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) favourite_corpora = { item.main_corpus_id: item.ident for item in user_items if item.is_single_corpus} def get_found_in(corp, phrases): ans = [] for phrase in phrases: phrase = phrase.lower() name = corp.name.lower() if corp.name is not None else '' desc = corp.description.lower() if corp.description is not None else '' if phrase not in name and phrase in desc: ans.append('defaultCorparch__found_in_desc') break return ans query_substrs, query_keywords = parse_query(self._tag_prefix, query) normalized_query_substrs = [s.lower() for s in query_substrs] used_keywords = set() rows = list(self._corparch.list_corpora(plugin_api, substrs=normalized_query_substrs, min_size=min_size, max_size=max_size, requestable=requestable, offset=offset, limit=limit + 1, keywords=query_keywords, favourites=tuple(favourite_corpora.keys()) if favourites_only else ()).values()) ans = [] for i, corp in enumerate(rows): used_keywords.update(corp.keywords) corp.keywords = self._corparch.get_l10n_keywords(corp.keywords, plugin_api.user_lang) corp.fav_id = favourite_corpora.get(corp.id, None) corp.found_in = get_found_in(corp, normalized_query_substrs) ans.append(corp.to_dict()) if i == limit - 1: break return dict(rows=ans, nextOffset=offset + limit if len(rows) > limit else None, keywords=l10n.sort(used_keywords, loc=plugin_api.user_lang), query=query, current_keywords=query_keywords, filters=dict(filter_dict))
def subcorp_list(self, request): """ Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in installed then the list is enriched by additional re-use/undelete information. """ self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY, MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE) filter_args = dict(show_deleted=bool(int(request.args.get('show_deleted', 0))), corpname=request.args.get('corpname')) data = [] user_corpora = plugins.runtime.AUTH.instance.permitted_corpora( self.session_get('user')).keys() related_corpora = set() for corp in user_corpora: for item in self.user_subc_names(corp): try: sc = self.cm.get_Corpus(corp, subcname=item['n'], decode_desc=False) data.append({ 'name': '%s / %s' % (corp, item['n']), 'size': sc.search_size(), 'created': time.mktime(sc.created.timetuple()), 'corpname': corp, 'human_corpname': sc.get_conf('NAME'), 'usesubcorp': sc.subcname, 'orig_subcname': sc.orig_subcname, 'deleted': False, 'description': sc.description, 'published': corplib.subcorpus_is_published(sc.spath) }) related_corpora.add(corp) except RuntimeError as e: logging.getLogger(__name__).warn( u'Failed to fetch information about subcorpus {0}:{1}: {2}'.format(corp, item['n'], e)) if filter_args['corpname']: data = filter(lambda item: not filter_args['corpname'] or item['corpname'] == filter_args['corpname'], data) elif filter_args['corpname'] is None: filter_args['corpname'] = '' # JS code requires non-null value if plugins.runtime.SUBC_RESTORE.exists: try: full_list = plugins.runtime.SUBC_RESTORE.instance.extend_subc_list(self._plugin_api, data, filter_args, 0) except Exception as e: logging.getLogger(__name__).error( 'subc_restore plug-in failed to list queries: %s' % e) full_list = data else: full_list = data sort = request.args.get('sort', '-created') sort_key, rev = self._parse_sorting_param(sort) if sort_key in ('size', 'created'): full_list = sorted(full_list, key=lambda x: x[sort_key], reverse=rev) else: full_list = l10n.sort(full_list, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev) ans = dict( SubcorpList=[], # this is used by subcorpus SELECT element; no need for that here subcorp_list=full_list, sort_key=dict(name=sort_key, reverse=rev), filter=filter_args, processed_subc=[v.to_dict() for v in self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS)], related_corpora=sorted(related_corpora), uses_subc_restore=plugins.runtime.SUBC_RESTORE.exists ) return ans
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): if query is False: # False means 'use default values' query = '' if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) def fav_id(corpus_id): for item in user_items: if item.is_single_corpus and item.main_corpus_id == corpus_id: return item.ident return None def get_found_in(corp, phrases): ans = [] for phrase in phrases: if phrase in corp.description.lower(): ans.append(_('description')) break return ans query_substrs, query_keywords = parse_query(self._tag_prefix, query) normalized_query_substrs = [s.lower() for s in query_substrs] used_keywords = set() rows = self._corparch.list_corpora(plugin_api, substrs=normalized_query_substrs, min_size=min_size, max_size=max_size, offset=offset, limit=limit + 1, keywords=query_keywords).values() ans = [] for i, corp in enumerate(rows): used_keywords.update(corp.keywords) corp.keywords = self._corparch.get_l10n_keywords( corp.keywords, plugin_api.user_lang) corp.fav_id = fav_id(corp.id) corp.found_in = get_found_in(corp, normalized_query_substrs) ans.append(corp.to_dict()) if i == limit - 1: break return dict(rows=ans, nextOffset=(limit + 1) if len(rows) > limit else None, keywords=l10n.sort(used_keywords, loc=plugin_api.user_lang), query=query, current_keywords=query_keywords, filters=dict(filter_dict))
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): if query is False: # False means 'use default values' query = '' if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None if filter_dict.get('requestable'): requestable = bool(int(filter_dict.get('requestable'))) else: requestable = False if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) def fav_id(corpus_id): for item in user_items: if item.is_single_corpus and item.main_corpus_id == corpus_id: return item.ident return None def get_found_in(corp, phrases): ans = [] for phrase in phrases: phrase = phrase.lower() if phrase not in corp.name.lower() and phrase in corp.description.lower(): ans.append('defaultCorparch__found_in_desc') break return ans query_substrs, query_keywords = parse_query(self._tag_prefix, query) normalized_query_substrs = [s.lower() for s in query_substrs] used_keywords = set() rows = self._corparch.list_corpora(plugin_api, substrs=normalized_query_substrs, min_size=min_size, max_size=max_size, requestable=requestable, offset=offset, limit=limit + 1, keywords=query_keywords).values() ans = [] for i, corp in enumerate(rows): used_keywords.update(corp.keywords) corp.keywords = self._corparch.get_l10n_keywords(corp.keywords, plugin_api.user_lang) corp.fav_id = fav_id(corp.id) corp.found_in = get_found_in(corp, normalized_query_substrs) ans.append(corp.to_dict()) if i == limit - 1: break return dict(rows=ans, nextOffset=offset + limit if len(rows) > limit else None, keywords=l10n.sort(used_keywords, loc=plugin_api.user_lang), query=query, current_keywords=query_keywords, filters=dict(filter_dict))
def xfreq_dist(self, crit, limit=1, sortkey='f', ftt_include_empty: int = 0, rel_mode=0, collator_locale='en_US'): """ Calculates data (including data for visual output) of a frequency distribution specified by the 'crit' parameter arguments: crit -- specified criteria (CQL) limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted values must be greater than the limit) sortkey -- a key according to which the distribution will be sorted ftt_include_empty -- str, TODO rel_mode -- {0, 1} (0 for structural attrs. , 1 for positional ones ??) """ def label(attr): if '/' in attr: attr = attr[:attr.index('/')] lab = self.pycorp.get_conf(attr + '.LABEL') return lab if lab else attr def export_word(wrd): return [{'n': ' '.join(n.split('\v'))} for n in wrd.split('\t')] def test_word_empty(wrd): return len(wrd) == 1 and (wrd[0]['n'] == '' or wrd[0]['n'] == '===NONE===') words = manatee.StrVector() freqs = manatee.NumVector() norms = manatee.NumVector() self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms) if len(freqs) == 0: return dict(Head=[], Items=[], SkippedEmpty=False, NoRelSorting=True) # for structural attrs, we intentionally rewrite norms as filled in by Corpus.freq_dist() # because of "hard to explain" metrics they lead to if rel_mode == 0: norms2_dict = self.get_attr_values_sizes(crit) norms = [norms2_dict.get(x, 0) for x in words] # For positional attrs, the norm is the size of the actual corpus/subcorpus. Please note that # for an "ad hoc" (or unnamed) subcorpus, this may be misleading as we still calculate against orig. corpus else: norms = [self.pycorp.search_size for _ in words] attrs = crit.split() head: List[Dict[str, Any]] = [ dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2) ] head.append( dict(n=translate('Freq'), s='freq', title=translate('Frequency'))) has_empty_item = False head.append( dict( n='i.p.m.', title=translate( 'instances per million positions (refers to the respective category)' ), s='rel')) lines = [] for w, f, nf in zip(words, freqs, norms): word = export_word(w) if test_word_empty(word): has_empty_item = True continue lines.append( dict(Word=word, freq=f, norm=nf, rel=round(f / nf * 1e6, 2))) if ftt_include_empty and limit == 0 and '.' in attrs[0]: attr = self.pycorp.get_attr(attrs[0]) all_vals = [attr.id2str(i) for i in range(attr.id_range())] used_vals = [line['Word'][0]['n'] for line in lines] for v in all_vals: if v in used_vals: continue lines.append(dict(Word=[{'n': v}], freq=0, rel=0, norm=0)) if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])): sortkey = int(sortkey) lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n']) else: if sortkey not in ('freq', 'rel'): sortkey = 'freq' lines = sorted(lines, key=lambda v: v[sortkey], reverse=True) return dict(Head=head, Items=lines, SkippedEmpty=has_empty_item, NoRelSorting=bool(rel_mode))
def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None): if query is False: # False means 'use default values' query = '' ans = {'rows': []} permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict) used_keywords = set() all_keywords_map = dict(self._corparch.all_keywords(plugin_api.user_lang)) if filter_dict.get('minSize'): min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False) else: min_size = 0 if filter_dict.get('maxSize'): max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False) else: max_size = None if offset is None: offset = 0 else: offset = int(offset) if limit is None: limit = int(self._corparch.max_page_size) else: limit = int(limit) user_items = self._corparch.user_items.get_user_items(plugin_api) def fav_id(corpus_id): for item in user_items: if item.is_single_corpus and item.main_corpus_id == corpus_id: return item.ident return None query_substrs, query_keywords = parse_query(self._tag_prefix, query) normalized_query_substrs = [s.lower() for s in query_substrs] for corp in self._corparch.get_list(plugin_api, permitted_corpora): full_data = self._corparch.get_corpus_info(plugin_api.user_lang, corp['id']) if not isinstance(full_data, BrokenCorpusInfo): keywords = [k for k in full_data['metadata']['keywords'].keys()] tests = [] found_in = [] tests.extend([k in keywords for k in query_keywords]) for s in normalized_query_substrs: # the name must be tested first to prevent the list 'found_in' # to be filled in case item matches both name and description if s in corp['name'].lower(): tests.append(True) elif s in (corp['desc'].lower() if corp['desc'] else ''): tests.append(True) found_in.append('defaultCorparch__found_in_desc') else: tests.append(False) tests.append(self.matches_size(corp, min_size, max_size)) tests.append(self._corparch.custom_filter( self._plugin_api, full_data, permitted_corpora)) if self.matches_all(tests): corp['size'] = corp['size'] corp['size_info'] = l10n.simplify_num(corp['size']) if corp['size'] else None corp['keywords'] = [(k, all_keywords_map[k]) for k in keywords] corp['found_in'] = found_in corp['fav_id'] = fav_id(corp['id']) # because of client-side fav/feat/search items compatibility corp['corpus_id'] = corp['id'] ans['rows'].append(corp) used_keywords.update(keywords) if not self.should_fetch_next(ans, offset, limit): break ans['rows'], ans['nextOffset'] = self.cut_result( self.sort(plugin_api, ans['rows']), offset, limit) ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang) ans['query'] = query ans['current_keywords'] = query_keywords ans['filters'] = dict(filter_dict) return ans
def get_attr_values(self, corpus, attr_map, aligned_corpora=None): """ Finds all the available values of remaining attributes according to the provided attr_map and aligned_corpora arguments: corpus -- manatee.corpus object attr_map -- a dictionary of attributes and values as selected by a user aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument) returns: a dictionary containing matching attributes and values """ corpname = vanilla_corpname(corpus.corpname) corpus_info = self.corparch.get_corpus_info(corpname) bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr) bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr) attrs = self._get_subcorp_attrs(corpus) if bib_label and bib_label not in attrs: attrs.append(bib_label) srch_attrs = set(attrs) - set( self.import_key(k) for k in attr_map.keys() if type(attr_map[k]) is not dict) srch_attrs.add('poscount') hidden_attrs = set() if bib_id is not None and bib_id not in srch_attrs: hidden_attrs.add(bib_id) if not bib_id: hidden_attrs.add('id') selected_attrs = tuple(srch_attrs.union(hidden_attrs)) # a map [db_col_name]=>[db_col_idx] srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)]) attr_items = AttrArgs(attr_map, self.empty_val_placeholder) where_sql, where_values = attr_items.export_sql('t1', corpname) join_sql = [] i = 2 for item in aligned_corpora: join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' % (i, i)) where_sql += ' AND t%d.corpus_id = ?' % i where_values.append(item) i += 1 if len(where_sql) > 0: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql) else: sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \ % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql)) ans = {} # already selected items are part of the answer; no need to fetch them from db ans.update(dict([(self.import_key(k), v) for k, v in attr_map.items()])) range_attrs = set() for attr in ans.keys(): if type(ans[attr]) is dict: ans[attr] = set( ) # currently we throw away the range and load all the stuff range_attrs.add(attr) for attr in srch_attrs: if attr in ('poscount', ): ans[attr] = 0 else: ans[attr] = set() poscounts = defaultdict(lambda: defaultdict(lambda: 0)) max_visible_chars = self.calc_max_attr_val_visible_chars(corpus_info) for item in self.db(corpname).execute(sql_template, *where_values).fetchall(): for attr in selected_attrs: v = item[srch_attr_map[attr]] if v is not None and attr not in hidden_attrs: attr_val = None if attr == bib_label: attr_val = (self.shorten_value( unicode(v), length=max_visible_chars), item[srch_attr_map[bib_id]], unicode(v)) elif type(ans[attr]) is set: attr_val = (self.shorten_value( unicode(v), length=max_visible_chars), v, v) elif type(ans[attr]) is int: ans[attr] += int(v) if attr_val is not None: poscounts[attr][attr_val] += item['poscount'] # here we append position count information to the respective items for attr, v in poscounts.items(): for k, c in v.items(): ans[attr].add(k + (l10n.format_number(c), )) del poscounts[attr] exported = {} collator_locale = corpus_info.collator_locale for k in ans.keys(): if type(ans[k]) is set: if len(ans[k]) <= self.max_attr_list_size or k in range_attrs: if k == bib_label: out_data = l10n.sort(ans[k], collator_locale, key=lambda t: t[0]) else: out_data = tuple( l10n.sort(ans[k], collator_locale, key=lambda t: t[0])) exported[self.export_key(k)] = out_data else: exported[self.export_key(k)] = {'length': len(ans[k])} else: exported[self.export_key(k)] = ans[k] exported['poscount'] = l10n.format_number(exported['poscount']) exported['aligned'] = aligned_corpora return exported