Ejemplo n.º 1
0
 def get_user_items(self, plugin_api):
     ans = []
     if self._auth.anonymous_user()['id'] != plugin_api.user_id:
         for item_id, item in list(self._db.hash_get_all(self._mk_key(plugin_api.user_id)).items()):
             ans.append(import_record(item))
         ans = l10n.sort(ans, plugin_api.user_lang, key=lambda itm: itm.sort_key, reverse=False)
     return ans
Ejemplo n.º 2
0
 def sort(self, plugin_api, data, field='name', *fields):
     if field == 'size':
         return sorted(data, key=lambda c: c.get(field, 0), reverse=True)
     else:
         def corp_cmp_key(c, field):
             return c.get(field) if c.get(field) is not None else ''
         return l10n.sort(data, loc=plugin_api.user_lang, key=lambda c: corp_cmp_key(c, field))
Ejemplo n.º 3
0
 def sort(self, plugin_api, data, field='name', *fields):
     if field == 'size':
         return sorted(data, key=lambda c: c.get(field, 0), reverse=True)
     else:
         def corp_cmp_key(c, field):
             return c.get(field) if c.get(field) is not None else ''
         return l10n.sort(data, loc=plugin_api.user_lang, key=lambda c: corp_cmp_key(c, field))
Ejemplo n.º 4
0
 def get_user_items(self, user_id):
     ans = []
     if self._auth.anonymous_user()['id'] != user_id:
         for item_id, item in self._db.hash_get_all(self._mk_key(user_id)).items():
             ans.append(self.decoder.decode(item))
         ans = l10n.sort(ans, self.getlocal('lang'), key=lambda itm: itm.name, reverse=False)
     return ans
Ejemplo n.º 5
0
 def get_user_items(self, plugin_api):
     ans = []
     if self._auth.anonymous_user()['id'] != plugin_api.user_id:
         for item_id, item in self._db.hash_get_all(self._mk_key(plugin_api.user_id)).items():
             ans.append(import_record(item))
         ans = l10n.sort(ans, plugin_api.user_lang, key=lambda itm: itm.sort_key, reverse=False)
     return ans
Ejemplo n.º 6
0
def require_existing_pquery(
        pquery: PqueryFormArgs, offset: int, limit: int, collator_locale: str,
        sort: str, reverse: bool) -> Tuple[int, List[Tuple[str, int]]]:
    path = _create_cache_path(pquery)
    if not os.path.exists(path):
        raise PqueryResultNotFound('The result does not exist')
    else:
        if sort == 'freq':
            if reverse is True:
                return load_cached_partial(path, offset, limit)
            else:
                total, rows = load_cached_full(path)
                return total, list(reversed(rows))[offset:offset + limit]
        elif sort == 'value':
            total, rows = load_cached_full(path)
            return (total,
                    l10n.sort(rows,
                              key=lambda x: x[0],
                              loc=collator_locale,
                              reverse=reverse)[offset:offset + limit])
        elif sort.startswith('freq-'):
            conc_idx = pquery.conc_ids.index(sort[len('freq-'):])
            total, rows = load_cached_full(path)
            return (total,
                    sorted(rows,
                           key=lambda x: x[conc_idx + 1],
                           reverse=reverse)[offset:offset + limit])
        else:
            raise PqueryArgumentError(f'Invalid sort argument: {sort}')
Ejemplo n.º 7
0
 def find_suggestion(
         self, user_id, ui_lang, maincorp, corpora, subcorpus, value, value_type, value_subformat,
         query_type, p_attr, struct, s_attr):
     used_corp = self._preset_corp if self._preset_corp is not None else maincorp
     value_norm = value if value_subformat in ('regexp', 'advanced') else re_escape(value)
     icase = '(?i)' if value_subformat in ('simple_ic',) else ''
     rels = defaultdict(lambda: set())
     try:
         conc = get_conc(
             used_corp,
             user_id,
             (f'aword,[{self._conf["attr1"]}="{icase}{value_norm}" | {self._conf["attr2"]}="{icase}{value_norm}"]',))
         conc.sync()
         mlargs = dict(ml1attr=self._conf["attr1"], ml2attr=self._conf["attr2"])
         fcrit = multi_level_crit(2, **mlargs)
         data = self._freq_dist(corp=used_corp, conc=conc, fcrit=fcrit, user_id=user_id)
         for item in data:
             attr1, attr2 = self._normalize_multivalues(
                 used_corp, value_norm, *(tuple([w['n'] for w in item['Word']])[:2]))
             rels[attr1].add(attr2)
     except RuntimeError as ex:
         msg = str(ex).lower()
         if 'syntax error' not in msg:
             raise ex
     return dict(attrs=(self._conf['attr1'], self._conf['attr2']),
                 data=dict((k, l10n.sort(v, ui_lang, key=lambda itm: itm, reverse=False))
                           for k, v in rels.items()))
Ejemplo n.º 8
0
 def _export_attr_values(self, data: Dict[StructAttr, Set[AttrValue]], total_poscount: int, aligned_corpora: List[str], expand_attrs: List[StructAttr], collator_locale: str, max_attr_list_size: Optional[int]) -> AttrValuesResponse:
     exported = AttrValuesResponse(
         attr_values={}, aligned=aligned_corpora, poscount=total_poscount)
     for struct_attr, attr_values in data.items():
         if max_attr_list_size is None or len(attr_values) <= max_attr_list_size or struct_attr in expand_attrs:
             out_data = l10n.sort(attr_values, collator_locale, key=lambda t: t[0])
             exported.attr_values[struct_attr.key()] = out_data
         else:
             exported.attr_values[struct_attr.key()] = {'length': len(attr_values)}
     return exported
Ejemplo n.º 9
0
 def get_user_items(self, user_id):
     ans = []
     if self._auth.anonymous_user()['id'] != user_id:
         for item_id, item in self._db.hash_get_all(
                 self._mk_key(user_id)).items():
             ans.append(self.decoder.decode(item))
         ans = l10n.sort(ans,
                         self.getlocal('lang'),
                         key=lambda itm: itm.name,
                         reverse=False)
     return ans
Ejemplo n.º 10
0
 def _export_attr_values(self, data, aligned_corpora, expand_attrs, collator_locale, max_attr_list_size):
     values = {}
     for k, v in data.items():
         if isinstance(v, Iterable):
             if max_attr_list_size is None or len(v) <= max_attr_list_size or k in expand_attrs:
                 out_data = l10n.sort(v, collator_locale, key=lambda t: t[0])
                 values[self.export_key(k)] = [AttrValue(*av) for av in out_data]
             else:
                 values[self.export_key(k)] = {'length': len(v)}
         else:
             values[self.export_key(k)] = v
     return AttrValuesResponse(attr_values=values, aligned=aligned_corpora, poscount=values['poscount'])
Ejemplo n.º 11
0
 def _export_attr_values(self, data, aligned_corpora, expand_attrs, collator_locale, max_attr_list_size):
     values = {}
     exported = dict(attr_values=values, aligned=aligned_corpora)
     for k in data.keys():
         if isinstance(data[k], Iterable):
             if len(data[k]) <= max_attr_list_size or max_attr_list_size is None or k in expand_attrs:
                 out_data = l10n.sort(data[k], collator_locale, key=lambda t: t[0])
                 values[self.export_key(k)] = out_data
             else:
                 values[self.export_key(k)] = {'length': len(data[k])}
         else:
             values[self.export_key(k)] = data[k]
     exported['poscount'] = values['poscount']
     return exported
Ejemplo n.º 12
0
def require_existing_wordlist(
        form: WordlistFormArgs, wlsort: str, reverse: bool, offset: int,
        limit: int, collator_locale: str) -> Tuple[int, List[Tuple[str, int]]]:
    path = _create_cache_path(form)
    if not os.path.exists(path):
        raise WordlistResultNotFound('The result does not exist')
    else:
        if wlsort == 'f':
            total, rows = load_cached_full(path)
            return (total, sorted(rows, key=lambda x: x[1],
                                  reverse=reverse)[offset:offset + limit])
        else:
            total, rows = load_cached_full(path)
            rows = l10n.sort(rows,
                             key=lambda x: x[0],
                             loc=collator_locale,
                             reverse=reverse)
            return total, rows[offset:offset + limit]
Ejemplo n.º 13
0
 def _export_attr_values(self, data, aligned_corpora, expand_attrs,
                         collator_locale, max_attr_list_size):
     values = {}
     exported = dict(attr_values=values, aligned=aligned_corpora)
     for k in list(data.keys()):
         if isinstance(data[k], Iterable):
             if max_attr_list_size is None or len(
                     data[k]) <= max_attr_list_size or k in expand_attrs:
                 out_data = l10n.sort(data[k],
                                      collator_locale,
                                      key=lambda t: t[0])
                 values[self.export_key(k)] = out_data
             else:
                 values[self.export_key(k)] = {'length': len(data[k])}
         else:
             values[self.export_key(k)] = data[k]
     exported['poscount'] = values['poscount']
     return exported
Ejemplo n.º 14
0
    def _export_subcorpora_list(self, corpname, out):
        """
        Updates passed dictionary by information about available sub-corpora.
        Listed values depend on current user and corpus.
        If there is a list already present in 'out' then it is extended
        by the new values.

        arguments:
        corpname -- corpus id
        out -- a dictionary used by templating system
        """
        basecorpname = corpname.split(':')[0]
        subcorp_list = l10n.sort(self.cm.subcorp_names(basecorpname),
                                 loc=self.ui_lang, key=lambda x: x['n'])
        if len(subcorp_list) > 0:
            subcorp_list = [{'n': '--%s--' % _('whole corpus'), 'v': ''}] + subcorp_list
        if out.get('SubcorpList', None) is None:
            out['SubcorpList'] = []
        out['SubcorpList'].extend(subcorp_list)
Ejemplo n.º 15
0
    def _export_subcorpora_list(self, corpname, out):
        """
        Updates passed dictionary by information about available sub-corpora.
        Listed values depend on current user and corpus.
        If there is a list already present in 'out' then it is extended
        by the new values.

        arguments:
        corpname -- corpus id
        out -- a dictionary used by templating system
        """
        basecorpname = corpname.split(':')[0]
        subcorp_list = l10n.sort(self.cm.subcorp_names(basecorpname),
                                 loc=self.ui_lang,
                                 key=lambda x: x['n'])
        if len(subcorp_list) > 0:
            subcorp_list = [{
                'n': '--%s--' % _('whole corpus'),
                'v': ''
            }] + subcorp_list
        if out.get('SubcorpList', None) is None:
            out['SubcorpList'] = []
        out['SubcorpList'].extend(subcorp_list)
Ejemplo n.º 16
0
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None):
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}
    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if shrink_list is False:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH')
                              or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{'v': attr.id2str(i)}
                            for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))
                                    .split(multisep) for i in range(attr.id_range())]
                        vals = [{'v': x}
                                for x in sorted(set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{'v': import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))}
                                for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals, key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(vals, cmp=lambda x1, x2: cmp(
                        x1['v'].lower(), x2['v'].lower()))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Ejemplo n.º 17
0
    def get_attr_values(self, corpus, attr_map, aligned_corpora=None):
        """
        Finds all the available values of remaining attributes according to the
        provided attr_map and aligned_corpora

        arguments:
        corpus -- manatee.corpus object
        attr_map -- a dictionary of attributes and values as selected by a user
        aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument)

        returns:
        a dictionary containing matching attributes and values
        """
        corpname = vanilla_corpname(corpus.corpname)
        corpus_info = self.corparch.get_corpus_info(corpname)
        bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr)
        bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr)
        attrs = self._get_subcorp_attrs(corpus)

        if bib_label and bib_label not in attrs:
            attrs.append(bib_label)

        srch_attrs = set(attrs) - set(attr_map.keys())
        srch_attrs.add('poscount')

        hidden_attrs = set()
        if bib_id is not None and bib_id not in srch_attrs:
            hidden_attrs.add(bib_id)
        if not bib_id:
            hidden_attrs.add('id')

        selected_attrs = tuple(srch_attrs.union(hidden_attrs))
        srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)])
        attr_items = AttrArgs(attr_map, self.empty_val_placeholder)
        where_sql, where_values = attr_items.export_sql('t1', corpname)

        join_sql = []
        i = 2
        for item in aligned_corpora:
            join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' % (i, i))
            where_sql += ' AND t%d.corpus_id = ?' % i
            where_values.append(item)
            i += 1

        if len(where_sql) > 0:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql)
        else:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql))

        ans = {}
        ans.update(attr_map)

        for attr in srch_attrs:
            if attr in ('poscount',):
                ans[attr] = 0
            else:
                ans[attr] = set()

        for item in self.db(corpname).execute(sql_template, *where_values).fetchall():
            for attr in selected_attrs:
                v = item[srch_attr_map[attr]]
                if v is not None and attr not in hidden_attrs:
                    if attr == bib_label:
                        ans[attr].add((self.shorten_value(unicode(v)),
                                       item[srch_attr_map[bib_id]], unicode(v)))
                    elif type(ans[attr]) is set:
                        ans[attr].add((self.shorten_value(v), v, v))
                    elif type(ans[attr]) is int:
                        ans[attr] += int(v)

        exported = {}
        collator_locale = corpus_info.collator_locale
        for k in ans.keys():
            if type(ans[k]) is set:
                if len(ans[k]) <= self.max_attr_list_size:
                    if k == bib_label:
                        out_data = l10n.sort(ans[k], collator_locale, key=lambda t: t[0])
                    else:
                        out_data = tuple(l10n.sort(ans[k], collator_locale, key=lambda t: t[0]))
                    exported[self.export_key(k)] = out_data
                else:
                    exported[self.export_key(k)] = {'length': len(ans[k])}

            else:
                exported[self.export_key(k)] = ans[k]
        exported['aligned'] = aligned_corpora
        return exported
Ejemplo n.º 18
0
    def xfreq_dist(self, crit, limit=1, sortkey='f', ml='', ftt_include_empty='', rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ml -- str, if non-empty then multi-level freq. distribution is generated
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1}, TODO
        """

        # ml = determines how the bar appears (multilevel x text type)
        # import math
        normwidth_freq = 100
        normwidth_rel = 100

        def calc_scale(freqs, norms):
            """
            Create proper scaling coefficients for freqs and norms
            to match a 100 units length bar.
            """
            from operator import add
            sumn = float(reduce(add, norms))
            if sumn == 0:
                return float(normwidth_rel) / max(freqs), 0
            else:
                sumf = float(reduce(add, freqs))
                corr = min(sumf / max(freqs), sumn / max(norms))
                return normwidth_rel / sumf * corr, normwidth_rel / sumn * corr

        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return self.import_string(lab if lab else attr)

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        words = [self.import_string(w) for w in words]
        if not len(freqs):
            return {}
        # now we intentionally rewrite norms as filled in by freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        sumf = float(sum([x for x in freqs]))
        attrs = crit.split()
        head = [dict(n=label(attrs[x]), s=x / 2)
                for x in range(0, len(attrs), 2)]
        head.append(dict(n=translate('Freq'), s='freq', title=translate('Frequency')))

        tofbar, tonbar = calc_scale(freqs, norms)
        if tonbar and not ml:
            maxf = max(freqs)  # because of bar height
            minf = min(freqs)
            maxrel = 0
            # because of bar width
            for index, (f, nf) in enumerate(zip(freqs, norms)):
                if nf == 0:
                    nf = 100000
                    norms[index] = 100000
                newrel = (f * tofbar / (nf * tonbar))
                if maxrel < newrel:
                    maxrel = newrel
            if rel_mode == 0:
                head.append(dict(
                    n='i.p.m.',
                    title=translate(
                        'instances per million positions (refers to the respective category)'),
                    s='rel'
                ))
            else:
                head.append(dict(n='Freq [%]', title='', s='rel'))

            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                rel_norm_freq = {
                    0: round(f * 1e6 / nf, 2),
                    1: round(f / sumf * 100, 2)
                }[rel_mode]

                rel_bar = {
                    0: 1 + int(f * tofbar * normwidth_rel / (nf * tonbar * maxrel)),
                    1: 1 + int(float(f) / maxf * normwidth_rel)
                }[rel_mode]

                freq_bar = {
                    0: int(normwidth_freq * float(f) / (maxf - minf + 1) + 1),
                    1: 10
                }[rel_mode]
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norm=nf,
                    nbar=int(nf * tonbar),
                    relbar=rel_bar,
                    norel=ml,
                    freqbar=freq_bar,
                    rel=rel_norm_freq
                ))
        else:
            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norel=1,
                    relbar=None
                ))

        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(
                    Word=[{'n': self.import_string(v)}],
                    freq=0,
                    rel=0,
                    norm=0,
                    nbar=0,
                    relbar=0,
                    norel=ml,
                    freqbar=0,
                    fbar=0
                ))
        if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head, Items=lines)
Ejemplo n.º 19
0
def texttype_values(corp,
                    subcorpattrs,
                    maxlistsize,
                    shrink_list=False,
                    collator_locale=None):
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}
    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if shrink_list is False:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and
                (corp.get_conf(n + '.TEXTBOXLENGTH')
                 or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH')
                                            or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{
                        'v': attr.id2str(i)
                    } for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            import_string(attr.id2str(i),
                                          from_encoding=corp.get_conf(
                                              'ENCODING')).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{
                            'v': x
                        } for x in sorted(
                            set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{
                            'v':
                            import_string(
                                attr.id2str(i),
                                from_encoding=corp.get_conf('ENCODING'))
                        } for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals,
                                               key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals,
                                                  collator_locale,
                                                  key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(
                        vals,
                        cmp=lambda x1, x2: cmp(x1['v'].lower(), x2['v'].lower(
                        )))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Ejemplo n.º 20
0
    def sort(self, plugin_api, data, *fields):
        def corp_cmp_key(c):
            return c.get('name') if c.get('name') is not None else ''

        return l10n.sort(data, loc=plugin_api.user_lang, key=corp_cmp_key)
Ejemplo n.º 21
0
    def search(self,
               plugin_api,
               query,
               offset=0,
               limit=None,
               filter_dict=None):
        external_keywords = filter_dict.getlist('keyword')
        external_keywords = self._corparch.map_external_keywords(
            external_keywords, plugin_api.user_lang)
        if len(external_keywords) != 0:
            query_substrs = []
            query_keywords = external_keywords + [self.default_label]
        else:

            if self.SESSION_KEYWORDS_KEY not in plugin_api.session:
                plugin_api.session[self.SESSION_KEYWORDS_KEY] = [
                    self.default_label
                ]
            initial_query = query
            if query is False:
                query = ''
            query_substrs, query_keywords = parse_query(
                self._tag_prefix, query)
            if len(query_keywords) == 0 and initial_query is False:
                query_keywords = plugin_api.session[self.SESSION_KEYWORDS_KEY]
            else:
                plugin_api.session[self.SESSION_KEYWORDS_KEY] = query_keywords
        query = ' '.join(query_substrs) \
                + ' ' + ' '.join('%s%s' % (self._tag_prefix, s) for s in query_keywords)

        ans = {'rows': []}
        permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict)

        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'),
                                           strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'),
                                           strict=False)
        else:
            max_size = None

        sorting_field = filter_dict.get('sortBySize', 'name')

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)

        def fav_id(corpus_id):
            for item in user_items:
                if item.is_single_corpus and item.main_corpus_id == corpus_id:
                    return item.ident
            return None

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)
        all_keywords_map = dict(
            self._corparch.all_keywords(plugin_api.user_lang))
        normalized_query_substrs = [s.lower() for s in query_substrs]
        used_keywords = set()

        for corp in self._corparch.get_list(plugin_api):
            full_data = self._corparch.get_corpus_info(plugin_api.user_lang,
                                                       corp['id'])
            if not isinstance(full_data, BrokenCorpusInfo):
                keywords = [k for k, _ in full_data.metadata.keywords]
                tests = []
                found_in = []
                tests.extend([k in keywords for k in query_keywords])
                for s in normalized_query_substrs:
                    # the name must be tested first to prevent the list 'found_in'
                    # to be filled in case item matches both name and description
                    if s in corp['name'].lower():
                        tests.append(True)
                    elif s in (corp['desc'].lower() if corp['desc'] else ''):
                        tests.append(True)
                        found_in.append('defaultCorparch__found_in_desc')
                    else:
                        tests.append(False)
                tests.append(self.matches_size(corp, min_size, max_size))
                tests.append(
                    self._corparch.custom_filter(self._plugin_api, full_data,
                                                 permitted_corpora))

                if all(test for test in tests):
                    corp['size'] = corp['size']
                    corp['size_info'] = l10n.simplify_num(
                        corp['size']) if corp['size'] else None
                    corp['keywords'] = [(k, all_keywords_map[k])
                                        for k in keywords]
                    corp['found_in'] = found_in
                    corp['fav_id'] = fav_id(corp['id'])
                    # because of client-side fav/feat/search items compatibility
                    corp['corpus_id'] = corp['id']
                    corp['pmltq'] = full_data['pmltq']
                    corp['repo'] = full_data['web']
                    corp['access'] = full_data['access']
                    corp['tokenConnect'] = full_data['token_connect'][
                        'providers']
                    ans['rows'].append(corp)
                    used_keywords.update(keywords)
                    if not self.should_fetch_next(ans, offset, limit):
                        break

        ans['rows'], ans['nextOffset'] = self.cut_result(
            self.sort(plugin_api, ans['rows'], field=sorting_field), offset,
            limit)
        ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang)
        ans['query'] = query
        ans['current_keywords'] = query_keywords
        ans['filters'] = dict(filter_dict)
        return ans
Ejemplo n.º 22
0
    def get_attr_values(self, corpus, attr_map, aligned_corpora=None):
        """
        Finds all the available values of remaining attributes according to the
        provided attr_map and aligned_corpora

        arguments:
        corpus -- manatee.corpus object
        attr_map -- a dictionary of attributes and values as selected by a user
        aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument)

        returns:
        a dictionary containing matching attributes and values
        """
        corpname = vanilla_corpname(corpus.corpname)
        corpus_info = self.corparch.get_corpus_info(corpname)
        bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr)
        bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr)
        attrs = self._get_subcorp_attrs(corpus)

        if bib_label and bib_label not in attrs:
            attrs.append(bib_label)

        srch_attrs = set(attrs) - set(self.import_key(k)
                                      for k in attr_map.keys() if type(attr_map[k]) is not dict)
        srch_attrs.add('poscount')

        hidden_attrs = set()
        if bib_id is not None and bib_id not in srch_attrs:
            hidden_attrs.add(bib_id)
        if not bib_id:
            hidden_attrs.add('id')

        selected_attrs = tuple(srch_attrs.union(hidden_attrs))

        # a map [db_col_name]=>[db_col_idx]
        srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)])

        attr_items = AttrArgs(attr_map, self.empty_val_placeholder)
        where_sql, where_values = attr_items.export_sql('t1', corpname)

        join_sql = []
        i = 2
        for item in aligned_corpora:
            join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' % (i, i))
            where_sql += ' AND t%d.corpus_id = ?' % i
            where_values.append(item)
            i += 1

        if len(where_sql) > 0:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql)
        else:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql))

        ans = {}
        # already selected items are part of the answer; no need to fetch them from db
        ans.update(dict([(self.import_key(k), v) for k, v in attr_map.items()]))
        range_attrs = set()

        for attr in ans.keys():
            if type(ans[attr]) is dict:
                ans[attr] = set()   # currently we throw away the range and load all the stuff
                range_attrs.add(attr)

        for attr in srch_attrs:
            if attr in ('poscount',):
                ans[attr] = 0
            else:
                ans[attr] = set()

        poscounts = defaultdict(lambda: defaultdict(lambda: 0))
        max_visible_chars = self.calc_max_attr_val_visible_chars(corpus_info)

        for item in self.db(corpname).execute(sql_template, *where_values).fetchall():
            for attr in selected_attrs:
                v = item[srch_attr_map[attr]]
                if v is not None and attr not in hidden_attrs:
                    attr_val = None
                    if attr == bib_label:
                        attr_val = (self.shorten_value(unicode(v), length=max_visible_chars),
                                    item[srch_attr_map[bib_id]], unicode(v))
                    elif type(ans[attr]) is set:
                        attr_val = (self.shorten_value(unicode(v), length=max_visible_chars), v, v)
                    elif type(ans[attr]) is int:
                        ans[attr] += int(v)

                    if attr_val is not None:
                        poscounts[attr][attr_val] += item['poscount']

        # here we append position count information to the respective items
        for attr, v in poscounts.items():
            for k, c in v.items():
                ans[attr].add(k + (l10n.format_number(c),))
            del poscounts[attr]

        exported = {}
        collator_locale = corpus_info.collator_locale

        for k in ans.keys():
            if type(ans[k]) is set:
                if len(ans[k]) <= self.max_attr_list_size or k in range_attrs:
                    if k == bib_label:
                        out_data = l10n.sort(ans[k], collator_locale, key=lambda t: t[0])
                    else:
                        out_data = tuple(l10n.sort(ans[k], collator_locale, key=lambda t: t[0]))
                    exported[self.export_key(k)] = out_data
                else:
                    exported[self.export_key(k)] = {'length': len(ans[k])}

            else:
                exported[self.export_key(k)] = ans[k]
        exported['poscount'] = l10n.format_number(exported['poscount'])
        exported['aligned'] = aligned_corpora
        return exported
Ejemplo n.º 23
0
    def subcorp_list(self, request):
        """
        Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in
        installed then the list is enriched by additional re-use/undelete information.
        """
        self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE)

        sort = 'n'  # TODO
        show_deleted = int(request.args.get('show_deleted', 0))
        current_corp = self.args.corpname
        if self.get_http_method() == 'POST':
            selected_subc = request.form.getlist('selected_subc')
            self._delete_subcorpora(selected_subc)

        data = []
        for corp in plugins.get('auth').permitted_corpora(self._session_get('user', 'id')).values():
            try:
                self.cm.get_Corpus(corp)
                basecorpname = corp.split(':')[0]
                for item in self.cm.subcorp_names(basecorpname):
                    sc = self.cm.get_Corpus(corp, item['n'])
                    subc_id = '%s:%s' % (corp, item['n'])
                    data.append({
                        'n': subc_id,
                        'v': item['n'],
                        'size': sc.search_size(),
                        'created': sc.created,
                        'corpname': corp,
                        'usesubcorp': item['n'],
                        'deleted': False
                    })
            except Exception as e:
                for d in data:
                    # permitted_corpora does this
                    d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'], unsafe='+')
                logging.getLogger(__name__).warn(
                    'Failed to fetch information about subcorpus of [%s]: %s' % (corp, e))

        if plugins.has_plugin('subc_restore'):
            try:
                full_list = plugins.get('subc_restore').extend_subc_list(
                    data, self._session_get('user', 'id'),
                    bool(show_deleted), 0)
            except Exception as e:
                logging.getLogger(__name__).error('subc_restore plug-in failed to list queries: %s' % e)
                full_list = []
        else:
            full_list = data

        # TODO sorting does not work
        sort_key, rev = Kontext._parse_sorting_param(sort)
        if sort_key in ('size', 'created'):
            data = sorted(data, key=lambda x: x[sort_key], reverse=rev)
        else:
            data = l10n.sort(data, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev)

        sort_keys = dict([(x, (x, '')) for x in ('n', 'size', 'created')])
        if not rev:
            sort_keys[sort_key] = ('-%s' % sort_key, '&#8593;')
        else:
            sort_keys[sort_key] = (sort_key, '&#8595;')

        # this is necessary to reset manatee module back to its original state
        self.cm.get_Corpus(current_corp)

        ans = {
            'subcorp_list': full_list,
            'sort_keys': sort_keys,
            'show_deleted': show_deleted,
            'rev': rev
        }
        self._export_subcorpora_list(ans)
        return ans
Ejemplo n.º 24
0
    def subcorp_list(self, request):
        """
        Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in
        installed then the list is enriched by additional re-use/undelete information.
        """
        self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE)

        sort = 'n'  # TODO
        show_deleted = int(request.args.get('show_deleted', 0))
        if self.get_http_method() == 'POST':
            selected_subc = request.form.getlist('selected_subc')
            self._delete_subcorpora(selected_subc)

        data = []
        user_corpora = plugins.get('auth').permitted_corpora(self._session_get('user', 'id')).values()
        for corp in user_corpora:
            try:
                for item in self.cm.subcorp_names(corp):
                    sc = self.cm.get_Corpus(corp, item['n'])
                    data.append({
                        'n': '%s:%s' % (self._canonical_corpname(corp), item['n']),
                        'internal_n': '%s:%s' % (corp, item['n']),
                        'v': item['n'],
                        'size': sc.search_size(),
                        'created': sc.created,
                        'corpname': corp,
                        'human_corpname': sc.get_conf('NAME'),
                        'usesubcorp': item['n'],
                        'deleted': False
                    })
            except Exception as e:
                for d in data:
                    # permitted_corpora does this
                    d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'], unsafe='+')
                logging.getLogger(__name__).warn(
                    'Failed to fetch information about subcorpus of [%s]: %s' % (corp, e))

        if plugins.has_plugin('subc_restore'):
            try:
                full_list = plugins.get('subc_restore').extend_subc_list(
                    data, self._session_get('user', 'id'),
                    self._canonical_corpname,
                    bool(show_deleted), 0)
            except Exception as e:
                logging.getLogger(__name__).error('subc_restore plug-in failed to list queries: %s' % e)
                full_list = []
        else:
            full_list = data

        # TODO sorting does not work
        sort_key, rev = Kontext._parse_sorting_param(sort)
        if sort_key in ('size', 'created'):
            data = sorted(data, key=lambda x: x[sort_key], reverse=rev)
        else:
            data = l10n.sort(data, loc=self.ui_lang, key=lambda x: x[sort_key], reverse=rev)

        sort_keys = dict([(x, (x, '')) for x in ('n', 'size', 'created')])
        if not rev:
            sort_keys[sort_key] = ('-%s' % sort_key, '&#8593;')
        else:
            sort_keys[sort_key] = (sort_key, '&#8595;')

        ans = {
            'SubcorpList': [],   # this is used by subcorpus SELECT element; no need for that here
            'subcorp_list': full_list,
            'sort_keys': sort_keys,
            'show_deleted': show_deleted,
            'rev': rev
        }
        return ans
Ejemplo n.º 25
0
    def list(self, request: Request) -> Dict[str, Any]:
        """
        Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in
        installed then the list is enriched by additional re-use/undelete information.
        """
        self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER,
                                    MainMenu.FREQUENCY, MainMenu.COLLOCATIONS,
                                    MainMenu.SAVE, MainMenu.CONCORDANCE)

        filter_args = dict(show_deleted=bool(
            int(request.args.get('show_deleted', 0))),
                           corpname=request.args.get('corpname'))
        data = []
        user_corpora = list(
            plugins.runtime.AUTH.instance.permitted_corpora(
                self.session_get('user')))
        related_corpora = set()
        for corp in user_corpora:
            for item in self.user_subc_names(corp):
                try:
                    sc = self.cm.get_corpus(corp,
                                            subcname=item['n'],
                                            decode_desc=False)
                    data.append({
                        'name': '%s / %s' % (corp, item['n']),
                        'size': sc.search_size,
                        'created': time.mktime(sc.created.timetuple()),
                        'corpname': corp,
                        'human_corpname': sc.get_conf('NAME'),
                        'usesubcorp': sc.subcname,
                        'orig_subcname': sc.orig_subcname,
                        'deleted': False,
                        'description': sc.description,
                        'published': sc.is_published
                    })
                    related_corpora.add(corp)
                except RuntimeError as e:
                    logging.getLogger(__name__).warning(
                        'Failed to fetch information about subcorpus {0}:{1}: {2}'
                        .format(corp, item['n'], e))

        if filter_args['corpname']:
            data = [
                item for item in data if not filter_args['corpname']
                or item['corpname'] == filter_args['corpname']
            ]
        elif filter_args['corpname'] is None:
            filter_args['corpname'] = ''  # JS code requires non-null value

        if plugins.runtime.SUBC_RESTORE.exists:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    full_list = sr.extend_subc_list(self._plugin_ctx, data,
                                                    filter_args, 0)
                except Exception as e:
                    logging.getLogger(__name__).error(
                        'subc_restore plug-in failed to list queries: %s' % e)
                    full_list = data
        else:
            full_list = data

        sort = request.args.get('sort', '-created')
        sort_key, rev = self._parse_sorting_param(sort)
        if sort_key in ('size', 'created'):
            full_list = sorted(full_list,
                               key=lambda x: x[sort_key],
                               reverse=rev)
        else:
            full_list = l10n.sort(full_list,
                                  loc=self.ui_lang,
                                  key=lambda x: x[sort_key],
                                  reverse=rev)

        ans = dict(
            SubcorpList=
            [],  # this is used by subcorpus SELECT element; no need for that here
            subcorp_list=full_list,
            sort_key=dict(name=sort_key, reverse=rev),
            filter=filter_args,
            processed_subc=[
                v.to_dict() for v in self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS)
            ],
            related_corpora=sorted(related_corpora),
            uses_subc_restore=plugins.runtime.SUBC_RESTORE.exists)
        return ans
Ejemplo n.º 26
0
    def search(self, user_id, query, offset=0, limit=None, filter_dict=None):
        ans = {"rows": []}
        permitted_corpora = self._auth.permitted_corpora(user_id)
        user_items = self._user_items.get_user_items(user_id)
        used_keywords = set()
        all_keywords_map = dict(self.all_keywords)
        if filter_dict.get("minSize"):
            min_size = l10n.desimplify_num(filter_dict.get("minSize"), strict=False)
        else:
            min_size = 0
        if filter_dict.get("maxSize"):
            max_size = l10n.desimplify_num(filter_dict.get("maxSize"), strict=False)
        else:
            max_size = None
        corplist = self.get_list(permitted_corpora)

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = self._max_page_size

        def cut_result(res):
            if limit is not None:
                right_lim = offset + int(limit)
                new_res = res[offset:right_lim]
                if right_lim >= len(res):
                    right_lim = None
            else:
                right_lim = None
                new_res = res
            return new_res, right_lim

        def is_fav(corpus_id):
            for item in user_items:
                if isinstance(item, CorpusItem) and item.corpus_id == corpus_id:
                    return True
            return False

        query_substrs, query_keywords = self._parse_query(query)
        matches_all = lambda d: reduce(lambda t1, t2: t1 and t2, d, True)

        def matches_size(d):
            item_size = d.get("size", None)
            return (
                item_size is not None
                and (not min_size or int(item_size) >= int(min_size))
                and (not max_size or int(item_size) <= int(max_size))
            )

        normalized_query_substrs = [s.lower() for s in query_substrs]

        for corp in corplist:
            full_data = self.get_corpus_info(corp["id"], self.getlocal("lang"))
            if not isinstance(full_data, BrokenCorpusInfo):
                keywords = [k for k in full_data["metadata"]["keywords"].keys()]
                hits = []
                found_in = []

                hits.extend([k in keywords for k in query_keywords])
                for s in normalized_query_substrs:
                    # the name must be tested first to prevent the list 'found_in'
                    # to be filled in case item matches both name and description
                    if s in corp["name"].lower():
                        hits.append(True)
                    elif s in (corp["desc"].lower() if corp["desc"] else ""):
                        hits.append(True)
                        found_in.append(_("description"))
                    else:
                        hits.append(False)
                hits.append(matches_size(corp))
                hits.append(self.custom_filter(full_data, permitted_corpora))

                if matches_all(hits):
                    corp["raw_size"] = l10n.simplify_num(corp["size"]) if corp["size"] else None
                    corp["keywords"] = [(k, all_keywords_map[k]) for k in keywords]
                    corp["found_in"] = found_in
                    corp["user_item"] = is_fav(corp["id"])
                    self.customize_search_result_item(corp, full_data)
                    ans["rows"].append(corp)
                    used_keywords.update(keywords)

        corp_cmp_key = lambda c: c.get("name") if c.get("name") is not None else ""
        ans["rows"], ans["nextOffset"] = cut_result(l10n.sort(ans["rows"], loc=self._lang(), key=corp_cmp_key))
        ans["keywords"] = l10n.sort(used_keywords, loc=self._lang())
        ans["query"] = query
        ans["filters"] = dict(filter_dict)
        return ans
Ejemplo n.º 27
0
 def sort(self, plugin_api, data, *fields):
     def corp_cmp_key(c):
         return c.get('name') if c.get('name') is not None else ''
     return l10n.sort(data, loc=plugin_api.user_lang, key=corp_cmp_key)
Ejemplo n.º 28
0
    def search(self,
               plugin_api,
               query,
               offset=0,
               limit=None,
               filter_dict=None):
        if query is False:  # False means 'use default values'
            query = ''
        ans = {'rows': []}
        permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict)
        used_keywords = set()
        all_keywords_map = dict(
            self._corparch.all_keywords(plugin_api.user_lang))
        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'),
                                           strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'),
                                           strict=False)
        else:
            max_size = None
        if filter_dict.get('favOnly'):
            favourite_only = bool(int(filter_dict.get('favOnly')))
        else:
            favourite_only = False

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)

        def fav_id(corpus_id):
            for item in user_items:
                if item.is_single_corpus and item.main_corpus_id == corpus_id:
                    return item.ident
            return None

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)

        normalized_query_substrs = [s.lower() for s in query_substrs]
        for corp in self._corparch.get_list(plugin_api, permitted_corpora):
            full_data = self._corparch.get_corpus_info(plugin_api.user_lang,
                                                       corp['id'])
            if not isinstance(full_data, BrokenCorpusInfo):
                if favourite_only and fav_id(corp['id']) is None:
                    continue

                keywords = [k for k, _ in full_data.metadata.keywords]
                tests = []
                found_in = []

                tests.extend([k in keywords for k in query_keywords])
                for s in normalized_query_substrs:
                    # the name must be tested first to prevent the list 'found_in'
                    # to be filled in case item matches both name and description
                    if s in corp['name'].lower():
                        tests.append(True)
                    elif s in (corp['desc'].lower() if corp['desc'] else ''):
                        tests.append(True)
                        found_in.append('defaultCorparch__found_in_desc')
                    else:
                        tests.append(False)
                tests.append(self.matches_size(corp, min_size, max_size))
                tests.append(
                    self._corparch.custom_filter(self._plugin_api, full_data,
                                                 permitted_corpora))

                if self.matches_all(tests):
                    corp['size'] = corp['size']
                    corp['size_info'] = l10n.simplify_num(
                        corp['size']) if corp['size'] else None
                    corp['keywords'] = [(k, all_keywords_map[k])
                                        for k in keywords]
                    corp['found_in'] = found_in
                    corp['fav_id'] = fav_id(corp['id'])
                    # because of client-side fav/feat/search items compatibility
                    corp['corpus_id'] = corp['id']
                    ans['rows'].append(corp)
                    used_keywords.update(keywords)
                    if not self.should_fetch_next(ans, offset, limit):
                        break
        ans['rows'], ans['nextOffset'] = self.cut_result(
            self.sort(plugin_api, ans['rows']), offset, limit)
        ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang)
        ans['query'] = query
        ans['current_keywords'] = query_keywords
        ans['filters'] = dict(filter_dict)
        return ans
Ejemplo n.º 29
0
    def subcorp_list(self, request):
        """
        Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in
        installed then the list is enriched by additional re-use/undelete information.
        """
        self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER,
                                    MainMenu.FREQUENCY, MainMenu.COLLOCATIONS,
                                    MainMenu.SAVE, MainMenu.CONCORDANCE)

        sort = request.args.get('sort', 'name')
        filter_args = dict(show_deleted=bool(
            int(request.args.get('show_deleted', 0))),
                           corpname=request.args.get('corpname'))
        data = []
        user_corpora = plugins.runtime.AUTH.instance.permitted_corpora(
            self.session_get('user')).values()
        related_corpora = set()
        for corp in user_corpora:
            try:
                for item in self.cm.subcorp_names(corp):
                    sc = self.cm.get_Corpus(corp, item['n'])
                    data.append({
                        'name':
                        '%s:%s' % (self._canonical_corpname(corp), item['n']),
                        'size':
                        sc.search_size(),
                        'created':
                        time.mktime(sc.created.timetuple()),
                        'corpname':
                        corp,
                        'human_corpname':
                        sc.get_conf('NAME'),
                        'usesubcorp':
                        item['n'],
                        'deleted':
                        False
                    })
                    related_corpora.add(self._canonical_corpname(corp))
            except Exception as e:
                for d in data:
                    # permitted_corpora does this
                    d['usesubcorp'] = werkzeug.urls.url_quote(d['usesubcorp'],
                                                              unsafe='+')
                logging.getLogger(__name__).warn(
                    'Failed to fetch information about subcorpus of [%s]: %s' %
                    (corp, e))

        if filter_args['corpname']:
            data = filter(
                lambda item: not filter_args['corpname'] or item['corpname'] ==
                filter_args['corpname'], data)
        elif filter_args['corpname'] is None:
            filter_args['corpname'] = ''  # JS code requires non-null value

        if plugins.runtime.SUBC_RESTORE.exists:
            try:
                full_list = plugins.runtime.SUBC_RESTORE.instance.extend_subc_list(
                    self._plugin_api, data, filter_args, 0)
            except Exception as e:
                logging.getLogger(__name__).error(
                    'subc_restore plug-in failed to list queries: %s' % e)
                full_list = data
        else:
            full_list = data

        sort_key, rev = self._parse_sorting_param(sort)
        if sort_key in ('size', 'created'):
            full_list = sorted(full_list,
                               key=lambda x: x[sort_key],
                               reverse=rev)
        else:
            full_list = l10n.sort(full_list,
                                  loc=self.ui_lang,
                                  key=lambda x: x[sort_key],
                                  reverse=rev)
        unfinished_corpora = filter(
            lambda at: not at.is_finished(),
            self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
        ans = dict(
            SubcorpList=
            [],  # this is used by subcorpus SELECT element; no need for that here
            subcorp_list=full_list,
            sort_key=dict(name=sort_key, reverse=rev),
            filter=filter_args,
            unfinished_subc=[uc.to_dict() for uc in unfinished_corpora],
            related_corpora=sorted(related_corpora))
        return ans
Ejemplo n.º 30
0
def texttype_values(
        corp: Corpus,
        subcorpattrs: str,
        maxlistsize: int,
        shrink_list: Union[Tuple[str, ...], List[str]] = (),
        collator_locale: Optional[str] = None) -> List[Dict[str, Any]]:
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}

    !!!!!!
    NOTE: avoid calling this method repeatedly for the same corpus as the
    attr = corp.get_attr(n) line is leaking opened files of corpora indexes which
    leads to exhausted limit for opened files for Gunicorn/Celery after some time.
    KonText caches the value returned by this function to prevent this.

    !!! TODO !!!

    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if not shrink_list:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and
                (corp.get_conf(n + '.TEXTBOXLENGTH')
                 or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH')
                                            or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{
                        'v': attr.id2str(i)
                    } for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            attr.id2str(i).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{
                            'v': x
                        } for x in sorted(
                            set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{
                            'v': attr.id2str(i)
                        } for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals,
                                               key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals,
                                                  collator_locale,
                                                  key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(
                        vals,
                        key=cmp_to_key(lambda x1, x2: cmp(
                            x1['v'].lower(), x2['v'].lower())))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Ejemplo n.º 31
0
    def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None):
        if query is False:  # False means 'use default values'
            query = ''
        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False)
        else:
            max_size = None
        if filter_dict.get('requestable'):
            requestable = bool(int(filter_dict.get('requestable')))
        else:
            requestable = False
        if filter_dict.get('favOnly'):
            favourites_only = bool(int(filter_dict.get('favOnly')))
        else:
            favourites_only = False

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)
        favourite_corpora = {
            item.main_corpus_id: item.ident for item in user_items if item.is_single_corpus}

        def get_found_in(corp, phrases):
            ans = []
            for phrase in phrases:
                phrase = phrase.lower()
                name = corp.name.lower() if corp.name is not None else ''
                desc = corp.description.lower() if corp.description is not None else ''
                if phrase not in name and phrase in desc:
                    ans.append('defaultCorparch__found_in_desc')
                    break
            return ans

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)
        normalized_query_substrs = [s.lower() for s in query_substrs]
        used_keywords = set()
        rows = list(self._corparch.list_corpora(plugin_api, substrs=normalized_query_substrs,
                                                min_size=min_size, max_size=max_size, requestable=requestable,
                                                offset=offset, limit=limit + 1, keywords=query_keywords,
                                                favourites=tuple(favourite_corpora.keys()) if favourites_only else ()).values())
        ans = []
        for i, corp in enumerate(rows):
            used_keywords.update(corp.keywords)
            corp.keywords = self._corparch.get_l10n_keywords(corp.keywords, plugin_api.user_lang)
            corp.fav_id = favourite_corpora.get(corp.id, None)
            corp.found_in = get_found_in(corp, normalized_query_substrs)
            ans.append(corp.to_dict())
            if i == limit - 1:
                break
        return dict(rows=ans,
                    nextOffset=offset + limit if len(rows) > limit else None,
                    keywords=l10n.sort(used_keywords, loc=plugin_api.user_lang),
                    query=query,
                    current_keywords=query_keywords,
                    filters=dict(filter_dict))
Ejemplo n.º 32
0
    def subcorp_list(self, request):
        """
        Displays a list of user subcorpora. In case there is a 'subc_restore' plug-in
        installed then the list is enriched by additional re-use/undelete information.
        """
        self.disabled_menu_items = (MainMenu.VIEW, MainMenu.FILTER, MainMenu.FREQUENCY,
                                    MainMenu.COLLOCATIONS, MainMenu.SAVE, MainMenu.CONCORDANCE)

        filter_args = dict(show_deleted=bool(int(request.args.get('show_deleted', 0))),
                           corpname=request.args.get('corpname'))
        data = []
        user_corpora = plugins.runtime.AUTH.instance.permitted_corpora(
            self.session_get('user')).keys()
        related_corpora = set()
        for corp in user_corpora:
            for item in self.user_subc_names(corp):
                try:
                    sc = self.cm.get_Corpus(corp, subcname=item['n'], decode_desc=False)
                    data.append({
                        'name': '%s / %s' % (corp, item['n']),
                        'size': sc.search_size(),
                        'created': time.mktime(sc.created.timetuple()),
                        'corpname': corp,
                        'human_corpname': sc.get_conf('NAME'),
                        'usesubcorp': sc.subcname,
                        'orig_subcname': sc.orig_subcname,
                        'deleted': False,
                        'description': sc.description,
                        'published': corplib.subcorpus_is_published(sc.spath)
                    })
                    related_corpora.add(corp)
                except RuntimeError as e:
                    logging.getLogger(__name__).warn(
                        u'Failed to fetch information about subcorpus {0}:{1}: {2}'.format(corp, item['n'], e))

        if filter_args['corpname']:
            data = filter(lambda item: not filter_args['corpname'] or item['corpname'] == filter_args['corpname'],
                          data)
        elif filter_args['corpname'] is None:
            filter_args['corpname'] = ''  # JS code requires non-null value

        if plugins.runtime.SUBC_RESTORE.exists:
            try:
                full_list = plugins.runtime.SUBC_RESTORE.instance.extend_subc_list(self._plugin_api, data,
                                                                                   filter_args, 0)
            except Exception as e:
                logging.getLogger(__name__).error(
                    'subc_restore plug-in failed to list queries: %s' % e)
                full_list = data
        else:
            full_list = data

        sort = request.args.get('sort', '-created')
        sort_key, rev = self._parse_sorting_param(sort)
        if sort_key in ('size', 'created'):
            full_list = sorted(full_list, key=lambda x: x[sort_key], reverse=rev)
        else:
            full_list = l10n.sort(full_list, loc=self.ui_lang,
                                  key=lambda x: x[sort_key], reverse=rev)

        ans = dict(
            SubcorpList=[],   # this is used by subcorpus SELECT element; no need for that here
            subcorp_list=full_list,
            sort_key=dict(name=sort_key, reverse=rev),
            filter=filter_args,
            processed_subc=[v.to_dict() for v in self.get_async_tasks(
                category=AsyncTaskStatus.CATEGORY_SUBCORPUS)],
            related_corpora=sorted(related_corpora),
            uses_subc_restore=plugins.runtime.SUBC_RESTORE.exists
        )
        return ans
Ejemplo n.º 33
0
    def search(self,
               plugin_api,
               query,
               offset=0,
               limit=None,
               filter_dict=None):
        if query is False:  # False means 'use default values'
            query = ''
        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'),
                                           strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'),
                                           strict=False)
        else:
            max_size = None

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)

        def fav_id(corpus_id):
            for item in user_items:
                if item.is_single_corpus and item.main_corpus_id == corpus_id:
                    return item.ident
            return None

        def get_found_in(corp, phrases):
            ans = []
            for phrase in phrases:
                if phrase in corp.description.lower():
                    ans.append(_('description'))
                    break
            return ans

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)
        normalized_query_substrs = [s.lower() for s in query_substrs]
        used_keywords = set()
        rows = self._corparch.list_corpora(plugin_api,
                                           substrs=normalized_query_substrs,
                                           min_size=min_size,
                                           max_size=max_size,
                                           offset=offset,
                                           limit=limit + 1,
                                           keywords=query_keywords).values()
        ans = []
        for i, corp in enumerate(rows):
            used_keywords.update(corp.keywords)
            corp.keywords = self._corparch.get_l10n_keywords(
                corp.keywords, plugin_api.user_lang)
            corp.fav_id = fav_id(corp.id)
            corp.found_in = get_found_in(corp, normalized_query_substrs)
            ans.append(corp.to_dict())
            if i == limit - 1:
                break
        return dict(rows=ans,
                    nextOffset=(limit + 1) if len(rows) > limit else None,
                    keywords=l10n.sort(used_keywords,
                                       loc=plugin_api.user_lang),
                    query=query,
                    current_keywords=query_keywords,
                    filters=dict(filter_dict))
Ejemplo n.º 34
0
    def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None):
        if query is False:  # False means 'use default values'
            query = ''
        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False)
        else:
            max_size = None
        if filter_dict.get('requestable'):
            requestable = bool(int(filter_dict.get('requestable')))
        else:
            requestable = False

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)

        def fav_id(corpus_id):
            for item in user_items:
                if item.is_single_corpus and item.main_corpus_id == corpus_id:
                    return item.ident
            return None

        def get_found_in(corp, phrases):
            ans = []
            for phrase in phrases:
                phrase = phrase.lower()
                if phrase not in corp.name.lower() and phrase in corp.description.lower():
                    ans.append('defaultCorparch__found_in_desc')
                    break
            return ans

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)
        normalized_query_substrs = [s.lower() for s in query_substrs]
        used_keywords = set()
        rows = self._corparch.list_corpora(plugin_api, substrs=normalized_query_substrs,
                                           min_size=min_size, max_size=max_size, requestable=requestable,
                                           offset=offset, limit=limit + 1, keywords=query_keywords).values()
        ans = []
        for i, corp in enumerate(rows):
            used_keywords.update(corp.keywords)
            corp.keywords = self._corparch.get_l10n_keywords(corp.keywords, plugin_api.user_lang)
            corp.fav_id = fav_id(corp.id)
            corp.found_in = get_found_in(corp, normalized_query_substrs)
            ans.append(corp.to_dict())
            if i == limit - 1:
                break
        return dict(rows=ans,
                    nextOffset=offset + limit if len(rows) > limit else None,
                    keywords=l10n.sort(used_keywords, loc=plugin_api.user_lang),
                    query=query,
                    current_keywords=query_keywords,
                    filters=dict(filter_dict))
Ejemplo n.º 35
0
    def xfreq_dist(self,
                   crit,
                   limit=1,
                   sortkey='f',
                   ftt_include_empty: int = 0,
                   rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1} (0 for structural attrs. , 1 for positional ones ??)
        """
        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return lab if lab else attr

        def export_word(wrd):
            return [{'n': '  '.join(n.split('\v'))} for n in wrd.split('\t')]

        def test_word_empty(wrd):
            return len(wrd) == 1 and (wrd[0]['n'] == ''
                                      or wrd[0]['n'] == '===NONE===')

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        if len(freqs) == 0:
            return dict(Head=[],
                        Items=[],
                        SkippedEmpty=False,
                        NoRelSorting=True)

        # for structural attrs, we intentionally rewrite norms as filled in by Corpus.freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        # For positional attrs, the norm is the size of the actual corpus/subcorpus. Please note that
        # for an "ad hoc" (or unnamed) subcorpus, this may be misleading as we still calculate against orig. corpus
        else:
            norms = [self.pycorp.search_size for _ in words]

        attrs = crit.split()
        head: List[Dict[str, Any]] = [
            dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2)
        ]
        head.append(
            dict(n=translate('Freq'), s='freq', title=translate('Frequency')))
        has_empty_item = False
        head.append(
            dict(
                n='i.p.m.',
                title=translate(
                    'instances per million positions (refers to the respective category)'
                ),
                s='rel'))

        lines = []
        for w, f, nf in zip(words, freqs, norms):
            word = export_word(w)
            if test_word_empty(word):
                has_empty_item = True
                continue
            lines.append(
                dict(Word=word, freq=f, norm=nf, rel=round(f / nf * 1e6, 2)))
        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(Word=[{'n': v}], freq=0, rel=0, norm=0))
        if (sortkey
                in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines,
                              loc=collator_locale,
                              key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head,
                    Items=lines,
                    SkippedEmpty=has_empty_item,
                    NoRelSorting=bool(rel_mode))
Ejemplo n.º 36
0
    def search(self, plugin_api, query, offset=0, limit=None, filter_dict=None):
        if query is False:  # False means 'use default values'
            query = ''
        ans = {'rows': []}
        permitted_corpora = self._auth.permitted_corpora(plugin_api.user_dict)
        used_keywords = set()
        all_keywords_map = dict(self._corparch.all_keywords(plugin_api.user_lang))
        if filter_dict.get('minSize'):
            min_size = l10n.desimplify_num(filter_dict.get('minSize'), strict=False)
        else:
            min_size = 0
        if filter_dict.get('maxSize'):
            max_size = l10n.desimplify_num(filter_dict.get('maxSize'), strict=False)
        else:
            max_size = None

        if offset is None:
            offset = 0
        else:
            offset = int(offset)

        if limit is None:
            limit = int(self._corparch.max_page_size)
        else:
            limit = int(limit)

        user_items = self._corparch.user_items.get_user_items(plugin_api)

        def fav_id(corpus_id):
            for item in user_items:
                if item.is_single_corpus and item.main_corpus_id == corpus_id:
                    return item.ident
            return None

        query_substrs, query_keywords = parse_query(self._tag_prefix, query)

        normalized_query_substrs = [s.lower() for s in query_substrs]
        for corp in self._corparch.get_list(plugin_api, permitted_corpora):
            full_data = self._corparch.get_corpus_info(plugin_api.user_lang, corp['id'])
            if not isinstance(full_data, BrokenCorpusInfo):
                keywords = [k for k in full_data['metadata']['keywords'].keys()]
                tests = []
                found_in = []

                tests.extend([k in keywords for k in query_keywords])
                for s in normalized_query_substrs:
                    # the name must be tested first to prevent the list 'found_in'
                    # to be filled in case item matches both name and description
                    if s in corp['name'].lower():
                        tests.append(True)
                    elif s in (corp['desc'].lower() if corp['desc'] else ''):
                        tests.append(True)
                        found_in.append('defaultCorparch__found_in_desc')
                    else:
                        tests.append(False)
                tests.append(self.matches_size(corp, min_size, max_size))
                tests.append(self._corparch.custom_filter(
                    self._plugin_api, full_data, permitted_corpora))

                if self.matches_all(tests):
                    corp['size'] = corp['size']
                    corp['size_info'] = l10n.simplify_num(corp['size']) if corp['size'] else None
                    corp['keywords'] = [(k, all_keywords_map[k]) for k in keywords]
                    corp['found_in'] = found_in
                    corp['fav_id'] = fav_id(corp['id'])
                    # because of client-side fav/feat/search items compatibility
                    corp['corpus_id'] = corp['id']
                    ans['rows'].append(corp)
                    used_keywords.update(keywords)
                    if not self.should_fetch_next(ans, offset, limit):
                        break
        ans['rows'], ans['nextOffset'] = self.cut_result(
            self.sort(plugin_api, ans['rows']), offset, limit)
        ans['keywords'] = l10n.sort(used_keywords, loc=plugin_api.user_lang)
        ans['query'] = query
        ans['current_keywords'] = query_keywords
        ans['filters'] = dict(filter_dict)
        return ans
Ejemplo n.º 37
0
    def get_attr_values(self, corpus, attr_map, aligned_corpora=None):
        """
        Finds all the available values of remaining attributes according to the
        provided attr_map and aligned_corpora

        arguments:
        corpus -- manatee.corpus object
        attr_map -- a dictionary of attributes and values as selected by a user
        aligned_corpora - a list/tuple of corpora names aligned to base one (the 'corpus' argument)

        returns:
        a dictionary containing matching attributes and values
        """
        corpname = vanilla_corpname(corpus.corpname)
        corpus_info = self.corparch.get_corpus_info(corpname)
        bib_label = LiveAttributes.import_key(corpus_info.metadata.label_attr)
        bib_id = LiveAttributes.import_key(corpus_info.metadata.id_attr)
        attrs = self._get_subcorp_attrs(corpus)

        if bib_label and bib_label not in attrs:
            attrs.append(bib_label)

        srch_attrs = set(attrs) - set(
            self.import_key(k)
            for k in attr_map.keys() if type(attr_map[k]) is not dict)
        srch_attrs.add('poscount')

        hidden_attrs = set()
        if bib_id is not None and bib_id not in srch_attrs:
            hidden_attrs.add(bib_id)
        if not bib_id:
            hidden_attrs.add('id')

        selected_attrs = tuple(srch_attrs.union(hidden_attrs))

        # a map [db_col_name]=>[db_col_idx]
        srch_attr_map = dict([(x[1], x[0]) for x in enumerate(selected_attrs)])

        attr_items = AttrArgs(attr_map, self.empty_val_placeholder)
        where_sql, where_values = attr_items.export_sql('t1', corpname)

        join_sql = []
        i = 2
        for item in aligned_corpora:
            join_sql.append('JOIN item AS t%d ON t1.item_id = t%d.item_id' %
                            (i, i))
            where_sql += ' AND t%d.corpus_id = ?' % i
            where_values.append(item)
            i += 1

        if len(where_sql) > 0:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s WHERE %s" \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql), where_sql)
        else:
            sql_template = "SELECT DISTINCT %s FROM item AS t1 %s " \
                           % (', '.join(self.apply_prefix(selected_attrs, 't1')), ' '.join(join_sql))

        ans = {}
        # already selected items are part of the answer; no need to fetch them from db
        ans.update(dict([(self.import_key(k), v)
                         for k, v in attr_map.items()]))
        range_attrs = set()

        for attr in ans.keys():
            if type(ans[attr]) is dict:
                ans[attr] = set(
                )  # currently we throw away the range and load all the stuff
                range_attrs.add(attr)

        for attr in srch_attrs:
            if attr in ('poscount', ):
                ans[attr] = 0
            else:
                ans[attr] = set()

        poscounts = defaultdict(lambda: defaultdict(lambda: 0))
        max_visible_chars = self.calc_max_attr_val_visible_chars(corpus_info)

        for item in self.db(corpname).execute(sql_template,
                                              *where_values).fetchall():
            for attr in selected_attrs:
                v = item[srch_attr_map[attr]]
                if v is not None and attr not in hidden_attrs:
                    attr_val = None
                    if attr == bib_label:
                        attr_val = (self.shorten_value(
                            unicode(v), length=max_visible_chars),
                                    item[srch_attr_map[bib_id]], unicode(v))
                    elif type(ans[attr]) is set:
                        attr_val = (self.shorten_value(
                            unicode(v), length=max_visible_chars), v, v)
                    elif type(ans[attr]) is int:
                        ans[attr] += int(v)

                    if attr_val is not None:
                        poscounts[attr][attr_val] += item['poscount']

        # here we append position count information to the respective items
        for attr, v in poscounts.items():
            for k, c in v.items():
                ans[attr].add(k + (l10n.format_number(c), ))
            del poscounts[attr]

        exported = {}
        collator_locale = corpus_info.collator_locale

        for k in ans.keys():
            if type(ans[k]) is set:
                if len(ans[k]) <= self.max_attr_list_size or k in range_attrs:
                    if k == bib_label:
                        out_data = l10n.sort(ans[k],
                                             collator_locale,
                                             key=lambda t: t[0])
                    else:
                        out_data = tuple(
                            l10n.sort(ans[k],
                                      collator_locale,
                                      key=lambda t: t[0]))
                    exported[self.export_key(k)] = out_data
                else:
                    exported[self.export_key(k)] = {'length': len(ans[k])}

            else:
                exported[self.export_key(k)] = ans[k]
        exported['poscount'] = l10n.format_number(exported['poscount'])
        exported['aligned'] = aligned_corpora
        return exported