Exemple #1
0
 def get_list(self, plugin_api, user_allowed_corpora):
     """
     arguments:
     user_allowed_corpora -- a dict (corpus_id, corpus_variant) containing corpora ids
                             accessible by the current user
     """
     cl = []
     for item in self._raw_list(plugin_api.user_lang).values():
         corp_id, path, web = item['id'], item['path'], item['sentence_struct']
         if corp_id in user_allowed_corpora:
             try:
                 corp_info = self.manatee_corpora.get_info(corp_id)
                 cl.append({'id': corp_id,
                            'name': l10n.import_string(corp_info.name,
                                                       from_encoding=corp_info.encoding),
                            'desc': l10n.import_string(corp_info.description,
                                                       from_encoding=corp_info.encoding),
                            'size': corp_info.size,
                            'path': path
                            })
             except Exception as e:
                 import logging
                 logging.getLogger(__name__).warn(
                     u'Failed to fetch info about %s with error %s (%r)' % (corp_info.name,
                                                                            type(e).__name__, e))
                 cl.append({
                     'id': corp_id, 'name': corp_id,
                     'path': path, 'desc': '', 'size': None})
     return cl
Exemple #2
0
 def get_list(self, user_allowed_corpora):
     """
     arguments:
     user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids
                             accessible by the current user
     """
     cl = []
     for item in self._raw_list().values():
         canonical_id, path, web = item['id'], item['path'], item['sentence_struct']
         corp_id = user_allowed_corpora.get(canonical_id, canonical_id)
         try:
             corp_info = self._manatee_corpora.get_info(corp_id)
             cl.append({'id': corp_id,
                        'canonical_id': canonical_id,
                        'name': l10n.import_string(corp_info.name,
                                                   from_encoding=corp_info.encoding),
                        'desc': l10n.import_string(corp_info.description,
                                                   from_encoding=corp_info.encoding),
                        'size': corp_info.size,
                        'path': path,
                        'requestable': item.get('requestable', False)
                        })
         except Exception, e:
             import logging
             logging.getLogger(__name__).warn(
                 u'Failed to fetch info about %s with error %s (%r)' % (corp_id,
                                                                        type(e).__name__, e))
             cl.append({
                 'id': corp_id, 'canonical_id': canonical_id, 'name': corp_id,
                 'path': path, 'desc': '', 'size': None})
Exemple #3
0
    def ajax_get_corp_details(self, request):
        """
        """
        corp_conf_info = plugins.get('corparch').get_corpus_info(request.args['corpname'])
        corpus = self.cm.get_Corpus(request.args['corpname'])
        encoding = corpus.get_conf('ENCODING')

        ans = {
            'corpname': l10n.import_string(self._canonical_corpname(corpus.get_conf('NAME')),
                                           from_encoding=encoding),
            'description': l10n.import_string(corpus.get_info(), from_encoding=encoding),
            'size': l10n.format_number(int(corpus.size())),
            'attrlist': [],
            'structlist': [],
            'web_url': corp_conf_info['web'] if corp_conf_info is not None else ''
        }
        try:
            ans['attrlist'] = [{'name': item, 'size': l10n.format_number(int(corpus.get_attr(item).id_range()))}
                               for item in corpus.get_conf('ATTRLIST').split(',')]
        except RuntimeError as e:
            logging.getLogger(__name__).warn('%s' % e)
            ans['attrlist'] = {'error': _('Failed to load')}
        ans['structlist'] = [{'name': item, 'size': l10n.format_number(int(corpus.get_struct(item).size()))}
                             for item in corpus.get_conf('STRUCTLIST').split(',')]
        return ans
Exemple #4
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        within_condition -- custom within condition; if non-empty then clickable form is omitted
        within_struct -- a structure the within_condition will be applied to
        """
        subcname = request.form['subcname']
        within_condition = request.form['within_condition']
        within_struct = request.form['within_struct']
        corp_encoding = self._corp().get_conf('ENCODING')

        if within_condition and within_struct:  # user entered a subcorpus query manually
            tt_query = [(export_string(within_struct, to_encoding=corp_encoding),
                        export_string(within_condition, to_encoding=corp_encoding))]
        else:
            tt_query = self._texttype_query(request)
            within_struct = import_string(tt_query[0][0], from_encoding=corp_encoding)
            within_condition = import_string(tt_query[0][1], from_encoding=corp_encoding)

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise ConcError(_('No subcorpus name specified!'))

        path = os.path.join(self.subcpath[-1], basecorpname)
        if not os.path.isdir(path):
            os.makedirs(path)
        path = os.path.join(path, subcname) + '.subc'
        if not tt_query:
            raise ConcError(_('Nothing specified!'))

        # Even if _texttype_query() parsed multiple structures into tt_query,
        # Manatee can accept directly only one (but with arbitrarily complex attribute
        # condition).
        # For this reason, we choose only the first struct+condition pair.
        # It is up to the user interface to deal with it.
        structname, subquery = tt_query[0]
        if type(path) == unicode:
            path = path.encode("utf-8")
        if corplib.create_subcorpus(path, self._corp(), structname, subquery):
            if plugins.has_plugin('subc_restore'):
                try:
                    plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'),
                                                            corpname=self.args.corpname,
                                                            subcname=subcname,
                                                            structname=within_struct,
                                                            condition=within_condition)
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            _('Subcorpus created but there was a problem saving a backup copy.'))
            return {}
        else:
            raise ConcError(_('Empty subcorpus!'))
Exemple #5
0
def get_full_ref(corp, pos):
    corpus_encoding = corp.get_conf('ENCODING')
    data = {}
    refs = [(n == '#' and ('#', str(pos)) or
             (n, corp.get_attr(n).pos2str(pos)))
            for n in corp.get_conf('FULLREF').split(',') if n != settings.get('corpora', 'speech_segment_struct_attr')]
    data['Refs'] = [{'name': n == '#' and _('Token number') or corp.get_conf(n + '.LABEL') or n,
                     'val': import_string(v, corpus_encoding)} for n, v in refs]
    for n, v in refs:
        data[n.replace('.', '_')] = import_string(v, corpus_encoding)
    return data
Exemple #6
0
def get_full_ref(corp, pos):
    corpus_encoding = corp.get_conf('ENCODING')
    data = {}
    refs = [(n == '#' and ('#', str(pos)) or
             (n, corp.get_attr(n).pos2str(pos)))
            for n in corp.get_conf('FULLREF').split(',') if n != settings.get('corpora', 'speech_segment_struct_attr')]
    data['Refs'] = [{'name': n == '#' and _('Token number') or corp.get_conf(n + '.LABEL') or n,
                     'val': import_string(v, corpus_encoding)} for n, v in refs]
    for n, v in refs:
        data[n.replace('.', '_')] = import_string(v, corpus_encoding)
    return data
Exemple #7
0
    def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs):
        """
        Retrieve a sentence via Manatee
        Args:
            corpus (manatee.Corpus): a corpus instance
            corpus_id (str): corpus ID
            token_id (int): token number/id
            kwic_len (int): number of tokens in KWIC
            tree_attrs (list of str): a list of positional attributes required by tree nodes/edges

        Returns (dict):
            data: a list of strings (Manatee raw format)
            kwic_pos: a tuple (first_kwic_idx, kwic_length)
        """
        encoding = corpus.get_conf('ENCODING')
        sentence_struct = self._conf.get_sentence_struct(corpus_id)
        conc = manatee.Concordance(corpus, ' '.join(
            '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1)
        conc.sync()
        kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1),
                               '-1:%s' % sentence_struct,
                               '1:%s' % sentence_struct,
                               ','.join(tree_attrs),
                               ','.join(tree_attrs), '', '')
        if kl.nextline():
            left_tk = kl.get_left()
            kwic_tk = kl.get_kwic()
            return dict(data=[import_string(s, from_encoding=encoding)
                              for s in left_tk + kwic_tk + kl.get_right()],
                        kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
Exemple #8
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname)
    try:
        # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        corplib.frq_db(corp, coll_args.cattr)
        conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q,
                                fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'))
        collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
            item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding)
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.args[0], coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Exemple #9
0
def set_favorite_item(ctrl, request):
    """
    """
    corpora = []
    main_size = None
    for i, c_id in enumerate(request.form.getlist('corpora')):
        corp = ctrl.cm.get_Corpus(c_id, subcname=request.form['subcorpus_id'] if i == 0 else None)
        if i == 0:
            main_size = corp.search_size()
        corpora.append(dict(id=c_id, name=l10n.import_string(
            corp.get_conf('NAME'), corp.get_conf('ENCODING'))))
    subcorpus_id = request.form['subcorpus_id']
    subcorpus_orig_id = request.form['subcorpus_orig_id']
    item = FavoriteItem(dict(
        name=u' || '.join(c['name'] for c in corpora) +
        (u' / ' + subcorpus_orig_id if subcorpus_orig_id else u''),
        corpora=corpora,
        subcorpus_id=subcorpus_id,
        subcorpus_orig_id=subcorpus_orig_id,
        size=main_size,
        size_info=l10n.simplify_num(main_size)
    ))
    with plugins.runtime.USER_ITEMS as uit:
        uit.add_user_item(ctrl._plugin_api, item)
        return item.to_dict()
Exemple #10
0
def get_detail_context(corp,
                       pos,
                       hitlen=1,
                       detail_left_ctx=40,
                       detail_right_ctx=40,
                       attrs=None,
                       structs='',
                       detail_ctx_incr=60):
    data = {}
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    query_attrs = 'word' if attrs is None else ','.join(attrs)
    cr = manatee.CorpRegion(corp, query_attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(
        cr.region(pos + hitlen, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''),
                                   from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = [('pos', pos)]
    if hitlen != 1:
        refbase.append(('hitlen', hitlen))
    data['expand_left_args'] = dict(
        refbase + [('detail_left_ctx', detail_left_ctx +
                    detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)])
    data['expand_right_args'] = dict(refbase +
                                     [('detail_left_ctx', detail_left_ctx),
                                      ('detail_right_ctx',
                                       detail_right_ctx + detail_ctx_incr)])
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #11
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        corp_encoding = self._corp().get_conf('ENCODING')

        if raw_cql:
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        else:
            tt_query = TextTypeCollector(self._corp(), request).get_query()
            full_cql = ' within '.join(['<%s %s />' % item for item in tt_query])
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=corp_encoding)
            imp_cql = (full_cql,)
        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname)

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1:
            result = corplib.create_subcorpus(path, self._corp(), tt_query[0][0], tt_query[0][1])
        elif len(tt_query) > 1 or within_cql:
            conc = conclib.get_conc(self._corp(), self._session_get('user', 'user'), q=imp_cql)
            conc.sync()
            struct = self._corp().get_struct(tt_query[0][0]) if len(tt_query) == 1 else None
            result = corplib.subcorpus_from_conc(path, conc, struct)
        else:
            raise UserActionException(_('Nothing specified!'))

        if result:
            if plugins.has_plugin('subc_restore'):
                try:
                    plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'),
                                                            corpname=self.args.corpname,
                                                            subcname=subcname,
                                                            cql=full_cql.split('[]')[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            _('Subcorpus created but there was a problem saving a backup copy.'))
            return {}
        else:
            raise ConcError(_('Empty subcorpus!'))
Exemple #12
0
 def fetch_attr(corp, attr, token_id, num_tokens):
     mattr = corp.get_attr(attr)
     ans = []
     for i in range(num_tokens):
         ans.append(
             import_string(mattr.pos2str(int(token_id) + i),
                           corp.get_conf('ENCODING')))
     return ' '.join(ans)
Exemple #13
0
 def get_list(self, plugin_api, user_allowed_corpora):
     """
     arguments:
     user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids
                             accessible by the current user
     """
     simple_names = set(user_allowed_corpora.keys())
     cl = []
     for item in self._raw_list(plugin_api.user_lang).values():
         canonical_id, path, web = item['id'], item['path'], item[
             'sentence_struct']
         if canonical_id in simple_names:
             try:
                 corp_id = user_allowed_corpora[canonical_id]
                 corp_info = self.manatee_corpora.get_info(corp_id)
                 cl.append({
                     'id':
                     corp_id,
                     'canonical_id':
                     canonical_id,
                     'name':
                     l10n.import_string(corp_info.name,
                                        from_encoding=corp_info.encoding),
                     'desc':
                     l10n.import_string(corp_info.description,
                                        from_encoding=corp_info.encoding),
                     'size':
                     corp_info.size,
                     'path':
                     path
                 })
             except Exception as e:
                 import logging
                 logging.getLogger(__name__).warn(
                     u'Failed to fetch info about %s with error %s (%r)' %
                     (corp_info.name, type(e).__name__, e))
                 cl.append({
                     'id': corp_id,
                     'canonical_id': canonical_id,
                     'name': corp_id,
                     'path': path,
                     'desc': '',
                     'size': None
                 })
     return cl
Exemple #14
0
def get_full_ref(corp, pos):
    corpus_encoding = corp.get_conf("ENCODING")
    data = {}
    refs = [
        (n == "#" and ("#", str(pos)) or (n, corp.get_attr(n).pos2str(pos)))
        for n in corp.get_conf("FULLREF").split(",")
        if n != settings.get("corpora", "speech_segment_struct_attr")
    ]
    data["Refs"] = [
        {
            "name": n == "#" and _("Token number") or corp.get_conf(n + ".LABEL") or n,
            "val": import_string(v, corpus_encoding),
        }
        for n, v in refs
    ]
    for n, v in refs:
        data[n.replace(".", "_")] = import_string(v, corpus_encoding)
    return data
Exemple #15
0
 def corpconf_pairs(self, corp, label):
     if type(corp) is UnicodeType:
         corp = self.get_Corpus(corp)
     val = import_string(corp.get_conf(label), from_encoding=corp.get_conf("ENCODING"))
     if len(val) > 2:
         val = val[1:].split(val[0])
     else:
         val = ""
     return [val[i : i + 2] for i in range(0, len(val), 2)]
Exemple #16
0
 def get_list(self, user_allowed_corpora):
     """
     arguments:
     user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids
                             accessible by the current user
     """
     cl = []
     for item in self._raw_list().values():
         canonical_id, path, web = item['id'], item['path'], item[
             'sentence_struct']
         corp_id = user_allowed_corpora.get(canonical_id, canonical_id)
         try:
             corp_info = self._manatee_corpora.get_info(corp_id)
             cl.append({
                 'id':
                 corp_id,
                 'canonical_id':
                 canonical_id,
                 'name':
                 l10n.import_string(corp_info.name,
                                    from_encoding=corp_info.encoding),
                 'desc':
                 l10n.import_string(corp_info.description,
                                    from_encoding=corp_info.encoding),
                 'size':
                 corp_info.size,
                 'path':
                 path,
                 'requestable':
                 item.get('requestable', False)
             })
         except Exception, e:
             import logging
             logging.getLogger(__name__).warn(
                 u'Failed to fetch info about %s with error %s (%r)' %
                 (corp_id, type(e).__name__, e))
             cl.append({
                 'id': corp_id,
                 'canonical_id': canonical_id,
                 'name': corp_id,
                 'path': path,
                 'desc': '',
                 'size': None
             })
Exemple #17
0
 def corpconf_pairs(self, corp, label):
     if type(corp) is UnicodeType:
         corp = self.get_Corpus(corp)
     val = import_string(corp.get_conf(label),
                         from_encoding=corp.get_conf('ENCODING'))
     if len(val) > 2:
         val = val[1:].split(val[0])
     else:
         val = ''
     return [val[i:i + 2] for i in range(0, len(val), 2)]
Exemple #18
0
def get_detail_context(
    corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, addattrs=None, structs="", detail_ctx_incr=60
):
    data = {}
    if addattrs is None:
        addattrs = []
    corpus_encoding = corp.get_conf("ENCODING")
    wrapdetail = corp.get_conf("WRAPDETAIL")
    if wrapdetail:
        data["wrapdetail"] = "<%s>" % wrapdetail
        if not wrapdetail in structs.split(","):
            data["deletewrap"] = True
        structs = wrapdetail + "," + structs
    else:
        data["wrapdetail"] = ""
    try:
        maxdetail = int(corp.get_conf("MAXDETAIL"))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf("MAXCONTEXT"))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    attrs = ",".join(["word"] + addattrs)
    cr = manatee.CorpRegion(corp, attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(cr.region(pos + hitlen, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg["str"] = import_string(seg["str"].replace("===NONE===", ""), from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg["class"]:
            seg["class"] = "coll"
    data["content"] = region_left + region_kwic + region_right
    refbase = "pos=%i&" % pos
    if hitlen != 1:
        refbase += "hitlen=%i&" % hitlen
    data["leftlink"] = refbase + (
        "detail_left_ctx=%i&detail_right_ctx=%i" % (detail_left_ctx + detail_ctx_incr, detail_right_ctx)
    )
    data["rightlink"] = refbase + (
        "detail_left_ctx=%i&detail_right_ctx=%i" % (detail_left_ctx, detail_right_ctx + detail_ctx_incr)
    )
    data["righttoleft"] = corp.get_conf("RIGHTTOLEFT")
    data["pos"] = pos
    data["maxdetail"] = maxdetail
    return data
Exemple #19
0
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40,
                       addattrs=None, structs='', detail_ctx_incr=60):
    data = {}
    if addattrs is None:
        addattrs = []
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    attrs = ','.join(['word'] + addattrs)
    cr = manatee.CorpRegion(corp, attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(cr.region(pos + hitlen,
                                             pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = 'pos=%i&' % pos
    if hitlen != 1:
        refbase += 'hitlen=%i&' % hitlen
    data['leftlink'] = refbase + ('detail_left_ctx=%i&detail_right_ctx=%i'
                                  % (detail_left_ctx + detail_ctx_incr,
                                     detail_right_ctx))
    data['rightlink'] = refbase + ('detail_left_ctx=%i&detail_right_ctx=%i'
                                   % (detail_left_ctx,
                                      detail_right_ctx + detail_ctx_incr))
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #20
0
    def get_list(self, user_allowed_corpora):
        """
        arguments:
        user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids
                                accessible by the current user
        """
        simple_names = set(user_allowed_corpora.keys())
        cl = []
        for item in self._raw_list().values():
            canonical_id, path, web = item["id"], item["path"], item["sentence_struct"]
            if canonical_id in simple_names:
                try:
                    corp_id = user_allowed_corpora[canonical_id]
                    corp_info = self._manatee_corpora.get_info(corp_id)

                    cl.append(
                        {
                            "id": corp_id,
                            "canonical_id": canonical_id,
                            "name": l10n.import_string(corp_info.name, from_encoding=corp_info.encoding),
                            "desc": l10n.import_string(corp_info.description, from_encoding=corp_info.encoding),
                            "size": corp_info.size,
                            "path": path,
                        }
                    )
                except Exception, e:
                    import logging

                    logging.getLogger(__name__).warn(
                        u"Failed to fetch info about %s with error %s (%r)" % (corp_info.name, type(e).__name__, e)
                    )
                    cl.append(
                        {
                            "id": corp_id,
                            "canonical_id": canonical_id,
                            "name": corp_id,
                            "path": path,
                            "desc": "",
                            "size": None,
                        }
                    )
Exemple #21
0
 def _load_raw_sent(self, corpus, canonical_corpus_id, token_id, tree_attrs):
     encoding = corpus.get_conf('ENCODING')
     sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id)
     conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1)
     conc.sync()
     kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1),
                            '-1:%s' % sentence_struct,
                            '1:%s' % sentence_struct,
                            ','.join(tree_attrs),
                            ','.join(tree_attrs), '', '')
     if kl.nextline():
         return [import_string(s, from_encoding=encoding)
                 for s in kl.get_left() + kl.get_kwic() + kl.get_right()]
Exemple #22
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, coll_args.subcname)
    try:
        corplib.frq_db(
            corp, coll_args.cattr
        )  # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        conc = conclib.get_conc(corp=corp,
                                user_id=coll_args.user_id,
                                minsize=coll_args.minsize,
                                q=coll_args.q,
                                fromp=0,
                                pagesize=0,
                                async=0,
                                save=coll_args.save,
                                samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'
                  ))
        collocs = conc.collocs(cattr=coll_args.cattr,
                               csortfn=coll_args.csortfn,
                               cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw,
                               ctow=coll_args.ctow,
                               cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr,
                               max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
            item['str'] = import_string(
                item['str'], from_encoding=coll_args.corpus_encoding)
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.args[0], coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Exemple #23
0
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40,
                       attrs=None, structs='', detail_ctx_incr=60):
    data = {}
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    query_attrs = 'word' if attrs is None else ','.join(attrs)
    cr = manatee.CorpRegion(corp, query_attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(cr.region(pos + hitlen,
                                             pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = [('pos', pos)]
    if hitlen != 1:
        refbase.append(('hitlen', hitlen))
    data['expand_left_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx + detail_ctx_incr),
                                    ('detail_right_ctx', detail_right_ctx)])
    data['expand_right_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx),
                                     ('detail_right_ctx', detail_right_ctx + detail_ctx_incr)])
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #24
0
 def _load_raw_sent(self, corpus, canonical_corpus_id, token_id,
                    tree_attrs):
     encoding = corpus.get_conf('ENCODING')
     sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id)
     conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1)
     conc.sync()
     kl = manatee.KWICLines(corpus, conc.RS(True, 0,
                                            1), '-1:%s' % sentence_struct,
                            '1:%s' % sentence_struct, ','.join(tree_attrs),
                            ','.join(tree_attrs), '', '')
     if kl.nextline():
         return [
             import_string(s, from_encoding=encoding)
             for s in kl.get_left() + kl.get_kwic() + kl.get_right()
         ]
Exemple #25
0
 def corpconf_pairs(self, corp, label):
     """
     Encodes some specific corpus registry file configuration values
     where a list of pairs is actually flattened (k1, v1, k2, v2,..., kN, vN).
     This applies e.g. for WPOSLIST and LPOSLIST.
     Returns:
          a list of pairs
     """
     if type(corp) is basestring:
         corp = self.get_Corpus(corp)
     val = import_string(corp.get_conf(label), from_encoding=corp.get_conf('ENCODING'))
     if len(val) > 2:
         val = val[1:].split(val[0])
     else:
         val = ''
     return [(val[i], val[i + 1]) for i in range(0, len(val), 2)]
Exemple #26
0
 def corpconf_pairs(self, corp, label):
     """
     Encodes some specific corpus registry file configuration values
     where a list of pairs is actually flattened (k1, v1, k2, v2,..., kN, vN).
     This applies e.g. for WPOSLIST and LPOSLIST.
     Returns:
          a list of pairs
     """
     if type(corp) is basestring:
         corp = self.get_Corpus(corp)
     val = import_string(corp.get_conf(label), from_encoding=corp.get_conf('ENCODING'))
     if len(val) > 2:
         val = val[1:].split(val[0])
     else:
         val = ''
     return [(val[i], val[i + 1]) for i in range(0, len(val), 2)]
Exemple #27
0
 def import_str(s):
     return import_string(s, self.corp.get_conf('ENCODING'))
Exemple #28
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if not subcname:
            raise UserActionException(translate('No subcorpus name specified!'))

        if publish and not description:
            raise UserActionException(translate('No description specified'))

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api, corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(
                    corpus_info.metadata.id_attr), [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
                imp_cql = (full_cql,)
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function')
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=self.corp_encoding)
            imp_cql = (full_cql,)

        basecorpname = self.args.corpname.split(':')[0]
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, self.session_get(
                    'user', 'fullname'), description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend = settings.get('calc_backend', 'type')
            if backend in ('celery', 'konserver'):
                import bgcalc
                app = bgcalc.calc_backend_client(settings)
                res = app.send_task('worker.create_subcorpus',
                                    (self.session_get('user', 'id'), self.args.corpname, path, publish_path,
                                     tt_query, imp_cql, self.session_get('user', 'fullname'), description),
                                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id,
                                                       category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                                                       label=u'%s:%s' % (basecorpname, subcname),
                                                       args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'),
                                                       corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path, description)).start()
                result = {}
        else:
            raise UserActionException(translate('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            translate('Subcorpus created but there was a problem saving a backup copy.'))
            unfinished_corpora = filter(lambda at: not at.is_finished(),
                                        self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(translate('Empty subcorpus!'))
Exemple #29
0
def texttype_values(corp,
                    subcorpattrs,
                    maxlistsize,
                    shrink_list=False,
                    collator_locale=None):
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}
    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if shrink_list is False:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and
                (corp.get_conf(n + '.TEXTBOXLENGTH')
                 or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH')
                                            or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{
                        'v': attr.id2str(i)
                    } for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            import_string(attr.id2str(i),
                                          from_encoding=corp.get_conf(
                                              'ENCODING')).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{
                            'v': x
                        } for x in sorted(
                            set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{
                            'v':
                            import_string(
                                attr.id2str(i),
                                from_encoding=corp.get_conf('ENCODING'))
                        } for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals,
                                               key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals,
                                                  collator_locale,
                                                  key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(
                        vals,
                        cmp=lambda x1, x2: cmp(x1['v'].lower(), x2['v'].lower(
                        )))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Exemple #30
0
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False):
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- ??
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}
    """
    if subcorpattrs == "#":
        return []
    attrlines = []

    for subcorpline in subcorpattrs.split(","):
        attrvals = []
        for n in subcorpline.split("|"):
            if n in ("", "#"):
                continue
            attr = corp.get_attr(n)
            attrval = {
                "name": n,
                "label": corp.get_conf(n + ".LABEL") or n,
                "attr_doc": corp.get_conf(n + ".ATTRDOC"),
                "attr_doc_label": corp.get_conf(n + ".ATTRDOCLABEL"),
            }
            hsep = corp.get_conf(n + ".HIERARCHICAL")
            multisep = corp.get_conf(n + ".MULTISEP")
            is_multival = corp.get_conf(n + ".MULTIVAL") in ("y", "yes")

            if not hsep and (corp.get_conf(n + ".TEXTBOXLENGTH") or attr.id_range() > maxlistsize or n in shrink_list):
                attrval["textboxlength"] = corp.get_conf(n + ".TEXTBOXLENGTH") or 24
            else:  # list of values
                if corp.get_conf(n + ".NUMERIC"):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({"v": int(attr.id2str(i))})
                        except:
                            vals.append({"v": attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{"v": attr.id2str(i)} for i in range(attr.id_range()) if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [
                            import_string(attr.id2str(i), from_encoding=corp.get_conf("ENCODING")).split(multisep)
                            for i in range(attr.id_range())
                        ]
                        vals = [{"v": x} for x in sorted(set([s for subl in raw_vals for s in subl]))]
                    else:
                        vals = [
                            {"v": import_string(attr.id2str(i), from_encoding=corp.get_conf("ENCODING"))}
                            for i in range(attr.id_range())
                        ]

                if hsep:  # hierarchical
                    attrval["hierarchical"] = hsep
                    attrval["Values"] = get_attr_hierarchy(vals, hsep, multisep)
                else:
                    attrval["Values"] = sorted(vals, cmp=lambda x, y: cmp(x["v"].lower(), y["v"].lower()))
            attrvals.append(attrval)
        attrlines.append({"Line": attrvals})
    return attrlines
Exemple #31
0
def get_detail_context(corp,
                       pos,
                       hitlen=1,
                       detail_left_ctx=40,
                       detail_right_ctx=40,
                       addattrs=None,
                       structs='',
                       detail_ctx_incr=60):
    data = {}
    if addattrs is None:
        addattrs = []
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    attrs = ','.join(['word'] + addattrs)
    cr = manatee.CorpRegion(corp, attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(
        cr.region(pos + hitlen, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''),
                                   from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = 'pos=%i&' % pos
    if hitlen != 1:
        refbase += 'hitlen=%i&' % hitlen
    data['leftlink'] = refbase + (
        'detail_left_ctx=%i&detail_right_ctx=%i' %
        (detail_left_ctx + detail_ctx_incr, detail_right_ctx))
    data['rightlink'] = refbase + (
        'detail_left_ctx=%i&detail_right_ctx=%i' %
        (detail_left_ctx, detail_right_ctx + detail_ctx_incr))
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #32
0
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None):
    """
    arguments:
    corp -- manatee.Corpus
    subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS)
    maxlistsize -- in case there is more that this number of items, empty list will be returned
    shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used
                   to specify an empty value)
    collator_locale -- a collator used to sort attribute values (en_US is the default)

    returns:
    a list containing following dictionaries
    { 'Line' : [
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' },
        ...
    ]}
    """
    if subcorpattrs == '#':
        return []
    attrlines = []

    if shrink_list is False:
        shrink_list = ()

    for subcorpline in subcorpattrs.split(','):
        attrvals = []
        for n in subcorpline.split('|'):
            if n in ('', '#'):
                continue
            attr = corp.get_attr(n)
            attrval = {
                'name': n,
                'label': corp.get_conf(n + '.LABEL') or n,
                'attr_doc': corp.get_conf(n + '.ATTRDOC'),
                'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'),
                'numeric': conf_bool(corp.get_conf(n + '.NUMERIC'))
            }
            hsep = corp.get_conf(n + '.HIERARCHICAL')
            multisep = corp.get_conf(n + '.MULTISEP')
            is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes')
            if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH')
                              or attr.id_range() > maxlistsize or n in shrink_list)):
                attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24)
            else:  # list of values
                if conf_bool(corp.get_conf(n + '.NUMERIC')):
                    vals = []
                    for i in range(attr.id_range()):
                        try:
                            vals.append({'v': int(attr.id2str(i))})
                        except:
                            vals.append({'v': attr.id2str(i)})
                elif hsep:  # hierarchical
                    vals = [{'v': attr.id2str(i)}
                            for i in range(attr.id_range())
                            if not multisep in attr.id2str(i)]
                else:
                    if is_multival:
                        raw_vals = [import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))
                                    .split(multisep) for i in range(attr.id_range())]
                        vals = [{'v': x}
                                for x in sorted(set([s for subl in raw_vals for s in subl]))]
                    else:

                        vals = [{'v': import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))}
                                for i in range(attr.id_range())]

                if hsep:  # hierarchical
                    attrval['hierarchical'] = hsep
                    attrval['Values'] = _get_attr_hierarchy(vals, hsep)
                elif conf_bool(corp.get_conf(n + '.NUMERIC')):
                    attrval['Values'] = sorted(vals, key=lambda item: item['v'])
                elif collator_locale:
                    attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v'])
                else:
                    attrval['Values'] = sorted(vals, cmp=lambda x1, x2: cmp(
                        x1['v'].lower(), x2['v'].lower()))
            attrvals.append(attrval)
        attrlines.append({'Line': attrvals})
    return attrlines
Exemple #33
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        aligned_corpora = request.form.getlist('aligned_corpora')
        publish = bool(int(request.form.get('publish')))
        corpus_info = self.get_corpus_info(self.args.corpname)
        description = request.form.get('description')

        if raw_cql:
            aligned_corpora = []
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql, )
        elif within_json:  # user entered a subcorpus query manually
            aligned_corpora = []
            tt_query = ()
            within_cql = self._deserialize_custom_within(
                json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql, )
        elif len(aligned_corpora
                 ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists:
            if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr:
                within_cql = None
                attrs = json.loads(request.form.get('attrs', '{}'))
                sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values(
                    self._plugin_api,
                    corpus=self.corp,
                    attr_map=attrs,
                    aligned_corpora=aligned_corpora,
                    limit_lists=False)
                values = sel_match['attr_values'][
                    corpus_info.metadata.label_attr]
                args = argmapping.Args()
                setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr),
                        [v[1] for v in values])
                tt_query = TextTypeCollector(self.corp, args).get_query()
                tmp = ['<%s %s />' % item for item in tt_query]
                full_cql = ' within '.join(tmp)
                full_cql = 'aword,[] within %s' % full_cql
                full_cql = import_string(full_cql,
                                         from_encoding=self.corp_encoding)
                imp_cql = (full_cql, )
            else:
                raise FunctionNotSupported(
                    'Corpus must have a bibliography item defined to support this function'
                )
        else:
            within_cql = None
            tt_query = TextTypeCollector(self.corp, request).get_query()
            tmp = ['<%s %s />' % item for item in tt_query]
            full_cql = ' within '.join(tmp)
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql,
                                     from_encoding=self.corp_encoding)
            imp_cql = (full_cql, )

        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname, publish=False)
        publish_path = self.prepare_subc_path(
            basecorpname, subcname, publish=True) if publish else None

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1 and len(aligned_corpora) == 0:
            result = corplib.create_subcorpus(path, self.corp, tt_query[0][0],
                                              tt_query[0][1])
            if result and publish_path:
                corplib.mk_publish_links(path, publish_path, description)
        elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0:
            backend, conf = settings.get_full('global', 'calc_backend')
            if backend == 'celery':
                import task
                app = task.get_celery_app(conf['conf'])
                res = app.send_task(
                    'worker.create_subcorpus',
                    (self.session_get('user', 'id'), self.args.corpname, path,
                     publish_path, tt_query, imp_cql, description),
                    time_limit=TASK_TIME_LIMIT)
                self._store_async_task(
                    AsyncTaskStatus(
                        status=res.status,
                        ident=res.id,
                        category=AsyncTaskStatus.CATEGORY_SUBCORPUS,
                        label=u'%s:%s' % (basecorpname, subcname),
                        args=dict(subcname=subcname, corpname=basecorpname)))
                result = {}
            elif backend == 'multiprocessing':
                from bgcalc import subc_calc
                import functools
                import multiprocessing
                worker = subc_calc.CreateSubcorpusTask(
                    user_id=self.session_get('user', 'id'),
                    corpus_id=self.args.corpname)
                multiprocessing.Process(target=functools.partial(
                    worker.run, tt_query, imp_cql, path, publish_path,
                    description)).start()
                result = {}
        else:
            raise UserActionException(_('Nothing specified!'))
        if result is not False:
            with plugins.runtime.SUBC_RESTORE as sr:
                try:
                    sr.store_query(user_id=self.session_get('user', 'id'),
                                   corpname=self.args.corpname,
                                   subcname=subcname,
                                   cql=full_cql.strip().split('[]', 1)[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning(
                        'Failed to store subcorpus query: %s' % e)
                    self.add_system_message(
                        'warning',
                        _('Subcorpus created but there was a problem saving a backup copy.'
                          ))
            unfinished_corpora = filter(
                lambda at: not at.is_finished(),
                self.get_async_tasks(
                    category=AsyncTaskStatus.CATEGORY_SUBCORPUS))
            return dict(
                unfinished_subc=[uc.to_dict() for uc in unfinished_corpora])
        else:
            raise SubcorpusError(_('Empty subcorpus!'))