def get_list(self, plugin_api, user_allowed_corpora): """ arguments: user_allowed_corpora -- a dict (corpus_id, corpus_variant) containing corpora ids accessible by the current user """ cl = [] for item in self._raw_list(plugin_api.user_lang).values(): corp_id, path, web = item['id'], item['path'], item['sentence_struct'] if corp_id in user_allowed_corpora: try: corp_info = self.manatee_corpora.get_info(corp_id) cl.append({'id': corp_id, 'name': l10n.import_string(corp_info.name, from_encoding=corp_info.encoding), 'desc': l10n.import_string(corp_info.description, from_encoding=corp_info.encoding), 'size': corp_info.size, 'path': path }) except Exception as e: import logging logging.getLogger(__name__).warn( u'Failed to fetch info about %s with error %s (%r)' % (corp_info.name, type(e).__name__, e)) cl.append({ 'id': corp_id, 'name': corp_id, 'path': path, 'desc': '', 'size': None}) return cl
def get_list(self, user_allowed_corpora): """ arguments: user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids accessible by the current user """ cl = [] for item in self._raw_list().values(): canonical_id, path, web = item['id'], item['path'], item['sentence_struct'] corp_id = user_allowed_corpora.get(canonical_id, canonical_id) try: corp_info = self._manatee_corpora.get_info(corp_id) cl.append({'id': corp_id, 'canonical_id': canonical_id, 'name': l10n.import_string(corp_info.name, from_encoding=corp_info.encoding), 'desc': l10n.import_string(corp_info.description, from_encoding=corp_info.encoding), 'size': corp_info.size, 'path': path, 'requestable': item.get('requestable', False) }) except Exception, e: import logging logging.getLogger(__name__).warn( u'Failed to fetch info about %s with error %s (%r)' % (corp_id, type(e).__name__, e)) cl.append({ 'id': corp_id, 'canonical_id': canonical_id, 'name': corp_id, 'path': path, 'desc': '', 'size': None})
def ajax_get_corp_details(self, request): """ """ corp_conf_info = plugins.get('corparch').get_corpus_info(request.args['corpname']) corpus = self.cm.get_Corpus(request.args['corpname']) encoding = corpus.get_conf('ENCODING') ans = { 'corpname': l10n.import_string(self._canonical_corpname(corpus.get_conf('NAME')), from_encoding=encoding), 'description': l10n.import_string(corpus.get_info(), from_encoding=encoding), 'size': l10n.format_number(int(corpus.size())), 'attrlist': [], 'structlist': [], 'web_url': corp_conf_info['web'] if corp_conf_info is not None else '' } try: ans['attrlist'] = [{'name': item, 'size': l10n.format_number(int(corpus.get_attr(item).id_range()))} for item in corpus.get_conf('ATTRLIST').split(',')] except RuntimeError as e: logging.getLogger(__name__).warn('%s' % e) ans['attrlist'] = {'error': _('Failed to load')} ans['structlist'] = [{'name': item, 'size': l10n.format_number(int(corpus.get_struct(item).size()))} for item in corpus.get_conf('STRUCTLIST').split(',')] return ans
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus within_condition -- custom within condition; if non-empty then clickable form is omitted within_struct -- a structure the within_condition will be applied to """ subcname = request.form['subcname'] within_condition = request.form['within_condition'] within_struct = request.form['within_struct'] corp_encoding = self._corp().get_conf('ENCODING') if within_condition and within_struct: # user entered a subcorpus query manually tt_query = [(export_string(within_struct, to_encoding=corp_encoding), export_string(within_condition, to_encoding=corp_encoding))] else: tt_query = self._texttype_query(request) within_struct = import_string(tt_query[0][0], from_encoding=corp_encoding) within_condition = import_string(tt_query[0][1], from_encoding=corp_encoding) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise ConcError(_('No subcorpus name specified!')) path = os.path.join(self.subcpath[-1], basecorpname) if not os.path.isdir(path): os.makedirs(path) path = os.path.join(path, subcname) + '.subc' if not tt_query: raise ConcError(_('Nothing specified!')) # Even if _texttype_query() parsed multiple structures into tt_query, # Manatee can accept directly only one (but with arbitrarily complex attribute # condition). # For this reason, we choose only the first struct+condition pair. # It is up to the user interface to deal with it. structname, subquery = tt_query[0] if type(path) == unicode: path = path.encode("utf-8") if corplib.create_subcorpus(path, self._corp(), structname, subquery): if plugins.has_plugin('subc_restore'): try: plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, structname=within_struct, condition=within_condition) except Exception as e: logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e) self.add_system_message('warning', _('Subcorpus created but there was a problem saving a backup copy.')) return {} else: raise ConcError(_('Empty subcorpus!'))
def get_full_ref(corp, pos): corpus_encoding = corp.get_conf('ENCODING') data = {} refs = [(n == '#' and ('#', str(pos)) or (n, corp.get_attr(n).pos2str(pos))) for n in corp.get_conf('FULLREF').split(',') if n != settings.get('corpora', 'speech_segment_struct_attr')] data['Refs'] = [{'name': n == '#' and _('Token number') or corp.get_conf(n + '.LABEL') or n, 'val': import_string(v, corpus_encoding)} for n, v in refs] for n, v in refs: data[n.replace('.', '_')] = import_string(v, corpus_encoding) return data
def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs): """ Retrieve a sentence via Manatee Args: corpus (manatee.Corpus): a corpus instance corpus_id (str): corpus ID token_id (int): token number/id kwic_len (int): number of tokens in KWIC tree_attrs (list of str): a list of positional attributes required by tree nodes/edges Returns (dict): data: a list of strings (Manatee raw format) kwic_pos: a tuple (first_kwic_idx, kwic_length) """ encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(corpus_id) conc = manatee.Concordance(corpus, ' '.join( '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): left_tk = kl.get_left() kwic_tk = kl.get_kwic() return dict(data=[import_string(s, from_encoding=encoding) for s in left_tk + kwic_tk + kl.get_right()], kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def set_favorite_item(ctrl, request): """ """ corpora = [] main_size = None for i, c_id in enumerate(request.form.getlist('corpora')): corp = ctrl.cm.get_Corpus(c_id, subcname=request.form['subcorpus_id'] if i == 0 else None) if i == 0: main_size = corp.search_size() corpora.append(dict(id=c_id, name=l10n.import_string( corp.get_conf('NAME'), corp.get_conf('ENCODING')))) subcorpus_id = request.form['subcorpus_id'] subcorpus_orig_id = request.form['subcorpus_orig_id'] item = FavoriteItem(dict( name=u' || '.join(c['name'] for c in corpora) + (u' / ' + subcorpus_orig_id if subcorpus_orig_id else u''), corpora=corpora, subcorpus_id=subcorpus_id, subcorpus_orig_id=subcorpus_orig_id, size=main_size, size_info=l10n.simplify_num(main_size) )) with plugins.runtime.USER_ITEMS as uit: uit.add_user_item(ctrl._plugin_api, item) return item.to_dict()
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, attrs=None, structs='', detail_ctx_incr=60): data = {} corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos query_attrs = 'word' if attrs is None else ','.join(attrs) cr = manatee.CorpRegion(corp, query_attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass( cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = [('pos', pos)] if hitlen != 1: refbase.append(('hitlen', hitlen)) data['expand_left_args'] = dict( refbase + [('detail_left_ctx', detail_left_ctx + detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)]) data['expand_right_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx), ('detail_right_ctx', detail_right_ctx + detail_ctx_incr)]) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') corp_encoding = self._corp().get_conf('ENCODING') if raw_cql: tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql,) elif within_json: # user entered a subcorpus query manually tt_query = () within_cql = self._deserialize_custom_within(json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql,) else: tt_query = TextTypeCollector(self._corp(), request).get_query() full_cql = ' within '.join(['<%s %s />' % item for item in tt_query]) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=corp_encoding) imp_cql = (full_cql,) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise UserActionException(_('No subcorpus name specified!')) path = self.prepare_subc_path(basecorpname, subcname) if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1: result = corplib.create_subcorpus(path, self._corp(), tt_query[0][0], tt_query[0][1]) elif len(tt_query) > 1 or within_cql: conc = conclib.get_conc(self._corp(), self._session_get('user', 'user'), q=imp_cql) conc.sync() struct = self._corp().get_struct(tt_query[0][0]) if len(tt_query) == 1 else None result = corplib.subcorpus_from_conc(path, conc, struct) else: raise UserActionException(_('Nothing specified!')) if result: if plugins.has_plugin('subc_restore'): try: plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.split('[]')[-1]) except Exception as e: logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e) self.add_system_message('warning', _('Subcorpus created but there was a problem saving a backup copy.')) return {} else: raise ConcError(_('Empty subcorpus!'))
def fetch_attr(corp, attr, token_id, num_tokens): mattr = corp.get_attr(attr) ans = [] for i in range(num_tokens): ans.append( import_string(mattr.pos2str(int(token_id) + i), corp.get_conf('ENCODING'))) return ' '.join(ans)
def get_list(self, plugin_api, user_allowed_corpora): """ arguments: user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids accessible by the current user """ simple_names = set(user_allowed_corpora.keys()) cl = [] for item in self._raw_list(plugin_api.user_lang).values(): canonical_id, path, web = item['id'], item['path'], item[ 'sentence_struct'] if canonical_id in simple_names: try: corp_id = user_allowed_corpora[canonical_id] corp_info = self.manatee_corpora.get_info(corp_id) cl.append({ 'id': corp_id, 'canonical_id': canonical_id, 'name': l10n.import_string(corp_info.name, from_encoding=corp_info.encoding), 'desc': l10n.import_string(corp_info.description, from_encoding=corp_info.encoding), 'size': corp_info.size, 'path': path }) except Exception as e: import logging logging.getLogger(__name__).warn( u'Failed to fetch info about %s with error %s (%r)' % (corp_info.name, type(e).__name__, e)) cl.append({ 'id': corp_id, 'canonical_id': canonical_id, 'name': corp_id, 'path': path, 'desc': '', 'size': None }) return cl
def get_full_ref(corp, pos): corpus_encoding = corp.get_conf("ENCODING") data = {} refs = [ (n == "#" and ("#", str(pos)) or (n, corp.get_attr(n).pos2str(pos))) for n in corp.get_conf("FULLREF").split(",") if n != settings.get("corpora", "speech_segment_struct_attr") ] data["Refs"] = [ { "name": n == "#" and _("Token number") or corp.get_conf(n + ".LABEL") or n, "val": import_string(v, corpus_encoding), } for n, v in refs ] for n, v in refs: data[n.replace(".", "_")] = import_string(v, corpus_encoding) return data
def corpconf_pairs(self, corp, label): if type(corp) is UnicodeType: corp = self.get_Corpus(corp) val = import_string(corp.get_conf(label), from_encoding=corp.get_conf("ENCODING")) if len(val) > 2: val = val[1:].split(val[0]) else: val = "" return [val[i : i + 2] for i in range(0, len(val), 2)]
def get_list(self, user_allowed_corpora): """ arguments: user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids accessible by the current user """ cl = [] for item in self._raw_list().values(): canonical_id, path, web = item['id'], item['path'], item[ 'sentence_struct'] corp_id = user_allowed_corpora.get(canonical_id, canonical_id) try: corp_info = self._manatee_corpora.get_info(corp_id) cl.append({ 'id': corp_id, 'canonical_id': canonical_id, 'name': l10n.import_string(corp_info.name, from_encoding=corp_info.encoding), 'desc': l10n.import_string(corp_info.description, from_encoding=corp_info.encoding), 'size': corp_info.size, 'path': path, 'requestable': item.get('requestable', False) }) except Exception, e: import logging logging.getLogger(__name__).warn( u'Failed to fetch info about %s with error %s (%r)' % (corp_id, type(e).__name__, e)) cl.append({ 'id': corp_id, 'canonical_id': canonical_id, 'name': corp_id, 'path': path, 'desc': '', 'size': None })
def corpconf_pairs(self, corp, label): if type(corp) is UnicodeType: corp = self.get_Corpus(corp) val = import_string(corp.get_conf(label), from_encoding=corp.get_conf('ENCODING')) if len(val) > 2: val = val[1:].split(val[0]) else: val = '' return [val[i:i + 2] for i in range(0, len(val), 2)]
def get_detail_context( corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, addattrs=None, structs="", detail_ctx_incr=60 ): data = {} if addattrs is None: addattrs = [] corpus_encoding = corp.get_conf("ENCODING") wrapdetail = corp.get_conf("WRAPDETAIL") if wrapdetail: data["wrapdetail"] = "<%s>" % wrapdetail if not wrapdetail in structs.split(","): data["deletewrap"] = True structs = wrapdetail + "," + structs else: data["wrapdetail"] = "" try: maxdetail = int(corp.get_conf("MAXDETAIL")) if maxdetail == 0: maxdetail = int(corp.get_conf("MAXCONTEXT")) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos attrs = ",".join(["word"] + addattrs) cr = manatee.CorpRegion(corp, attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass(cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg["str"] = import_string(seg["str"].replace("===NONE===", ""), from_encoding=corpus_encoding) for seg in region_kwic: if not seg["class"]: seg["class"] = "coll" data["content"] = region_left + region_kwic + region_right refbase = "pos=%i&" % pos if hitlen != 1: refbase += "hitlen=%i&" % hitlen data["leftlink"] = refbase + ( "detail_left_ctx=%i&detail_right_ctx=%i" % (detail_left_ctx + detail_ctx_incr, detail_right_ctx) ) data["rightlink"] = refbase + ( "detail_left_ctx=%i&detail_right_ctx=%i" % (detail_left_ctx, detail_right_ctx + detail_ctx_incr) ) data["righttoleft"] = corp.get_conf("RIGHTTOLEFT") data["pos"] = pos data["maxdetail"] = maxdetail return data
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, addattrs=None, structs='', detail_ctx_incr=60): data = {} if addattrs is None: addattrs = [] corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos attrs = ','.join(['word'] + addattrs) cr = manatee.CorpRegion(corp, attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass(cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = 'pos=%i&' % pos if hitlen != 1: refbase += 'hitlen=%i&' % hitlen data['leftlink'] = refbase + ('detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx + detail_ctx_incr, detail_right_ctx)) data['rightlink'] = refbase + ('detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx, detail_right_ctx + detail_ctx_incr)) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def get_list(self, user_allowed_corpora): """ arguments: user_allowed_corpora -- a dict (corpus_canonical_id, corpus_id) containing corpora ids accessible by the current user """ simple_names = set(user_allowed_corpora.keys()) cl = [] for item in self._raw_list().values(): canonical_id, path, web = item["id"], item["path"], item["sentence_struct"] if canonical_id in simple_names: try: corp_id = user_allowed_corpora[canonical_id] corp_info = self._manatee_corpora.get_info(corp_id) cl.append( { "id": corp_id, "canonical_id": canonical_id, "name": l10n.import_string(corp_info.name, from_encoding=corp_info.encoding), "desc": l10n.import_string(corp_info.description, from_encoding=corp_info.encoding), "size": corp_info.size, "path": path, } ) except Exception, e: import logging logging.getLogger(__name__).warn( u"Failed to fetch info about %s with error %s (%r)" % (corp_info.name, type(e).__name__, e) ) cl.append( { "id": corp_id, "canonical_id": canonical_id, "name": corp_id, "path": path, "desc": "", "size": None, } )
def _load_raw_sent(self, corpus, canonical_corpus_id, token_id, tree_attrs): encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id) conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): return [import_string(s, from_encoding=encoding) for s in kl.get_left() + kl.get_kwic() + kl.get_right()]
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, coll_args.subcname) try: corplib.frq_db( corp, coll_args.cattr ) # try to fetch precalculated data; if none then MissingSubCorpFreqFile conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string( item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, attrs=None, structs='', detail_ctx_incr=60): data = {} corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos query_attrs = 'word' if attrs is None else ','.join(attrs) cr = manatee.CorpRegion(corp, query_attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass(cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = [('pos', pos)] if hitlen != 1: refbase.append(('hitlen', hitlen)) data['expand_left_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx + detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)]) data['expand_right_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx), ('detail_right_ctx', detail_right_ctx + detail_ctx_incr)]) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def _load_raw_sent(self, corpus, canonical_corpus_id, token_id, tree_attrs): encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id) conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): return [ import_string(s, from_encoding=encoding) for s in kl.get_left() + kl.get_kwic() + kl.get_right() ]
def corpconf_pairs(self, corp, label): """ Encodes some specific corpus registry file configuration values where a list of pairs is actually flattened (k1, v1, k2, v2,..., kN, vN). This applies e.g. for WPOSLIST and LPOSLIST. Returns: a list of pairs """ if type(corp) is basestring: corp = self.get_Corpus(corp) val = import_string(corp.get_conf(label), from_encoding=corp.get_conf('ENCODING')) if len(val) > 2: val = val[1:].split(val[0]) else: val = '' return [(val[i], val[i + 1]) for i in range(0, len(val), 2)]
def import_str(s): return import_string(s, self.corp.get_conf('ENCODING'))
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') aligned_corpora = request.form.getlist('aligned_corpora') publish = bool(int(request.form.get('publish'))) corpus_info = self.get_corpus_info(self.args.corpname) description = request.form.get('description') if not subcname: raise UserActionException(translate('No subcorpus name specified!')) if publish and not description: raise UserActionException(translate('No description specified')) if raw_cql: aligned_corpora = [] tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql,) elif within_json: # user entered a subcorpus query manually aligned_corpora = [] tt_query = () within_cql = self._deserialize_custom_within(json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql,) elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists: if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None attrs = json.loads(request.form.get('attrs', '{}')) sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_api, corpus=self.corp, attr_map=attrs, aligned_corpora=aligned_corpora, limit_lists=False) values = sel_match['attr_values'][corpus_info.metadata.label_attr] args = argmapping.Args() setattr(args, 'sca_{0}'.format( corpus_info.metadata.id_attr), [v[1] for v in values]) tt_query = TextTypeCollector(self.corp, args).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql,) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function') else: within_cql = None tt_query = TextTypeCollector(self.corp, request).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql,) basecorpname = self.args.corpname.split(':')[0] path = self.prepare_subc_path(basecorpname, subcname, publish=False) publish_path = self.prepare_subc_path( basecorpname, subcname, publish=True) if publish else None if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1 and len(aligned_corpora) == 0: result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, self.session_get( 'user', 'fullname'), description) elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0: backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.create_subcorpus', (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, self.session_get('user', 'fullname'), description), time_limit=TASK_TIME_LIMIT) self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=u'%s:%s' % (basecorpname, subcname), args=dict(subcname=subcname, corpname=basecorpname))) result = {} elif backend == 'multiprocessing': from bgcalc import subc_calc import functools import multiprocessing worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'), corpus_id=self.args.corpname) multiprocessing.Process(target=functools.partial( worker.run, tt_query, imp_cql, path, publish_path, description)).start() result = {} else: raise UserActionException(translate('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e) self.add_system_message('warning', translate('Subcorpus created but there was a problem saving a backup copy.')) unfinished_corpora = filter(lambda at: not at.is_finished(), self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(translate('Empty subcorpus!'))
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None): """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} """ if subcorpattrs == '#': return [] attrlines = [] if shrink_list is False: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{ 'v': attr.id2str(i) } for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ import_string(attr.id2str(i), from_encoding=corp.get_conf( 'ENCODING')).split(multisep) for i in range(attr.id_range()) ] vals = [{ 'v': x } for x in sorted( set([s for subl in raw_vals for s in subl]))] else: vals = [{ 'v': import_string( attr.id2str(i), from_encoding=corp.get_conf('ENCODING')) } for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted( vals, cmp=lambda x1, x2: cmp(x1['v'].lower(), x2['v'].lower( ))) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False): """ arguments: corp -- manatee.Corpus subcorpattrs -- ?? maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} """ if subcorpattrs == "#": return [] attrlines = [] for subcorpline in subcorpattrs.split(","): attrvals = [] for n in subcorpline.split("|"): if n in ("", "#"): continue attr = corp.get_attr(n) attrval = { "name": n, "label": corp.get_conf(n + ".LABEL") or n, "attr_doc": corp.get_conf(n + ".ATTRDOC"), "attr_doc_label": corp.get_conf(n + ".ATTRDOCLABEL"), } hsep = corp.get_conf(n + ".HIERARCHICAL") multisep = corp.get_conf(n + ".MULTISEP") is_multival = corp.get_conf(n + ".MULTIVAL") in ("y", "yes") if not hsep and (corp.get_conf(n + ".TEXTBOXLENGTH") or attr.id_range() > maxlistsize or n in shrink_list): attrval["textboxlength"] = corp.get_conf(n + ".TEXTBOXLENGTH") or 24 else: # list of values if corp.get_conf(n + ".NUMERIC"): vals = [] for i in range(attr.id_range()): try: vals.append({"v": int(attr.id2str(i))}) except: vals.append({"v": attr.id2str(i)}) elif hsep: # hierarchical vals = [{"v": attr.id2str(i)} for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [ import_string(attr.id2str(i), from_encoding=corp.get_conf("ENCODING")).split(multisep) for i in range(attr.id_range()) ] vals = [{"v": x} for x in sorted(set([s for subl in raw_vals for s in subl]))] else: vals = [ {"v": import_string(attr.id2str(i), from_encoding=corp.get_conf("ENCODING"))} for i in range(attr.id_range()) ] if hsep: # hierarchical attrval["hierarchical"] = hsep attrval["Values"] = get_attr_hierarchy(vals, hsep, multisep) else: attrval["Values"] = sorted(vals, cmp=lambda x, y: cmp(x["v"].lower(), y["v"].lower())) attrvals.append(attrval) attrlines.append({"Line": attrvals}) return attrlines
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, addattrs=None, structs='', detail_ctx_incr=60): data = {} if addattrs is None: addattrs = [] corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos attrs = ','.join(['word'] + addattrs) cr = manatee.CorpRegion(corp, attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass( cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = 'pos=%i&' % pos if hitlen != 1: refbase += 'hitlen=%i&' % hitlen data['leftlink'] = refbase + ( 'detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx + detail_ctx_incr, detail_right_ctx)) data['rightlink'] = refbase + ( 'detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx, detail_right_ctx + detail_ctx_incr)) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def texttype_values(corp, subcorpattrs, maxlistsize, shrink_list=False, collator_locale=None): """ arguments: corp -- manatee.Corpus subcorpattrs -- structures and attributes to be processed (see Manatee's SUBCORPATTRS) maxlistsize -- in case there is more that this number of items, empty list will be returned shrink_list -- list/tuple of attributes we want to return empty lists for (False can be used to specify an empty value) collator_locale -- a collator used to sort attribute values (en_US is the default) returns: a list containing following dictionaries { 'Line' : [ { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, { 'attr_doc_label' : '', 'Values' : [ {'v', 'item name'}, ... ], 'name' : '', 'attr_doc' : '', 'label' : '' }, ... ]} """ if subcorpattrs == '#': return [] attrlines = [] if shrink_list is False: shrink_list = () for subcorpline in subcorpattrs.split(','): attrvals = [] for n in subcorpline.split('|'): if n in ('', '#'): continue attr = corp.get_attr(n) attrval = { 'name': n, 'label': corp.get_conf(n + '.LABEL') or n, 'attr_doc': corp.get_conf(n + '.ATTRDOC'), 'attr_doc_label': corp.get_conf(n + '.ATTRDOCLABEL'), 'numeric': conf_bool(corp.get_conf(n + '.NUMERIC')) } hsep = corp.get_conf(n + '.HIERARCHICAL') multisep = corp.get_conf(n + '.MULTISEP') is_multival = corp.get_conf(n + '.MULTIVAL') in ('y', 'yes') if (not hsep and (corp.get_conf(n + '.TEXTBOXLENGTH') or attr.id_range() > maxlistsize or n in shrink_list)): attrval['textboxlength'] = (corp.get_conf(n + '.TEXTBOXLENGTH') or 24) else: # list of values if conf_bool(corp.get_conf(n + '.NUMERIC')): vals = [] for i in range(attr.id_range()): try: vals.append({'v': int(attr.id2str(i))}) except: vals.append({'v': attr.id2str(i)}) elif hsep: # hierarchical vals = [{'v': attr.id2str(i)} for i in range(attr.id_range()) if not multisep in attr.id2str(i)] else: if is_multival: raw_vals = [import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING')) .split(multisep) for i in range(attr.id_range())] vals = [{'v': x} for x in sorted(set([s for subl in raw_vals for s in subl]))] else: vals = [{'v': import_string(attr.id2str(i), from_encoding=corp.get_conf('ENCODING'))} for i in range(attr.id_range())] if hsep: # hierarchical attrval['hierarchical'] = hsep attrval['Values'] = _get_attr_hierarchy(vals, hsep) elif conf_bool(corp.get_conf(n + '.NUMERIC')): attrval['Values'] = sorted(vals, key=lambda item: item['v']) elif collator_locale: attrval['Values'] = l10n.sort(vals, collator_locale, key=lambda item: item['v']) else: attrval['Values'] = sorted(vals, cmp=lambda x1, x2: cmp( x1['v'].lower(), x2['v'].lower())) attrvals.append(attrval) attrlines.append({'Line': attrvals}) return attrlines
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') aligned_corpora = request.form.getlist('aligned_corpora') publish = bool(int(request.form.get('publish'))) corpus_info = self.get_corpus_info(self.args.corpname) description = request.form.get('description') if raw_cql: aligned_corpora = [] tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql, ) elif within_json: # user entered a subcorpus query manually aligned_corpora = [] tt_query = () within_cql = self._deserialize_custom_within( json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql, ) elif len(aligned_corpora ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists: if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None attrs = json.loads(request.form.get('attrs', '{}')) sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_api, corpus=self.corp, attr_map=attrs, aligned_corpora=aligned_corpora, limit_lists=False) values = sel_match['attr_values'][ corpus_info.metadata.label_attr] args = argmapping.Args() setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr), [v[1] for v in values]) tt_query = TextTypeCollector(self.corp, args).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function' ) else: within_cql = None tt_query = TextTypeCollector(self.corp, request).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise UserActionException(_('No subcorpus name specified!')) path = self.prepare_subc_path(basecorpname, subcname, publish=False) publish_path = self.prepare_subc_path( basecorpname, subcname, publish=True) if publish else None if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1 and len(aligned_corpora) == 0: result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, description) elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0: backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) res = app.send_task( 'worker.create_subcorpus', (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, description), time_limit=TASK_TIME_LIMIT) self._store_async_task( AsyncTaskStatus( status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=u'%s:%s' % (basecorpname, subcname), args=dict(subcname=subcname, corpname=basecorpname))) result = {} elif backend == 'multiprocessing': from bgcalc import subc_calc import functools import multiprocessing worker = subc_calc.CreateSubcorpusTask( user_id=self.session_get('user', 'id'), corpus_id=self.args.corpname) multiprocessing.Process(target=functools.partial( worker.run, tt_query, imp_cql, path, publish_path, description)).start() result = {} else: raise UserActionException(_('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning( 'Failed to store subcorpus query: %s' % e) self.add_system_message( 'warning', _('Subcorpus created but there was a problem saving a backup copy.' )) unfinished_corpora = filter( lambda at: not at.is_finished(), self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) return dict( unfinished_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(_('Empty subcorpus!'))