def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname) try: # try to fetch precalculated data; if none then MissingSubCorpFreqFile corplib.frq_db(corp, coll_args.cattr) conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def run(self, tt_query, cql, path, publish_path): """ returns: True in case of success In case of an empty subcorus, EmptySubcorpusException is thrown """ conc = conclib.get_conc(self._corp, self._user_id, q=cql, asnc=0) if conc.size() == 0: raise EmptySubcorpusException('Empty subcorpus') ans = corplib.subcorpus_from_conc(path, conc) if ans is False: raise EmptySubcorpusException( 'Failed to create the subcorpus from a concordance') if not os.path.isfile( path): # this should not happen but it looks like it did logging.getLogger(__name__).warning( 'Sync. called conc. file not created (path: {})'.format(path)) time.sleep(5) # we must set write perms for group as this is created by Celery and we won't be # able to create hardlinks otherwise os.chmod(path, 0o664) if publish_path: corplib.mk_publish_links(path, publish_path, self._author, self._description) return ans
def calc_freqs_bg(args): """ Calculate actual frequency data. arguments: args -- a FreqCalsArgs instance returns: a dict(freqs=..., conc_size=...) """ cm = corplib.CorpusManager(subcpath=args.subcpath) corp = cm.get_Corpus(args.corpname, args.subcname) conc = conclib.get_conc(corp=corp, user_id=args.user_id, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, async=0, save=args.save, samplesize=args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) freqs = [ conc.xfreq_dist(cr, args.flimit, args.freq_sort, args.ml, args.ftt_include_empty, args.rel_mode, args.collator_locale) for cr in args.fcrit ] return dict(freqs=freqs, conc_size=conc.size())
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') corp_encoding = self._corp().get_conf('ENCODING') if raw_cql: tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql,) elif within_json: # user entered a subcorpus query manually tt_query = () within_cql = self._deserialize_custom_within(json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql,) else: tt_query = TextTypeCollector(self._corp(), request).get_query() full_cql = ' within '.join(['<%s %s />' % item for item in tt_query]) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=corp_encoding) imp_cql = (full_cql,) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise UserActionException(_('No subcorpus name specified!')) path = self.prepare_subc_path(basecorpname, subcname) if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1: result = corplib.create_subcorpus(path, self._corp(), tt_query[0][0], tt_query[0][1]) elif len(tt_query) > 1 or within_cql: conc = conclib.get_conc(self._corp(), self._session_get('user', 'user'), q=imp_cql) conc.sync() struct = self._corp().get_struct(tt_query[0][0]) if len(tt_query) == 1 else None result = corplib.subcorpus_from_conc(path, conc, struct) else: raise UserActionException(_('Nothing specified!')) if result: if plugins.has_plugin('subc_restore'): try: plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.split('[]')[-1]) except Exception as e: logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e) self.add_system_message('warning', _('Subcorpus created but there was a problem saving a backup copy.')) return {} else: raise ConcError(_('Empty subcorpus!'))
def run(self): """ note: this is called by Celery worker """ cm = corplib.CorpusManager(subcpath=self._args.subcpath) self._corp = cm.get_Corpus(self._args.corpname, subcname=self._args.subcname) self._conc = conclib.get_conc(corp=self._corp, user_id=self._args.user_id, minsize=self._args.minsize, q=self._args.q, fromp=0, pagesize=0, async=0, save=0, samplesize=0) result, full_size = self.ct_dist(self._args.fcrit, limit=self._args.ctminfreq, limit_type=self._args.ctminfreq_type) return dict(data=[x[0] + x[1:] for x in result], full_size=full_size)
def run(self, tt_query, cql, path): """ returns: True in case of success In case of an empty subcorus, EmptySubcorpusException is thrown """ conc = conclib.get_conc(self._corp, self._user_id, q=cql, async=0) ans = corplib.subcorpus_from_conc(path, conc) if ans is False: raise EmptySubcorpusException('Empty subcorpus') return ans
def calculate_colls_bg(coll_args): """ Background collocations calculation. This function is expected to be run either from Celery or from other process (via multiprocessing). """ cm = corplib.CorpusManager(subcpath=coll_args.subcpath) corp = cm.get_Corpus(coll_args.corpname, coll_args.subcname) try: corplib.frq_db( corp, coll_args.cattr ) # try to fetch precalculated data; if none then MissingSubCorpFreqFile conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q, fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.' )) collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq, cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items) for item in collocs['Items']: item['pfilter'] = [('q2', item['pfilter'])] item['nfilter'] = [('q2', item['nfilter'])] item['str'] = import_string( item['str'], from_encoding=coll_args.corpus_encoding) return dict(data=collocs, processing=0, tasks=[]) except corplib.MissingSubCorpFreqFile as e: ans = {'attrname': coll_args.cattr, 'tasks': []} out = freq_calc.build_arf_db(e.args[0], coll_args.cattr) if type(out) is list: processing = 1 ans['tasks'].extend(out) else: processing = 0 ans['processing'] = processing ans['data'] = dict(Items=[], Head=[]) return ans
def calc_freqs(self, flimit, freq_sort, ml, rel_mode, fcrit, ftt_include_empty, collator_locale, fmaxitems, fpage, line_offset): """ Calculate actual frequency data. Returns: a 2-tuple (freq_data, caching_data) where: freq_data = dict(lastpage=..., data=..., fstart=..., fmaxitems=..., conc_size=...) caching_data = dict(data=..., cache_path=...); can be also None which means 'do not cache' """ cache_path = self._cache_file_path(fcrit, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) cache_ans = None if os.path.isfile(cache_path): with open(cache_path, 'rb') as f: data, conc_size = cPickle.load(f) else: cm = corplib.CorpusManager(subcpath=self._subcpath) corp = cm.get_Corpus(self._corpname, self._subcname) conc = conclib.get_conc(corp=corp, user_id=self._user_id, minsize=self._minsize, q=self._q, fromp=self._fromp, pagesize=self._pagesize, async=0, save=self._save, samplesize=self._samplesize) conc_size = conc.size() data = [conc.xfreq_dist(cr, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) for cr in fcrit] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if total_length >= self.min_cached_data_size: cache_ans = dict(data=(data, conc_size), cache_path=cache_path) items_per_page = fmaxitems fstart = (fpage - 1) * fmaxitems + line_offset fmaxitems = fmaxitems * fpage + 1 + line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [dict(Total=total_length, TotalPages=int(math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1], Head=data[0]['Head'])] else: ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=fmaxitems, conc_size=conc_size), cache_ans
def calc_freqs_bg(args): """ Calculate actual frequency data. arguments: args -- a FreqCalsArgs instance returns: a dict(freqs=..., conc_size=...) """ cm = corplib.CorpusManager(subcpath=args.subcpath) corp = cm.get_Corpus(args.corpname, subcname=args.subcname) conc = conclib.get_conc(corp=corp, user_id=args.user_id, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, async=0, save=args.save, samplesize=args.samplesize) if not conc.finished(): raise UnfinishedConcordanceError( _('Cannot calculate yet - source concordance not finished. Please try again later.')) freqs = [conc.xfreq_dist(cr, args.flimit, args.freq_sort, args.ml, args.ftt_include_empty, args.rel_mode, args.collator_locale) for cr in args.fcrit] return dict(freqs=freqs, conc_size=conc.size())
def fcs_search(self, corp, corpname, fcs_query, max_rec, start): """ aux function for federated content search: operation=searchRetrieve """ query = fcs_query.replace('+', ' ') # convert URL spaces exact_match = True # attr=".*value.*" if 'exact' in query.lower() and '=' not in query: # lemma EXACT "dog" pos = query.lower().index('exact') # first occurrence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs rq = '' # query for manatee try: # parse query if '=' in query: # lemma=word | lemma="word" | lemma="w1 w2" | word="" attr, term = query.split('=') attr = attr.strip() term = term.strip() else: # "w1 w2" | "word" | word attr = 'word' # use one of search attributes if in corpora attributes # otherwise use `word` - fails below if not valid for sa in self.search_attrs: if sa in attrs: attr = sa break term = query.strip() if '"' in attr: raise Exception if '"' in term: # "word" | "word1 word2" | "" | "it is \"good\"" if term[0] != '"' or term[-1] != '"': # check q. marks raise Exception term = term[1:-1].strip() # remove quotation marks if ' ' in term: # multi-word term if exact_match: rq = ' '.join( ['[%s="%s"]' % (attr, t) for t in term.split()]) else: rq = ' '.join([ '[%s=".*%s.*"]' % (attr, t) for t in term.split() ]) elif term.strip() == '': # "" raise Exception # empty term else: # one-word term if exact_match: rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) else: # must be single-word term if ' ' in term: raise Exception if exact_match: # build query rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) except: # there was a problem when parsing raise Exception(10, query, 'Query syntax error') if attr not in attrs: raise Exception(16, attr, 'Unsupported index') # try to get concordance try: anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id'] q = ['q' + rq] # q = ['aword,[lc="havel"]'] conc = conclib.get_conc(corp, anon_id, q=q) except Exception as e: raise Exception(10, repr(e), 'Query syntax error') kwic = kwiclib.Kwic(corp, corpname, conc) kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR) page = kwic.kwicpage(kwic_args) # convert concordance # start starts with 1 start -= 1 if len(page['Lines']) < start: raise Exception(61, 'startRecord', 'First record position out of range') return [(kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'], kwicline['Right'][0]['str'], kwicline['ref']) for kwicline in page['Lines']][start:][:max_rec]
def fcs_search(self, corp, corpname, fcs_query, max_rec, start): """ aux function for federated content search: operation=searchRetrieve """ query = fcs_query.replace('+', ' ') # convert URL spaces exact_match = True # attr=".*value.*" if 'exact' in query.lower() and '=' not in query: # lemma EXACT "dog" pos = query.lower().index('exact') # first occurrence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs rq = '' # query for manatee try: # parse query if '=' in query: # lemma=word | lemma="word" | lemma="w1 w2" | word="" attr, term = query.split('=') attr = attr.strip() term = term.strip() else: # "w1 w2" | "word" | word attr = 'word' # use one of search attributes if in corpora attributes # otherwise use `word` - fails below if not valid for sa in self.search_attrs: if sa in attrs: attr = sa break term = query.strip() if '"' in attr: raise Exception if '"' in term: # "word" | "word1 word2" | "" | "it is \"good\"" if term[0] != '"' or term[-1] != '"': # check q. marks raise Exception term = term[1:-1].strip() # remove quotation marks if ' ' in term: # multi-word term if exact_match: rq = ' '.join(['[%s="%s"]' % (attr, t) for t in term.split()]) else: rq = ' '.join(['[%s=".*%s.*"]' % (attr, t) for t in term.split()]) elif term.strip() == '': # "" raise Exception # empty term else: # one-word term if exact_match: rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) else: # must be single-word term if ' ' in term: raise Exception if exact_match: # build query rq = '[%s="%s"]' % (attr, term) else: rq = '[%s=".*%s.*"]' % (attr, term) except: # there was a problem when parsing raise Exception(10, query, 'Query syntax error') if attr not in attrs: raise Exception(16, attr, 'Unsupported index') fromp = int(math.floor((start - 1) / max_rec)) + 1 # try to get concordance try: anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id'] q = ['q' + rq] conc = conclib.get_conc(corp, anon_id, q=q, fromp=fromp, pagesize=max_rec * 2, async=0) except Exception as e: raise Exception(10, repr(e), 'Query syntax error') kwic = kwiclib.Kwic(corp, corpname, conc) kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR) kwic_args.fromp = fromp kwic_args.pagesize = max_rec * 2 kwic_args.leftctx = '-{0}'.format(settings.get_int('fcs', 'kwic_context', 5)) kwic_args.rightctx = '{0}'.format(settings.get_int('fcs', 'kwic_context', 5)) page = kwic.kwicpage(kwic_args) # convert concordance local_offset = (start - 1) % max_rec if start > conc.size(): raise Exception(61, 'startRecord', 'First record position out of range') rows = [ ( kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'], kwicline['Right'][0]['str'], kwicline['ref'] ) for kwicline in page['Lines'] ][local_offset:local_offset + max_rec] return rows, conc.size()
def calc_freqs(self, flimit, freq_sort, ml, rel_mode, fcrit, ftt_include_empty, collator_locale, fmaxitems, fpage, line_offset): """ Calculate actual frequency data. Returns: a 2-tuple (freq_data, caching_data) where: freq_data = dict(lastpage=..., data=..., fstart=..., fmaxitems=..., conc_size=...) caching_data = dict(data=..., cache_path=...); can be also None which means 'do not cache' """ cache_path = self._cache_file_path(fcrit, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) cache_ans = None if os.path.isfile(cache_path): with open(cache_path, 'rb') as f: data, conc_size = cPickle.load(f) else: cm = corplib.CorpusManager(subcpath=self._subcpath) corp = cm.get_Corpus(self._corpname, self._subcname) conc = conclib.get_conc(corp=corp, user_id=self._user_id, minsize=self._minsize, q=self._q, fromp=self._fromp, pagesize=self._pagesize, async=0, save=self._save, samplesize=self._samplesize) conc_size = conc.size() data = [ conc.xfreq_dist(cr, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale) for cr in fcrit ] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if total_length >= self.min_cached_data_size: cache_ans = dict(data=(data, conc_size), cache_path=cache_path) items_per_page = fmaxitems fstart = (fpage - 1) * fmaxitems + line_offset fmaxitems = fmaxitems * fpage + 1 + line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [ dict(Total=total_length, TotalPages=int( math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1], Head=data[0]['Head']) ] else: ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=fmaxitems, conc_size=conc_size), cache_ans