def _get_bg_conc(corp: manatee.Corpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, EmptyConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): cachefile, _ = cache_map.add_to_map(subchash, q[:i + 1], 0, calc_status=CalcStatus()) if os.path.isfile(cachefile): del_silent(cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {cachefile}') app = bgcalc.calc_backend_client(settings) app.send_task('conc_sync_calculate', (user_id, corp.corpname, getattr( corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend = settings.get('calc_backend', 'type') if backend == 'multiprocessing': from concworker import mp mp.create_task(user_id, corp, subchash, q, samplesize).start() elif backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=10) # register should be fast ans.get() # = wait for task registration else: raise ValueError('Unknown concordance calculation backend: %s' % (backend,)) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) except Exception as e: _cancel_async_task(cache_map, subchash, q) raise e return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
def submit(self, request): form_args = WordlistFormArgs() form_args.update_by_user_query(request.json) worker = calc_backend_client(settings) ans = dict(corpname=self.args.corpname, usesubcorp=self.args.usesubcorp, freq_files_avail=True, subtasks=[]) async_res = worker.send_task( 'get_wordlist', object.__class__, args=(form_args.to_dict(), self.corp.size, self.session_get('user', 'id'))) bg_result = async_res.get() if isinstance(bg_result, MissingSubCorpFreqFile): data_calc = freq_calc.build_arf_db(self.session_get('user', 'id'), self.corp, form_args.wlattr) if type(data_calc) is list: for subtask in data_calc: self._store_async_task(subtask) ans['subtasks'].append(subtask.to_dict()) ans['freq_files_avail'] = False else: # TODO we should join the current calculation here instead of throwing an error raise WordlistError('The data calculation is already running') elif isinstance(bg_result, Exception): raise bg_result self._curr_wlform_args = form_args def on_conc_store(query_ids, history_ts, result): result['wl_query_id'] = query_ids[0] if history_ts: self._store_last_search('wlist', query_ids[0]) self.on_conc_store = on_conc_store return ans
def get_conc_cache_status(corp: KCorpus, conc_id: str): cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) q = [] try: with plugins.runtime.QUERY_PERSISTENCE as qp: data = qp.open(conc_id) q = data.get('q', []) cache_status = cache_map.get_calc_status(corp.subchash, data.get('q', [])) if cache_status is None: # conc is not cached nor calculated return Exception('Concordance calculation is lost') elif not cache_status.finished and cache_status.task_id: # we must also test directly a respective task as might have been killed # and thus failed to store info to cache metadata worker = calc_backend_client(settings) err = worker.get_task_error(cache_status.task_id) if err is not None: raise err return { 'finished': cache_status.finished, 'concsize': cache_status.concsize, 'fullsize': cache_status.fullsize, 'relconcsize': cache_status.relconcsize, 'arf': cache_status.arf } except CalcTaskNotFoundError as ex: cancel_conc_task(cache_map, corp.subchash, q) raise Exception(f'Concordance calculation is lost: {ex}') except Exception as ex: cancel_conc_task(cache_map, corp.subchash, q) raise ex
def calculate_colls(coll_args): """ Calculates required collocations based on passed arguments. Function is able to reuse cached values and utilize configured backend (either Celery or multiprocessing). returns: a dictionary ready to be used in a respective template (collx.tmpl) (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage) """ if coll_args.num_lines > 0: collstart = 0 collend = coll_args.num_lines else: collstart = (int(coll_args.collpage) - 1) * \ int(coll_args.citemsperpage) + int(coll_args.line_offset) collend = collstart + int(coll_args.citemsperpage) + 1 cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath, user_id=coll_args.user_id, q=coll_args.q, minsize=coll_args.minsize, save=coll_args.save, samplesize=coll_args.samplesize) collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr, cminfreq=coll_args.cminfreq) if collocs is None: num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES else: num_fetch_items = len(collocs['Items']) if collocs is None or collend > num_fetch_items: if os.path.isfile(cache_path): # cache avail. but not enough items os.unlink(cache_path) if collend >= num_fetch_items: num_fetch_items += (collend - num_fetch_items) + 10 * \ int(coll_args.citemsperpage) # TODO heuristics :) coll_args.cache_path = cache_path coll_args.num_fetch_items = num_fetch_items backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(),), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) ans = res.get() elif backend == 'multiprocessing': ans = calculate_colls_mp(coll_args) else: ans = dict(data=collocs, processing=0) result = dict( Head=ans['data']['Head'], attrname=coll_args.cattr, processing=ans['processing'], collstart=collstart, lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1, Items=ans['data']['Items'][collstart:collend - 1] ) return result
def freq_intersection(self, request): """ Run a paradigmatic query out of existing concordances. submitted JSON structure - see models.pquery.common.FreqIntersectionArgs """ worker = bgcalc.calc_backend_client(settings) corp_info = self.get_corpus_info(self.args.corpname) self._curr_pquery_args = PqueryFormArgs(corpname=self.corp.corpname, attr=self._get_default_attr(), position='0<0~0>0') self._curr_pquery_args.update_by_user_query(request.json) conc_forms, raw_queries = _load_conc_queries( self._plugin_ctx, self._curr_pquery_args.conc_ids, self.args.corpname, 'query') if self._curr_pquery_args.conc_subset_complements: conc_forms2, raw_queries2 = _load_conc_queries( self._plugin_ctx, self._curr_pquery_args.conc_subset_complements.conc_ids, self.args.corpname, 'query') raw_queries.update(raw_queries2) if self._curr_pquery_args.conc_superset: conc_forms3, raw_queries3 = _load_conc_queries( self._plugin_ctx, [self._curr_pquery_args.conc_superset.conc_id], self.args.corpname, 'query') raw_queries.update(raw_queries3) calc_args = (self._curr_pquery_args, raw_queries, self.subcpath, self.session_get('user', 'id'), corp_info.collator_locale if corp_info.collator_locale else 'en_US') task_status = worker.send_task('calc_merged_freqs', object.__class__, args=calc_args, time_limit=TASK_TIME_LIMIT) sq_items = [] for conc_id in self._curr_pquery_args.conc_ids: sq_items.append( conc_forms[conc_id]['curr_queries'][self.args.corpname]) shortened_q = ' && '.join(f'{{{q}}}' for q in sq_items) shortened_q = f'{shortened_q} -> {self._curr_pquery_args.attr}' def on_conc_store(query_ids, history_ts, result): async_task = AsyncTaskStatus( status=task_status.status, ident=task_status.id, category=AsyncTaskStatus.CATEGORY_PQUERY, label=shortened_q, args=dict(query_id=query_ids[0], last_update=time.time()), url=self.create_url('pquery/result', dict(q=f'~{query_ids[0]}'))) self._store_async_task(async_task) result['task'] = async_task.to_dict() if history_ts: self._store_last_search('pquery', query_ids[0]) self.on_conc_store = on_conc_store return {}
def process(self, attrname='', worker_tasks=None): backend = settings.get('calc_backend', 'type') if worker_tasks and backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) for t in worker_tasks: tr = app.AsyncResult(t) if tr.status == 'FAILURE': raise bgcalc.ExternalTaskError('Task %s failed' % (t, )) return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
def process(self, attrname='', worker_tasks=None): backend = settings.get('calc_backend', 'type') if worker_tasks and backend in ('celery', 'rq'): import bgcalc worker = bgcalc.calc_backend_client(settings) for t in worker_tasks: tr = worker.AsyncResult(t) if tr.status == 'FAILURE': raise BgCalcError(f'Task {t} failed') return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
def process(self, attrname='', worker_tasks=None): backend = settings.get('calc_backend', 'type') if worker_tasks and backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) for t in worker_tasks: tr = app.AsyncResult(t) if tr.status == 'FAILURE': raise bgcalc.ExternalTaskError('Task %s failed' % (t,)) return {'status': freq_calc.build_arf_db_status(self.corp, attrname)}
def calculate_colls(coll_args): """ Calculates required collocations based on passed arguments. Function is able to reuse cached values and utilize configured backend (either Celery or multiprocessing). returns: a dictionary ready to be used in a respective template (collx.tmpl) (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage) """ if coll_args.num_lines > 0: collstart = 0 collend = coll_args.num_lines else: collstart = (int(coll_args.collpage) - 1) * \ int(coll_args.citemsperpage) + int(coll_args.line_offset) collend = collstart + int(coll_args.citemsperpage) + 1 cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath, user_id=coll_args.user_id, q=coll_args.q, save=coll_args.save, samplesize=coll_args.samplesize) collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr, cminfreq=coll_args.cminfreq) if collocs is None: num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES else: num_fetch_items = len(collocs['Items']) if collocs is None or collend > num_fetch_items: if os.path.isfile(cache_path): # cache avail. but not enough items os.unlink(cache_path) if collend >= num_fetch_items: num_fetch_items += (collend - num_fetch_items) + 10 * \ int(coll_args.citemsperpage) # TODO heuristics :) coll_args.cache_path = cache_path coll_args.num_fetch_items = num_fetch_items app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(),), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) ans = res.get() else: ans = dict(data=collocs, processing=0) result = dict( Head=ans['data']['Head'], attrname=coll_args.cattr, processing=ans['processing'], collstart=collstart, lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1, Items=ans['data']['Items'][collstart:collend - 1] ) return result
def calculate_freqs(args): """ Calculates a frequency distribution based on a defined concordance and frequency-related arguments. The class is able to cache the data in a background process/task. This prevents KonText to calculate (via Manatee) full frequency list again and again (e.g. if user moves from page to page). """ cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save, samplesize=args.samplesize) calc_result, cache_path = cache.get(fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml, ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode, collator_locale=args.collator_locale) if calc_result is None: backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc args.cache_path = cache_path app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_freqs', args=(args.to_dict(),), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) calc_result = res.get() if backend == 'multiprocessing': calc_result = calculate_freqs_mp(args) data = calc_result['freqs'] conc_size = calc_result['conc_size'] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if 'Items' in data[0] else 0 items_per_page = args.fmaxitems fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [dict(Total=total_length, TotalPages=int(math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [], Head=data[0].get('Head', []))] else: for item in data: if 'Items' not in item: item['Items'] = [] item['Total'] = len(item['Items']) item['TotalPages'] = None ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)
def calculate_freqs(args): """ Calculates a frequency distribution based on a defined concordance and frequency-related arguments. The class is able to cache the data in a background process/task. This prevents KonText to calculate (via Manatee) full frequency list again and again (e.g. if user moves from page to page). """ cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save, samplesize=args.samplesize) calc_result, cache_path = cache.get(fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml, ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode, collator_locale=args.collator_locale) if calc_result is None: backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc args.cache_path = cache_path app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_freqs', args=(args.to_dict(),), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) calc_result = res.get() if backend == 'multiprocessing': calc_result = calculate_freqs_mp(args) data = calc_result['freqs'] conc_size = calc_result['conc_size'] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if 'Items' in data[0] else 0 items_per_page = args.fmaxitems fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [dict(Total=total_length, TotalPages=int(math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [], Head=data[0].get('Head', []))] else: for item in data: if 'Items' not in item: item['Items'] = [] item['Total'] = len(item['Items']) item['TotalPages'] = None ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ app = bgcalc.calc_backend_client(settings) ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q)) else: return EmptyConc(corp, cache_map.cache_file_path(subchash, q))
def cancel_async_task(cache_map: AbstractConcCache, subchash: Optional[str], q: Tuple[str, ...]): cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) if status: try: if status.task_id: app = bgcalc.calc_backend_client(settings) app.control.revoke(status.task_id, terminate=True, signal='SIGKILL') except IOError: pass cache_map.del_entry(subchash, q) del_silent(cachefile)
def _cancel_async_task(cache_map, subchash, q): cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) backend = settings.get('calc_backend', 'type') if backend == 'multiprocessing': logging.getLogger(__name__).warning('Unable to cancel async task in multiprocessing mode') elif backend in ('celery', 'konserver') and status: import bgcalc try: if status.task_id: app = bgcalc.calc_backend_client(settings) app.control.revoke(status.task_id, terminate=True, signal='SIGKILL') except IOError: pass cache_map.del_entry(subchash, q) _del_silent(cachefile)
def calculate_freqs_ct(args): """ note: this is called by webserver """ try: app = bgcalc.calc_backend_client(settings) res = app.send_task('calculate_freqs_ct', args=(args.to_dict(), ), time_limit=TASK_TIME_LIMIT) calc_result = res.get() except Exception as ex: if is_celery_user_error(ex): raise UserActionException(str(ex)) from ex else: raise ex return calc_result
def cancel_conc_task(cache_map: AbstractConcCache, subchash: Optional[str], q: Tuple[str, ...]): """ Removes conc. cache entry and also a respective calculation task (silently). """ cachefile = cache_map.readable_cache_path(subchash, q) status = cache_map.get_calc_status(subchash, q) if status: try: if status.task_id: worker = bgcalc.calc_backend_client(settings) worker.control.revoke(status.task_id, terminate=True, signal='SIGKILL') except (IOError, CalcTaskNotFoundError): pass cache_map.del_entry(subchash, q) del_silent(cachefile)
def _get_bg_conc(corp: AbstractKCorpus, user_id: int, q: Tuple[str, ...], subchash: Optional[str], samplesize: int, calc_from: int, minsize: int) -> Union[PyConc, InitialConc]: """ arguments: calc_from - from which operation idx (inclusive) we have to calculate respective results """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if status and not status.finished: # the calc is already running, the client has to wait and check regularly return InitialConc(corp, status.cachefile) # let's create cache records of the operations we'll have to perform if calc_from < len(q): for i in range(calc_from, len(q)): status = cache_map.add_to_map(subchash, q[:i + 1], ConcCacheStatus(), overwrite=True) # the file cannot be valid as otherwise, calc_from would be higher if os.path.isfile(status.cachefile): del_silent(status.cachefile) logging.getLogger(__name__).warning( f'Removed unbound conc. cache file {status.cachefile}') worker = bgcalc.calc_backend_client(settings) worker.send_task( 'conc_sync_calculate', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize), time_limit=TASK_TIME_LIMIT) # for smaller concordances/corpora there is a chance the data # is ready in a few seconds - let's try this: conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: # return empty yet unfinished concordance to make the client watch the calculation return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def calculate_colls(coll_args: CollCalcArgs) -> CalculateCollsResult: """ Calculates required collocations based on passed arguments. Result values are cached. returns: a dictionary ready to be used in a respective template (collx.tmpl) (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage) """ collstart = (coll_args.collpage - 1) * coll_args.citemsperpage collend = collstart + coll_args.citemsperpage cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath, user_id=coll_args.user_id, q=coll_args.q, samplesize=coll_args.samplesize) collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr, cminfreq=coll_args.cminfreq) if collocs is None: coll_args.cache_path = cache_path worker = bgcalc.calc_backend_client(settings) res = worker.send_task('calculate_colls', object.__class__, args=(coll_args, ), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) ans = res.get() else: ans = dict(data=collocs, processing=0) return CalculateCollsResult( Head=ans['data']['Head'], attrname=coll_args.cattr, processing=ans['processing'], lastpage=not collstart + coll_args.citemsperpage < len( ans['data']['Items']), Items=ans['data']['Items'][collstart:collend])
def build_arf_db(corp, attrname): """ Provides a higher level wrapper to create_arf_db(). Function creates a background process where create_arf_db() is run. """ base_path = corp_freqs_cache_path(corp, attrname) if calc_is_running(base_path): curr_status = _get_total_calc_status(base_path) if curr_status < 100: return curr_status subc_path = prepare_arf_calc_paths(corp, attrname) backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) task_ids = [] for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) write_log_header(corp, logfilename_m) res = app.send_task('worker.compile_{0}'.format(m), (corp.corpname, subc_path, attrname, logfilename_m), time_limit=TASK_TIME_LIMIT) task_ids.append(res.id) return task_ids elif backend == 'multiprocessing': import subprocess for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size())) log = " 2>> '%s'" % logfilename_m if subc_path: cmd = u"mkstats '%s' '%s' %%s '%s' %s" % (corp.get_confpath(), attrname, subc_path.decode('utf-8'), log.decode('utf-8')) cmd = cmd.encode('utf-8') else: cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log) subprocess.call(cmd % 'frq', shell=True) return []
def build_arf_db(corp, attrname): """ Provides a higher level wrapper to create_arf_db(). Function creates a background process where create_arf_db() is run. """ base_path = corp_freqs_cache_path(corp, attrname) if calc_is_running(base_path): curr_status = _get_total_calc_status(base_path) if curr_status < 100: return curr_status subc_path = prepare_arf_calc_paths(corp, attrname) backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) task_ids = [] for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) write_log_header(corp, logfilename_m) res = app.send_task('worker.compile_{0}'.format(m), (corp.corpname, subc_path, attrname, logfilename_m), time_limit=TASK_TIME_LIMIT) task_ids.append(res.id) return task_ids elif backend == 'multiprocessing': import subprocess for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size())) log = " 2>> '%s'" % logfilename_m if subc_path: cmd = u"mkstats '%s' '%s' %%s '%s' %s" % (corp.get_confpath(), attrname, subc_path.decode('utf-8'), log.decode('utf-8')) cmd = cmd.encode('utf-8') else: cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log) subprocess.call(cmd % 'frq', shell=True) return []
def _get_async_conc(corp, user_id, q, subchash, samplesize, minsize): """ """ cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) status = cache_map.get_calc_status(subchash, q) if not status or status.error: worker = bgcalc.calc_backend_client(settings) ans = worker.send_task( 'conc_register', object.__class__, (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize, TASK_TIME_LIMIT), time_limit=CONC_REGISTER_TASK_LIMIT) ans.get(timeout=CONC_REGISTER_WAIT_LIMIT) conc_avail = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) if conc_avail: return PyConc(corp, 'l', cache_map.readable_cache_path(subchash, q)) else: return InitialConc(corp, cache_map.readable_cache_path(subchash, q))
def calculate_freqs_ct(args): """ note: this is called by webserver """ backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc try: app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(),), time_limit=TASK_TIME_LIMIT) calc_result = res.get() except Exception as ex: if is_celery_user_error(ex): raise UserActionException(ex.message) else: raise ex elif backend == 'multiprocessing': raise NotImplementedError( 'Multi-processing backend is not yet supported for freq_ct calculation') else: raise ValueError('Invalid backend') return calc_result
def calculate_freqs_ct(args): """ note: this is called by webserver """ backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc try: app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(),), time_limit=TASK_TIME_LIMIT) calc_result = res.get() except Exception as ex: if is_celery_user_error(ex): raise UserActionException(ex.message) else: raise ex elif backend == 'multiprocessing': raise NotImplementedError( 'Multi-processing backend is not yet supported for freq_ct calculation') else: raise ValueError('Invalid backend') return calc_result
def build_arf_db(corp, attrname): """ Provides a higher level wrapper to create_arf_db(). Function creates a background process where create_arf_db() is run. """ base_path = corp_freqs_cache_path(corp, attrname) if calc_is_running(base_path): curr_status = _get_total_calc_status(base_path) if curr_status < 100: return curr_status subc_path = prepare_arf_calc_paths(corp, attrname) app = bgcalc.calc_backend_client(settings) task_ids = [] for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) write_log_header(corp, logfilename_m) res = app.send_task( 'compile_{0}'.format(m), (corp.corpname, subc_path, attrname, logfilename_m), time_limit=TASK_TIME_LIMIT) task_ids.append(res.id) return task_ids
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') aligned_corpora = request.form.getlist('aligned_corpora') publish = bool(int(request.form.get('publish'))) corpus_info = self.get_corpus_info(self.args.corpname) description = request.form.get('description') if not subcname: raise UserActionException(translate('No subcorpus name specified!')) if publish and not description: raise UserActionException(translate('No description specified')) if raw_cql: aligned_corpora = [] tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql,) elif within_json: # user entered a subcorpus query manually aligned_corpora = [] tt_query = () within_cql = self._deserialize_custom_within(json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql,) elif len(aligned_corpora) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists: if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None attrs = json.loads(request.form.get('attrs', '{}')) sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_api, corpus=self.corp, attr_map=attrs, aligned_corpora=aligned_corpora, limit_lists=False) values = sel_match['attr_values'][corpus_info.metadata.label_attr] args = argmapping.Args() setattr(args, 'sca_{0}'.format( corpus_info.metadata.id_attr), [v[1] for v in values]) tt_query = TextTypeCollector(self.corp, args).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql,) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function') else: within_cql = None tt_query = TextTypeCollector(self.corp, request).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql,) basecorpname = self.args.corpname.split(':')[0] path = self.prepare_subc_path(basecorpname, subcname, publish=False) publish_path = self.prepare_subc_path( basecorpname, subcname, publish=True) if publish else None if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1 and len(aligned_corpora) == 0: result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, self.session_get( 'user', 'fullname'), description) elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0: backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) res = app.send_task('worker.create_subcorpus', (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, self.session_get('user', 'fullname'), description), time_limit=TASK_TIME_LIMIT) self._store_async_task(AsyncTaskStatus(status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=u'%s:%s' % (basecorpname, subcname), args=dict(subcname=subcname, corpname=basecorpname))) result = {} elif backend == 'multiprocessing': from bgcalc import subc_calc import functools import multiprocessing worker = subc_calc.CreateSubcorpusTask(user_id=self.session_get('user', 'id'), corpus_id=self.args.corpname) multiprocessing.Process(target=functools.partial( worker.run, tt_query, imp_cql, path, publish_path, description)).start() result = {} else: raise UserActionException(translate('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e) self.add_system_message('warning', translate('Subcorpus created but there was a problem saving a backup copy.')) unfinished_corpora = filter(lambda at: not at.is_finished(), self.get_async_tasks(category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) return dict(processed_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(translate('Empty subcorpus!'))
def _create_subcorpus(self, request: Request) -> Dict[str, Any]: """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ within_cql = None form_type = request.json['form_type'] if form_type == 'tt-sel': data = CreateSubcorpusArgs(**request.json) corpus_info = self.get_corpus_info(data.corpname) if (plugins.runtime.LIVE_ATTRIBUTES.exists and plugins.runtime.LIVE_ATTRIBUTES.instance.is_enabled_for( self._plugin_ctx, [data.corpname] ) # TODO here we skip aligned corpora which is debatable and len(data.aligned_corpora) > 0): if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_ctx, corpus=self.corp, attr_map=data.text_types, aligned_corpora=data.aligned_corpora, limit_lists=False) sel_attrs = {} for k, vals in sel_match.attr_values.items(): if k == corpus_info.metadata.label_attr: k = corpus_info.metadata.id_attr if '.' in k: sel_attrs[k] = [v[1] for v in vals] tt_query = TextTypeCollector(self.corp, sel_attrs).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql imp_cql = (full_cql, ) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function' ) else: tt_query = TextTypeCollector(self.corp, data.text_types).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql imp_cql = (full_cql, ) elif form_type == 'within': data = CreateSubcorpusWithinArgs(**request.json) tt_query = () within_cql = self._deserialize_custom_within(data.within) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql, ) elif form_type == 'cql': data = CreateSubcorpusRawCQLArgs(**request.json) tt_query = () within_cql = data.cql full_cql = f'aword,[] {data.cql}' imp_cql = (full_cql, ) else: raise UserActionException( f'Invalid form type provided - "{form_type}"') if not data.subcname: raise UserActionException( translate('No subcorpus name specified!')) if data.publish and not data.description: raise UserActionException(translate('No description specified')) path = self.prepare_subc_path(self.args.corpname, data.subcname, publish=False) publish_path = self.prepare_subc_path( self.args.corpname, data.subcname, publish=True) if data.publish else None if len(tt_query) == 1 and not data.has_aligned_corpora(): result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, self.session_get('user', 'fullname'), data.description) elif len(tt_query) > 1 or within_cql or data.has_aligned_corpora(): worker = bgcalc.calc_backend_client(settings) res = worker.send_task( 'create_subcorpus', object.__class__, (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, self.session_get('user', 'fullname'), data.description), time_limit=TASK_TIME_LIMIT) self._store_async_task( AsyncTaskStatus(status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=f'{self.args.corpname}/{data.subcname}', args=dict(subcname=data.subcname, corpname=self.args.corpname))) result = {} else: raise UserActionException(translate('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=data.subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning( 'Failed to store subcorpus query: %s' % e) self.add_system_message( 'warning', translate( 'Subcorpus created but there was a problem saving a backup copy.' )) unfinished_corpora = [ at for at in self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS) if not at.is_finished() ] return dict( processed_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(translate('Empty subcorpus!'))
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') aligned_corpora = request.form.getlist('aligned_corpora') publish = bool(int(request.form.get('publish'))) corpus_info = self.get_corpus_info(self.args.corpname) description = request.form.get('description') if raw_cql: aligned_corpora = [] tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql, ) elif within_json: # user entered a subcorpus query manually aligned_corpora = [] tt_query = () within_cql = self._deserialize_custom_within( json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql, ) elif len(aligned_corpora ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists: if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None attrs = json.loads(request.form.get('attrs', '{}')) sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_api, corpus=self.corp, attr_map=attrs, aligned_corpora=aligned_corpora, limit_lists=False) values = sel_match['attr_values'][ corpus_info.metadata.label_attr] args = argmapping.Args() setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr), [v[1] for v in values]) tt_query = TextTypeCollector(self.corp, args).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function' ) else: within_cql = None tt_query = TextTypeCollector(self.corp, request).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise UserActionException( translate('No subcorpus name specified!')) path = self.prepare_subc_path(basecorpname, subcname, publish=False) publish_path = self.prepare_subc_path( basecorpname, subcname, publish=True) if publish else None if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1 and len(aligned_corpora) == 0: result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, self.session_get('user', 'fullname'), description) elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0: backend = settings.get('calc_backend', 'type') if backend in ('celery', 'konserver'): import bgcalc app = bgcalc.calc_backend_client(settings) res = app.send_task( 'worker.create_subcorpus', (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, description), time_limit=TASK_TIME_LIMIT) self._store_async_task( AsyncTaskStatus( status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=u'%s:%s' % (basecorpname, subcname), args=dict(subcname=subcname, corpname=basecorpname))) result = {} elif backend == 'multiprocessing': from bgcalc import subc_calc import functools import multiprocessing worker = subc_calc.CreateSubcorpusTask( user_id=self.session_get('user', 'id'), corpus_id=self.args.corpname) multiprocessing.Process(target=functools.partial( worker.run, tt_query, imp_cql, path, publish_path, description)).start() result = {} else: raise UserActionException(translate('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning( 'Failed to store subcorpus query: %s' % e) self.add_system_message( 'warning', translate( 'Subcorpus created but there was a problem saving a backup copy.' )) unfinished_corpora = filter( lambda at: not at.is_finished(), self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) return dict( processed_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(translate('Empty subcorpus!'))