def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('global', 'calc_backend') if backend == 'multiprocessing': from concworker import mp mp.create_task(user_id, corp, subchash, q, samplesize).start() elif backend == 'celery': import task app = task.get_celery_app(conf['conf']) ans = app.send_task('worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) ans.get() # = wait for task registration else: raise ValueError('Unknown concordance calculation backend: %s' % (backend,)) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) except Exception as e: _cancel_async_task(cache_map, subchash, q) raise e return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('global', 'calc_backend') if backend == 'multiprocessing': from concworker import mp mp.create_task(user_id, corp, subchash, q, samplesize).start() elif backend == 'celery': import task app = task.get_celery_app(conf['conf']) ans = app.send_task( 'worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) ans.get() # = wait for task registration else: raise ValueError('Unknown concordance calculation backend: %s' % (backend, )) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) except Exception as e: _cancel_async_task(cache_map, subchash, q) raise e return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('corpora', 'conc_calc_backend') if backend == 'multiprocessing': from concworker.default import BackgroundCalc, NotifierFactory receiver, sender = NotifierFactory()() calc = BackgroundCalc(notification_sender=sender) proc = Process(target=calc, args=(corp, subchash, q, samplesize,)) proc.start() elif backend == 'celery': from concworker.wcelery import NotifierFactory import task app = task.get_celery_app(conf['conf']) res = app.send_task('worker.register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) receiver, sender = NotifierFactory(res)() else: raise ValueError('Unknown concordance calculation backend: %s' % (backend,)) cachefile, pidfile = receiver.receive() try: _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=plugins.get('conc_cache').get_mapping(corp), pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): raise RuntimeError('Concordance cache file [%s] not created. PID file: %s' % (cachefile, pidfile)) except Exception as e: if os.path.exists(pidfile): os.remove(pidfile) raise e return PyConc(corp, 'l', cachefile)
def calculate_freqs_ct(args): """ note: this is called by webserver """ backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task try: app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(), ), time_limit=TASK_TIME_LIMIT) calc_result = res.get() except Exception as ex: if is_celery_user_error(ex): raise UserActionException(ex.message) else: raise ex elif backend == 'multiprocessing': raise NotImplementedError( 'Multi-processing backend is not yet supported for freq_ct calculation' ) else: raise ValueError('Invalid backend') return calc_result
def calculate_colls(coll_args): """ Calculates required collocations based on passed arguments. Function is able to reuse cached values and utilize configured backend (either Celery or multiprocessing). returns: a dictionary ready to be used in a respective template (collx.tmpl) (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage) """ if coll_args.num_lines > 0: collstart = 0 collend = coll_args.num_lines else: collstart = (int(coll_args.collpage) - 1) * \ int(coll_args.citemsperpage) + int(coll_args.line_offset) collend = collstart + int(coll_args.citemsperpage) + 1 cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath, user_id=coll_args.user_id, q=coll_args.q, minsize=coll_args.minsize, save=coll_args.save, samplesize=coll_args.samplesize) collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr, cminfreq=coll_args.cminfreq) if collocs is None: num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES else: num_fetch_items = len(collocs['Items']) if collocs is None or collend > num_fetch_items: if os.path.isfile(cache_path): # cache avail. but not enough items os.unlink(cache_path) if collend >= num_fetch_items: num_fetch_items += (collend - num_fetch_items) + 10 * \ int(coll_args.citemsperpage) # TODO heuristics :) coll_args.cache_path = cache_path coll_args.num_fetch_items = num_fetch_items backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(),)) # worker task caches the value AFTER the result is returned (see worker.py) ans = res.get() elif backend == 'multiprocessing': ans = calculate_colls_mp(coll_args) else: ans = dict(data=collocs, processing=0) result = dict( Head=ans['data']['Head'], attrname=coll_args.cattr, processing=ans['processing'], collstart=collstart, lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1, Items=ans['data']['Items'][collstart:collend - 1] ) return result
def calculate_freqs(args): """ Calculates a frequency distribution based on a defined concordance and frequency-related arguments. The class is able to cache the data in a background process/task. This prevents KonText to calculate (via Manatee) full frequency list again and again (e.g. if user moves from page to page). """ cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save, samplesize=args.samplesize) calc_result, cache_path = cache.get(fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml, ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode, collator_locale=args.collator_locale) if calc_result is None: backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task args.cache_path = cache_path app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_freqs', args=(args.to_dict(),)) # worker task caches the value AFTER the result is returned (see worker.py) calc_result = res.get() if backend == 'multiprocessing': calc_result = calculate_freqs_mp(args) data = calc_result['freqs'] conc_size = calc_result['conc_size'] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if 'Items' in data[0] else 0 items_per_page = args.fmaxitems fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [dict(Total=total_length, TotalPages=int(math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [], Head=data[0].get('Head', []))] else: for item in data: if 'Items' not in item: item['Items'] = [] item['Total'] = len(item['Items']) item['TotalPages'] = None ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('corpora', 'conc_calc_backend') if backend == 'multiprocessing': from concworker.default import BackgroundCalc, NotifierFactory receiver, sender = NotifierFactory()() calc = BackgroundCalc(notification_sender=sender) proc = Process(target=calc, args=( corp, subchash, q, samplesize, )) proc.start() elif backend == 'celery': from concworker.wcelery import NotifierFactory import task app = task.get_celery_app(conf['conf']) res = app.send_task( 'worker.register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) receiver, sender = NotifierFactory(res)() else: raise ValueError('Unknown concordance calculation backend: %s' % (backend, )) cachefile, pidfile = receiver.receive() try: _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=plugins.get('conc_cache').get_mapping(corp), pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): raise RuntimeError( 'Concordance cache file [%s] not created. PID file: %s' % (cachefile, pidfile)) except Exception as e: if os.path.exists(pidfile): os.remove(pidfile) raise e return PyConc(corp, 'l', cachefile)
def _cancel_async_task(cache_map, subchash, q): cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) backend, conf = settings.get_full('global', 'calc_backend') if backend == 'multiprocessing': logging.getLogger(__name__).warning('Unable to cancel async task in multiprocessing mode') elif backend == 'celery' and status: import task try: if status.task_id: app = task.get_celery_app(conf['conf']) app.control.revoke(status.task_id, terminate=True, signal='SIGKILL') except IOError: pass cache_map.del_entry(subchash, q) _del_silent(cachefile)
def build_arf_db(corp, attrname): """ Provides a higher level wrapper to create_arf_db(). Function creates a background process where create_arf_db() is run. """ base_path = corp_freqs_cache_path(corp, attrname) if calc_is_running(base_path): curr_status = _get_total_calc_status(base_path) if curr_status < 100: return curr_status subc_path = prepare_arf_calc_paths(corp, attrname) backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) task_ids = [] for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) write_log_header(corp, logfilename_m) res = app.send_task( 'worker.compile_{0}'.format(m), (corp.corpname, subc_path, attrname, logfilename_m), time_limit=TASK_TIME_LIMIT) task_ids.append(res.id) return task_ids elif backend == 'multiprocessing': import subprocess for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size())) log = " 2>> '%s'" % logfilename_m if subc_path: cmd = u"mkstats '%s' '%s' %%s '%s' %s" % ( corp.get_confpath(), attrname, subc_path.decode('utf-8'), log.decode('utf-8')) cmd = cmd.encode('utf-8') else: cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log) subprocess.call(cmd % 'frq', shell=True) return []
def build_arf_db(corp, attrname): """ Provides a higher level wrapper to create_arf_db(). Function creates a background process where create_arf_db() is run. """ base_path = corp_freqs_cache_path(corp, attrname) if calc_is_running(base_path): curr_status = _get_total_calc_status(base_path) if curr_status < 100: return curr_status subc_path = prepare_arf_calc_paths(corp, attrname) backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) task_ids = [] for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) write_log_header(corp, logfilename_m) res = app.send_task('worker.compile_%s' % m, (corp.corpname, subc_path, attrname, logfilename_m)) task_ids.append(res.id) return task_ids elif backend == 'multiprocessing': import subprocess for m in ('frq', 'arf', 'docf'): logfilename_m = create_log_path(base_path, m) open(logfilename_m, 'w').write('%d\n%s\n0 %%' % (os.getpid(), corp.search_size())) log = " 2>> '%s'" % logfilename_m if subc_path: cmd = u"mkstats '%s' '%s' %%s '%s' %s" % (corp.get_confpath(), attrname, subc_path.decode('utf-8'), log.decode('utf-8')) cmd = cmd.encode('utf-8') else: cmd = "mkstats '%s' '%s' %%s %s" % (corp.get_confpath(), attrname, log) subprocess.call(cmd % 'frq', shell=True) return []
def calculate_freqs_ct(args): """ note: this is called by webserver """ backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task try: app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_freqs_ct', args=(args.to_dict(),)) calc_result = res.get() except Exception as ex: if is_celery_user_error(ex): raise UserActionException(ex.message) else: raise ex elif backend == 'multiprocessing': raise NotImplementedError( 'Multi-processing backend is not yet supported for freq_ct calculation') else: raise ValueError('Invalid backend') return calc_result
def _create_subcorpus(self, request): """ req. arguments: subcname -- name of new subcorpus create -- bool, sets whether to create new subcorpus cql -- custom within condition """ subcname = request.form['subcname'] within_json = request.form.get('within_json') raw_cql = request.form.get('cql') aligned_corpora = request.form.getlist('aligned_corpora') publish = bool(int(request.form.get('publish'))) corpus_info = self.get_corpus_info(self.args.corpname) description = request.form.get('description') if raw_cql: aligned_corpora = [] tt_query = () within_cql = raw_cql full_cql = 'aword,[] %s' % raw_cql imp_cql = (full_cql, ) elif within_json: # user entered a subcorpus query manually aligned_corpora = [] tt_query = () within_cql = self._deserialize_custom_within( json.loads(within_json)) full_cql = 'aword,[] %s' % within_cql imp_cql = (full_cql, ) elif len(aligned_corpora ) > 0 and plugins.runtime.LIVE_ATTRIBUTES.exists: if corpus_info.metadata.label_attr and corpus_info.metadata.id_attr: within_cql = None attrs = json.loads(request.form.get('attrs', '{}')) sel_match = plugins.runtime.LIVE_ATTRIBUTES.instance.get_attr_values( self._plugin_api, corpus=self.corp, attr_map=attrs, aligned_corpora=aligned_corpora, limit_lists=False) values = sel_match['attr_values'][ corpus_info.metadata.label_attr] args = argmapping.Args() setattr(args, 'sca_{0}'.format(corpus_info.metadata.id_attr), [v[1] for v in values]) tt_query = TextTypeCollector(self.corp, args).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) else: raise FunctionNotSupported( 'Corpus must have a bibliography item defined to support this function' ) else: within_cql = None tt_query = TextTypeCollector(self.corp, request).get_query() tmp = ['<%s %s />' % item for item in tt_query] full_cql = ' within '.join(tmp) full_cql = 'aword,[] within %s' % full_cql full_cql = import_string(full_cql, from_encoding=self.corp_encoding) imp_cql = (full_cql, ) basecorpname = self.args.corpname.split(':')[0] if not subcname: raise UserActionException(_('No subcorpus name specified!')) path = self.prepare_subc_path(basecorpname, subcname, publish=False) publish_path = self.prepare_subc_path( basecorpname, subcname, publish=True) if publish else None if type(path) == unicode: path = path.encode('utf-8') if len(tt_query) == 1 and len(aligned_corpora) == 0: result = corplib.create_subcorpus(path, self.corp, tt_query[0][0], tt_query[0][1]) if result and publish_path: corplib.mk_publish_links(path, publish_path, description) elif len(tt_query) > 1 or within_cql or len(aligned_corpora) > 0: backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) res = app.send_task( 'worker.create_subcorpus', (self.session_get('user', 'id'), self.args.corpname, path, publish_path, tt_query, imp_cql, description), time_limit=TASK_TIME_LIMIT) self._store_async_task( AsyncTaskStatus( status=res.status, ident=res.id, category=AsyncTaskStatus.CATEGORY_SUBCORPUS, label=u'%s:%s' % (basecorpname, subcname), args=dict(subcname=subcname, corpname=basecorpname))) result = {} elif backend == 'multiprocessing': from bgcalc import subc_calc import functools import multiprocessing worker = subc_calc.CreateSubcorpusTask( user_id=self.session_get('user', 'id'), corpus_id=self.args.corpname) multiprocessing.Process(target=functools.partial( worker.run, tt_query, imp_cql, path, publish_path, description)).start() result = {} else: raise UserActionException(_('Nothing specified!')) if result is not False: with plugins.runtime.SUBC_RESTORE as sr: try: sr.store_query(user_id=self.session_get('user', 'id'), corpname=self.args.corpname, subcname=subcname, cql=full_cql.strip().split('[]', 1)[-1]) except Exception as e: logging.getLogger(__name__).warning( 'Failed to store subcorpus query: %s' % e) self.add_system_message( 'warning', _('Subcorpus created but there was a problem saving a backup copy.' )) unfinished_corpora = filter( lambda at: not at.is_finished(), self.get_async_tasks( category=AsyncTaskStatus.CATEGORY_SUBCORPUS)) return dict( unfinished_subc=[uc.to_dict() for uc in unfinished_corpora]) else: raise SubcorpusError(_('Empty subcorpus!'))
initializer.init_plugin('user_items') initializer.init_plugin('corparch') initializer.init_plugin('live_attributes', optional=True) translation.load_translations(settings.get('global', 'translations')) translation.activate('en_US') # background jobs do not need localization import concworker import task from bgcalc import freq_calc from bgcalc import subc_calc from bgcalc import coll_calc _, conf = settings.get_full('global', 'calc_backend') app = task.get_celery_app(conf['conf']) def load_script_module(name, path): return imp.load_source(name, path) class WorkerTaskException(Exception): pass def is_compiled(corp, attr, method): """ Test whether pre-calculated data for particular combination corpus+attribute+method (arf, docf, frq) already exist.
def calculate_colls(coll_args): """ Calculates required collocations based on passed arguments. Function is able to reuse cached values and utilize configured backend (either Celery or multiprocessing). returns: a dictionary ready to be used in a respective template (collx.tmpl) (keys: Head, Items, cmaxitems, attrname, processing, collstart, lastpage) """ if coll_args.num_lines > 0: collstart = 0 collend = coll_args.num_lines else: collstart = (int(coll_args.collpage) - 1) * \ int(coll_args.citemsperpage) + int(coll_args.line_offset) collend = collstart + int(coll_args.citemsperpage) + 1 cache = CollCalcCache(corpname=coll_args.corpname, subcname=coll_args.subcname, subcpath=coll_args.subcpath, user_id=coll_args.user_id, q=coll_args.q, minsize=coll_args.minsize, save=coll_args.save, samplesize=coll_args.samplesize) collocs, cache_path = cache.get(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns, cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminbgr=coll_args.cminbgr, cminfreq=coll_args.cminfreq) if collocs is None: num_fetch_items = CollCalcCache.MANATEE_DEFAULT_NUM_FETCH_LINES else: num_fetch_items = len(collocs['Items']) if collocs is None or collend > num_fetch_items: if os.path.isfile(cache_path): # cache avail. but not enough items os.unlink(cache_path) if collend >= num_fetch_items: num_fetch_items += (collend - num_fetch_items) + 10 * \ int(coll_args.citemsperpage) # TODO heuristics :) coll_args.cache_path = cache_path coll_args.num_fetch_items = num_fetch_items backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_colls', args=(coll_args.to_dict(), ), time_limit=TASK_TIME_LIMIT) # worker task caches the value AFTER the result is returned (see worker.py) ans = res.get() elif backend == 'multiprocessing': ans = calculate_colls_mp(coll_args) else: ans = dict(data=collocs, processing=0) result = dict( Head=ans['data']['Head'], attrname=coll_args.cattr, processing=ans['processing'], collstart=collstart, lastpage=0 if collstart + coll_args.citemsperpage < len(ans['data']['Items']) else 1, Items=ans['data']['Items'][collstart:collend - 1]) return result
def calculate_freqs(args): """ Calculates a frequency distribution based on a defined concordance and frequency-related arguments. The class is able to cache the data in a background process/task. This prevents KonText to calculate (via Manatee) full frequency list again and again (e.g. if user moves from page to page). """ cache = FreqCalcCache(corpname=args.corpname, subcname=args.subcname, user_id=args.user_id, subcpath=args.subcpath, minsize=args.minsize, q=args.q, fromp=args.fromp, pagesize=args.pagesize, save=args.save, samplesize=args.samplesize) calc_result, cache_path = cache.get( fcrit=args.fcrit, flimit=args.flimit, freq_sort=args.freq_sort, ml=args.ml, ftt_include_empty=args.ftt_include_empty, rel_mode=args.rel_mode, collator_locale=args.collator_locale) if calc_result is None: backend, conf = settings.get_full('global', 'calc_backend') if backend == 'celery': import task args.cache_path = cache_path app = task.get_celery_app(conf['conf']) res = app.send_task('worker.calculate_freqs', args=(args.to_dict(), )) # worker task caches the value AFTER the result is returned (see worker.py) calc_result = res.get() if backend == 'multiprocessing': calc_result = calculate_freqs_mp(args) data = calc_result['freqs'] conc_size = calc_result['conc_size'] lastpage = None if len(data) == 1: # a single block => pagination total_length = len(data[0]['Items']) if 'Items' in data[0] else 0 items_per_page = args.fmaxitems fstart = (args.fpage - 1) * args.fmaxitems + args.line_offset fmaxitems = args.fmaxitems * args.fpage + 1 + args.line_offset if total_length < fmaxitems: lastpage = 1 else: lastpage = 0 ans = [ dict(Total=total_length, TotalPages=int(math.ceil(total_length / float(items_per_page))), Items=data[0]['Items'][fstart:fmaxitems - 1] if 'Items' in data[0] else [], Head=data[0].get('Head', [])) ] else: for item in data: item['Total'] = len(item['Items']) if 'Items' in item else 0 item['TotalPages'] = None ans = data fstart = None return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=args.fmaxitems, conc_size=conc_size)