def _get_cached_conc(corp, subchash, q, pid_dir, minsize): """ Loads a concordance from cache """ start_time = time.time() q = tuple(q) if not os.path.isdir(pid_dir): os.makedirs(pid_dir, mode=0o775) cache_map = plugins.get('conc_cache').get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: pidfile = cache_map.get_stored_pidfile(subchash, q[:i]) _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=cache_map, pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): # broken cache cache_map.del_entry(subchash, q) try: os.remove(pidfile) except OSError: pass continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): conccorp = manatee.Corpus(qq[2:]) break conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp) if not _is_conc_alive(pidfile, minsize) and not conc.finished(): # unfinished and dead concordance cache_map.del_entry(subchash, q) try: os.remove(cachefile) except OSError: pass try: os.remove(pidfile) except OSError: pass continue ans = (i, conc) break logging.getLogger(__name__).debug( 'get_cached_conc(%s, [%s]) -> %s, %01.4f' % (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss', time.time() - start_time)) return ans
def _get_cached_conc(corp, subchash, q, pid_dir, minsize): """ Loads a concordance from cache """ start_time = time.time() q = tuple(q) if not os.path.isdir(pid_dir): os.makedirs(pid_dir, mode=0o775) cache_map = plugins.get('conc_cache').get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: pidfile = cache_map.get_stored_pidfile(subchash, q[:i]) _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=cache_map, pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): # broken cache cache_map.del_entry(subchash, q) try: os.remove(pidfile) except OSError: pass continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): conccorp = manatee.Corpus(qq[2:]) break conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp) if not _is_conc_alive(pidfile, minsize) and not conc.finished(): # unfinished and dead concordance cache_map.del_entry(subchash, q) try: os.remove(cachefile) except OSError: pass try: os.remove(pidfile) except OSError: pass continue ans = (i, conc) break logging.getLogger(__name__).debug('get_cached_conc(%s, [%s]) -> %s, %01.4f' % (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss', time.time() - start_time)) return ans
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('global', 'calc_backend') if backend == 'multiprocessing': from concworker import mp mp.create_task(user_id, corp, subchash, q, samplesize).start() elif backend == 'celery': import task app = task.get_celery_app(conf['conf']) ans = app.send_task( 'worker.conc_register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) ans.get() # = wait for task registration else: raise ValueError('Unknown concordance calculation backend: %s' % (backend, )) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q, minsize=minsize) except Exception as e: _cancel_async_task(cache_map, subchash, q) raise e return PyConc(corp, 'l', cache_map.cache_file_path(subchash, q))
def _get_async_conc(corp, user_id, q, save, subchash, samplesize, fullsize, minsize): """ Note: 'save' argument is present because of bonito-open-3.45.11 compatibility but it is currently not used ----- TODO remove it """ backend, conf = settings.get_full('corpora', 'conc_calc_backend') if backend == 'multiprocessing': from concworker.default import BackgroundCalc, NotifierFactory receiver, sender = NotifierFactory()() calc = BackgroundCalc(notification_sender=sender) proc = Process(target=calc, args=( corp, subchash, q, samplesize, )) proc.start() elif backend == 'celery': from concworker.wcelery import NotifierFactory import task app = task.get_celery_app(conf['conf']) res = app.send_task( 'worker.register', (user_id, corp.corpname, getattr(corp, 'subcname', None), subchash, q, samplesize)) receiver, sender = NotifierFactory(res)() else: raise ValueError('Unknown concordance calculation backend: %s' % (backend, )) cachefile, pidfile = receiver.receive() try: _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=plugins.get('conc_cache').get_mapping(corp), pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): raise RuntimeError( 'Concordance cache file [%s] not created. PID file: %s' % (cachefile, pidfile)) except Exception as e: if os.path.exists(pidfile): os.remove(pidfile) raise e return PyConc(corp, 'l', cachefile)
def _get_cached_conc(corp, subchash, q, minsize): """ Loads a concordance from cache. The function tries to find at least a sublist of 'q' (starting from zero) to avoid full concordance search if possible. arguments: corp -- a respective manatee.Corpus object subchash -- a subcorpus hash (generated by PyConc) q -- a query representation list minsize -- a minimum concordance size to return immediately (synchronously) returns: a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance] """ start_time = time.time() q = tuple(q) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) # try to find the most complete cached operation # (e.g. query + filter + sample) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) except ConcCalculationControlException as ex: _cancel_async_task(cache_map, subchash, q[:i]) logging.getLogger(__name__).warning( 'Removed broken concordance cache record. Original error: %s' % (ex, )) continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): conccorp = manatee.Corpus(qq[2:]) break conc = None try: if not _min_conc_unfinished(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize): conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp) except (ConcCalculationControlException, manatee.FileAccessError) as ex: logging.getLogger(__name__).error( 'Failed to join unfinished calculation: {0}'.format(ex)) _cancel_async_task(cache_map, subchash, q[:i]) continue ans = (i, conc) break logging.getLogger(__name__).debug( 'get_cached_conc(%s, [%s]) -> %s, %01.4f' % (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss', time.time() - start_time)) return ans
def _get_cached_conc(corp, subchash, q, pid_dir, minsize): """ Loads a concordance from cache """ start_time = time.time() q = tuple(q) if not os.path.isdir(pid_dir): os.makedirs(pid_dir) cache_map = cache_factory.get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: pidfile = cache_map[(subchash, q[:i])][2] _wait_for_conc( corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=cache_map, pidfile=pidfile, minsize=minsize, ) if not os.path.exists(cachefile): # broken cache del cache_map[(subchash, q)] try: os.remove(pidfile) except OSError: pass continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith("x-"): conccorp = manatee.Corpus(qq[2:]) break conc = PyConc(conccorp, "l", cachefile, orig_corp=corp) if not _is_conc_alive(pidfile, minsize) and not conc.finished(): # unfinished and dead concordance del cache_map[(subchash, q)] try: os.remove(cachefile) except OSError: pass try: os.remove(pidfile) except OSError: pass continue ans = (i, conc) break logging.getLogger(__name__).debug( "get_cached_conc(%s, [%s]) -> %s, %01.4f" % (corp.corpname, ",".join(q), "hit" if ans[1] else "miss", time.time() - start_time) ) return ans