Ejemplo n.º 1
0
 def __call__(self, corpus_name, subc_name, subchash, query, samplesize):
     corpus_manager = CorpusManager()
     corpus_obj = corpus_manager.get_Corpus(corpus_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     pidfile = self._create_pid_file()
     cachefile, stored_pidfile = cache_map.add_to_map(subchash, query, 0, pidfile)
     return dict(cachefile=cachefile, pidfile=pidfile, stored_pidfile=stored_pidfile)
Ejemplo n.º 2
0
    def __call__(self, initial_args, subc_dir, corpus_name, subc_name,
                 subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., pidfile=..., stored_pidfile=...)
        subc_dir -- a directory where user's subcorpora are stored
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        sleeptime = None
        try:
            corpus_manager = CorpusManager(subcpath=(subc_dir, ))
            corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args.get('stored_pidfile'):
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True,
                          False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(
                        corpus_obj, query, initial_args['cachefile'])
                    self._update_pidfile(initial_args['pidfile'],
                                         last_check=int(time.time()),
                                         curr_wait=sleeptime,
                                         finished=sizes['finished'],
                                         concsize=sizes['concsize'],
                                         fullsize=sizes['fullsize'],
                                         relconcsize=sizes['relconcsize'])
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
                os.remove(initial_args['pidfile'])
        except Exception as e:
            # Please note that there is no need to clean any mess (pidfile of failed calculation,
            # unfinished cached concordance etc.) here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error(
                'Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(
                traceback.format_exception(*sys.exc_info())))
            self._update_pidfile(initial_args['pidfile'],
                                 last_check=int(time.time()),
                                 curr_wait=sleeptime,
                                 error=str(e))
Ejemplo n.º 3
0
def require_existing_conc(corp: AbstractKCorpus,
                          q: Union[Tuple[str, ...], List[str]]) -> PyConc:
    """
    Load a cached concordance based on a provided corpus and query.
    If nothing is found, ConcNotFoundException is thrown.
    """
    corpus_manager = CorpusManager(subcpath=[])
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    subchash = getattr(corp, 'subchash', None)
    status = cache_map.get_calc_status(subchash, q)
    if status is None:
        raise ConcNotFoundException('Concordance not found: {}'.format(
            ', '.join(q)))
    if status.finished and status.readable:
        mcorp = corp
        for qq in reversed(q):  # find the right main corp, if aligned
            if qq.startswith('x-'):
                mcorp = corpus_manager.get_corpus(qq[2:])
                break
        try:
            return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp)
        except manatee.FileAccessError as ex:
            raise ConcNotFoundException(ex)
    raise BrokenConcordanceException(
        'Concordance broken. File: {}, error: {}'.format(
            status.cachefile, status.error))
Ejemplo n.º 4
0
 def __init__(self, task_id, cache_factory, subc_dirs, corpus_name,
              subc_name: str, conc_dir: str):
     super().__init__(task_id, cache_factory)
     self.corpus_manager = CorpusManager(subcpath=subc_dirs)
     self.corpus_obj = self.corpus_manager.get_corpus(corpus_name,
                                                      subcname=subc_name)
     setattr(self.corpus_obj, '_conc_dir', conc_dir)
     self.cache_map = self._cache_factory.get_mapping(self.corpus_obj)
Ejemplo n.º 5
0
 def __call__(self, corpus_name, subc_name, subchash, subcpaths, query, samplesize):
     corpus_manager = CorpusManager(subcpath=subcpaths)
     corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     new_status = self.create_new_calc_status()
     cachefile, prev_status = cache_map.add_to_map(subchash, query, 0, new_status)
     return dict(
         cachefile=cachefile,
         already_running=prev_status is not None)
Ejemplo n.º 6
0
 def __call__(self, corpus_name: str, subc_name: str, subchash: Optional[str], subcpaths: Tuple[str, ...], query: Tuple[str, ...], samplesize: int) -> Dict[str, Any]:
     corpus_manager = CorpusManager(subcpath=subcpaths)
     corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     new_status = self.create_new_calc_status()
     cachefile, prev_status = cache_map.add_to_map(subchash, query, 0, new_status)
     return dict(
         cachefile=cachefile,
         already_running=prev_status is not None)
Ejemplo n.º 7
0
 def __call__(self, corpus_name, subc_name, subchash, subcpath, query,
              samplesize):
     corpus_manager = CorpusManager(subcpath=(subcpath, ))
     corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     new_status = self.create_new_calc_status()
     cachefile, prev_status = cache_map.add_to_map(subchash, query, 0,
                                                   new_status)
     return dict(cachefile=cachefile,
                 already_running=prev_status is not None)
Ejemplo n.º 8
0
 def __call__(self, corpus_name, subc_name, subchash, query, samplesize):
     corpus_manager = CorpusManager()
     corpus_obj = corpus_manager.get_Corpus(corpus_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     pidfile = self._create_pid_file()
     cachefile, stored_pidfile = cache_map.add_to_map(
         subchash, query, 0, pidfile)
     return dict(cachefile=cachefile,
                 pidfile=pidfile,
                 stored_pidfile=stored_pidfile)
Ejemplo n.º 9
0
    def __call__(self, initial_args, subc_dir, corpus_name, subc_name, subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., pidfile=..., stored_pidfile=...)
        subc_dir -- a directory where user's subcorpora are stored
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        sleeptime = None
        try:
            corpus_manager = CorpusManager(subcpath=(subc_dir,))
            corpus_obj = corpus_manager.get_Corpus(corpus_name, subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args.get('stored_pidfile'):
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True, False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile'])
                    self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()),
                                         curr_wait=sleeptime, finished=sizes['finished'],
                                         concsize=sizes['concsize'], fullsize=sizes['fullsize'],
                                         relconcsize=sizes['relconcsize'])
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
                os.remove(initial_args['pidfile'])
        except Exception as e:
            # Please note that there is no need to clean any mess (pidfile of failed calculation,
            # unfinished cached concordance etc.) here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error('Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info())))
            self._update_pidfile(initial_args['pidfile'], last_check=int(time.time()), curr_wait=sleeptime, error=str(e))
Ejemplo n.º 10
0
 def __call__(self, corpus_name: str, subc_name: str,
              subchash: Optional[str], subcpaths: Tuple[str, ...],
              query: Tuple[str, ...], samplesize: int) -> Dict[str, Any]:
     corpus_manager = CorpusManager(subcpath=subcpaths)
     corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name)
     cache_map = self._cache_factory.get_mapping(corpus_obj)
     status = cache_map.get_calc_status(subchash, query)
     if status is None or status.error:
         status = self.create_new_calc_status()
         status = cache_map.add_to_map(subchash,
                                       query,
                                       status,
                                       overwrite=True)
         already_running = False
     else:
         already_running = True
     return dict(cachefile=status.cachefile,
                 already_running=already_running)
Ejemplo n.º 11
0
def _load_corp(corp_id, subc: str, user_id):
    """
    Instantiate a manatee.Corpus (or manatee.SubCorpus)
    instance

    arguments:
    corp_id -- a corpus identifier
    subc -- a subcorpus identifier (None if not defined)
    user_id --
    """
    subc_paths = [
        os.path.join(settings.get('corpora', 'users_subcpath'), 'published')
    ]
    if user_id is not None:
        subc_paths.insert(
            0,
            os.path.join(settings.get('corpora', 'users_subcpath'),
                         str(user_id)))
    cm = CorpusManager(subc_paths)
    return cm.get_corpus(corp_id, '', subc)
Ejemplo n.º 12
0
async def conc_cache_status_ws_handler(
        request: web.Request) -> web.WebSocketResponse:
    ws = web.WebSocketResponse()
    await ws.prepare(request)
    logging.debug('Client connected to conc cache status')

    # wait for concordance parameters
    msg = await ws.receive()
    params = json.loads(msg.data)
    logging.debug('Received conc parameters: %s', params)

    subcpath = [
        os.path.join(settings.get('corpora', 'users_subcpath'), 'published')
    ]
    with plugins.runtime.AUTH as auth:
        if not auth.is_anonymous(params['user_id']):
            subcpath.insert(
                0,
                os.path.join(settings.get('corpora', 'users_subcpath'),
                             str(params['user_id'])))
    cm = CorpusManager(subcpath)
    corp = cm.get_corpus(corpname=params['corp_id'],
                         subcname=params.get('subc_path', None))

    # check until finished
    while not ws.closed:
        try:
            response = get_conc_cache_status(corp, params['conc_id'])
        except Exception as e:
            response = {'error': str(e), 'finished': True}
        await ws.send_json(response)

        if response['finished']:
            await ws.close()
        else:
            await asyncio.sleep(CONC_CACHE_STATUS_REFRESH_PERIOD)

    logging.debug('Client disconnected from conc cache status')
    return ws
Ejemplo n.º 13
0
class ConcSyncCalculation(GeneralWorker):
    """
    A worker for calculating a concordance synchronously (from Manatee API point of view)
    but still in background.

    Please note that the worker expects you to create required concordance cache
    mapping records.
    """
    def __init__(self, task_id, cache_factory, subc_dirs, corpus_name,
                 subc_name: str, conc_dir: str):
        super().__init__(task_id, cache_factory)
        self.corpus_manager = CorpusManager(subcpath=subc_dirs)
        self.corpus_obj = self.corpus_manager.get_corpus(corpus_name,
                                                         subcname=subc_name)
        setattr(self.corpus_obj, '_conc_dir', conc_dir)
        self.cache_map = self._cache_factory.get_mapping(self.corpus_obj)

    def _mark_calc_states_err(self, subchash: Optional[str], query: Tuple[str,
                                                                          ...],
                              from_idx: int, err: BaseException):
        for i in range(from_idx, len(query)):
            self.cache_map.update_calc_status(subchash,
                                              query[:i + 1],
                                              error=err,
                                              finished=True)

    def __call__(self, subchash, query: Tuple[str, ...], samplesize: int):
        try:
            calc_from, conc = find_cached_conc_base(self.corpus_obj,
                                                    subchash,
                                                    query,
                                                    minsize=0)
            if isinstance(
                    conc, InitialConc
            ):  # we have nothing, let's start with the 1st operation only
                for i in range(0, len(query)):
                    self.cache_map.add_to_map(
                        subchash,
                        query[:i + 1],
                        ConcCacheStatus(task_id=self._task_id),
                        overwrite=True)
                calc_status = self.cache_map.get_calc_status(
                    subchash, query[:1])
                conc = self.compute_conc(self.corpus_obj, query[:1],
                                         samplesize)
                conc.sync()
                conc.save(calc_status.cachefile)
                os.chmod(calc_status.cachefile, 0o664)
                self.cache_map.update_calc_status(subchash,
                                                  query[:1],
                                                  readable=True,
                                                  finished=True,
                                                  concsize=conc.size())
                calc_from = 1
            else:
                for i in range(calc_from, len(query)):
                    self.cache_map.add_to_map(
                        subchash,
                        query[:i + 1],
                        ConcCacheStatus(task_id=self._task_id),
                        overwrite=True)
        except Exception as ex:
            logging.getLogger(__name__).error(ex)
            manatee_err = extract_manatee_error(ex)
            norm_err = manatee_err if manatee_err else ex
            self._mark_calc_states_err(subchash, query, 0, norm_err)
            return
        # save additional concordance actions to cache (e.g. sample, aligned corpus without a query,...)
        for act in range(calc_from, len(query)):
            try:
                command, args = query[act][0], query[act][1:]
                conc.exec_command(command, args)
                if command in 'gae':  # user specific/volatile actions, cannot save
                    raise NotImplementedError(
                        f'Cannot run command {command} in background')  # TODO
                calc_status = self.cache_map.get_calc_status(
                    subchash, query[:act + 1])
                conc.save(calc_status.cachefile)
                os.chmod(calc_status.cachefile, 0o664)
                self.cache_map.update_calc_status(subchash,
                                                  query[:act + 1],
                                                  readable=True,
                                                  finished=True,
                                                  concsize=conc.size())
            except Exception as ex:
                self._mark_calc_states_err(subchash, query, act, ex)
                logging.getLogger(__name__).error(ex)
                return
Ejemplo n.º 14
0
    def __call__(self, initial_args, subc_dirs, corpus_name, subc_name,
                 subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., already_running=...)
        subc_dirs -- a list of directories where to look for subcorpora
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        cache_map = None
        try:
            corpus_manager = CorpusManager(subcpath=subc_dirs)
            corpus_obj = corpus_manager.get_corpus(corpus_name,
                                                   subcname=subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)
            if not initial_args['already_running']:
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                cachefile = initial_args['cachefile']
                conc.save(cachefile, False, True)  # partial
                os.chmod(cachefile, 0o664)
                cache_map.update_calc_status(subchash,
                                             query,
                                             readable=True,
                                             task_id=self._task_id)
                while not conc.finished():
                    conc.save(cachefile + '.tmp', False, True)
                    os.rename(cachefile + '.tmp', cachefile)
                    sizes = self.get_cached_conc_sizes(corpus_obj, query)
                    cache_map.update_calc_status(
                        subchash,
                        query,
                        finished=sizes['finished'],
                        concsize=sizes['concsize'],
                        fullsize=sizes['fullsize'],
                        relconcsize=sizes['relconcsize'],
                        arf=None,
                        task_id=self._task_id)
                    time.sleep(sleeptime)
                    sleeptime += 0.1

                conc.save(cachefile + '.tmp')  # whole
                os.rename(cachefile + '.tmp', cachefile)
                os.chmod(cachefile, 0o664)
                sizes = self.get_cached_conc_sizes(corpus_obj, query)
                cache_map.update_calc_status(
                    subchash,
                    query,
                    finished=sizes['finished'],
                    concsize=conc.size(),
                    fullsize=sizes['fullsize'],
                    relconcsize=sizes['relconcsize'],
                    arf=round(conc.compute_ARF(), 2)
                    if not corpus_obj.is_subcorpus else None,
                    task_id=self._task_id)
        except Exception as e:
            # Please note that there is no need to clean any mess (unfinished cached concordance etc.)
            # here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            manatee_err = extract_manatee_error(e)
            norm_err = manatee_err if manatee_err else e
            if cache_map is not None:
                cache_map.update_calc_status(subchash,
                                             query,
                                             finished=True,
                                             error=norm_err)
Ejemplo n.º 15
0
    def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., already_running=...)
        subc_dirs -- a list of directories where to look for subcorpora
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        sleeptime = None
        cache_map = None
        try:
            corpus_manager = CorpusManager(subcpath=subc_dirs)
            corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args['already_running']:
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True, False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile'])
                    cache_map.update_calc_status(subchash, query, dict(
                        curr_wait=sleeptime,
                        finished=sizes['finished'],
                        concsize=sizes['concsize'],
                        fullsize=sizes['fullsize'],
                        relconcsize=sizes['relconcsize'],
                        arf=None,
                        task_id=self._task_id))
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile'])
                cache_map.update_calc_status(subchash, query, dict(
                    curr_wait=sleeptime,
                    finished=sizes['finished'],
                    concsize=sizes['concsize'],
                    fullsize=sizes['fullsize'],
                    relconcsize=sizes['relconcsize'],
                    arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None,
                    task_id=self._task_id))
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
        except Exception as e:
            # Please note that there is no need to clean any mess (unfinished cached concordance etc.)
            # here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error('Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info())))
            if cache_map is not None:
                cache_map.update_calc_status(
                    subchash, query, dict(
                        finished=True,
                        curr_wait=sleeptime,
                        error=e.message if getattr(e, 'message', None) else e.__class__.__name__))
Ejemplo n.º 16
0
 def __init__(self, conf, ident):
     super().__init__(ident)
     self._conf = conf
     fixed_corp = conf.get('corpus')
     self._preset_corp = CorpusManager().get_corpus(fixed_corp) if fixed_corp else None
Ejemplo n.º 17
0
    def __call__(self, initial_args, subc_dirs, corpus_name, subc_name,
                 subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., already_running=...)
        subc_dirs -- a list of directories where to look for subcorpora
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        cache_map = None
        try:
            corpus_manager = CorpusManager(subcpath=subc_dirs)
            corpus_obj = corpus_manager.get_Corpus(corpus_name,
                                                   subcname=subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args['already_running']:
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True,
                          False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(
                        corpus_obj, query, initial_args['cachefile'])
                    cache_map.update_calc_status(
                        subchash,
                        query,
                        finished=sizes['finished'],
                        concsize=sizes['concsize'],
                        fullsize=sizes['fullsize'],
                        relconcsize=sizes['relconcsize'],
                        arf=None,
                        task_id=self._task_id)
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                sizes = self.get_cached_conc_sizes(corpus_obj, query,
                                                   initial_args['cachefile'])
                cache_map.update_calc_status(
                    subchash,
                    query,
                    finished=sizes['finished'],
                    concsize=sizes['concsize'],
                    fullsize=sizes['fullsize'],
                    relconcsize=sizes['relconcsize'],
                    arf=round(conc.compute_ARF(), 2)
                    if not is_subcorpus(corpus_obj) else None,
                    task_id=self._task_id)
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
        except Exception as e:
            # Please note that there is no need to clean any mess (unfinished cached concordance etc.)
            # here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error(
                'Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(
                traceback.format_exception(*sys.exc_info())))
            if cache_map is not None:
                cache_map.update_calc_status(subchash,
                                             query,
                                             finished=True,
                                             error=e)
Ejemplo n.º 18
0
def find_cached_conc_base(
        corp: AbstractKCorpus, subchash: Optional[str], q: Tuple[str, ...],
        minsize: int) -> Tuple[Optional[int], Union[PyConc, InitialConc]]:
    """
    Load a concordance from cache starting from a complete operation q[:],
    then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be
    used to skip calculation of already available operations q[:-i].

    arguments:
    minsize -- a minimum concordance size to return immediately (synchronously); please
                note that unlike wait_for_conc here we accept also 0

    returns:
    a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance]
    """
    corpus_manager = CorpusManager(subcpath=[])
    start_time = time.time()
    cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp)
    cache_map.refresh_map()
    calc_status = cache_map.get_calc_status(subchash, q)
    if calc_status:
        if calc_status.error is None:
            if calc_status.created - corp.corp_mtime < 0:
                logging.getLogger(__name__).warning(
                    'Removed outdated cache file (older than corpus indices)')
                cache_map.del_full_entry(subchash, q)
        else:
            logging.getLogger(__name__).warning(
                'Removed failed calculation cache record (error: {0}'.format(
                    calc_status.error))
            cache_map.del_full_entry(subchash, q)
            raise calc_status.normalized_error

    if _contains_shuffle_seq(q):
        srch_from = 1
    else:
        srch_from = len(q)

    conc = InitialConc(corp=corp)
    ans = (0, conc)
    # try to find the most complete cached operation
    # (e.g. query + filter + sample)
    for i in range(srch_from, 0, -1):
        cache_path = cache_map.readable_cache_path(subchash, q[:i])
        # now we know that someone already calculated the conc (but it might not be finished yet)
        if cache_path:
            try:
                ready = wait_for_conc(cache_map=cache_map,
                                      subchash=subchash,
                                      q=q[:i],
                                      minsize=minsize)
                if not ready:
                    if minsize != 0:
                        cancel_conc_task(cache_map, subchash, q[:i])
                        logging.getLogger(__name__).warning(
                            'Removed unfinished concordance cache record due to exceeded time limit'
                        )
                    continue
                _, finished = _check_result(cache_map=cache_map,
                                            subchash=subchash,
                                            q=q[:i],
                                            minsize=minsize)
                if finished:
                    mcorp = corp
                    for qq in reversed(
                            q[:i]):  # find the right main corp, if aligned
                        if qq.startswith('x-'):
                            mcorp = corpus_manager.get_corpus(qq[2:])
                            break
                    conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp)
            except (ConcCalculationStatusException,
                    manatee.FileAccessError) as ex:
                logging.getLogger(__name__).error(
                    f'Failed to use cached concordance for {q[:i]}: {ex}')
                cancel_conc_task(cache_map, subchash, q[:i])
                continue
            ans = (i, conc)
            break
    logging.getLogger(__name__).debug(
        f'find_cached_conc_base({corp.get_conffile()}, [{", ".join(q)}]), '
        f'conc: {conc.__class__.__name__}, '
        f'must calc ops from {i} to {len(q)}, '
        f'time: {(time.time() - start_time):.4f}')
    return ans