def get_cached_conc_sizes(self, corp, q=None, cachefile=None): """ arguments: corp -- manatee.Corpus instance q -- a list containing preprocessed query cachefile -- if not provided then the path is determined automatically using CACHE_ROOT_DIR and corpus name, corpus name and the query returns: a dictionary { finished : 0/1, concsize : int, fullsize : int, relconcsize : float (concordance size recalculated to a million corpus), arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values) } """ import struct if q is None: q = [] ans = dict(finished=False, concsize=None, fullsize=None, relconcsize=None) if not cachefile: # AJAX call q = tuple(q) subchash = getattr(corp, 'subchash', None) cache_map = self._cache_factory.get_mapping(corp) cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) if status.error is not None: raise ConcCalculationStatusException( 'Concordance calculation failed', status.error) if cachefile and os.path.isfile(cachefile): cache = open(cachefile, 'rb') cache.seek(15) finished = bool(ord(cache.read(1))) (fullsize, ) = struct.unpack('q', cache.read(8)) cache.seek(32) (concsize, ) = struct.unpack('i', cache.read(4)) if fullsize > 0: relconcsize = 1000000.0 * fullsize / corp.search_size() else: relconcsize = 1000000.0 * concsize / corp.search_size() if finished and not is_subcorpus(corp): conc = manatee.Concordance(corp, cachefile) result_arf = round(conc.compute_ARF(), 2) else: result_arf = None ans['finished'] = finished ans['concsize'] = concsize ans['fullsize'] = fullsize ans['relconcsize'] = relconcsize ans['arf'] = result_arf return ans
def get_cached_conc_sizes(self, corp, q=None, cachefile=None): """ arguments: corp -- manatee.Corpus instance q -- a list containing preprocessed query cachefile -- if not provided then the path is determined automatically using CACHE_ROOT_DIR and corpus name, corpus name and the query returns: a dictionary { finished : 0/1, concsize : int, fullsize : int, relconcsize : float (concordance size recalculated to a million corpus), arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values) } """ import struct if q is None: q = [] ans = dict(finished=False, concsize=None, fullsize=None, relconcsize=None) if not cachefile: # AJAX call q = tuple(q) subchash = getattr(corp, 'subchash', None) cache_map = self._cache_factory.get_mapping(corp) cachefile = cache_map.cache_file_path(subchash, q) status = cache_map.get_calc_status(subchash, q) if status.error is not None: raise ConcCalculationStatusException('Concordance calculation failed', status.error) if cachefile and os.path.isfile(cachefile): cache = open(cachefile, 'rb') cache.seek(15) finished = bool(ord(cache.read(1))) (fullsize,) = struct.unpack('q', cache.read(8)) cache.seek(32) (concsize,) = struct.unpack('i', cache.read(4)) if fullsize > 0: relconcsize = 1000000.0 * fullsize / corp.search_size() else: relconcsize = 1000000.0 * concsize / corp.search_size() if finished and not is_subcorpus(corp): conc = manatee.Concordance(corp, cachefile) result_arf = round(conc.compute_ARF(), 2) else: result_arf = None ans['finished'] = finished ans['concsize'] = concsize ans['fullsize'] = fullsize ans['relconcsize'] = relconcsize ans['arf'] = result_arf return ans
def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., already_running=...) subc_dirs -- a list of directories where to look for subcorpora corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ cache_map = None try: corpus_manager = CorpusManager(subcpath=subc_dirs) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args['already_running']: # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes( corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=None, task_id=self._task_id) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status( subchash, query, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None, task_id=self._task_id) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) except Exception as e: # Please note that there is no need to clean any mess (unfinished cached concordance etc.) # here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error( 'Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join( traceback.format_exception(*sys.exc_info()))) if cache_map is not None: cache_map.update_calc_status(subchash, query, finished=True, error=e)
def kwicpage(self, args): """ Generates template data for page displaying provided concordance arguments: args -- a KwicArgs instance returns: KwicPageData converted into a dict """ args.refs = getattr(args, 'refs', '').replace('.MAP_OUP', '') # to be removed ... try: fromp = int(args.fromp) if fromp < 1: fromp = 1 except: fromp = 1 out = KwicPageData() pagination = Pagination() pagination.first_page = 1 out.Lines = self.kwiclines(args.create_kwicline_args()) self.add_aligns(out, args.create_kwicline_args(speech_segment=None)) if len(out.CorporaColumns) == 0: out.CorporaColumns = [ dict(n=self.corpus.corpname, label=self.corpus.get_conf('NAME')) ] out.KWICCorps = [self.corpus.corpname] if args.labelmap: out.GroupNumbers = format_labelmap(args.labelmap) if fromp > 1: pagination.prev_page = fromp - 1 if self.conc.size() > args.pagesize: out.fromp = fromp numofpages = (self.conc.size() - 1) / args.pagesize + 1 if numofpages < 30: out.Page = [{'page': x} for x in range(1, numofpages + 1)] if fromp < numofpages: pagination.next_page = fromp + 1 pagination.last_page = numofpages else: pagination.last_page = 1 out.concsize = self.conc.size() if is_subcorpus(self.corpus): out.result_arf = '' else: out.result_arf = round(self.conc.compute_ARF(), 2) if is_subcorpus(self.corpus): corpsize = self.corpus.search_size( ) # TODO this is unverified solution trying to bypass possible manatee bug else: corpsize = self.corpus.size() out.result_relative_freq = round( self.conc.size() / (float(corpsize) / 1e6), 2) if args.hidenone: for line, part in itertools.product(out.Lines, ('Kwic', 'Left', 'Right')): for item in line[part]: item['str'] = item['str'].replace('===NONE===', '') out.pagination = pagination.export() return dict(out)
def kwicpage(self, args): """ Generates template data for page displaying provided concordance arguments: args -- a KwicArgs instance returns: KwicPageData converted into a dict """ args.refs = getattr(args, 'refs', '').replace('.MAP_OUP', '') # to be removed ... try: fromp = int(args.fromp) if fromp < 1: fromp = 1 except: fromp = 1 out = KwicPageData() pagination = Pagination() pagination.first_page = 1 out.Lines = self.kwiclines(args.create_kwicline_args()) self.add_aligns(out, args.create_kwicline_args(speech_segment=None)) if len(out.CorporaColumns) == 0: out.CorporaColumns = [dict(n=self.corpus.corpname, label=self.corpus.get_conf('NAME'))] out.KWICCorps = [self.corpus.corpname] if args.labelmap: out.GroupNumbers = format_labelmap(args.labelmap) if fromp > 1: pagination.prev_page = fromp - 1 if self.conc.size() > args.pagesize: out.fromp = fromp numofpages = (self.conc.size() - 1) / args.pagesize + 1 if numofpages < 30: out.Page = [{'page': x} for x in range(1, numofpages + 1)] if fromp < numofpages: pagination.next_page = fromp + 1 pagination.last_page = numofpages else: pagination.last_page = 1 out.concsize = self.conc.size() if is_subcorpus(self.corpus): out.result_arf = '' else: out.result_arf = round(self.conc.compute_ARF(), 2) if is_subcorpus(self.corpus): corpsize = self.corpus.search_size( ) # TODO this is unverified solution trying to bypass possible manatee bug else: corpsize = self.corpus.size() out.result_relative_freq = round( self.conc.size() / (float(corpsize) / 1e6), 2) if args.hidenone: for line, part in itertools.product(out.Lines, ('Kwic', 'Left', 'Right')): for item in line[part]: item['str'] = item['str'].replace('===NONE===', '') out.pagination = pagination.export() return dict(out)
def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize): """ initial_args -- a dict(cachefile=..., already_running=...) subc_dirs -- a list of directories where to look for subcorpora corpus -- a corpus identifier subc_name -- subcorpus name (should be None if not present) subchash -- an identifier of current subcorpus (None if no subcorpus is in use) query -- a tuple/list containing current query samplesize -- row limit """ sleeptime = None cache_map = None try: corpus_manager = CorpusManager(subcpath=subc_dirs) corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name) cache_map = self._cache_factory.get_mapping(corpus_obj) if not initial_args['already_running']: # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may # not be ready yet (this is checked by the 'finished()' method). conc = self.compute_conc(corpus_obj, query, samplesize) sleeptime = 0.1 time.sleep(sleeptime) conc.save(initial_args['cachefile'], False, True, False) # partial while not conc.finished(): # TODO it looks like append=True does not work with Manatee 2.121.1 properly tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile, False, True, False) os.rename(tmp_cachefile, initial_args['cachefile']) time.sleep(sleeptime) sleeptime += 0.1 sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status(subchash, query, dict( curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=None, task_id=self._task_id)) tmp_cachefile = initial_args['cachefile'] + '.tmp' conc.save(tmp_cachefile) # whole os.rename(tmp_cachefile, initial_args['cachefile']) sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile']) cache_map.update_calc_status(subchash, query, dict( curr_wait=sleeptime, finished=sizes['finished'], concsize=sizes['concsize'], fullsize=sizes['fullsize'], relconcsize=sizes['relconcsize'], arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None, task_id=self._task_id)) # update size in map file cache_map.add_to_map(subchash, query, conc.size()) except Exception as e: # Please note that there is no need to clean any mess (unfinished cached concordance etc.) # here as this is performed by _get_cached_conc() # function in case it detects a problem. import traceback logging.getLogger(__name__).error('Background calculation error: %s' % e) logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info()))) if cache_map is not None: cache_map.update_calc_status( subchash, query, dict( finished=True, curr_wait=sleeptime, error=e.message if getattr(e, 'message', None) else e.__class__.__name__))