Esempio n. 1
0
    def get_cached_conc_sizes(self, corp, q=None, cachefile=None):
        """
        arguments:
        corp -- manatee.Corpus instance
        q -- a list containing preprocessed query
        cachefile -- if not provided then the path is determined automatically
        using CACHE_ROOT_DIR and corpus name, corpus name and the query

        returns:
        a dictionary {
            finished : 0/1,
            concsize : int,
            fullsize : int,
            relconcsize : float (concordance size recalculated to a million corpus),
            arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values)
        }
        """
        import struct

        if q is None:
            q = []
        ans = dict(finished=False,
                   concsize=None,
                   fullsize=None,
                   relconcsize=None)
        if not cachefile:  # AJAX call
            q = tuple(q)
            subchash = getattr(corp, 'subchash', None)
            cache_map = self._cache_factory.get_mapping(corp)
            cachefile = cache_map.cache_file_path(subchash, q)
            status = cache_map.get_calc_status(subchash, q)
            if status.error is not None:
                raise ConcCalculationStatusException(
                    'Concordance calculation failed', status.error)

        if cachefile and os.path.isfile(cachefile):
            cache = open(cachefile, 'rb')
            cache.seek(15)
            finished = bool(ord(cache.read(1)))
            (fullsize, ) = struct.unpack('q', cache.read(8))
            cache.seek(32)
            (concsize, ) = struct.unpack('i', cache.read(4))

            if fullsize > 0:
                relconcsize = 1000000.0 * fullsize / corp.search_size()
            else:
                relconcsize = 1000000.0 * concsize / corp.search_size()

            if finished and not is_subcorpus(corp):
                conc = manatee.Concordance(corp, cachefile)
                result_arf = round(conc.compute_ARF(), 2)
            else:
                result_arf = None

            ans['finished'] = finished
            ans['concsize'] = concsize
            ans['fullsize'] = fullsize
            ans['relconcsize'] = relconcsize
            ans['arf'] = result_arf
        return ans
Esempio n. 2
0
    def get_cached_conc_sizes(self, corp, q=None, cachefile=None):
        """
        arguments:
        corp -- manatee.Corpus instance
        q -- a list containing preprocessed query
        cachefile -- if not provided then the path is determined automatically
        using CACHE_ROOT_DIR and corpus name, corpus name and the query

        returns:
        a dictionary {
            finished : 0/1,
            concsize : int,
            fullsize : int,
            relconcsize : float (concordance size recalculated to a million corpus),
            arf : ARF of the result (this is calculated only for the finished result, i.e. no intermediate values)
        }
        """
        import struct

        if q is None:
            q = []
        ans = dict(finished=False, concsize=None, fullsize=None, relconcsize=None)
        if not cachefile:  # AJAX call
            q = tuple(q)
            subchash = getattr(corp, 'subchash', None)
            cache_map = self._cache_factory.get_mapping(corp)
            cachefile = cache_map.cache_file_path(subchash, q)
            status = cache_map.get_calc_status(subchash, q)
            if status.error is not None:
                raise ConcCalculationStatusException('Concordance calculation failed', status.error)

        if cachefile and os.path.isfile(cachefile):
            cache = open(cachefile, 'rb')
            cache.seek(15)
            finished = bool(ord(cache.read(1)))
            (fullsize,) = struct.unpack('q', cache.read(8))
            cache.seek(32)
            (concsize,) = struct.unpack('i', cache.read(4))

            if fullsize > 0:
                relconcsize = 1000000.0 * fullsize / corp.search_size()
            else:
                relconcsize = 1000000.0 * concsize / corp.search_size()

            if finished and not is_subcorpus(corp):
                conc = manatee.Concordance(corp, cachefile)
                result_arf = round(conc.compute_ARF(), 2)
            else:
                result_arf = None

            ans['finished'] = finished
            ans['concsize'] = concsize
            ans['fullsize'] = fullsize
            ans['relconcsize'] = relconcsize
            ans['arf'] = result_arf
        return ans
Esempio n. 3
0
    def __call__(self, initial_args, subc_dirs, corpus_name, subc_name,
                 subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., already_running=...)
        subc_dirs -- a list of directories where to look for subcorpora
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        cache_map = None
        try:
            corpus_manager = CorpusManager(subcpath=subc_dirs)
            corpus_obj = corpus_manager.get_Corpus(corpus_name,
                                                   subcname=subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args['already_running']:
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True,
                          False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(
                        corpus_obj, query, initial_args['cachefile'])
                    cache_map.update_calc_status(
                        subchash,
                        query,
                        finished=sizes['finished'],
                        concsize=sizes['concsize'],
                        fullsize=sizes['fullsize'],
                        relconcsize=sizes['relconcsize'],
                        arf=None,
                        task_id=self._task_id)
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                sizes = self.get_cached_conc_sizes(corpus_obj, query,
                                                   initial_args['cachefile'])
                cache_map.update_calc_status(
                    subchash,
                    query,
                    finished=sizes['finished'],
                    concsize=sizes['concsize'],
                    fullsize=sizes['fullsize'],
                    relconcsize=sizes['relconcsize'],
                    arf=round(conc.compute_ARF(), 2)
                    if not is_subcorpus(corpus_obj) else None,
                    task_id=self._task_id)
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
        except Exception as e:
            # Please note that there is no need to clean any mess (unfinished cached concordance etc.)
            # here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error(
                'Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(
                traceback.format_exception(*sys.exc_info())))
            if cache_map is not None:
                cache_map.update_calc_status(subchash,
                                             query,
                                             finished=True,
                                             error=e)
Esempio n. 4
0
    def kwicpage(self, args):
        """
        Generates template data for page displaying provided concordance

        arguments:
            args -- a KwicArgs instance

        returns:
        KwicPageData converted into a dict
        """
        args.refs = getattr(args, 'refs', '').replace('.MAP_OUP',
                                                      '')  # to be removed ...
        try:
            fromp = int(args.fromp)
            if fromp < 1:
                fromp = 1
        except:
            fromp = 1

        out = KwicPageData()
        pagination = Pagination()
        pagination.first_page = 1
        out.Lines = self.kwiclines(args.create_kwicline_args())
        self.add_aligns(out, args.create_kwicline_args(speech_segment=None))

        if len(out.CorporaColumns) == 0:
            out.CorporaColumns = [
                dict(n=self.corpus.corpname,
                     label=self.corpus.get_conf('NAME'))
            ]
            out.KWICCorps = [self.corpus.corpname]

        if args.labelmap:
            out.GroupNumbers = format_labelmap(args.labelmap)
        if fromp > 1:
            pagination.prev_page = fromp - 1
        if self.conc.size() > args.pagesize:
            out.fromp = fromp
            numofpages = (self.conc.size() - 1) / args.pagesize + 1
            if numofpages < 30:
                out.Page = [{'page': x} for x in range(1, numofpages + 1)]
            if fromp < numofpages:
                pagination.next_page = fromp + 1
            pagination.last_page = numofpages
        else:
            pagination.last_page = 1
        out.concsize = self.conc.size()

        if is_subcorpus(self.corpus):
            out.result_arf = ''
        else:
            out.result_arf = round(self.conc.compute_ARF(), 2)

        if is_subcorpus(self.corpus):
            corpsize = self.corpus.search_size(
            )  # TODO this is unverified solution trying to bypass possible manatee bug
        else:
            corpsize = self.corpus.size()
        out.result_relative_freq = round(
            self.conc.size() / (float(corpsize) / 1e6), 2)
        if args.hidenone:
            for line, part in itertools.product(out.Lines,
                                                ('Kwic', 'Left', 'Right')):
                for item in line[part]:
                    item['str'] = item['str'].replace('===NONE===', '')
        out.pagination = pagination.export()
        return dict(out)
Esempio n. 5
0
    def kwicpage(self, args):
        """
        Generates template data for page displaying provided concordance

        arguments:
            args -- a KwicArgs instance

        returns:
        KwicPageData converted into a dict
        """
        args.refs = getattr(args, 'refs', '').replace('.MAP_OUP', '')  # to be removed ...
        try:
            fromp = int(args.fromp)
            if fromp < 1:
                fromp = 1
        except:
            fromp = 1

        out = KwicPageData()
        pagination = Pagination()
        pagination.first_page = 1
        out.Lines = self.kwiclines(args.create_kwicline_args())
        self.add_aligns(out, args.create_kwicline_args(speech_segment=None))

        if len(out.CorporaColumns) == 0:
            out.CorporaColumns = [dict(n=self.corpus.corpname, label=self.corpus.get_conf('NAME'))]
            out.KWICCorps = [self.corpus.corpname]

        if args.labelmap:
            out.GroupNumbers = format_labelmap(args.labelmap)
        if fromp > 1:
            pagination.prev_page = fromp - 1
        if self.conc.size() > args.pagesize:
            out.fromp = fromp
            numofpages = (self.conc.size() - 1) / args.pagesize + 1
            if numofpages < 30:
                out.Page = [{'page': x} for x in range(1, numofpages + 1)]
            if fromp < numofpages:
                pagination.next_page = fromp + 1
            pagination.last_page = numofpages
        else:
            pagination.last_page = 1
        out.concsize = self.conc.size()

        if is_subcorpus(self.corpus):
            out.result_arf = ''
        else:
            out.result_arf = round(self.conc.compute_ARF(), 2)

        if is_subcorpus(self.corpus):
            corpsize = self.corpus.search_size(
            )  # TODO this is unverified solution trying to bypass possible manatee bug
        else:
            corpsize = self.corpus.size()
        out.result_relative_freq = round(
            self.conc.size() / (float(corpsize) / 1e6), 2)
        if args.hidenone:
            for line, part in itertools.product(out.Lines, ('Kwic', 'Left', 'Right')):
                for item in line[part]:
                    item['str'] = item['str'].replace('===NONE===', '')
        out.pagination = pagination.export()
        return dict(out)
Esempio n. 6
0
    def __call__(self, initial_args, subc_dirs, corpus_name, subc_name, subchash, query, samplesize):
        """
        initial_args -- a dict(cachefile=..., already_running=...)
        subc_dirs -- a list of directories where to look for subcorpora
        corpus -- a corpus identifier
        subc_name -- subcorpus name (should be None if not present)
        subchash -- an identifier of current subcorpus (None if no subcorpus is in use)
        query -- a tuple/list containing current query
        samplesize -- row limit
        """
        sleeptime = None
        cache_map = None
        try:
            corpus_manager = CorpusManager(subcpath=subc_dirs)
            corpus_obj = corpus_manager.get_Corpus(corpus_name, subcname=subc_name)
            cache_map = self._cache_factory.get_mapping(corpus_obj)

            if not initial_args['already_running']:
                # The conc object bellow is asynchronous; i.e. you obtain it immediately but it may
                # not be ready yet (this is checked by the 'finished()' method).
                conc = self.compute_conc(corpus_obj, query, samplesize)
                sleeptime = 0.1
                time.sleep(sleeptime)
                conc.save(initial_args['cachefile'], False, True, False)  # partial
                while not conc.finished():
                    # TODO it looks like append=True does not work with Manatee 2.121.1 properly
                    tmp_cachefile = initial_args['cachefile'] + '.tmp'
                    conc.save(tmp_cachefile, False, True, False)
                    os.rename(tmp_cachefile, initial_args['cachefile'])
                    time.sleep(sleeptime)
                    sleeptime += 0.1
                    sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile'])
                    cache_map.update_calc_status(subchash, query, dict(
                        curr_wait=sleeptime,
                        finished=sizes['finished'],
                        concsize=sizes['concsize'],
                        fullsize=sizes['fullsize'],
                        relconcsize=sizes['relconcsize'],
                        arf=None,
                        task_id=self._task_id))
                tmp_cachefile = initial_args['cachefile'] + '.tmp'
                conc.save(tmp_cachefile)  # whole
                os.rename(tmp_cachefile, initial_args['cachefile'])
                sizes = self.get_cached_conc_sizes(corpus_obj, query, initial_args['cachefile'])
                cache_map.update_calc_status(subchash, query, dict(
                    curr_wait=sleeptime,
                    finished=sizes['finished'],
                    concsize=sizes['concsize'],
                    fullsize=sizes['fullsize'],
                    relconcsize=sizes['relconcsize'],
                    arf=round(conc.compute_ARF(), 2) if not is_subcorpus(corpus_obj) else None,
                    task_id=self._task_id))
                # update size in map file
                cache_map.add_to_map(subchash, query, conc.size())
        except Exception as e:
            # Please note that there is no need to clean any mess (unfinished cached concordance etc.)
            # here as this is performed by _get_cached_conc()
            # function in case it detects a problem.
            import traceback
            logging.getLogger(__name__).error('Background calculation error: %s' % e)
            logging.getLogger(__name__).error(''.join(traceback.format_exception(*sys.exc_info())))
            if cache_map is not None:
                cache_map.update_calc_status(
                    subchash, query, dict(
                        finished=True,
                        curr_wait=sleeptime,
                        error=e.message if getattr(e, 'message', None) else e.__class__.__name__))