Beispiel #1
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, subcname=coll_args.subcname)
    try:
        # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        corplib.frq_db(corp, coll_args.cattr)
        conc = conclib.get_conc(corp=corp, user_id=coll_args.user_id, minsize=coll_args.minsize, q=coll_args.q,
                                fromp=0, pagesize=0, async=0, save=coll_args.save, samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'))
        collocs = conc.collocs(cattr=coll_args.cattr, csortfn=coll_args.csortfn, cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw, ctow=coll_args.ctow, cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr, max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
            item['str'] = import_string(item['str'], from_encoding=coll_args.corpus_encoding)
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.args[0], coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Beispiel #2
0
 def run(self, tt_query, cql, path, publish_path):
     """
     returns:
     True in case of success
     In case of an empty subcorus, EmptySubcorpusException is thrown
     """
     conc = conclib.get_conc(self._corp, self._user_id, q=cql, asnc=0)
     if conc.size() == 0:
         raise EmptySubcorpusException('Empty subcorpus')
     ans = corplib.subcorpus_from_conc(path, conc)
     if ans is False:
         raise EmptySubcorpusException(
             'Failed to create the subcorpus from a concordance')
     if not os.path.isfile(
             path):  # this should not happen but it looks like it did
         logging.getLogger(__name__).warning(
             'Sync. called conc. file not created (path: {})'.format(path))
         time.sleep(5)
     # we must set write perms for group as this is created by Celery and we won't be
     # able to create hardlinks otherwise
     os.chmod(path, 0o664)
     if publish_path:
         corplib.mk_publish_links(path, publish_path, self._author,
                                  self._description)
     return ans
Beispiel #3
0
def calc_freqs_bg(args):
    """
    Calculate actual frequency data.

    arguments:
    args -- a FreqCalsArgs instance

    returns:
    a dict(freqs=..., conc_size=...)
    """

    cm = corplib.CorpusManager(subcpath=args.subcpath)
    corp = cm.get_Corpus(args.corpname, args.subcname)
    conc = conclib.get_conc(corp=corp,
                            user_id=args.user_id,
                            minsize=args.minsize,
                            q=args.q,
                            fromp=args.fromp,
                            pagesize=args.pagesize,
                            async=0,
                            save=args.save,
                            samplesize=args.samplesize)
    if not conc.finished():
        raise UnfinishedConcordanceError(
            _('Cannot calculate yet - source concordance not finished. Please try again later.'
              ))
    freqs = [
        conc.xfreq_dist(cr, args.flimit, args.freq_sort, args.ml,
                        args.ftt_include_empty, args.rel_mode,
                        args.collator_locale) for cr in args.fcrit
    ]
    return dict(freqs=freqs, conc_size=conc.size())
Beispiel #4
0
    def _create_subcorpus(self, request):
        """
        req. arguments:
        subcname -- name of new subcorpus
        create -- bool, sets whether to create new subcorpus
        cql -- custom within condition
        """
        subcname = request.form['subcname']
        within_json = request.form.get('within_json')
        raw_cql = request.form.get('cql')
        corp_encoding = self._corp().get_conf('ENCODING')

        if raw_cql:
            tt_query = ()
            within_cql = raw_cql
            full_cql = 'aword,[] %s' % raw_cql
            imp_cql = (full_cql,)
        elif within_json:  # user entered a subcorpus query manually
            tt_query = ()
            within_cql = self._deserialize_custom_within(json.loads(within_json))
            full_cql = 'aword,[] %s' % within_cql
            imp_cql = (full_cql,)
        else:
            tt_query = TextTypeCollector(self._corp(), request).get_query()
            full_cql = ' within '.join(['<%s %s />' % item for item in tt_query])
            full_cql = 'aword,[] within %s' % full_cql
            full_cql = import_string(full_cql, from_encoding=corp_encoding)
            imp_cql = (full_cql,)
        basecorpname = self.args.corpname.split(':')[0]
        if not subcname:
            raise UserActionException(_('No subcorpus name specified!'))
        path = self.prepare_subc_path(basecorpname, subcname)

        if type(path) == unicode:
            path = path.encode('utf-8')

        if len(tt_query) == 1:
            result = corplib.create_subcorpus(path, self._corp(), tt_query[0][0], tt_query[0][1])
        elif len(tt_query) > 1 or within_cql:
            conc = conclib.get_conc(self._corp(), self._session_get('user', 'user'), q=imp_cql)
            conc.sync()
            struct = self._corp().get_struct(tt_query[0][0]) if len(tt_query) == 1 else None
            result = corplib.subcorpus_from_conc(path, conc, struct)
        else:
            raise UserActionException(_('Nothing specified!'))

        if result:
            if plugins.has_plugin('subc_restore'):
                try:
                    plugins.get('subc_restore').store_query(user_id=self._session_get('user', 'id'),
                                                            corpname=self.args.corpname,
                                                            subcname=subcname,
                                                            cql=full_cql.split('[]')[-1])
                except Exception as e:
                    logging.getLogger(__name__).warning('Failed to store subcorpus query: %s' % e)
                    self.add_system_message('warning',
                                            _('Subcorpus created but there was a problem saving a backup copy.'))
            return {}
        else:
            raise ConcError(_('Empty subcorpus!'))
Beispiel #5
0
 def run(self):
     """
     note: this is called by Celery worker
     """
     cm = corplib.CorpusManager(subcpath=self._args.subcpath)
     self._corp = cm.get_Corpus(self._args.corpname, subcname=self._args.subcname)
     self._conc = conclib.get_conc(corp=self._corp, user_id=self._args.user_id, minsize=self._args.minsize,
                                   q=self._args.q, fromp=0, pagesize=0, async=0, save=0, samplesize=0)
     result, full_size = self.ct_dist(self._args.fcrit, limit=self._args.ctminfreq,
                                      limit_type=self._args.ctminfreq_type)
     return dict(data=[x[0] + x[1:] for x in result], full_size=full_size)
Beispiel #6
0
 def run(self):
     """
     note: this is called by Celery worker
     """
     cm = corplib.CorpusManager(subcpath=self._args.subcpath)
     self._corp = cm.get_Corpus(self._args.corpname, subcname=self._args.subcname)
     self._conc = conclib.get_conc(corp=self._corp, user_id=self._args.user_id, minsize=self._args.minsize,
                                   q=self._args.q, fromp=0, pagesize=0, async=0, save=0, samplesize=0)
     result, full_size = self.ct_dist(self._args.fcrit, limit=self._args.ctminfreq,
                                      limit_type=self._args.ctminfreq_type)
     return dict(data=[x[0] + x[1:] for x in result], full_size=full_size)
Beispiel #7
0
 def run(self, tt_query, cql, path):
     """
     returns:
     True in case of success
     In case of an empty subcorus, EmptySubcorpusException is thrown
     """
     conc = conclib.get_conc(self._corp, self._user_id, q=cql, async=0)
     ans = corplib.subcorpus_from_conc(path, conc)
     if ans is False:
         raise EmptySubcorpusException('Empty subcorpus')
     return ans
Beispiel #8
0
def calculate_colls_bg(coll_args):
    """
    Background collocations calculation.
    This function is expected to be run either
    from Celery or from other process (via multiprocessing).
    """
    cm = corplib.CorpusManager(subcpath=coll_args.subcpath)
    corp = cm.get_Corpus(coll_args.corpname, coll_args.subcname)
    try:
        corplib.frq_db(
            corp, coll_args.cattr
        )  # try to fetch precalculated data; if none then MissingSubCorpFreqFile
        conc = conclib.get_conc(corp=corp,
                                user_id=coll_args.user_id,
                                minsize=coll_args.minsize,
                                q=coll_args.q,
                                fromp=0,
                                pagesize=0,
                                async=0,
                                save=coll_args.save,
                                samplesize=coll_args.samplesize)
        if not conc.finished():
            raise UnfinishedConcordanceError(
                _('Cannot calculate yet - source concordance not finished. Please try again later.'
                  ))
        collocs = conc.collocs(cattr=coll_args.cattr,
                               csortfn=coll_args.csortfn,
                               cbgrfns=coll_args.cbgrfns,
                               cfromw=coll_args.cfromw,
                               ctow=coll_args.ctow,
                               cminfreq=coll_args.cminfreq,
                               cminbgr=coll_args.cminbgr,
                               max_lines=coll_args.num_fetch_items)
        for item in collocs['Items']:
            item['pfilter'] = [('q2', item['pfilter'])]
            item['nfilter'] = [('q2', item['nfilter'])]
            item['str'] = import_string(
                item['str'], from_encoding=coll_args.corpus_encoding)
        return dict(data=collocs, processing=0, tasks=[])
    except corplib.MissingSubCorpFreqFile as e:
        ans = {'attrname': coll_args.cattr, 'tasks': []}
        out = freq_calc.build_arf_db(e.args[0], coll_args.cattr)
        if type(out) is list:
            processing = 1
            ans['tasks'].extend(out)
        else:
            processing = 0
        ans['processing'] = processing
        ans['data'] = dict(Items=[], Head=[])
        return ans
Beispiel #9
0
    def calc_freqs(self, flimit, freq_sort, ml, rel_mode, fcrit, ftt_include_empty, collator_locale, fmaxitems, fpage,
                   line_offset):
        """
        Calculate actual frequency data.

        Returns:
        a 2-tuple (freq_data, caching_data) where:
            freq_data = dict(lastpage=..., data=..., fstart=..., fmaxitems=..., conc_size=...)
            caching_data = dict(data=..., cache_path=...); can be also None which means 'do not cache'
        """
        cache_path = self._cache_file_path(fcrit, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale)
        cache_ans = None

        if os.path.isfile(cache_path):
            with open(cache_path, 'rb') as f:
                data, conc_size = cPickle.load(f)
        else:
            cm = corplib.CorpusManager(subcpath=self._subcpath)
            corp = cm.get_Corpus(self._corpname, self._subcname)
            conc = conclib.get_conc(corp=corp, user_id=self._user_id, minsize=self._minsize, q=self._q,
                                    fromp=self._fromp, pagesize=self._pagesize, async=0, save=self._save,
                                    samplesize=self._samplesize)
            conc_size = conc.size()
            data = [conc.xfreq_dist(cr, flimit, freq_sort, ml, ftt_include_empty, rel_mode, collator_locale)
                    for cr in fcrit]

        lastpage = None
        if len(data) == 1:  # a single block => pagination
            total_length = len(data[0]['Items'])
            if total_length >= self.min_cached_data_size:
                cache_ans = dict(data=(data, conc_size), cache_path=cache_path)
            items_per_page = fmaxitems
            fstart = (fpage - 1) * fmaxitems + line_offset
            fmaxitems = fmaxitems * fpage + 1 + line_offset
            if total_length < fmaxitems:
                lastpage = 1
            else:
                lastpage = 0
            ans = [dict(Total=total_length,
                        TotalPages=int(math.ceil(total_length / float(items_per_page))),
                        Items=data[0]['Items'][fstart:fmaxitems - 1],
                        Head=data[0]['Head'])]
        else:
            ans = data
            fstart = None
        return dict(lastpage=lastpage, data=ans, fstart=fstart, fmaxitems=fmaxitems,
                    conc_size=conc_size), cache_ans
Beispiel #10
0
def calc_freqs_bg(args):
    """
    Calculate actual frequency data.

    arguments:
    args -- a FreqCalsArgs instance

    returns:
    a dict(freqs=..., conc_size=...)
    """

    cm = corplib.CorpusManager(subcpath=args.subcpath)
    corp = cm.get_Corpus(args.corpname, subcname=args.subcname)
    conc = conclib.get_conc(corp=corp, user_id=args.user_id, minsize=args.minsize, q=args.q,
                            fromp=args.fromp, pagesize=args.pagesize, async=0, save=args.save,
                            samplesize=args.samplesize)
    if not conc.finished():
        raise UnfinishedConcordanceError(
            _('Cannot calculate yet - source concordance not finished. Please try again later.'))
    freqs = [conc.xfreq_dist(cr, args.flimit, args.freq_sort, args.ml, args.ftt_include_empty, args.rel_mode,
                             args.collator_locale)
             for cr in args.fcrit]
    return dict(freqs=freqs, conc_size=conc.size())
Beispiel #11
0
    def fcs_search(self, corp, corpname, fcs_query, max_rec, start):
        """
            aux function for federated content search: operation=searchRetrieve
        """
        query = fcs_query.replace('+', ' ')  # convert URL spaces
        exact_match = True  # attr=".*value.*"
        if 'exact' in query.lower() and '=' not in query:  # lemma EXACT "dog"
            pos = query.lower().index('exact')  # first occurrence of EXACT
            query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
            exact_match = True

        attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
        rq = ''  # query for manatee
        try:  # parse query
            if '=' in query:  # lemma=word | lemma="word" | lemma="w1 w2" | word=""
                attr, term = query.split('=')
                attr = attr.strip()
                term = term.strip()
            else:  # "w1 w2" | "word" | word
                attr = 'word'
                # use one of search attributes if in corpora attributes
                # otherwise use `word` - fails below if not valid
                for sa in self.search_attrs:
                    if sa in attrs:
                        attr = sa
                        break
                term = query.strip()
            if '"' in attr:
                raise Exception
            if '"' in term:  # "word" | "word1 word2" | "" | "it is \"good\""
                if term[0] != '"' or term[-1] != '"':  # check q. marks
                    raise Exception
                term = term[1:-1].strip()  # remove quotation marks
                if ' ' in term:  # multi-word term
                    if exact_match:
                        rq = ' '.join(
                            ['[%s="%s"]' % (attr, t) for t in term.split()])
                    else:
                        rq = ' '.join([
                            '[%s=".*%s.*"]' % (attr, t) for t in term.split()
                        ])
                elif term.strip() == '':  # ""
                    raise Exception  # empty term
                else:  # one-word term
                    if exact_match:
                        rq = '[%s="%s"]' % (attr, term)
                    else:
                        rq = '[%s=".*%s.*"]' % (attr, term)
            else:  # must be single-word term
                if ' ' in term:
                    raise Exception
                if exact_match:  # build query
                    rq = '[%s="%s"]' % (attr, term)
                else:
                    rq = '[%s=".*%s.*"]' % (attr, term)
        except:  # there was a problem when parsing
            raise Exception(10, query, 'Query syntax error')
        if attr not in attrs:
            raise Exception(16, attr, 'Unsupported index')

        # try to get concordance
        try:
            anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id']
            q = ['q' + rq]
            # q = ['aword,[lc="havel"]']
            conc = conclib.get_conc(corp, anon_id, q=q)
        except Exception as e:
            raise Exception(10, repr(e), 'Query syntax error')

        kwic = kwiclib.Kwic(corp, corpname, conc)
        kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR)
        page = kwic.kwicpage(kwic_args)  # convert concordance

        # start starts with 1
        start -= 1

        if len(page['Lines']) < start:
            raise Exception(61, 'startRecord',
                            'First record position out of range')
        return [(kwicline['Left'][0]['str'], kwicline['Kwic'][0]['str'],
                 kwicline['Right'][0]['str'], kwicline['ref'])
                for kwicline in page['Lines']][start:][:max_rec]
Beispiel #12
0
    def fcs_search(self, corp, corpname, fcs_query, max_rec, start):
        """
            aux function for federated content search: operation=searchRetrieve
        """
        query = fcs_query.replace('+', ' ')  # convert URL spaces
        exact_match = True  # attr=".*value.*"
        if 'exact' in query.lower() and '=' not in query:  # lemma EXACT "dog"
            pos = query.lower().index('exact')  # first occurrence of EXACT
            query = query[:pos] + '=' + query[pos + 5:]  # 1st exact > =
            exact_match = True

        attrs = corp.get_conf('ATTRLIST').split(',')  # list of available attrs
        rq = ''  # query for manatee
        try:  # parse query
            if '=' in query:  # lemma=word | lemma="word" | lemma="w1 w2" | word=""
                attr, term = query.split('=')
                attr = attr.strip()
                term = term.strip()
            else:  # "w1 w2" | "word" | word
                attr = 'word'
                # use one of search attributes if in corpora attributes
                # otherwise use `word` - fails below if not valid
                for sa in self.search_attrs:
                    if sa in attrs:
                        attr = sa
                        break
                term = query.strip()
            if '"' in attr:
                raise Exception
            if '"' in term:  # "word" | "word1 word2" | "" | "it is \"good\""
                if term[0] != '"' or term[-1] != '"':  # check q. marks
                    raise Exception
                term = term[1:-1].strip()  # remove quotation marks
                if ' ' in term:  # multi-word term
                    if exact_match:
                        rq = ' '.join(['[%s="%s"]' % (attr, t)
                                       for t in term.split()])
                    else:
                        rq = ' '.join(['[%s=".*%s.*"]' % (attr, t)
                                       for t in term.split()])
                elif term.strip() == '':  # ""
                    raise Exception  # empty term
                else:  # one-word term
                    if exact_match:
                        rq = '[%s="%s"]' % (attr, term)
                    else:
                        rq = '[%s=".*%s.*"]' % (attr, term)
            else:  # must be single-word term
                if ' ' in term:
                    raise Exception
                if exact_match:  # build query
                    rq = '[%s="%s"]' % (attr, term)
                else:
                    rq = '[%s=".*%s.*"]' % (attr, term)
        except:  # there was a problem when parsing
            raise Exception(10, query, 'Query syntax error')
        if attr not in attrs:
            raise Exception(16, attr, 'Unsupported index')

        fromp = int(math.floor((start - 1) / max_rec)) + 1
        # try to get concordance
        try:
            anon_id = plugins.runtime.AUTH.instance.anonymous_user()['id']
            q = ['q' + rq]
            conc = conclib.get_conc(corp, anon_id, q=q, fromp=fromp, pagesize=max_rec * 2, async=0)
        except Exception as e:
            raise Exception(10, repr(e), 'Query syntax error')

        kwic = kwiclib.Kwic(corp, corpname, conc)
        kwic_args = kwiclib.KwicPageArgs(Args(), base_attr=Kontext.BASE_ATTR)
        kwic_args.fromp = fromp
        kwic_args.pagesize = max_rec * 2
        kwic_args.leftctx = '-{0}'.format(settings.get_int('fcs', 'kwic_context', 5))
        kwic_args.rightctx = '{0}'.format(settings.get_int('fcs', 'kwic_context', 5))
        page = kwic.kwicpage(kwic_args)  # convert concordance

        local_offset = (start - 1) % max_rec
        if start > conc.size():
            raise Exception(61, 'startRecord', 'First record position out of range')
        rows = [
            (
                kwicline['Left'][0]['str'],
                kwicline['Kwic'][0]['str'],
                kwicline['Right'][0]['str'],
                kwicline['ref']
            )
            for kwicline in page['Lines']
        ][local_offset:local_offset + max_rec]
        return rows, conc.size()
Beispiel #13
0
    def calc_freqs(self, flimit, freq_sort, ml, rel_mode, fcrit,
                   ftt_include_empty, collator_locale, fmaxitems, fpage,
                   line_offset):
        """
        Calculate actual frequency data.

        Returns:
        a 2-tuple (freq_data, caching_data) where:
            freq_data = dict(lastpage=..., data=..., fstart=..., fmaxitems=..., conc_size=...)
            caching_data = dict(data=..., cache_path=...); can be also None which means 'do not cache'
        """
        cache_path = self._cache_file_path(fcrit, flimit, freq_sort, ml,
                                           ftt_include_empty, rel_mode,
                                           collator_locale)
        cache_ans = None

        if os.path.isfile(cache_path):
            with open(cache_path, 'rb') as f:
                data, conc_size = cPickle.load(f)
        else:
            cm = corplib.CorpusManager(subcpath=self._subcpath)
            corp = cm.get_Corpus(self._corpname, self._subcname)
            conc = conclib.get_conc(corp=corp,
                                    user_id=self._user_id,
                                    minsize=self._minsize,
                                    q=self._q,
                                    fromp=self._fromp,
                                    pagesize=self._pagesize,
                                    async=0,
                                    save=self._save,
                                    samplesize=self._samplesize)
            conc_size = conc.size()
            data = [
                conc.xfreq_dist(cr, flimit, freq_sort, ml, ftt_include_empty,
                                rel_mode, collator_locale) for cr in fcrit
            ]

        lastpage = None
        if len(data) == 1:  # a single block => pagination
            total_length = len(data[0]['Items'])
            if total_length >= self.min_cached_data_size:
                cache_ans = dict(data=(data, conc_size), cache_path=cache_path)
            items_per_page = fmaxitems
            fstart = (fpage - 1) * fmaxitems + line_offset
            fmaxitems = fmaxitems * fpage + 1 + line_offset
            if total_length < fmaxitems:
                lastpage = 1
            else:
                lastpage = 0
            ans = [
                dict(Total=total_length,
                     TotalPages=int(
                         math.ceil(total_length / float(items_per_page))),
                     Items=data[0]['Items'][fstart:fmaxitems - 1],
                     Head=data[0]['Head'])
            ]
        else:
            ans = data
            fstart = None
        return dict(lastpage=lastpage,
                    data=ans,
                    fstart=fstart,
                    fmaxitems=fmaxitems,
                    conc_size=conc_size), cache_ans