コード例 #1
0
ファイル: freq_calc.py プロジェクト: Vokabular/kontext
    def ct_dist(self, crit, limit_type, limit=1):
        """
        Calculate join distribution (contingency table).
        """
        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()

        abs_limit = 1  # we always fetch all the values to be able to filter by percentiles and provide misc. info
        self._corp.freq_dist(self._conc.RS(), crit, abs_limit, words, freqs,
                             norms)

        crit_lx = re.split(r'\s+', crit)
        attrs = []
        for i in range(0, len(crit_lx), 2):
            attrs.append(crit_lx[i])

        if len(attrs) > 2:
            raise CTCalculationError(
                'Exactly two attributes (either positional or structural) can be used'
            )

        words = [tuple(w.split('\t')) for w in words]

        num_structattrs = self._get_num_structattrs(attrs)
        if num_structattrs == 2:
            norms = [1e6] * len(words)  # this is not really needed
        elif num_structattrs == 1:
            sattr_idx = 0 if '.' in attrs[0] else 1
            norms = self._calc_1sattr_norms(words,
                                            sattr=attrs[sattr_idx],
                                            sattr_idx=sattr_idx)
        else:
            norms = [self._corp.size()] * len(words)
        mans = list(zip(words, freqs, norms))
        if limit_type == 'abs':
            ans = [v for v in mans if v[1] >= limit]
        elif limit_type == 'ipm':
            ans = [v for v in mans if v[1] / float(v[2]) * 1e6 >= limit]
        elif limit_type == 'pabs':
            values = sorted(mans, key=lambda v: v[1])
            plimit = int(math.floor(limit / 100. * len(values)))
            ans = values[plimit:]
        elif limit_type == 'pipm':
            values = sorted(mans, key=lambda v: v[1] / float(v[2]) * 1e6)
            # math.floor(x) == math.ceil(x) - 1 (indexing from 0)
            plimit = math.floor(limit / 100. * len(values))
            ans = values[plimit:]
        if len(ans) > 1000:
            raise UserActionException(
                'The result size is too high. Please try to increase the minimum frequency.'
            )
        return ans, len(mans)
コード例 #2
0
ファイル: pyconc.py プロジェクト: gkkulik/kontext
    def xfreq_dist(self, crit, limit=1, sortkey='f', ml='', ftt_include_empty='', rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ml -- str, if non-empty then multi-level freq. distribution is generated
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1}, TODO
        """

        # ml = determines how the bar appears (multilevel x text type)
        # import math
        normwidth_freq = 100
        normwidth_rel = 100

        def calc_scale(freqs, norms):
            """
            Create proper scaling coefficients for freqs and norms
            to match a 100 units length bar.
            """
            from operator import add
            sumn = float(reduce(add, norms))
            if sumn == 0:
                return float(normwidth_rel) / max(freqs), 0
            else:
                sumf = float(reduce(add, freqs))
                corr = min(sumf / max(freqs), sumn / max(norms))
                return normwidth_rel / sumf * corr, normwidth_rel / sumn * corr

        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return self.import_string(lab if lab else attr)

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        words = [self.import_string(w) for w in words]
        if not len(freqs):
            return {}
        # now we intentionally rewrite norms as filled in by freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        sumf = float(sum([x for x in freqs]))
        attrs = crit.split()
        head = [dict(n=label(attrs[x]), s=x / 2)
                for x in range(0, len(attrs), 2)]
        head.append(dict(n=translate('Freq'), s='freq', title=translate('Frequency')))

        tofbar, tonbar = calc_scale(freqs, norms)
        if tonbar and not ml:
            maxf = max(freqs)  # because of bar height
            minf = min(freqs)
            maxrel = 0
            # because of bar width
            for index, (f, nf) in enumerate(zip(freqs, norms)):
                if nf == 0:
                    nf = 100000
                    norms[index] = 100000
                newrel = (f * tofbar / (nf * tonbar))
                if maxrel < newrel:
                    maxrel = newrel
            if rel_mode == 0:
                head.append(dict(
                    n='i.p.m.',
                    title=translate(
                        'instances per million positions (refers to the respective category)'),
                    s='rel'
                ))
            else:
                head.append(dict(n='Freq [%]', title='', s='rel'))

            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                rel_norm_freq = {
                    0: round(f * 1e6 / nf, 2),
                    1: round(f / sumf * 100, 2)
                }[rel_mode]

                rel_bar = {
                    0: 1 + int(f * tofbar * normwidth_rel / (nf * tonbar * maxrel)),
                    1: 1 + int(float(f) / maxf * normwidth_rel)
                }[rel_mode]

                freq_bar = {
                    0: int(normwidth_freq * float(f) / (maxf - minf + 1) + 1),
                    1: 10
                }[rel_mode]
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norm=nf,
                    nbar=int(nf * tonbar),
                    relbar=rel_bar,
                    norel=ml,
                    freqbar=freq_bar,
                    rel=rel_norm_freq
                ))
        else:
            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norel=1,
                    relbar=None
                ))

        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(
                    Word=[{'n': self.import_string(v)}],
                    freq=0,
                    rel=0,
                    norm=0,
                    nbar=0,
                    relbar=0,
                    norel=ml,
                    freqbar=0,
                    fbar=0
                ))
        if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head, Items=lines)
コード例 #3
0
    def xfreq_dist(self,
                   crit,
                   limit=1,
                   sortkey='f',
                   ftt_include_empty: int = 0,
                   rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1} (0 for structural attrs. , 1 for positional ones ??)
        """
        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return lab if lab else attr

        def export_word(wrd):
            return [{'n': '  '.join(n.split('\v'))} for n in wrd.split('\t')]

        def test_word_empty(wrd):
            return len(wrd) == 1 and (wrd[0]['n'] == ''
                                      or wrd[0]['n'] == '===NONE===')

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        if len(freqs) == 0:
            return dict(Head=[],
                        Items=[],
                        SkippedEmpty=False,
                        NoRelSorting=True)

        # for structural attrs, we intentionally rewrite norms as filled in by Corpus.freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        # For positional attrs, the norm is the size of the actual corpus/subcorpus. Please note that
        # for an "ad hoc" (or unnamed) subcorpus, this may be misleading as we still calculate against orig. corpus
        else:
            norms = [self.pycorp.search_size for _ in words]

        attrs = crit.split()
        head: List[Dict[str, Any]] = [
            dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2)
        ]
        head.append(
            dict(n=translate('Freq'), s='freq', title=translate('Frequency')))
        has_empty_item = False
        head.append(
            dict(
                n='i.p.m.',
                title=translate(
                    'instances per million positions (refers to the respective category)'
                ),
                s='rel'))

        lines = []
        for w, f, nf in zip(words, freqs, norms):
            word = export_word(w)
            if test_word_empty(word):
                has_empty_item = True
                continue
            lines.append(
                dict(Word=word, freq=f, norm=nf, rel=round(f / nf * 1e6, 2)))
        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(Word=[{'n': v}], freq=0, rel=0, norm=0))
        if (sortkey
                in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines,
                              loc=collator_locale,
                              key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head,
                    Items=lines,
                    SkippedEmpty=has_empty_item,
                    NoRelSorting=bool(rel_mode))