Beispiel #1
0
def sort_line_groups(conc, group_ids):
    ids = manatee.IntVector()
    strs = manatee.StrVector()
    for g in group_ids:
        ids.append(g)
        strs.append('%05d' % g)
    conc.linegroup_sort(ids, strs)
Beispiel #2
0
 def command_g(self, options):
     """
     sort according to linegroups
     """
     annot = get_stored_conc(self.pycorp, options, self.pycorp._conc_dir)
     self.set_linegroup_from_conc(annot)
     lmap = annot.labelmap
     lmap[0] = None
     ids = manatee.IntVector(map(int, lmap.keys()))
     strs = manatee.StrVector(map(lngrp_sortstr, lmap.values()))
     self.linegroup_sort(ids, strs)
Beispiel #3
0
    def ct_dist(self, crit, limit_type, limit=1):
        """
        Calculate join distribution (contingency table).
        """
        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()

        abs_limit = 1  # we always fetch all the values to be able to filter by percentiles and provide misc. info
        self._corp.freq_dist(self._conc.RS(), crit, abs_limit, words, freqs,
                             norms)

        crit_lx = re.split(r'\s+', crit)
        attrs = []
        for i in range(0, len(crit_lx), 2):
            attrs.append(crit_lx[i])

        if len(attrs) > 2:
            raise CTCalculationError(
                'Exactly two attributes (either positional or structural) can be used'
            )

        words = [tuple(w.split('\t')) for w in words]

        num_structattrs = self._get_num_structattrs(attrs)
        if num_structattrs == 2:
            norms = [1e6] * len(words)  # this is not really needed
        elif num_structattrs == 1:
            sattr_idx = 0 if '.' in attrs[0] else 1
            norms = self._calc_1sattr_norms(words,
                                            sattr=attrs[sattr_idx],
                                            sattr_idx=sattr_idx)
        else:
            norms = [self._corp.size()] * len(words)
        mans = list(zip(words, freqs, norms))
        if limit_type == 'abs':
            ans = [v for v in mans if v[1] >= limit]
        elif limit_type == 'ipm':
            ans = [v for v in mans if v[1] / float(v[2]) * 1e6 >= limit]
        elif limit_type == 'pabs':
            values = sorted(mans, key=lambda v: v[1])
            plimit = int(math.floor(limit / 100. * len(values)))
            ans = values[plimit:]
        elif limit_type == 'pipm':
            values = sorted(mans, key=lambda v: v[1] / float(v[2]) * 1e6)
            # math.floor(x) == math.ceil(x) - 1 (indexing from 0)
            plimit = math.floor(limit / 100. * len(values))
            ans = values[plimit:]
        if len(ans) > 1000:
            raise UserActionException(
                'The result size is too high. Please try to increase the minimum frequency.'
            )
        return ans, len(mans)
Beispiel #4
0
    def get_sort_idx(self, q=(), pagesize=20):
        """
        In case sorting is active this method generates shortcuts to pages where new
        first letter of sorted keys (it can be 'left', 'kwic', 'right') starts.

        arguments:
        q -- a query (as a list)
        pagesize -- number of items per page

        returns:
        a list of dicts with following structure (example):
            [{'page': 1, 'label': u'a'}, {'page': 1, 'label': u'A'}, {'page': 2, 'label': u'b'},...]
        """
        crit = ''
        for qq in q:
            if qq.startswith('s') and not qq.startswith('s*'):
                crit = qq[1:]
        if not crit:
            return []
        vals = manatee.StrVector()
        idx = manatee.IntVector()
        if '.' in crit.split('/')[0]:
            just_letters = False
        else:
            just_letters = True
        self.conc.sort_idx(crit, vals, idx, just_letters)
        out = [(v, pos / pagesize + 1) for v, pos in zip(vals, idx)]
        if just_letters:
            result = []
            keys = []
            for v, p in out:
                if not v[0] in keys:
                    result.append((v[0], p))
                    keys.append(v[0])
            out = result

        ans = []
        for v, p in out:
            try:
                ans.append({'page': p, 'label': v})
            except UnicodeDecodeError:
                # Without manatee.set_encoding, manatee appears to produce
                # few extra undecodable items. Ignoring them produces
                # the same result as in case of official Bonito app.
                pass
        return ans
Beispiel #5
0
    def xfreq_dist(self, crit, limit=1, sortkey='f', ml='', ftt_include_empty='', rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ml -- str, if non-empty then multi-level freq. distribution is generated
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1}, TODO
        """

        # ml = determines how the bar appears (multilevel x text type)
        # import math
        normwidth_freq = 100
        normwidth_rel = 100

        def calc_scale(freqs, norms):
            """
            Create proper scaling coefficients for freqs and norms
            to match a 100 units length bar.
            """
            from operator import add
            sumn = float(reduce(add, norms))
            if sumn == 0:
                return float(normwidth_rel) / max(freqs), 0
            else:
                sumf = float(reduce(add, freqs))
                corr = min(sumf / max(freqs), sumn / max(norms))
                return normwidth_rel / sumf * corr, normwidth_rel / sumn * corr

        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return self.import_string(lab if lab else attr)

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        words = [self.import_string(w) for w in words]
        if not len(freqs):
            return {}
        # now we intentionally rewrite norms as filled in by freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        sumf = float(sum([x for x in freqs]))
        attrs = crit.split()
        head = [dict(n=label(attrs[x]), s=x / 2)
                for x in range(0, len(attrs), 2)]
        head.append(dict(n=translate('Freq'), s='freq', title=translate('Frequency')))

        tofbar, tonbar = calc_scale(freqs, norms)
        if tonbar and not ml:
            maxf = max(freqs)  # because of bar height
            minf = min(freqs)
            maxrel = 0
            # because of bar width
            for index, (f, nf) in enumerate(zip(freqs, norms)):
                if nf == 0:
                    nf = 100000
                    norms[index] = 100000
                newrel = (f * tofbar / (nf * tonbar))
                if maxrel < newrel:
                    maxrel = newrel
            if rel_mode == 0:
                head.append(dict(
                    n='i.p.m.',
                    title=translate(
                        'instances per million positions (refers to the respective category)'),
                    s='rel'
                ))
            else:
                head.append(dict(n='Freq [%]', title='', s='rel'))

            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                rel_norm_freq = {
                    0: round(f * 1e6 / nf, 2),
                    1: round(f / sumf * 100, 2)
                }[rel_mode]

                rel_bar = {
                    0: 1 + int(f * tofbar * normwidth_rel / (nf * tonbar * maxrel)),
                    1: 1 + int(float(f) / maxf * normwidth_rel)
                }[rel_mode]

                freq_bar = {
                    0: int(normwidth_freq * float(f) / (maxf - minf + 1) + 1),
                    1: 10
                }[rel_mode]
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norm=nf,
                    nbar=int(nf * tonbar),
                    relbar=rel_bar,
                    norel=ml,
                    freqbar=freq_bar,
                    rel=rel_norm_freq
                ))
        else:
            lines = []
            for w, f, nf in zip(words, freqs, norms):
                w = self.import_string(w)
                lines.append(dict(
                    Word=[{'n': '  '.join(n.split('\v'))} for n in w.split('\t')],
                    freq=f,
                    fbar=int(f * tofbar) + 1,
                    norel=1,
                    relbar=None
                ))

        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(
                    Word=[{'n': self.import_string(v)}],
                    freq=0,
                    rel=0,
                    norm=0,
                    nbar=0,
                    relbar=0,
                    norel=ml,
                    freqbar=0,
                    fbar=0
                ))
        if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head, Items=lines)
Beispiel #6
0
def create_str_vector():
    """
    Creates a new manatee.StrVector instance
    """
    return manatee.StrVector()
Beispiel #7
0
    def add_aligns(self, result, args):
        """
        Adds lines from aligned corpora. Method modifies passed KwicPageData instance by setting
        respective attributes.

        arguments:
        result -- KwicPageData type is required
        """
        def create_empty_cell():
            return {
                'rightsize': 0,
                'hitlen': ';hitlen=9',
                'Right': [],
                'Kwic': [],
                'linegroup': '_',
                'leftsize': 0,
                'ref': '',
                'rightspace': '',
                'leftspace': '',
                'kwiclen': 0,
                'toknum': None,
                'Left': []
            }

        def fix_length(arr, length):
            return arr + [
                create_empty_cell() for _ in range(length - len(arr))
            ]

        if not args.alignlist:
            return
        al_lines = []
        corps_with_colls = manatee.StrVector()
        self.conc.get_aligned(corps_with_colls)
        result.KWICCorps = [c for c in corps_with_colls]
        if self.corpus.corpname not in result.KWICCorps:
            result.KWICCorps = [self.corpus.corpname] + result.KWICCorps
        result.CorporaColumns = [
            dict(n=c.get_conffile(),
                 label=c.get_conf('NAME') or c.get_conffile())
            for c in [self.conc.orig_corp] + args.alignlist
        ]
        for al_corp in args.alignlist:
            al_corpname = al_corp.get_conffile()
            if al_corpname in corps_with_colls:
                self.conc.switch_aligned(al_corp.get_conffile())
                al_lines.append(self.kwiclines(args))
            else:
                self.conc.switch_aligned(self.conc.orig_corp.get_conffile())
                self.conc.add_aligned(al_corp.get_conffile())
                self.conc.switch_aligned(al_corp.get_conffile())
                al_lines.append(
                    self.kwiclines(
                        args.copy(leftctx='0',
                                  rightctx='0',
                                  attrs='word',
                                  ctxattrs='')))

        # It appears that Manatee returns lists of different lengths in case some translations
        # are missing at the end of a concordance. Following block fixes this issue.
        al_lines_fixed = [
            fix_length(item, len(result.Lines)) for item in al_lines
        ]
        aligns = list(zip(*al_lines_fixed))
        for i, line in enumerate(result.Lines):
            line['Align'] = aligns[i]
Beispiel #8
0
    def xfreq_dist(self,
                   crit,
                   limit=1,
                   sortkey='f',
                   ftt_include_empty: int = 0,
                   rel_mode=0,
                   collator_locale='en_US'):
        """
        Calculates data (including data for visual output) of a frequency distribution
        specified by the 'crit' parameter

        arguments:
        crit -- specified criteria (CQL)
        limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted
                 values must be greater than the limit)
        sortkey -- a key according to which the distribution will be sorted
        ftt_include_empty -- str, TODO
        rel_mode -- {0, 1} (0 for structural attrs. , 1 for positional ones ??)
        """
        def label(attr):
            if '/' in attr:
                attr = attr[:attr.index('/')]
            lab = self.pycorp.get_conf(attr + '.LABEL')
            return lab if lab else attr

        def export_word(wrd):
            return [{'n': '  '.join(n.split('\v'))} for n in wrd.split('\t')]

        def test_word_empty(wrd):
            return len(wrd) == 1 and (wrd[0]['n'] == ''
                                      or wrd[0]['n'] == '===NONE===')

        words = manatee.StrVector()
        freqs = manatee.NumVector()
        norms = manatee.NumVector()
        self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms)
        if len(freqs) == 0:
            return dict(Head=[],
                        Items=[],
                        SkippedEmpty=False,
                        NoRelSorting=True)

        # for structural attrs, we intentionally rewrite norms as filled in by Corpus.freq_dist()
        # because of "hard to explain" metrics they lead to
        if rel_mode == 0:
            norms2_dict = self.get_attr_values_sizes(crit)
            norms = [norms2_dict.get(x, 0) for x in words]
        # For positional attrs, the norm is the size of the actual corpus/subcorpus. Please note that
        # for an "ad hoc" (or unnamed) subcorpus, this may be misleading as we still calculate against orig. corpus
        else:
            norms = [self.pycorp.search_size for _ in words]

        attrs = crit.split()
        head: List[Dict[str, Any]] = [
            dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2)
        ]
        head.append(
            dict(n=translate('Freq'), s='freq', title=translate('Frequency')))
        has_empty_item = False
        head.append(
            dict(
                n='i.p.m.',
                title=translate(
                    'instances per million positions (refers to the respective category)'
                ),
                s='rel'))

        lines = []
        for w, f, nf in zip(words, freqs, norms):
            word = export_word(w)
            if test_word_empty(word):
                has_empty_item = True
                continue
            lines.append(
                dict(Word=word, freq=f, norm=nf, rel=round(f / nf * 1e6, 2)))
        if ftt_include_empty and limit == 0 and '.' in attrs[0]:
            attr = self.pycorp.get_attr(attrs[0])
            all_vals = [attr.id2str(i) for i in range(attr.id_range())]
            used_vals = [line['Word'][0]['n'] for line in lines]
            for v in all_vals:
                if v in used_vals:
                    continue
                lines.append(dict(Word=[{'n': v}], freq=0, rel=0, norm=0))
        if (sortkey
                in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])):
            sortkey = int(sortkey)
            lines = l10n.sort(lines,
                              loc=collator_locale,
                              key=lambda v: v['Word'][sortkey]['n'])
        else:
            if sortkey not in ('freq', 'rel'):
                sortkey = 'freq'
            lines = sorted(lines, key=lambda v: v[sortkey], reverse=True)
        return dict(Head=head,
                    Items=lines,
                    SkippedEmpty=has_empty_item,
                    NoRelSorting=bool(rel_mode))