def ct_dist(self, crit, limit_type, limit=1): """ Calculate join distribution (contingency table). """ words = manatee.StrVector() freqs = manatee.NumVector() norms = manatee.NumVector() abs_limit = 1 # we always fetch all the values to be able to filter by percentiles and provide misc. info self._corp.freq_dist(self._conc.RS(), crit, abs_limit, words, freqs, norms) crit_lx = re.split(r'\s+', crit) attrs = [] for i in range(0, len(crit_lx), 2): attrs.append(crit_lx[i]) if len(attrs) > 2: raise CTCalculationError( 'Exactly two attributes (either positional or structural) can be used' ) words = [tuple(w.split('\t')) for w in words] num_structattrs = self._get_num_structattrs(attrs) if num_structattrs == 2: norms = [1e6] * len(words) # this is not really needed elif num_structattrs == 1: sattr_idx = 0 if '.' in attrs[0] else 1 norms = self._calc_1sattr_norms(words, sattr=attrs[sattr_idx], sattr_idx=sattr_idx) else: norms = [self._corp.size()] * len(words) mans = list(zip(words, freqs, norms)) if limit_type == 'abs': ans = [v for v in mans if v[1] >= limit] elif limit_type == 'ipm': ans = [v for v in mans if v[1] / float(v[2]) * 1e6 >= limit] elif limit_type == 'pabs': values = sorted(mans, key=lambda v: v[1]) plimit = int(math.floor(limit / 100. * len(values))) ans = values[plimit:] elif limit_type == 'pipm': values = sorted(mans, key=lambda v: v[1] / float(v[2]) * 1e6) # math.floor(x) == math.ceil(x) - 1 (indexing from 0) plimit = math.floor(limit / 100. * len(values)) ans = values[plimit:] if len(ans) > 1000: raise UserActionException( 'The result size is too high. Please try to increase the minimum frequency.' ) return ans, len(mans)
def xfreq_dist(self, crit, limit=1, sortkey='f', ml='', ftt_include_empty='', rel_mode=0, collator_locale='en_US'): """ Calculates data (including data for visual output) of a frequency distribution specified by the 'crit' parameter arguments: crit -- specified criteria (CQL) limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted values must be greater than the limit) sortkey -- a key according to which the distribution will be sorted ml -- str, if non-empty then multi-level freq. distribution is generated ftt_include_empty -- str, TODO rel_mode -- {0, 1}, TODO """ # ml = determines how the bar appears (multilevel x text type) # import math normwidth_freq = 100 normwidth_rel = 100 def calc_scale(freqs, norms): """ Create proper scaling coefficients for freqs and norms to match a 100 units length bar. """ from operator import add sumn = float(reduce(add, norms)) if sumn == 0: return float(normwidth_rel) / max(freqs), 0 else: sumf = float(reduce(add, freqs)) corr = min(sumf / max(freqs), sumn / max(norms)) return normwidth_rel / sumf * corr, normwidth_rel / sumn * corr def label(attr): if '/' in attr: attr = attr[:attr.index('/')] lab = self.pycorp.get_conf(attr + '.LABEL') return self.import_string(lab if lab else attr) words = manatee.StrVector() freqs = manatee.NumVector() norms = manatee.NumVector() self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms) words = [self.import_string(w) for w in words] if not len(freqs): return {} # now we intentionally rewrite norms as filled in by freq_dist() # because of "hard to explain" metrics they lead to if rel_mode == 0: norms2_dict = self.get_attr_values_sizes(crit) norms = [norms2_dict.get(x, 0) for x in words] sumf = float(sum([x for x in freqs])) attrs = crit.split() head = [dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2)] head.append(dict(n=translate('Freq'), s='freq', title=translate('Frequency'))) tofbar, tonbar = calc_scale(freqs, norms) if tonbar and not ml: maxf = max(freqs) # because of bar height minf = min(freqs) maxrel = 0 # because of bar width for index, (f, nf) in enumerate(zip(freqs, norms)): if nf == 0: nf = 100000 norms[index] = 100000 newrel = (f * tofbar / (nf * tonbar)) if maxrel < newrel: maxrel = newrel if rel_mode == 0: head.append(dict( n='i.p.m.', title=translate( 'instances per million positions (refers to the respective category)'), s='rel' )) else: head.append(dict(n='Freq [%]', title='', s='rel')) lines = [] for w, f, nf in zip(words, freqs, norms): w = self.import_string(w) rel_norm_freq = { 0: round(f * 1e6 / nf, 2), 1: round(f / sumf * 100, 2) }[rel_mode] rel_bar = { 0: 1 + int(f * tofbar * normwidth_rel / (nf * tonbar * maxrel)), 1: 1 + int(float(f) / maxf * normwidth_rel) }[rel_mode] freq_bar = { 0: int(normwidth_freq * float(f) / (maxf - minf + 1) + 1), 1: 10 }[rel_mode] lines.append(dict( Word=[{'n': ' '.join(n.split('\v'))} for n in w.split('\t')], freq=f, fbar=int(f * tofbar) + 1, norm=nf, nbar=int(nf * tonbar), relbar=rel_bar, norel=ml, freqbar=freq_bar, rel=rel_norm_freq )) else: lines = [] for w, f, nf in zip(words, freqs, norms): w = self.import_string(w) lines.append(dict( Word=[{'n': ' '.join(n.split('\v'))} for n in w.split('\t')], freq=f, fbar=int(f * tofbar) + 1, norel=1, relbar=None )) if ftt_include_empty and limit == 0 and '.' in attrs[0]: attr = self.pycorp.get_attr(attrs[0]) all_vals = [attr.id2str(i) for i in range(attr.id_range())] used_vals = [line['Word'][0]['n'] for line in lines] for v in all_vals: if v in used_vals: continue lines.append(dict( Word=[{'n': self.import_string(v)}], freq=0, rel=0, norm=0, nbar=0, relbar=0, norel=ml, freqbar=0, fbar=0 )) if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])): sortkey = int(sortkey) lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n']) else: if sortkey not in ('freq', 'rel'): sortkey = 'freq' lines = sorted(lines, key=lambda v: v[sortkey], reverse=True) return dict(Head=head, Items=lines)
def xfreq_dist(self, crit, limit=1, sortkey='f', ftt_include_empty: int = 0, rel_mode=0, collator_locale='en_US'): """ Calculates data (including data for visual output) of a frequency distribution specified by the 'crit' parameter arguments: crit -- specified criteria (CQL) limit -- str type!, minimal frequency accepted, this value is exclusive! (i.e. accepted values must be greater than the limit) sortkey -- a key according to which the distribution will be sorted ftt_include_empty -- str, TODO rel_mode -- {0, 1} (0 for structural attrs. , 1 for positional ones ??) """ def label(attr): if '/' in attr: attr = attr[:attr.index('/')] lab = self.pycorp.get_conf(attr + '.LABEL') return lab if lab else attr def export_word(wrd): return [{'n': ' '.join(n.split('\v'))} for n in wrd.split('\t')] def test_word_empty(wrd): return len(wrd) == 1 and (wrd[0]['n'] == '' or wrd[0]['n'] == '===NONE===') words = manatee.StrVector() freqs = manatee.NumVector() norms = manatee.NumVector() self.pycorp.freq_dist(self.RS(), crit, limit, words, freqs, norms) if len(freqs) == 0: return dict(Head=[], Items=[], SkippedEmpty=False, NoRelSorting=True) # for structural attrs, we intentionally rewrite norms as filled in by Corpus.freq_dist() # because of "hard to explain" metrics they lead to if rel_mode == 0: norms2_dict = self.get_attr_values_sizes(crit) norms = [norms2_dict.get(x, 0) for x in words] # For positional attrs, the norm is the size of the actual corpus/subcorpus. Please note that # for an "ad hoc" (or unnamed) subcorpus, this may be misleading as we still calculate against orig. corpus else: norms = [self.pycorp.search_size for _ in words] attrs = crit.split() head: List[Dict[str, Any]] = [ dict(n=label(attrs[x]), s=x / 2) for x in range(0, len(attrs), 2) ] head.append( dict(n=translate('Freq'), s='freq', title=translate('Frequency'))) has_empty_item = False head.append( dict( n='i.p.m.', title=translate( 'instances per million positions (refers to the respective category)' ), s='rel')) lines = [] for w, f, nf in zip(words, freqs, norms): word = export_word(w) if test_word_empty(word): has_empty_item = True continue lines.append( dict(Word=word, freq=f, norm=nf, rel=round(f / nf * 1e6, 2))) if ftt_include_empty and limit == 0 and '.' in attrs[0]: attr = self.pycorp.get_attr(attrs[0]) all_vals = [attr.id2str(i) for i in range(attr.id_range())] used_vals = [line['Word'][0]['n'] for line in lines] for v in all_vals: if v in used_vals: continue lines.append(dict(Word=[{'n': v}], freq=0, rel=0, norm=0)) if (sortkey in ('0', '1', '2')) and (int(sortkey) < len(lines[0]['Word'])): sortkey = int(sortkey) lines = l10n.sort(lines, loc=collator_locale, key=lambda v: v['Word'][sortkey]['n']) else: if sortkey not in ('freq', 'rel'): sortkey = 'freq' lines = sorted(lines, key=lambda v: v[sortkey], reverse=True) return dict(Head=head, Items=lines, SkippedEmpty=has_empty_item, NoRelSorting=bool(rel_mode))