def xdistribution(self, xrange, yrange): """ TODO: no direct call found for this """ begs = manatee.IntVector(xrange) vals = manatee.IntVector(xrange) self.distribution(vals, begs, yrange) return zip(vals, begs)
def linegroup_info_select(self, selected_count=5): """ TODO: no direct call found for this """ ids = manatee.IntVector() freqs = manatee.IntVector() self.get_linegroup_stat(ids, freqs) grps = [(f, i) for f, i in zip(freqs, ids) if i] grps.sort() grps = [i for f, i in grps[-5:]] grps.sort() self.selected_grps = [0] + grps return self.selected_grps
def sort_line_groups(conc, group_ids): ids = manatee.IntVector() strs = manatee.StrVector() for g in group_ids: ids.append(g) strs.append('%05d' % g) conc.linegroup_sort(ids, strs)
def linegroup_info_subset(self, conc): """ TODO: no direct call found for this """ # conc = manatee.Concordance (fstream) conc.sync() conc.set_linegroup_from_conc(self) if not conc.size(): return 0, 0, [0] * (len(self.selected_grps) + 1) ids = manatee.IntVector() freqs = manatee.IntVector() conc.get_linegroup_stat(ids, freqs) info = dict(zip(ids, freqs)) if not info: # no annotation return 0, 0, [0] * (len(self.selected_grps) + 1) hist = [info.get(i, 0) for i in self.selected_grps] hist.append(conc.size() - sum(hist)) cnt, maxid = max(zip(freqs, ids)) return maxid, (cnt / float(conc.size())), hist
def command_g(self, options): """ sort according to linegroups """ annot = get_stored_conc(self.pycorp, options, self.pycorp._conc_dir) self.set_linegroup_from_conc(annot) lmap = annot.labelmap lmap[0] = None ids = manatee.IntVector(map(int, lmap.keys())) strs = manatee.StrVector(map(lngrp_sortstr, lmap.values())) self.linegroup_sort(ids, strs)
def get_sort_idx(self, q=(), pagesize=20): """ In case sorting is active this method generates shortcuts to pages where new first letter of sorted keys (it can be 'left', 'kwic', 'right') starts. arguments: q -- a query (as a list) pagesize -- number of items per page returns: a list of dicts with following structure (example): [{'page': 1, 'label': u'a'}, {'page': 1, 'label': u'A'}, {'page': 2, 'label': u'b'},...] """ crit = '' for qq in q: if qq.startswith('s') and not qq.startswith('s*'): crit = qq[1:] if not crit: return [] vals = manatee.StrVector() idx = manatee.IntVector() if '.' in crit.split('/')[0]: just_letters = False else: just_letters = True self.conc.sort_idx(crit, vals, idx, just_letters) out = [(v, pos / pagesize + 1) for v, pos in zip(vals, idx)] if just_letters: result = [] keys = [] for v, p in out: if not v[0] in keys: result.append((v[0], p)) keys.append(v[0]) out = result ans = [] for v, p in out: try: ans.append({'page': p, 'label': v}) except UnicodeDecodeError: # Without manatee.set_encoding, manatee appears to produce # few extra undecodable items. Ignoring them produces # the same result as in case of official Bonito app. pass return ans
def iter_label2pos(corpus, query): """ Low-level function to evaluate a labeled CQL query on a given corpus (called from `match_cql` function). # Arguments corpus: `manatee.Corpus` object query: labeled CQL query # Yields {label: position} dictionaries """ results = corpus.eval_query(query) while not results.end(): beg = results.peek_beg() # query's first token's position colls = manatee.IntVector() results.collocs(colls) yield { colls[i]: colls[i + 1] + beg for i in range(0, len(colls), 2) } results.next()
def xdistribution(self, xrange: List[int], amplitude: int) -> Tuple[List[int], List[int]]: begs = manatee.IntVector(xrange) values = manatee.IntVector(xrange) self.distribution(values, begs, amplitude) return begs, values