def _load_raw_sent(self, corpus, corpus_id, token_id, kwic_len, tree_attrs): """ Retrieve a sentence via Manatee Args: corpus (manatee.Corpus): a corpus instance corpus_id (str): corpus ID token_id (int): token number/id kwic_len (int): number of tokens in KWIC tree_attrs (list of str): a list of positional attributes required by tree nodes/edges Returns (dict): data: a list of strings (Manatee raw format) kwic_pos: a tuple (first_kwic_idx, kwic_length) """ encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(corpus_id) conc = manatee.Concordance(corpus, ' '.join( '[#%d]' % k for k in range(token_id, token_id + kwic_len)), 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): left_tk = kl.get_left() kwic_tk = kl.get_kwic() return dict(data=[import_string(s, from_encoding=encoding) for s in left_tk + kwic_tk + kl.get_right()], kwic_pos=(len(left_tk) / 4, len(kwic_tk) / 4))
def matching_structattr(corp: manatee.Corpus, struct: str, attr: str, val: str, search_attr: str) -> Tuple[List[str], int, int]: """ Return a value of search_attr matching provided structural attribute [struct].[attr] = [val] """ try: size_limit = 1000000 ans = set() query = '<{struct} {attr}="{attr_val}">[]'.format(struct=struct, attr=attr, attr_val=val) conc = manatee.Concordance(corp, query, 0, -1) conc.sync() size = conc.size() kw = manatee.KWICLines(corp, conc.RS(True, 0, size_limit), '-1', '1', 'word', '', '', '={}.{}'.format(struct, search_attr)) while kw.nextline(): refs = kw.get_ref_list() if len(refs) > 0: ans.add(refs[0]) return sorted(ans), size, min(size, size_limit) except RuntimeError as ex: if 'AttrNotFound' in str(ex): return [], 0, 0 raise ex
def _load_raw_sent(self, corpus, canonical_corpus_id, token_id, tree_attrs): encoding = corpus.get_conf('ENCODING') sentence_struct = self._conf.get_sentence_struct(canonical_corpus_id) conc = manatee.Concordance(corpus, '[#%d]' % token_id, 1, -1) conc.sync() kl = manatee.KWICLines(corpus, conc.RS(True, 0, 1), '-1:%s' % sentence_struct, '1:%s' % sentence_struct, ','.join(tree_attrs), ','.join(tree_attrs), '', '') if kl.nextline(): return [ import_string(s, from_encoding=encoding) for s in kl.get_left() + kl.get_kwic() + kl.get_right() ]
def get_groups_first_line(self): if not isinstance(self.conc, InitialConc): kl = manatee.KWICLines( self.conc.corp(), r=self.conc.RS(True, 0, int(self.conc.size())), left='0', right='0', kwica='', ctxa='', struca='', refa='', ) i = 0 while kl.nextline(): i = i + 1 if kl.get_linegroup(): return i return None
def _find_refs(conc, attr, alignment, idx): limit = 1 leftcontext = '-1' rightcontext = '1' attrs = '' attrs_allpos = '' structs = '' refs = 's.id,#' maxcontext = 10 kw = manatee.KWICLines(conc.corp(), conc.RS(True, 0, limit), leftcontext, rightcontext, attrs, attrs_allpos, structs, refs, maxcontext) while kw.nextline(): refs = kw.get_refs() struct_id, token_idx = _parse_refs(refs) sent_order = attr.str2id(struct_id) srch = alignment.find_left_val(sent_order) print('#{0} -- {1} -- 1st sentence in corp: {2} -- aligndef line: {3}'. format(idx, struct_id, sent_order, srch))
def generate_kwiclines(self, query, corpus): """ Parameters ---------- query : str a query to be used to extract all tag values corpus : str a corpus name Returns ------- set a set containing all unique tag values as found in the corpus """ conc = manatee.Concordance(corpus, query, 0) kw = manatee.KWICLines(conc, '-1#', '1#', 'tag', 'tag', '', '#', 0) ans = set() for i in range(conc.size()): kw.nextline(i) ans.add(kw.get_kwic()[0].strip()) return sorted(tuple(ans))
def add_structattr_support(corp: KCorpus, attrs, token_id): """ A decorator function which turns 'fetch_posattr' into a more general function which is able to load structural attributes too. The load is performed only once for all possible structural attributes. """ data = {} refs = [x for x in attrs if '.' in x] refs_mapping = {} for n in refs: if n: lab = corp.get_conf(f'{n}.LABEL') refs_mapping[lab if lab else n] = n if len(refs) > 0: conc = manatee.Concordance(corp.unwrap(), '[#{}]'.format(int(token_id)), 1, -1) conc.sync() rs = conc.RS(True, 0, 0) kl = manatee.KWICLines(corp.unwrap(), rs, '-1', '1', 'word', '', '', ','.join(refs)) if kl.nextline(): refs_str = kl.get_refs() for kv in refs_str.split(','): if '=' in kv: k, v = kv.split('=') k = refs_mapping.get(k) data[k] = v def decorator(fn): def wrapper(corp, attr, token_id, num_tokens): if '.' in attr: return data[attr] return fn(corp, attr, token_id, num_tokens) return wrapper return decorator
def kwiclines(self, args): """ Generates list of 'kwic' (= keyword in context) lines according to the provided Concordance object and additional parameters (like page number, width of the left and right context etc.). arguments: args -- a KwicLinesArgs instance returns: a dictionary containing all the required line data (left context, kwic, right context,...) """ # add structures needed to render speech playback information all_structs = args.structs if self.speech_segment_has_audio(args.speech_segment): speech_struct_attr_name = '.'.join(args.speech_segment) speech_struct_attr = self.corpus.get_attr(speech_struct_attr_name) if speech_struct_attr_name not in args.structs: all_structs += ',' + speech_struct_attr_name else: speech_struct_attr_name = '' speech_struct_attr = None lines = [] if args.righttoleft: rightlabel, leftlabel = 'Left', 'Right' args.structs += ',ltr' # from unicodedata import bidirectional else: leftlabel, rightlabel = 'Left', 'Right' # self.conc.corp() must be used here instead of self.corpus # because in case of parallel corpora these two are different and only the latter one is correct if isinstance(self.conc, InitialConc): kl = EmptyKWiclines() else: kl = manatee.KWICLines( self.conc.corp(), self.conc.RS(True, args.fromline, args.toline), args.leftctx, args.rightctx, args.attrs, args.ctxattrs, all_structs, args.refs) labelmap = args.labelmap.copy() labelmap['_'] = '_' maxleftsize = 0 maxrightsize = 0 filter_out_speech_tag = args.speech_segment and args.speech_segment[0] not in args.structs \ and speech_struct_attr_name in all_structs i = args.fromline while kl.nextline(): linegroup = kl.get_linegroup() if not linegroup: # manatee returns 0 in case of no group (but None will work too here) linegroup = -1 # client-side uses -1 as "no group" if self.speech_segment_has_audio(args.speech_segment): leftmost_speech_id = speech_struct_attr.pos2str( kl.get_ctxbeg()) else: leftmost_speech_id = None leftwords, last_left_speech_id = self.update_speech_boundaries( args.speech_segment, tokens2strclass(kl.get_left()), 'left', filter_out_speech_tag, leftmost_speech_id) kwicwords, last_left_speech_id = self.update_speech_boundaries( args.speech_segment, tokens2strclass(kl.get_kwic()), 'kwic', filter_out_speech_tag, last_left_speech_id) rightwords = self.update_speech_boundaries( args.speech_segment, tokens2strclass(kl.get_right()), 'right', filter_out_speech_tag, last_left_speech_id)[0] leftwords = self.postproc_text_chunk(leftwords) kwicwords = self.postproc_text_chunk(kwicwords) rightwords = self.postproc_text_chunk(rightwords) if args.righttoleft and Kwic.isengword(kwicwords[0]): leftwords, rightwords = Kwic.update_right_to_left( leftwords, rightwords) leftsize = 0 for w in leftwords: if not w['class'] == 'strc': leftsize += len(w['str']) + 1 if leftsize > maxleftsize: maxleftsize = leftsize rightsize = 0 for w in rightwords: if not w['class'] == 'strc': rightsize += len(w['str']) + 1 if rightsize > maxrightsize: maxrightsize = rightsize line_data = dict(toknum=kl.get_pos(), hitlen=Kwic.non1hitlen(kl.get_kwiclen()), kwiclen=kl.get_kwiclen(), ref=[s for s in kl.get_ref_list()], Kwic=kwicwords, linegroup=linegroup, leftsize=leftsize, rightsize=rightsize, linenum=i) line_data[leftlabel] = leftwords line_data[rightlabel] = rightwords lines.append(line_data) i += 1 for line in lines: line['leftspace'] = ' ' * (maxleftsize - line['leftsize']) line['rightspace'] = ' ' * (maxrightsize - line['rightsize']) return lines
def kwiclines(self, speech_segment, fromline, toline, leftctx='-5', rightctx='5', attrs='word', ctxattrs='word', refs='#', user_structs='p', labelmap={}, righttoleft=False, alignlist=[], align_attrname='align', aattrs='word', astructs=''): """ Generates list of 'kwic' (= keyword in context) lines according to the provided Concordance object and additional parameters (like page number, width of the left and right context etc.). arguments: speech_segment -- 2-tuple ... returns: a dictionary containing all the required line data (left context, kwic, right context,...) """ # structs represent which structures are requested by user # all_structs contain also internal structures needed to render # additional information (like the speech links) all_structs = user_structs if speech_segment: speech_struct_attr_name = '.'.join(speech_segment) speech_struct_attr = self.corpus.get_attr(speech_struct_attr_name) if not speech_struct_attr_name in user_structs: all_structs += ',' + speech_struct_attr_name else: speech_struct_attr_name = None speech_struct_attr = None lines = [] if righttoleft: rightlabel, leftlabel = 'Left', 'Right' user_structs += ',ltr' # from unicodedata import bidirectional else: leftlabel, rightlabel = 'Left', 'Right' # self.conc.corp() must be used here instead of self.corpus # because in case of parallel corpora these two are different and only the latter one is correct kl = manatee.KWICLines(self.conc.corp(), self.conc.RS(True, fromline, toline), leftctx, rightctx, attrs, ctxattrs, all_structs, refs) labelmap = labelmap.copy() labelmap['_'] = '_' maxleftsize = 0 maxrightsize = 0 filter_out_speech_tag = speech_segment and speech_segment[0] not in user_structs \ and speech_struct_attr_name in all_structs i = fromline while kl.nextline(): linegroup = str(kl.get_linegroup() or '_') linegroup = labelmap.get(linegroup, '#' + linegroup) if speech_segment: leftmost_speech_id = speech_struct_attr.pos2str( kl.get_ctxbeg()) else: leftmost_speech_id = None leftwords, last_left_speech_id = self.postproc_kwicline_part( speech_segment, tokens2strclass(kl.get_left()), 'left', filter_out_speech_tag, leftmost_speech_id) kwicwords, last_left_speech_id = self.postproc_kwicline_part( speech_segment, tokens2strclass(kl.get_kwic()), 'kwic', filter_out_speech_tag, last_left_speech_id) rightwords = self.postproc_kwicline_part( speech_segment, tokens2strclass(kl.get_right()), 'right', filter_out_speech_tag, last_left_speech_id)[0] if righttoleft and Kwic.isengword(kwicwords[0]): leftwords, rightwords = Kwic.update_right_to_left( leftwords, rightwords) leftsize = 0 for w in leftwords: if not w['class'] == 'strc': leftsize += len(w['str']) + 1 if leftsize > maxleftsize: maxleftsize = leftsize rightsize = 0 for w in rightwords: if not w['class'] == 'strc': rightsize += len(w['str']) + 1 if rightsize > maxrightsize: maxrightsize = rightsize line_data = dict(toknum=kl.get_pos(), hitlen=Kwic.non1hitlen(kl.get_kwiclen()), kwiclen=kl.get_kwiclen(), ref=self.import_string(kl.get_refs()), Kwic=kwicwords, linegroup=linegroup, leftsize=leftsize, rightsize=rightsize, linenum=i) line_data[leftlabel] = leftwords line_data[rightlabel] = rightwords lines.append(line_data) i += 1 for line in lines: line['leftspace'] = ' ' * (maxleftsize - line['leftsize']) line['rightspace'] = ' ' * (maxrightsize - line['rightsize']) return lines