def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con', show_header=True, html=False) -> Union[str, List[str]]: """ Convert to a pretty text representation which can be printed to visualize linguistic structures. Args: tok: Token key. lem: Lemma key. pos: Part-of-speech key. dep: Dependency parse tree key. sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet. ner: Named entity key. srl: Semantic role labeling key. con: Constituency parsing key. show_header: ``True`` to include a header which indicates each field with its name. html: ``True`` to output HTML format so that non-ASCII characters can align correctly. Returns: A pretty string. """ results = [] tok = prefix_match(tok, self) pos = prefix_match(pos, self) ner = prefix_match(ner, self) conlls = self.to_conll(tok, lem, pos, dep, sdp) flat = isinstance(conlls, CoNLLSentence) if flat: conlls: List[CoNLLSentence] = [conlls] def condense(block_, extras_=None): text_ = make_table(block_, insert_header=False) text_ = [x.split('\t', 1) for x in text_.split('\n')] text_ = [[x[0], x[1].replace('\t', '')] for x in text_] if extras_: for r, s in zip(extras_, text_): r.extend(s) return text_ for i, conll in enumerate(conlls): conll: CoNLLSentence = conll tokens = [x.form for x in conll] length = len(conll) extras = [[] for j in range(length + 1)] if ner in self: ner_samples = self[ner] if flat: ner_samples = [ner_samples] ner_per_sample = ner_samples[i] # For nested NER, use the longest span start_offsets = [None for i in range(length)] for ent, label, b, e in ner_per_sample: if not start_offsets[b] or e > start_offsets[b][-1]: start_offsets[b] = (ent, label, b, e) ner_per_sample = [y for y in start_offsets if y] header = ['Token', 'NER', 'Type'] block = [[] for _ in range(length + 1)] _ner = [] _type = [] offset = 0 for ent, label, b, e in ner_per_sample: render_labeled_span(b, e, _ner, _type, label, offset) offset = e if offset != length: _ner.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if any(_type): block[0].extend(header) for j, (_s, _t) in enumerate(zip(_ner, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if srl in self: srl_samples = self[srl] if flat: srl_samples = [srl_samples] srl_per_sample = srl_samples[i] for k, pas in enumerate(srl_per_sample): if not pas: continue block = [[] for _ in range(length + 1)] header = ['Token', 'SRL', f'PA{k + 1}'] _srl = [] _type = [] offset = 0 p_index = None for _, label, b, e in pas: render_labeled_span(b, e, _srl, _type, label, offset) offset = e if label == PRED: p_index = b if len(_srl) != length: _srl.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if p_index is not None: _srl[p_index] = '╟──►' # _type[j] = 'V' if len(block) != len(_srl) + 1: # warnings.warn(f'Unable to visualize overlapped spans: {pas}') continue block[0].extend(header) while len(_srl) < length: _srl.append('') while len(_type) < length: _type.append('') for j, (_s, _t) in enumerate(zip(_srl, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if con in self: con_samples: Tree = self[con] if flat: con_samples: List[Tree] = [con_samples] tree = con_samples[i] block = [[] for _ in range(length + 1)] block[0].extend(('Token', 'PoS')) for j, t in enumerate(tree.pos()): block[j + 1].extend(t) for height in range( 2, tree.height() + (0 if len(tree) == 1 else 1)): offset = 0 spans = [] labels = [] for k, subtree in enumerate( tree.subtrees(lambda x: x.height() == height)): subtree: Tree = subtree b, e = offset, offset + len(subtree.leaves()) if height >= 3: b, e = subtree[0].center, subtree[-1].center + 1 subtree.center = b + (e - b) // 2 render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True) offset = e if len(spans) != length: spans.extend([''] * (length - len(spans))) if len(labels) != length: labels.extend([''] * (length - len(labels))) if height < 3: continue block[0].extend(['', f'{height}']) for j, (_s, _t) in enumerate(zip(spans, labels)): block[j + 1].extend((_s, _t)) # check short arrows and increase their length for j, arrow in enumerate(spans): if not arrow: # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag if block[j + 1][-3] or block[j + 1][-4] == '───►': if height > 3: if block[j + 1][-3]: block[j + 1][-1] = block[j + 1][-3] block[j + 1][-2] = '───►' else: block[j + 1][-1] = '────' block[j + 1][-2] = '────' block[j + 1][-3] = '────' if block[j + 1][-4] == '───►': block[j + 1][-4] = '────' else: block[j + 1][-1] = '────' if block[j + 1][-1] == '────': block[j + 1][-2] = '────' if not block[j + 1][-4]: block[j + 1][-4] = '────' # If the root label is shorter than the level number, extend it to the same length level_len = len(block[0][-1]) for row in block[1:]: if row[-1] and len(row[-1]) < level_len: row[-1] = row[-1] + ' ' * (level_len - len(row[-1])) text = condense(block) # Cosmetic issues for row in text[1:]: while ' ─' in row[1]: row[1] = row[1].replace(' ─', ' ──') row[1] = row[1].replace('─ ─', '───') row[1] = re.sub( r'([►─])([\w-]*)(\s+)([│├])', lambda m: f'{m.group(1)}{m.group(2)}{"─" * len(m.group(3))}{"┤" if m.group(4) == "│" else "┼"}', row[1]) row[1] = re.sub(r'►(─+)►', r'─\1►', row[1]) for r, s in zip(extras, text): r.extend(s) # warnings.warn('Unable to visualize non-projective trees.') if dep in self and conll.projective: text = conll.to_tree(extras) if not show_header: text = text.split('\n') text = '\n'.join(text[2:]) results.append(text) elif any(extras): results.append(make_table(extras, insert_header=True)) else: results.append(' '.join([ '/'.join(str(f) for f in x.nonempty_fields) for x in conll ])) if html: def to_html(pretty_text: str) -> str: lines = [x for x in pretty_text.split('\n') if x] cells = [] for line in lines: cells.append(line.split('\t')) num_cols = len(cells[0]) cols = [] for i in range(num_cols): cols.append([]) for row in cells: cols[-1].append(row[i]) html = '<div style="display: table; padding-bottom: 1rem;">' for i, each in enumerate(cols): html += '<pre style="display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,' \ 'Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;">' if i != len(cols) - 1: each = [x + ' ' for x in each] html += '<br>'.join( [x.replace(' ', ' ') for x in each]) html += '</pre>' html += '</div>' return html results = [to_html(x) for x in results] if flat: return results[0] return results
def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con', show_header=True) -> str: """ Convert to a pretty text representation which can be printed to visualize linguistics structures. Args: tok: Token key. lem: Lemma key. pos: Part-of-speech key. dep: Dependency parse tree key. sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet. ner: Named entity key. srl: Semantic role labeling key. con: Constituency parsing key. show_header: ``True`` to print a header which indicates each field with its name. Returns: A pretty string. """ results = [] tok = prefix_match(tok, self) pos = prefix_match(pos, self) ner = prefix_match(ner, self) conlls = self.to_conll(tok, lem, pos, dep, sdp) flat = isinstance(conlls, CoNLLSentence) if flat: conlls: List[CoNLLSentence] = [conlls] def condense(block_, extras_=None): text_ = make_table(block_, insert_header=False) text_ = [x.split('\t', 1) for x in text_.split('\n')] text_ = [[x[0], x[1].replace('\t', '')] for x in text_] if extras_: for r, s in zip(extras_, text_): r.extend(s) return text_ for i, conll in enumerate(conlls): conll: CoNLLSentence = conll tokens = [x.form for x in conll] length = len(conll) extras = [[] for j in range(length + 1)] if ner in self: ner_samples = self[ner] if flat: ner_samples = [ner_samples] ner_per_sample = ner_samples[i] # For nested NER, use the longest span start_offsets = [None for i in range(length)] for ent, label, b, e in ner_per_sample: if not start_offsets[b] or e > start_offsets[b][-1]: start_offsets[b] = (ent, label, b, e) ner_per_sample = [y for y in start_offsets if y] header = ['Tok', 'NER', 'Type'] block = [[] for _ in range(length + 1)] _ner = [] _type = [] offset = 0 for ent, label, b, e in ner_per_sample: render_labeled_span(b, e, _ner, _type, label, offset) offset = e if offset != length: _ner.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if any(_type): block[0].extend(header) for j, (_s, _t) in enumerate(zip(_ner, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if srl in self: srl_samples = self[srl] if flat: srl_samples = [srl_samples] srl_per_sample = srl_samples[i] for k, pas in enumerate(srl_per_sample): if not pas: continue block = [[] for _ in range(length + 1)] header = ['Tok', 'SRL', f'PA{k + 1}'] _srl = [] _type = [] offset = 0 p_index = None for _, label, b, e in pas: render_labeled_span(b, e, _srl, _type, label, offset) offset = e if label == PRED: p_index = b if len(_srl) != length: _srl.extend([''] * (length - offset)) _type.extend([''] * (length - offset)) if p_index is not None: _srl[p_index] = '╟──►' # _type[j] = 'V' if len(block) != len(_srl) + 1: warnings.warn(f'Unable to visualize overlapped spans: {pas}') continue block[0].extend(header) for j, (_s, _t) in enumerate(zip(_srl, _type)): block[j + 1].extend((tokens[j], _s, _t)) text = condense(block, extras) if con in self: con_samples: Tree = self[con] if flat: con_samples: List[Tree] = [con_samples] tree = con_samples[i] block = [[] for _ in range(length + 1)] block[0].extend(('Tok', 'PoS')) for j, t in enumerate(tree.pos()): block[j + 1].extend(t) for height in range(2, tree.height()): offset = 0 spans = [] labels = [] for k, subtree in enumerate(tree.subtrees(lambda x: x.height() == height)): subtree: Tree = subtree b, e = offset, offset + len(subtree.leaves()) if height >= 3: b, e = subtree[0].center, subtree[-1].center + 1 subtree.center = b + (e - b) // 2 render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True) offset = e if len(spans) != length: spans.extend([''] * (length - len(spans))) if len(labels) != length: labels.extend([''] * (length - len(labels))) if height < 3: continue block[0].extend(['', f'{height}']) for j, (_s, _t) in enumerate(zip(spans, labels)): block[j + 1].extend((_s, _t)) # check short arrows and increase their length for j, arrow in enumerate(spans): if not arrow: # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag if block[j + 1][-3] or block[j + 1][-4] == '───►': if height > 3: if block[j + 1][-3]: block[j + 1][-1] = block[j + 1][-3] block[j + 1][-2] = '───►' else: block[j + 1][-1] = '────' block[j + 1][-2] = '────' block[j + 1][-3] = '────' if block[j + 1][-4] == '───►': block[j + 1][-4] = '────' else: block[j + 1][-1] = '────' if block[j + 1][-1] == '────': block[j + 1][-2] = '────' if not block[j + 1][-4]: block[j + 1][-4] = '────' text = condense(block) # Cosmetic issues for row in text: while ' ─' in row[1]: row[1] = row[1].replace(' ─', ' ──') row[1] = row[1].replace('─ │', '───┤') row[1] = row[1].replace('─ ├', '───┼') row[1] = re.sub(r'►(\w+)(\s+)([│├])', lambda m: f'►{m.group(1)}{"─" * len(m.group(2))}{"┤" if m.group(3) == "│" else "┼"}', row[1]) row[1] = re.sub(r'►(─+)►', r'─\1►', row[1]) for r, s in zip(extras, text): r.extend(s) # warnings.warn('Unable to visualize non-projective trees.') if dep in self and conll.projective: text = conll.to_tree(extras) if not show_header: text = text.split('\n') text = '\n'.join(text[2:]) results.append(text) elif any(extras): results.append(make_table(extras, insert_header=True)) else: results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll])) if flat: return results[0] return results