Beispiel #1
0
    def to_conll(self,
                 tok='tok',
                 lem='lem',
                 pos='pos',
                 dep='dep',
                 sdp='sdp') -> Union[CoNLLSentence, List[CoNLLSentence]]:
        """
        Convert to :class:`~hanlp_common.conll.CoNLLSentence`.

        Args:
            tok (str): Field name for tok.
            lem (str): Field name for lem.
            pos (str): Filed name for upos.
            dep (str): Field name for dependency parsing.
            sdp (str): Field name for semantic dependency parsing.

        Returns:
            A :class:`~hanlp_common.conll.CoNLLSentence` representation.

        """
        tok = prefix_match(tok, self)
        lem = prefix_match(lem, self)
        pos = prefix_match(pos, self)
        dep = prefix_match(dep, self)
        sdp = prefix_match(sdp, self)
        results = CoNLLSentenceList()
        if not self[tok]:
            return results
        flat = isinstance(self[tok][0], str)
        if flat:
            d = Document((k, [v]) for k, v in self.items())
        else:
            d = self
        for sample in [dict(zip(d, t)) for t in zip(*d.values())]:

            def get(_k, _i):
                _v = sample.get(_k, None)
                if not _v:
                    return None
                return _v[_i]

            sent = CoNLLSentence()

            for i, _tok in enumerate(sample[tok]):
                _dep = get(dep, i)
                if not _dep:
                    _dep = (None, None)
                sent.append(
                    CoNLLUWord(i + 1,
                               form=_tok,
                               lemma=get(lem, i),
                               upos=get(pos, i),
                               head=_dep[0],
                               deprel=_dep[1],
                               deps=None if not get(sdp, i) else '|'.join(
                                   f'{x[0]}:{x[1]}' for x in get(sdp, i))))
            results.append(sent)
        if flat:
            return results[0]
        return results
Beispiel #2
0
 def finalize_document(self, doc: Document, task_name: str):
     pos_key = prefix_match('pos', doc)
     pos: List[List[str]] = doc.get(pos_key, None)
     if pos:
         for tree, pos_per_sent in zip(doc[task_name], pos):
             tree: Tree = tree
             offset = 0
             for subtree in tree.subtrees(lambda t: t.height() == 2):
                 tag = subtree.label()
                 if tag == '_':
                     subtree.set_label(pos_per_sent[offset])
                 offset += 1
Beispiel #3
0
    def get_by_prefix(self, prefix: str):
        """
        Get value by the prefix of a key.

        Args:
            prefix: The prefix of a key. If multiple keys are matched, only the first one will be used.

        Returns:
            The value assigned with the matched key.
        """
        key = prefix_match(prefix, self)
        if not key:
            return None
        return self[key]
Beispiel #4
0
 def _resolve_task_name(self, dependencies):
     resolved_dependencies = set()
     if isinstance(dependencies, str):
         if dependencies in self.tasks:
             resolved_dependencies.add(dependencies)
         elif dependencies.endswith('*'):
             resolved_dependencies.update(x for x in self.tasks if x.startswith(dependencies[:-1]))
         else:
             prefix_matched = prefix_match(dependencies, self.config.task_names)
             assert prefix_matched, f'No prefix matching for {dependencies}. ' \
                                    f'Check your dependencies definition: {list(self.tasks.values())}'
             resolved_dependencies.add(prefix_matched)
     elif isinstance(dependencies, Iterable):
         resolved_dependencies.update(set(chain.from_iterable(self._resolve_task_name(x) for x in dependencies)))
     return resolved_dependencies
Beispiel #5
0
    def to_pretty(self,
                  tok='tok',
                  lem='lem',
                  pos='pos',
                  dep='dep',
                  sdp='sdp',
                  ner='ner',
                  srl='srl',
                  con='con',
                  show_header=True,
                  html=False) -> Union[str, List[str]]:
        """
        Convert to a pretty text representation which can be printed to visualize linguistic structures.

        Args:
            tok: Token key.
            lem: Lemma key.
            pos: Part-of-speech key.
            dep: Dependency parse tree key.
            sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
            ner: Named entity key.
            srl: Semantic role labeling key.
            con: Constituency parsing key.
            show_header: ``True`` to include a header which indicates each field with its name.
            html: ``True`` to output HTML format so that non-ASCII characters can align correctly.

        Returns:
            A pretty string.

        """
        results = []
        tok = prefix_match(tok, self)
        pos = prefix_match(pos, self)
        ner = prefix_match(ner, self)
        conlls = self.to_conll(tok, lem, pos, dep, sdp)
        flat = isinstance(conlls, CoNLLSentence)
        if flat:
            conlls: List[CoNLLSentence] = [conlls]

        def condense(block_, extras_=None):
            text_ = make_table(block_, insert_header=False)
            text_ = [x.split('\t', 1) for x in text_.split('\n')]
            text_ = [[x[0], x[1].replace('\t', '')] for x in text_]
            if extras_:
                for r, s in zip(extras_, text_):
                    r.extend(s)
            return text_

        for i, conll in enumerate(conlls):
            conll: CoNLLSentence = conll
            tokens = [x.form for x in conll]
            length = len(conll)
            extras = [[] for j in range(length + 1)]
            if ner in self:
                ner_samples = self[ner]
                if flat:
                    ner_samples = [ner_samples]
                ner_per_sample = ner_samples[i]
                # For nested NER, use the longest span
                start_offsets = [None for i in range(length)]
                for ent, label, b, e in ner_per_sample:
                    if not start_offsets[b] or e > start_offsets[b][-1]:
                        start_offsets[b] = (ent, label, b, e)
                ner_per_sample = [y for y in start_offsets if y]
                header = ['Token', 'NER', 'Type']
                block = [[] for _ in range(length + 1)]
                _ner = []
                _type = []
                offset = 0
                for ent, label, b, e in ner_per_sample:
                    render_labeled_span(b, e, _ner, _type, label, offset)
                    offset = e
                if offset != length:
                    _ner.extend([''] * (length - offset))
                    _type.extend([''] * (length - offset))
                if any(_type):
                    block[0].extend(header)
                    for j, (_s, _t) in enumerate(zip(_ner, _type)):
                        block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)

            if srl in self:
                srl_samples = self[srl]
                if flat:
                    srl_samples = [srl_samples]
                srl_per_sample = srl_samples[i]
                for k, pas in enumerate(srl_per_sample):
                    if not pas:
                        continue
                    block = [[] for _ in range(length + 1)]
                    header = ['Token', 'SRL', f'PA{k + 1}']
                    _srl = []
                    _type = []
                    offset = 0
                    p_index = None
                    for _, label, b, e in pas:
                        render_labeled_span(b, e, _srl, _type, label, offset)
                        offset = e
                        if label == PRED:
                            p_index = b
                    if len(_srl) != length:
                        _srl.extend([''] * (length - offset))
                        _type.extend([''] * (length - offset))
                    if p_index is not None:
                        _srl[p_index] = '╟──►'
                        # _type[j] = 'V'
                        if len(block) != len(_srl) + 1:
                            # warnings.warn(f'Unable to visualize overlapped spans: {pas}')
                            continue
                        block[0].extend(header)
                        while len(_srl) < length:
                            _srl.append('')
                        while len(_type) < length:
                            _type.append('')
                        for j, (_s, _t) in enumerate(zip(_srl, _type)):
                            block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)
            if con in self:
                con_samples: Tree = self[con]
                if flat:
                    con_samples: List[Tree] = [con_samples]
                tree = con_samples[i]
                block = [[] for _ in range(length + 1)]
                block[0].extend(('Token', 'PoS'))
                for j, t in enumerate(tree.pos()):
                    block[j + 1].extend(t)

                for height in range(
                        2,
                        tree.height() + (0 if len(tree) == 1 else 1)):
                    offset = 0
                    spans = []
                    labels = []
                    for k, subtree in enumerate(
                            tree.subtrees(lambda x: x.height() == height)):
                        subtree: Tree = subtree
                        b, e = offset, offset + len(subtree.leaves())
                        if height >= 3:
                            b, e = subtree[0].center, subtree[-1].center + 1
                        subtree.center = b + (e - b) // 2
                        render_labeled_span(b,
                                            e,
                                            spans,
                                            labels,
                                            subtree.label(),
                                            offset,
                                            unidirectional=True)
                        offset = e
                    if len(spans) != length:
                        spans.extend([''] * (length - len(spans)))
                    if len(labels) != length:
                        labels.extend([''] * (length - len(labels)))
                    if height < 3:
                        continue
                    block[0].extend(['', f'{height}'])
                    for j, (_s, _t) in enumerate(zip(spans, labels)):
                        block[j + 1].extend((_s, _t))
                    # check short arrows and increase their length
                    for j, arrow in enumerate(spans):
                        if not arrow:
                            # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag
                            if block[j + 1][-3] or block[j + 1][-4] == '───►':
                                if height > 3:
                                    if block[j + 1][-3]:
                                        block[j + 1][-1] = block[j + 1][-3]
                                        block[j + 1][-2] = '───►'
                                    else:
                                        block[j + 1][-1] = '────'
                                        block[j + 1][-2] = '────'
                                    block[j + 1][-3] = '────'
                                    if block[j + 1][-4] == '───►':
                                        block[j + 1][-4] = '────'
                                else:
                                    block[j + 1][-1] = '────'
                                if block[j + 1][-1] == '────':
                                    block[j + 1][-2] = '────'
                                if not block[j + 1][-4]:
                                    block[j + 1][-4] = '────'
                # If the root label is shorter than the level number, extend it to the same length
                level_len = len(block[0][-1])
                for row in block[1:]:
                    if row[-1] and len(row[-1]) < level_len:
                        row[-1] = row[-1] + ' ' * (level_len - len(row[-1]))

                text = condense(block)
                # Cosmetic issues
                for row in text[1:]:
                    while '  ─' in row[1]:
                        row[1] = row[1].replace('  ─', ' ──')
                    row[1] = row[1].replace('─ ─', '───')
                    row[1] = re.sub(
                        r'([►─])([\w-]*)(\s+)([│├])', lambda m:
                        f'{m.group(1)}{m.group(2)}{"─" * len(m.group(3))}{"┤" if m.group(4) == "│" else "┼"}',
                        row[1])
                    row[1] = re.sub(r'►(─+)►', r'─\1►', row[1])
                for r, s in zip(extras, text):
                    r.extend(s)
            # warnings.warn('Unable to visualize non-projective trees.')
            if dep in self and conll.projective:
                text = conll.to_tree(extras)
                if not show_header:
                    text = text.split('\n')
                    text = '\n'.join(text[2:])
                results.append(text)
            elif any(extras):
                results.append(make_table(extras, insert_header=True))
            else:
                results.append(' '.join([
                    '/'.join(str(f) for f in x.nonempty_fields) for x in conll
                ]))
        if html:

            def to_html(pretty_text: str) -> str:
                lines = [x for x in pretty_text.split('\n') if x]
                cells = []
                for line in lines:
                    cells.append(line.split('\t'))

                num_cols = len(cells[0])
                cols = []

                for i in range(num_cols):
                    cols.append([])
                    for row in cells:
                        cols[-1].append(row[i])

                html = '<div style="display: table; padding-bottom: 1rem;">'
                for i, each in enumerate(cols):
                    html += '<pre style="display: table-cell; font-family: SFMono-Regular,Menlo,Monaco,Consolas,' \
                            'Liberation Mono,Courier New,monospace; white-space: nowrap; line-height: 128%; padding: 0;">'
                    if i != len(cols) - 1:
                        each = [x + ' ' for x in each]
                    html += '<br>'.join(
                        [x.replace(' ', '&nbsp;') for x in each])
                    html += '</pre>'
                html += '</div>'
                return html

            results = [to_html(x) for x in results]
        if flat:
            return results[0]
        return results
Beispiel #6
0
    def to_pretty(self, tok='tok', lem='lem', pos='pos', dep='dep', sdp='sdp', ner='ner', srl='srl', con='con',
                  show_header=True) -> str:
        """
        Convert to a pretty text representation which can be printed to visualize linguistics structures.

        Args:
            tok: Token key.
            lem: Lemma key.
            pos: Part-of-speech key.
            dep: Dependency parse tree key.
            sdp: Semantic dependency tree/graph key. SDP visualization has not been implemented yet.
            ner: Named entity key.
            srl: Semantic role labeling key.
            con: Constituency parsing key.
            show_header: ``True`` to print a header which indicates each field with its name.

        Returns:
            A pretty string.

        """
        results = []
        tok = prefix_match(tok, self)
        pos = prefix_match(pos, self)
        ner = prefix_match(ner, self)
        conlls = self.to_conll(tok, lem, pos, dep, sdp)
        flat = isinstance(conlls, CoNLLSentence)
        if flat:
            conlls: List[CoNLLSentence] = [conlls]

        def condense(block_, extras_=None):
            text_ = make_table(block_, insert_header=False)
            text_ = [x.split('\t', 1) for x in text_.split('\n')]
            text_ = [[x[0], x[1].replace('\t', '')] for x in text_]
            if extras_:
                for r, s in zip(extras_, text_):
                    r.extend(s)
            return text_

        for i, conll in enumerate(conlls):
            conll: CoNLLSentence = conll
            tokens = [x.form for x in conll]
            length = len(conll)
            extras = [[] for j in range(length + 1)]
            if ner in self:
                ner_samples = self[ner]
                if flat:
                    ner_samples = [ner_samples]
                ner_per_sample = ner_samples[i]
                # For nested NER, use the longest span
                start_offsets = [None for i in range(length)]
                for ent, label, b, e in ner_per_sample:
                    if not start_offsets[b] or e > start_offsets[b][-1]:
                        start_offsets[b] = (ent, label, b, e)
                ner_per_sample = [y for y in start_offsets if y]
                header = ['Tok', 'NER', 'Type']
                block = [[] for _ in range(length + 1)]
                _ner = []
                _type = []
                offset = 0
                for ent, label, b, e in ner_per_sample:
                    render_labeled_span(b, e, _ner, _type, label, offset)
                    offset = e
                if offset != length:
                    _ner.extend([''] * (length - offset))
                    _type.extend([''] * (length - offset))
                if any(_type):
                    block[0].extend(header)
                    for j, (_s, _t) in enumerate(zip(_ner, _type)):
                        block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)

            if srl in self:
                srl_samples = self[srl]
                if flat:
                    srl_samples = [srl_samples]
                srl_per_sample = srl_samples[i]
                for k, pas in enumerate(srl_per_sample):
                    if not pas:
                        continue
                    block = [[] for _ in range(length + 1)]
                    header = ['Tok', 'SRL', f'PA{k + 1}']
                    _srl = []
                    _type = []
                    offset = 0
                    p_index = None
                    for _, label, b, e in pas:
                        render_labeled_span(b, e, _srl, _type, label, offset)
                        offset = e
                        if label == PRED:
                            p_index = b
                    if len(_srl) != length:
                        _srl.extend([''] * (length - offset))
                        _type.extend([''] * (length - offset))
                    if p_index is not None:
                        _srl[p_index] = '╟──►'
                        # _type[j] = 'V'
                        if len(block) != len(_srl) + 1:
                            warnings.warn(f'Unable to visualize overlapped spans: {pas}')
                            continue
                        block[0].extend(header)
                        for j, (_s, _t) in enumerate(zip(_srl, _type)):
                            block[j + 1].extend((tokens[j], _s, _t))
                    text = condense(block, extras)
            if con in self:
                con_samples: Tree = self[con]
                if flat:
                    con_samples: List[Tree] = [con_samples]
                tree = con_samples[i]
                block = [[] for _ in range(length + 1)]
                block[0].extend(('Tok', 'PoS'))
                for j, t in enumerate(tree.pos()):
                    block[j + 1].extend(t)

                for height in range(2, tree.height()):
                    offset = 0
                    spans = []
                    labels = []
                    for k, subtree in enumerate(tree.subtrees(lambda x: x.height() == height)):
                        subtree: Tree = subtree
                        b, e = offset, offset + len(subtree.leaves())
                        if height >= 3:
                            b, e = subtree[0].center, subtree[-1].center + 1
                        subtree.center = b + (e - b) // 2
                        render_labeled_span(b, e, spans, labels, subtree.label(), offset, unidirectional=True)
                        offset = e
                    if len(spans) != length:
                        spans.extend([''] * (length - len(spans)))
                    if len(labels) != length:
                        labels.extend([''] * (length - len(labels)))
                    if height < 3:
                        continue
                    block[0].extend(['', f'{height}'])
                    for j, (_s, _t) in enumerate(zip(spans, labels)):
                        block[j + 1].extend((_s, _t))
                    # check short arrows and increase their length
                    for j, arrow in enumerate(spans):
                        if not arrow:
                            # -1 current tag ; -2 arrow to current tag ; -3 = prev tag ; -4 = arrow to prev tag
                            if block[j + 1][-3] or block[j + 1][-4] == '───►':
                                if height > 3:
                                    if block[j + 1][-3]:
                                        block[j + 1][-1] = block[j + 1][-3]
                                        block[j + 1][-2] = '───►'
                                    else:
                                        block[j + 1][-1] = '────'
                                        block[j + 1][-2] = '────'
                                    block[j + 1][-3] = '────'
                                    if block[j + 1][-4] == '───►':
                                        block[j + 1][-4] = '────'
                                else:
                                    block[j + 1][-1] = '────'
                                if block[j + 1][-1] == '────':
                                    block[j + 1][-2] = '────'
                                if not block[j + 1][-4]:
                                    block[j + 1][-4] = '────'

                text = condense(block)
                # Cosmetic issues
                for row in text:
                    while '  ─' in row[1]:
                        row[1] = row[1].replace('  ─', ' ──')
                    row[1] = row[1].replace('─  │', '───┤')
                    row[1] = row[1].replace('─  ├', '───┼')
                    row[1] = re.sub(r'►(\w+)(\s+)([│├])', lambda
                        m: f'►{m.group(1)}{"─" * len(m.group(2))}{"┤" if m.group(3) == "│" else "┼"}', row[1])
                    row[1] = re.sub(r'►(─+)►', r'─\1►', row[1])
                for r, s in zip(extras, text):
                    r.extend(s)
            # warnings.warn('Unable to visualize non-projective trees.')
            if dep in self and conll.projective:
                text = conll.to_tree(extras)
                if not show_header:
                    text = text.split('\n')
                    text = '\n'.join(text[2:])
                results.append(text)
            elif any(extras):
                results.append(make_table(extras, insert_header=True))
            else:
                results.append(' '.join(['/'.join(str(f) for f in x.nonempty_fields) for x in conll]))
        if flat:
            return results[0]
        return results