コード例 #1
0
ファイル: libtreebank.py プロジェクト: gguibon/ginger
def _conllu_tree(tree_lines_lst: ty.Iterable[str]) -> libginger.Tree:
    """Return a tree (that is a list of `Nodes`) from its
       [CoNLL-U string representation](http://universaldependencies.org/format.html)."""
    root = libginger.Node(identifier=0, form='ROOT')
    res = [root]
    full_nodes = [
        root
    ]  # Efficient storage of referenceable nodes for faster retrival
    # We will fill these out if they are given
    metadata = {}  # type: ty.Dict[str, str]
    # First get the self-contained values, deal with references later

    for i, line in enumerate(l.strip() for l in tree_lines_lst):
        if line.startswith('#'):
            # Extract metadata
            metadata_match = re.match(r'#\s*(?P<key>.+?)\s*=\s*(?P<value>.*)',
                                      line)
            if metadata_match:
                metadata[metadata_match.group('key')] = metadata_match.group(
                    'value')
            continue

        try:
            (identifier, form, lemma, upostag, xpostag, feats, head, deprel,
             deps, misc) = line.split('\t')
        except ValueError:
            raise ParsingError(
                'At line {i} : 10 columns expected, got {n} ({line!r})'.format(
                    i=i, n=len(line.split('\t')), line=line))

        # Deal with multi-word tokens
        if not identifier.isnumeric() and re.match(r'\d+-\d+', identifier):
            a, b = identifier.split('-')
            form, misc = (e if e != '_' else None for e in (form, misc))
            new_node = libginger.MultiTokenNode(
                (PlaceholderNode(i) for i in range(int(a),
                                                   int(b) + 1)), form, misc)
        else:
            try:
                identifier = _parse_conll_identifier(identifier,
                                                     i,
                                                     'ID',
                                                     non_zero=True)
            except ValueError:
                # TODO: Issue a warning here
                if re.match(r'\d+.\d+', identifier):  # Skip empty nodes
                    continue
                raise FieldParsingError(i, 'ID', 'CoNLL-U', identifier)

            try:
                feats = conll_map_to_dict(feats)
            except ParsingError:
                raise FieldParsingError(i, 'FEATS', 'CoNLL-U', feats)

            try:
                head = PlaceholderNode(_parse_conll_identifier(
                    head, i, 'HEAD'))
            except ValueError:
                raise FieldParsingError(i, 'HEAD', 'CoNLL-U', head)

            try:
                if deps == '_':
                    deps = []
                else:
                    deps = [
                        (PlaceholderNode(_parse_conll_identifier(n, i,
                                                                 'HEAD')), d)
                        for n, d in (e.split(':') for e in deps.split('|'))
                    ]
            except ValueError:
                raise FieldParsingError(i, 'DEPS', 'CoNLL-U', deps)

            (form, lemma, upostag, xpostag, deprel,
             misc) = (e if e != '_' else None
                      for e in (form, lemma, upostag, xpostag, deprel, misc))

            new_node = libginger.Node(identifier, form, lemma, upostag,
                                      xpostag, feats, head, deprel, deps, misc)
            full_nodes.append(new_node)
        res.append(new_node)

    # Now deal with references
    for node in res[1:]:
        if isinstance(node, libginger.MultiTokenNode):
            node.span = [
                full_nodes[placeholder.identifier] for placeholder in node.span
            ]
        else:
            node.head = full_nodes[node.head.identifier]
            node.deps = [(next(n for n in res
                               if n.identifier == head.identifier), dep)
                         for head, dep in node.deps]

    return libginger.Tree(
        res,
        **{
            key: value
            for key, value in metadata.items() if key in ('sent_id', 'text')
        },
    )
コード例 #2
0
ファイル: libtreebank.py プロジェクト: gguibon/ginger
def _conllx_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree:
    """Create an Universal Dependencies tree from a CoNLL-X tree."""
    root = libginger.Node(identifier=0, form='ROOT')
    res = [root]
    conllx_to_conllu_identifiers = {0: 0}

    for i, line in enumerate(l.strip() for l in tree_lst):
        # Skip comment lines
        if line.startswith('#'):
            continue

        try:
            (identifier, form, lemma, upostag, xpostag, feats, head, deprel,
             phead, pdeprel) = line.split('\t')
        except ValueError:
            # TODO: Issue a warning here
            raise ParsingError(
                'At line {i} : 10 columns expected, got {n} ({line!r})'.format(
                    i=i, n=len(line.split('\t')), line=line))

        try:
            identifier = _parse_conll_identifier(identifier,
                                                 i,
                                                 'ID',
                                                 non_zero=True)
        except ValueError:
            raise FieldParsingError(i, 'ID', 'CoNLL-X', identifier)

        lemma = re.sub(r'\s', '_', lemma)

        try:
            feats = conll_map_to_dict(feats)
        except ParsingError:
            # Be nice : if empty, it should be an underscore, but let's be nice with spaces and
            # empty strings, too
            if feats.isspace() or not feats:
                # TODO: Issue a warning here
                feats = dict()
            else:
                raise FieldParsingError(i, 'FEATS', 'CoNLL-X', feats)

        try:
            head = _parse_conll_identifier(head, i, 'HEAD')
        except ValueError:
            raise FieldParsingError(i, 'HEAD', 'CoNLL-X', head)

        try:
            phead = _parse_conll_identifier(phead, i, 'PHEAD')
        except ValueError:
            if phead == '_':
                phead, pdeprel = None, None
            else:
                raise FieldParsingError(i, 'PHEAD', 'CoNLL-X', phead)

        # Deal with multi-token words
        tokens = list(re.findall(r'\w+|\S', form))
        # Deal with the first token
        real_identifier = len(res)
        conllx_to_conllu_identifiers[identifier] = real_identifier

        (lemma, upostag, xpostag, deprel,
         pdeprel) = (e if e != '_' else None
                     for e in (lemma, upostag, xpostag, deprel, pdeprel))

        res.append(
            libginger.Node(
                identifier=real_identifier,
                form=tokens[0],
                lemma=lemma,
                upostag=upostag,
                xpostag=xpostag,
                feats=feats,
                head=head,
                deprel=deprel,
                deps=[] if phead is None else [(phead, pdeprel)],
            ), )

        # Now deal with the other tokens, their head will simply be the first token,
        # with the relation 'fixed'
        for t in tokens[1:]:
            res.append(
                libginger.Node(identifier=len(res),
                               form=t,
                               head=identifier,
                               deprel='fixed'))

    # Now that we have a `Node` for every node, let's do the linking
    for n in res[1:]:
        n.head = res[conllx_to_conllu_identifiers[n.head]]
        n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel)
                  for head, deprel in n.deps]

    return libginger.Tree(res)
コード例 #3
0
ファイル: libtreebank.py プロジェクト: gguibon/ginger
def _conll2009_sys_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree:
    """Create an Universal Dependencies tree from a CoNLL-2009 tree.
       This takes only the predicted columns into account

       The gold attributes are stored in the `misc` attribute."""
    root = libginger.Node(identifier=0, form='ROOT')
    res = [root]
    conllx_to_conllu_identifiers = {0: 0}

    for i, line in enumerate(l.strip() for l in tree_lst):
        # Skip comment lines
        if line.startswith('#'):
            continue

        try:
            (identifier, form, lemma, plemma, pos, ppos, feat, pfeat, head,
             phead, deprel, pdeprel, fillpred, pred,
             *apreds) = line.split('\t')
        except ValueError:
            raise ParsingError(
                'At line {i} : at least 14 columns expected, got {n} ({line!r})'
                .format(i=i, n=len(line.split('\t')), line=line))
        try:
            identifier = _parse_conll_identifier(identifier,
                                                 i,
                                                 'ID',
                                                 non_zero=True)
        except ValueError:
            raise FieldParsingError(i, 'ID', 'CoNLL-2009', identifier)

        plemma = re.sub(r'\s', '_', plemma)

        try:
            pfeat = conll_map_to_dict(pfeat)
        except ValueError:
            # Be nice : if empty, it should be an underscore, but let's be nice with spaces and
            # empty strings, too
            if pfeat.isspace() or not pfeat:
                # TODO: Issue a warning here
                pfeat = dict()
            else:
                raise FieldParsingError(i, 'PFEAT', 'CoNLL-2009', pfeat)

        try:
            phead = _parse_conll_identifier(phead, i, 'PHEAD')
        except ValueError:
            raise FieldParsingError(i, 'PHEAD', 'CoNLL-2009', phead)

        try:
            head = _parse_conll_identifier(head, i, 'HEAD')
        except ValueError:
            if head == '_':
                head, deprel = None, None
            else:
                raise FieldParsingError(i, 'HEAD', 'CoNLL-2009', head)

        # Deal with multi-token words
        tokens = list(re.findall(r'\w+|\S', form))
        # Deal with the first token
        real_identifier = len(res)
        conllx_to_conllu_identifiers[identifier] = real_identifier

        (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred,
         *apreds) = (e if e != '_' else None
                     for e in (lemma, plemma, pos, ppos, deprel, pdeprel,
                               fillpred, *apreds))

        res.append(
            libginger.Node(
                identifier=real_identifier,
                form=tokens[0],
                lemma=plemma,
                upostag=ppos,
                feats=pfeat,
                head=phead,
                deprel=pdeprel,
                deps=[],
                misc=dict_to_conll_map(
                    {
                        k: v
                        for k, v in (
                            ('pos', pos),
                            ('head', head),
                            ('deprel', deprel),
                            ('fillpred', fillpred),
                            ('pred', pred),
                            ('apreds', ','.join(apreds)),
                        ) if v and v != '_'
                    }, ),
            ), )

        # Now deal with the other tokens, their head will simply be the first token,
        # with the relation 'fixed'
        for t in tokens[1:]:
            res.append(
                libginger.Node(identifier=len(res),
                               form=t,
                               head=identifier,
                               deprel='fixed'))

    # Now that we have a `Node` for every node,& let's do the linking
    for n in res[1:]:
        n.head = res[conllx_to_conllu_identifiers[n.head]]
        n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel)
                  for head, deprel in n.deps]

    return libginger.Tree(res)
コード例 #4
0
ファイル: libtreebank.py プロジェクト: LoicGrobol/ginger
def _conll2009_gold_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree:
    """Create an Universal Dependencies tree from a CoNLL-2009 tree.
       This takes only the gold columns into account

       The P-attributes and 'pred are stored in the `misc` attribute."""
    root = libginger.Node(identifier=0, form="ROOT")
    res = [root]
    conllx_to_conllu_identifiers = {0: 0}

    for i, line in enumerate(l.strip() for l in tree_lst):
        # Skip comment lines
        if line.startswith("#"):
            continue

        try:
            (
                identifier,
                form,
                lemma,
                plemma,
                pos,
                ppos,
                feat,
                pfeat,
                head,
                phead,
                deprel,
                pdeprel,
                fillpred,
                pred,
                *apreds,
            ) = line.split("\t")
        except ValueError:
            # TODO: Issue a warning here
            raise ParsingError(
                "At line {i} : at least 14 columns expected, got {n} ({line!r})"
                .format(i=i, n=len(line.split("\t")), line=line))

        try:
            identifier = _parse_conll_identifier(identifier,
                                                 i,
                                                 "ID",
                                                 non_zero=True)
        except ValueError:
            raise FieldParsingError(i, "ID", "CoNLL-2009", identifier)

        lemma = re.sub(r"\s", "_", lemma)

        try:
            feat = conll_map_to_dict(feat)
        except ValueError:
            # Be nice : if empty, it should be an underscore, but let's be nice with spaces and
            # empty strings, too
            if feat.isspace() or not feat:
                # TODO: Issue a warning here
                feat = dict()
            else:
                raise FieldParsingError(i, "FEAT", "CoNLL-2009", feat)

        try:
            head = _parse_conll_identifier(head, i, "HEAD")
        except ValueError:
            raise FieldParsingError(i, "HEAD", "CoNLL-2009", head)

        try:
            phead = _parse_conll_identifier(phead, i, "PHEAD")
        except ValueError:
            if phead == "_":
                phead, pdeprel = None, None
            else:
                raise FieldParsingError(i, "PHEAD", "CoNLL-2009", phead)

        # Deal with multi-token words
        tokens = list(re.findall(r"\w+|\S", form))
        # Deal with the first token
        real_identifier = len(res)
        conllx_to_conllu_identifiers[identifier] = real_identifier

        (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred,
         *apreds) = (e if e != "_" else None
                     for e in (lemma, plemma, pos, ppos, deprel, pdeprel,
                               fillpred, *apreds))
        # TODO: update this to use `PlaceholderNode`s
        res.append(
            libginger.Node(
                identifier=real_identifier,
                form=tokens[0],
                lemma=lemma,
                upostag=pos,
                feats=feat,
                head=head,
                deprel=deprel,
                deps=[],
                misc=dict_to_conll_map({
                    k: v
                    for k, v in (
                        ("ppos", ppos),
                        ("phead", phead),
                        ("pdeprel", pdeprel),
                        ("fillpred", fillpred),
                        ("pred", pred),
                        ("apreds", ",".join(apreds)),
                    ) if v and v != "_"
                }),
            ))

        # Now deal with the other tokens, their head will simply be the first token,
        # with the relation 'fixed'
        for t in tokens[1:]:
            res.append(
                libginger.Node(identifier=len(res),
                               form=t,
                               head=identifier,
                               deprel="fixed"))

    # Now that we have a `Node` for every node,& let's do the linking
    for n in res[1:]:
        n.head = res[conllx_to_conllu_identifiers[n.head]]
        n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel)
                  for head, deprel in n.deps]

    return libginger.Tree(res)