def _conllu_tree(tree_lines_lst: ty.Iterable[str]) -> libginger.Tree: """Return a tree (that is a list of `Nodes`) from its [CoNLL-U string representation](http://universaldependencies.org/format.html).""" root = libginger.Node(identifier=0, form='ROOT') res = [root] full_nodes = [ root ] # Efficient storage of referenceable nodes for faster retrival # We will fill these out if they are given metadata = {} # type: ty.Dict[str, str] # First get the self-contained values, deal with references later for i, line in enumerate(l.strip() for l in tree_lines_lst): if line.startswith('#'): # Extract metadata metadata_match = re.match(r'#\s*(?P<key>.+?)\s*=\s*(?P<value>.*)', line) if metadata_match: metadata[metadata_match.group('key')] = metadata_match.group( 'value') continue try: (identifier, form, lemma, upostag, xpostag, feats, head, deprel, deps, misc) = line.split('\t') except ValueError: raise ParsingError( 'At line {i} : 10 columns expected, got {n} ({line!r})'.format( i=i, n=len(line.split('\t')), line=line)) # Deal with multi-word tokens if not identifier.isnumeric() and re.match(r'\d+-\d+', identifier): a, b = identifier.split('-') form, misc = (e if e != '_' else None for e in (form, misc)) new_node = libginger.MultiTokenNode( (PlaceholderNode(i) for i in range(int(a), int(b) + 1)), form, misc) else: try: identifier = _parse_conll_identifier(identifier, i, 'ID', non_zero=True) except ValueError: # TODO: Issue a warning here if re.match(r'\d+.\d+', identifier): # Skip empty nodes continue raise FieldParsingError(i, 'ID', 'CoNLL-U', identifier) try: feats = conll_map_to_dict(feats) except ParsingError: raise FieldParsingError(i, 'FEATS', 'CoNLL-U', feats) try: head = PlaceholderNode(_parse_conll_identifier( head, i, 'HEAD')) except ValueError: raise FieldParsingError(i, 'HEAD', 'CoNLL-U', head) try: if deps == '_': deps = [] else: deps = [ (PlaceholderNode(_parse_conll_identifier(n, i, 'HEAD')), d) for n, d in (e.split(':') for e in deps.split('|')) ] except ValueError: raise FieldParsingError(i, 'DEPS', 'CoNLL-U', deps) (form, lemma, upostag, xpostag, deprel, misc) = (e if e != '_' else None for e in (form, lemma, upostag, xpostag, deprel, misc)) new_node = libginger.Node(identifier, form, lemma, upostag, xpostag, feats, head, deprel, deps, misc) full_nodes.append(new_node) res.append(new_node) # Now deal with references for node in res[1:]: if isinstance(node, libginger.MultiTokenNode): node.span = [ full_nodes[placeholder.identifier] for placeholder in node.span ] else: node.head = full_nodes[node.head.identifier] node.deps = [(next(n for n in res if n.identifier == head.identifier), dep) for head, dep in node.deps] return libginger.Tree( res, **{ key: value for key, value in metadata.items() if key in ('sent_id', 'text') }, )
def _conllx_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree: """Create an Universal Dependencies tree from a CoNLL-X tree.""" root = libginger.Node(identifier=0, form='ROOT') res = [root] conllx_to_conllu_identifiers = {0: 0} for i, line in enumerate(l.strip() for l in tree_lst): # Skip comment lines if line.startswith('#'): continue try: (identifier, form, lemma, upostag, xpostag, feats, head, deprel, phead, pdeprel) = line.split('\t') except ValueError: # TODO: Issue a warning here raise ParsingError( 'At line {i} : 10 columns expected, got {n} ({line!r})'.format( i=i, n=len(line.split('\t')), line=line)) try: identifier = _parse_conll_identifier(identifier, i, 'ID', non_zero=True) except ValueError: raise FieldParsingError(i, 'ID', 'CoNLL-X', identifier) lemma = re.sub(r'\s', '_', lemma) try: feats = conll_map_to_dict(feats) except ParsingError: # Be nice : if empty, it should be an underscore, but let's be nice with spaces and # empty strings, too if feats.isspace() or not feats: # TODO: Issue a warning here feats = dict() else: raise FieldParsingError(i, 'FEATS', 'CoNLL-X', feats) try: head = _parse_conll_identifier(head, i, 'HEAD') except ValueError: raise FieldParsingError(i, 'HEAD', 'CoNLL-X', head) try: phead = _parse_conll_identifier(phead, i, 'PHEAD') except ValueError: if phead == '_': phead, pdeprel = None, None else: raise FieldParsingError(i, 'PHEAD', 'CoNLL-X', phead) # Deal with multi-token words tokens = list(re.findall(r'\w+|\S', form)) # Deal with the first token real_identifier = len(res) conllx_to_conllu_identifiers[identifier] = real_identifier (lemma, upostag, xpostag, deprel, pdeprel) = (e if e != '_' else None for e in (lemma, upostag, xpostag, deprel, pdeprel)) res.append( libginger.Node( identifier=real_identifier, form=tokens[0], lemma=lemma, upostag=upostag, xpostag=xpostag, feats=feats, head=head, deprel=deprel, deps=[] if phead is None else [(phead, pdeprel)], ), ) # Now deal with the other tokens, their head will simply be the first token, # with the relation 'fixed' for t in tokens[1:]: res.append( libginger.Node(identifier=len(res), form=t, head=identifier, deprel='fixed')) # Now that we have a `Node` for every node, let's do the linking for n in res[1:]: n.head = res[conllx_to_conllu_identifiers[n.head]] n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel) for head, deprel in n.deps] return libginger.Tree(res)
def _conll2009_sys_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree: """Create an Universal Dependencies tree from a CoNLL-2009 tree. This takes only the predicted columns into account The gold attributes are stored in the `misc` attribute.""" root = libginger.Node(identifier=0, form='ROOT') res = [root] conllx_to_conllu_identifiers = {0: 0} for i, line in enumerate(l.strip() for l in tree_lst): # Skip comment lines if line.startswith('#'): continue try: (identifier, form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, pred, *apreds) = line.split('\t') except ValueError: raise ParsingError( 'At line {i} : at least 14 columns expected, got {n} ({line!r})' .format(i=i, n=len(line.split('\t')), line=line)) try: identifier = _parse_conll_identifier(identifier, i, 'ID', non_zero=True) except ValueError: raise FieldParsingError(i, 'ID', 'CoNLL-2009', identifier) plemma = re.sub(r'\s', '_', plemma) try: pfeat = conll_map_to_dict(pfeat) except ValueError: # Be nice : if empty, it should be an underscore, but let's be nice with spaces and # empty strings, too if pfeat.isspace() or not pfeat: # TODO: Issue a warning here pfeat = dict() else: raise FieldParsingError(i, 'PFEAT', 'CoNLL-2009', pfeat) try: phead = _parse_conll_identifier(phead, i, 'PHEAD') except ValueError: raise FieldParsingError(i, 'PHEAD', 'CoNLL-2009', phead) try: head = _parse_conll_identifier(head, i, 'HEAD') except ValueError: if head == '_': head, deprel = None, None else: raise FieldParsingError(i, 'HEAD', 'CoNLL-2009', head) # Deal with multi-token words tokens = list(re.findall(r'\w+|\S', form)) # Deal with the first token real_identifier = len(res) conllx_to_conllu_identifiers[identifier] = real_identifier (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred, *apreds) = (e if e != '_' else None for e in (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred, *apreds)) res.append( libginger.Node( identifier=real_identifier, form=tokens[0], lemma=plemma, upostag=ppos, feats=pfeat, head=phead, deprel=pdeprel, deps=[], misc=dict_to_conll_map( { k: v for k, v in ( ('pos', pos), ('head', head), ('deprel', deprel), ('fillpred', fillpred), ('pred', pred), ('apreds', ','.join(apreds)), ) if v and v != '_' }, ), ), ) # Now deal with the other tokens, their head will simply be the first token, # with the relation 'fixed' for t in tokens[1:]: res.append( libginger.Node(identifier=len(res), form=t, head=identifier, deprel='fixed')) # Now that we have a `Node` for every node,& let's do the linking for n in res[1:]: n.head = res[conllx_to_conllu_identifiers[n.head]] n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel) for head, deprel in n.deps] return libginger.Tree(res)
def _conll2009_gold_tree(tree_lst: ty.Iterable[str]) -> libginger.Tree: """Create an Universal Dependencies tree from a CoNLL-2009 tree. This takes only the gold columns into account The P-attributes and 'pred are stored in the `misc` attribute.""" root = libginger.Node(identifier=0, form="ROOT") res = [root] conllx_to_conllu_identifiers = {0: 0} for i, line in enumerate(l.strip() for l in tree_lst): # Skip comment lines if line.startswith("#"): continue try: ( identifier, form, lemma, plemma, pos, ppos, feat, pfeat, head, phead, deprel, pdeprel, fillpred, pred, *apreds, ) = line.split("\t") except ValueError: # TODO: Issue a warning here raise ParsingError( "At line {i} : at least 14 columns expected, got {n} ({line!r})" .format(i=i, n=len(line.split("\t")), line=line)) try: identifier = _parse_conll_identifier(identifier, i, "ID", non_zero=True) except ValueError: raise FieldParsingError(i, "ID", "CoNLL-2009", identifier) lemma = re.sub(r"\s", "_", lemma) try: feat = conll_map_to_dict(feat) except ValueError: # Be nice : if empty, it should be an underscore, but let's be nice with spaces and # empty strings, too if feat.isspace() or not feat: # TODO: Issue a warning here feat = dict() else: raise FieldParsingError(i, "FEAT", "CoNLL-2009", feat) try: head = _parse_conll_identifier(head, i, "HEAD") except ValueError: raise FieldParsingError(i, "HEAD", "CoNLL-2009", head) try: phead = _parse_conll_identifier(phead, i, "PHEAD") except ValueError: if phead == "_": phead, pdeprel = None, None else: raise FieldParsingError(i, "PHEAD", "CoNLL-2009", phead) # Deal with multi-token words tokens = list(re.findall(r"\w+|\S", form)) # Deal with the first token real_identifier = len(res) conllx_to_conllu_identifiers[identifier] = real_identifier (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred, *apreds) = (e if e != "_" else None for e in (lemma, plemma, pos, ppos, deprel, pdeprel, fillpred, *apreds)) # TODO: update this to use `PlaceholderNode`s res.append( libginger.Node( identifier=real_identifier, form=tokens[0], lemma=lemma, upostag=pos, feats=feat, head=head, deprel=deprel, deps=[], misc=dict_to_conll_map({ k: v for k, v in ( ("ppos", ppos), ("phead", phead), ("pdeprel", pdeprel), ("fillpred", fillpred), ("pred", pred), ("apreds", ",".join(apreds)), ) if v and v != "_" }), )) # Now deal with the other tokens, their head will simply be the first token, # with the relation 'fixed' for t in tokens[1:]: res.append( libginger.Node(identifier=len(res), form=t, head=identifier, deprel="fixed")) # Now that we have a `Node` for every node,& let's do the linking for n in res[1:]: n.head = res[conllx_to_conllu_identifiers[n.head]] n.deps = [(res[conllx_to_conllu_identifiers[head]], deprel) for head, deprel in n.deps] return libginger.Tree(res)