def parse_spacy(passages, lang, verbose=False): for passage, in annotate_all(zip(passages), as_array=True, as_tuples=True, lang=lang, verbose=verbose): terminals = sorted(passage.layer(layer0.LAYER_ID).all, key=operator.attrgetter("position")) dep_nodes = [ConlluConverter.Node()] + [ ConlluConverter.Node(t.position, terminal=t, token=ConlluConverter.Token(t.text, t.tag)) for t in terminals ] for dep_node in dep_nodes[1:]: dep_node.token.paragraph = dep_node.terminal.paragraph head = Attr.HEAD(dep_node.terminal.tok[Attr.HEAD.value]) if head: head += dep_node.position rel = Attr.DEP(dep_node.terminal.tok[Attr.DEP.value], lang=passage.attrib.get("lang", lang)) assert head is not None and rel is not None, \ "head=%r, rel=%r for token %d in:\n%s" % (head, rel, dep_node.position, " ".join(map(str, terminals))) edge = ConlluConverter.Edge(head, rel, remote=False) dep_node.terminal = None edge.link_head(dep_nodes) dep_node.add_edges([edge]) parsed = ConlluConverter().build_passage(dep_nodes, passage.ID) yield passage, parsed
def from_conllu(lines, passage_id=None, return_original=False, annotate=False, terminals_only=False, dep=False, **kwargs): """Converts from parsed text in Universal Dependencies format to a Passage object. :param lines: iterable of lines in Universal Dependencies format, describing a single passage. :param passage_id: ID to set for passage :param return_original: return triple of (UCCA passage, Universal Dependencies string, sentence ID) :param annotate: whether to save dependency annotations in "extra" dict of layer 0 :param terminals_only: create only terminals (with any annotation if specified), no non-terminals :param dep: return dependency graph rather than converted UCCA passage :return generator of Passage objects """ from semstr.conversion.conllu import ConlluConverter return ConlluConverter().from_format(lines, passage_id=passage_id, return_original=return_original, annotate=annotate, terminals_only=terminals_only, dep=dep, format=kwargs.get("format"))
def to_conllu(passage, test=False, *args, **kwargs): """ Convert from a Passage object to a string in Universal Dependencies format (conllu) :param passage: the Passage object to convert :param test: whether to omit the head and deprel columns. Defaults to False :return list of lines representing the semantic dependencies in the passage """ del args, kwargs from semstr.conversion.conllu import ConlluConverter return ConlluConverter().to_format(passage, test, tree=True)
def to_conllu(passage, test=False, enhanced=True, preprocess=True, **kwargs): """ Convert from a Passage object to a string in Universal Dependencies format (conllu) :param passage: the Passage object to convert :param test: whether to omit the head and deprel columns. Defaults to False :param enhanced: whether to include enhanced edges :param preprocess: preprocess the converted dependency graph before returning it? :return list of lines representing the semantic dependencies in the passage """ from semstr.conversion.conllu import ConlluConverter return ConlluConverter(enhanced=enhanced).to_format(passage, test=test, preprocess=preprocess, format=kwargs.get("format"))
def from_conllu(lines, passage_id=None, split=True, return_original=False, annotate=False, *args, **kwargs): """Converts from parsed text in Universal Dependencies format to a Passage object. :param lines: iterable of lines in Universal Dependencies format, describing a single passage. :param passage_id: ID to set for passage :param split: split each sentence to its own passage? :param return_original: return triple of (UCCA passage, Universal Dependencies string, sentence ID) :param annotate: whether to save dependency annotations in "extra" dict of layer 0 :return generator of Passage objects """ del args, kwargs from semstr.conversion.conllu import ConlluConverter return ConlluConverter().from_format(lines, passage_id, split, return_original=return_original, annotate=annotate)