def get_rst_relation_root_nodes(docgraph, data=True, rst_namespace='rst'): """ yield all nodes that dominate one or more RST relations in the given document graph (in no particular order). Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains RST annotations data : bool If True (default), yields (node ID, relation name, list of tokens) tuples. If False, yields just node IDs. rst_namespace : str The namespace that the RST annotations use (default: rst) Yields ------ relations : str or (str, str, list of str) tuples If data=False, this will just yield node IDs of the nodes that directly dominate an RST relation. If data=True, this yields tuples of the form: (node ID, relation name, list of tokens that this relation spans). """ rel_attr = rst_namespace+':rel_name' for node_id, node_attrs in docgraph.nodes_iter(data=True): if rel_attr in node_attrs and node_attrs[rel_attr] != 'span': yield (node_id, node_attrs[rel_attr], get_span(docgraph, node_id)) if data else (node_id)
def get_conano_units(docgraph, data=True, conano_namespace='conano'): """ yield all Conano units that occur in the given document graph, sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains Conano annotations data : bool If True (default), yields (node ID, list of tokens) tuples. If False, yields just unit IDs. conano_namespace : str The namespace that the Conano annotations use (default: conano) Yields ------ relations : str or (str, str, list of str) tuples If data=False, this will just yield node IDs of the nodes that directly dominate an RST relation. If data=True, this yields tuples of the form: (node ID, relation name, list of tokens that this relation spans). """ for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace + ':unit'), key=natural_sort_key): yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
def __add_annotation_tier(self, docgraph, body, annotation_layer): """ adds a span-based annotation layer as a <tier> to the Exmaralda <body>. Parameter --------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file annotation_layer : str the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence' """ layer_cat = annotation_layer.split(':')[-1] temp_tier = self.E( 'tier', { 'id': "TIE{}".format(self.tier_count), 'category': layer_cat, 'type': "t", 'display-name': "[{}]".format(annotation_layer) }) self.tier_count += 1 for node_id in select_nodes_by_layer(docgraph, annotation_layer): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) event_label = docgraph.node[node_id].get('label', '') event = self.E('event', { 'start': "T{}".format(start_id), 'end': "T{}".format(end_id) }, event_label) temp_tier.append(event) body.append(temp_tier)
def get_conano_units(docgraph, data=True, conano_namespace='conano'): """ yield all Conano units that occur in the given document graph, sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains Conano annotations data : bool If True (default), yields (node ID, list of tokens) tuples. If False, yields just unit IDs. conano_namespace : str The namespace that the Conano annotations use (default: conano) Yields ------ relations : str or (str, str, list of str) tuples If data=False, this will just yield node IDs of the nodes that directly dominate an RST relation. If data=True, this yields tuples of the form: (node ID, relation name, list of tokens that this relation spans). """ for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace+':unit'), key=natural_sort_key): yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
def __add_annotation_tier(self, docgraph, body, annotation_layer): """ adds a span-based annotation layer as a <tier> to the Exmaralda <body>. Parameter --------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file annotation_layer : str the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence' """ layer_cat = annotation_layer.split(':')[-1] temp_tier = self.E('tier', {'id': "TIE{}".format(self.tier_count), 'category': layer_cat, 'type': "t", 'display-name': "[{}]".format(annotation_layer)}) self.tier_count += 1 for node_id in select_nodes_by_layer(docgraph, annotation_layer): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) event_label = docgraph.node[node_id].get('label', '') event = self.E('event', {'start': "T{}".format(start_id), 'end': "T{}".format(end_id)}, event_label) temp_tier.append(event) body.append(temp_tier)
def get_connective(docgraph, unit_id): """ returns the lowercased string of the connective used in the given Conano unit. """ unit_index, _unit_type = unit_id.split(':') connective_id = unit_index+':connective' return ' '.join(docgraph.get_token(tok_id).lower() for tok_id in get_span(docgraph, connective_id))
def get_connective(docgraph, unit_id): """ returns the lowercased string of the connective used in the given Conano unit. """ unit_index, _unit_type = unit_id.split(':') connective_id = unit_index + ':connective' return ' '.join( docgraph.get_token(tok_id).lower() for tok_id in get_span(docgraph, connective_id))
def test_tiger_sentence_spans(): """ convert a TigerXML sentence (without a secondary edge) into a ``TigerSentenceGraph`` and check, if the syntax nodes cover the right tokens / string spans. """ maz_13125_s389 = etree.fromstring(SENTENCE_WITHOUT_SECEDGE) tsg = TigerSentenceGraph(maz_13125_s389) # the root element should cover the complete sentence assert get_span(tsg, 's389_503') == [ 's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7', 's389_8', 's389_9', 's389_10', 's389_11'] assert get_text(tsg, 's389_503') == \ u"Was man nicht durch Augenschein nachprüfen kann , ist manipulierbar ." assert dg.is_continuous(tsg, 's389_503') # a subordinated ('SB') clause assert get_span(tsg, 's389_502') == [ 's389_1', 's389_2', 's389_3', 's389_4', 's389_5', 's389_6', 's389_7'] assert get_text(tsg, 's389_502') == \ u"Was man nicht durch Augenschein nachprüfen kann" assert dg.is_continuous(tsg, 's389_502') # a discontinuously annotated VP ('OC', i.e. a clausal object) assert get_span(tsg, 's389_501') == [ 's389_1', 's389_4', 's389_5', 's389_6'] assert get_text(tsg, 's389_501') == \ u"Was durch Augenschein nachprüfen" assert not dg.is_continuous(tsg, 's389_501') # a PP modifier ('MO') assert get_span(tsg, 's389_500') == ['s389_4', 's389_5'] assert get_text(tsg, 's389_500') == \ u"durch Augenschein" assert dg.is_continuous(tsg, 's389_500')
def __add_coreference_chain_tiers(self, docgraph, body, min_chain_length=3): """ Parameters ---------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file min_chain_length : int don't add tiers for chains with less than N elements (default: 3) TODO: this method assumes that each pointing relation chains signifies a coreference chain. """ E = self.E for i, chain in enumerate(get_pointing_chains(docgraph)): chain_tier = E( 'tier', { 'id': "TIE{}".format(self.tier_count), 'category': "chain", 'type': "t", 'display-name': "[coref-chain-{}]".format(i) }) self.tier_count += 1 chain_length = len(chain) if chain_length < min_chain_length: continue # ignore short chains for j, node_id in enumerate(chain): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) element_str = "chain_{0}: {1}/{2}".format( i, chain_length - j, chain_length) chain_tier.append( E( 'event', { 'start': "T{}".format(start_id), 'end': "T{}".format(end_id) }, element_str)) body.append(chain_tier)
def __build_markable_token_mapper(self, coreference_layer=None, markable_layer=None): """ Creates mappings from tokens to the markable spans they belong to and the coreference chains these markables are part of. Returns ------- tok2markables : dict (str -> set of str) Maps from a token (node ID) to all the markables (node IDs) it is part of. markable2toks : dict (str -> list of str) Maps from a markable (node ID) to all the tokens (node IDs) that belong to it. markable2chains : dict (str -> list of int) Maps from a markable (node ID) to all the chains (chain ID) it belongs to. """ tok2markables = defaultdict(set) markable2toks = defaultdict(list) markable2chains = defaultdict(list) coreference_chains = get_pointing_chains(self.docgraph, layer=coreference_layer) for chain_id, chain in enumerate(coreference_chains): for markable_node_id in chain: markable2chains[markable_node_id].append(chain_id) # ID of the first singleton (if there are any) singleton_id = len(coreference_chains) # markable2toks/tok2markables shall contains all markables, not only # those which are part of a coreference chain for markable_node_id in select_nodes_by_layer(self.docgraph, markable_layer): span = get_span(self.docgraph, markable_node_id) markable2toks[markable_node_id] = span for token_node_id in span: tok2markables[token_node_id].add(markable_node_id) # singletons each represent their own chain (with only one element) if markable_node_id not in markable2chains: markable2chains[markable_node_id] = [singleton_id] singleton_id += 1 return tok2markables, markable2toks, markable2chains
def __add_coreference_chain_tiers(self, docgraph, body, min_chain_length=3): """ Parameters ---------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file min_chain_length : int don't add tiers for chains with less than N elements (default: 3) TODO: this method assumes that each pointing relation chains signifies a coreference chain. """ E = self.E for i, chain in enumerate(get_pointing_chains(docgraph)): chain_tier = E('tier', {'id': "TIE{}".format(self.tier_count), 'category': "chain", 'type': "t", 'display-name': "[coref-chain-{}]".format(i)}) self.tier_count += 1 chain_length = len(chain) if chain_length < min_chain_length: continue # ignore short chains for j, node_id in enumerate(chain): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) element_str = "chain_{0}: {1}/{2}".format( i, chain_length-j, chain_length) chain_tier.append( E('event', {'start': "T{}".format(start_id), 'end': "T{}".format(end_id)}, element_str)) body.append(chain_tier)
def test_get_span(): """get spans from an sentence graph with dominance, spanning and pointing relations, but without self-loops""" sg1 = make_sentencegraph1() assert is_directed_acyclic_graph(sg1) is False assert len(sg1) == 15 # token nodes only "span" themselves for i in range(8): assert dg.get_span(sg1, i) == [i] # the sentence covers all tokens, except for the markers ',' and '.' assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6] assert dg.get_span(sg1, 'NP1') == [0] assert dg.get_span(sg1, 'VP1') == [1] assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6] assert dg.get_span(sg1, 'NP2') == [3] assert dg.get_span(sg1, 'VP2') == [4, 5, 6] # the debug parameter should 'raise' a warning (since the graph is # cyclic), but the result must be the same) assert dg.get_span(sg1, 'S', debug=True) == [0, 1, 3, 4, 5, 6] # get_span() must be robust against self-loops sg1.add_edge('SBAR', 'SBAR', layers={sg1.ns+':selfloop'}, edge_type=dg.EdgeTypes.dominance_relation) assert dg.get_span(sg1, 'S') == [0, 1, 3, 4, 5, 6] assert dg.get_span(sg1, 'SBAR') == [3, 4, 5, 6] assert dg.get_span(sg1, 'SBAR', debug=True) == [3, 4, 5, 6] # get_span() won't be able to recover from a dominance relation # (non self)-loop sg1.add_edge('NP1', 'S', layers={sg1.ns+':loop'}, edge_type=dg.EdgeTypes.dominance_relation) with pytest.raises(RuntimeError) as excinfo: assert dg.get_span(sg1, 'S')
def test_tiger_sentence_with_secedge_spans(): """ convert a TigerXML sentence (without a secondary edge) into a ``TigerSentenceGraph`` and check, if the syntax nodes cover the right tokens / string spans. """ maz_12976_s367 = etree.fromstring(SENTENCE_WITH_SECEDGE) tsg_secedge = TigerSentenceGraph(maz_12976_s367) # sentence root assert get_span(tsg_secedge, 's367_508') == [ 's367_1', 's367_2', 's367_3', 's367_4', 's367_5', 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11', 's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19', 's367_20'] assert get_text(tsg_secedge, 's367_508') == \ u"Es kann nicht sein , dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt ." assert dg.is_continuous(tsg_secedge, 's367_508') # discontinuous NP assert get_span(tsg_secedge, 's367_507') == [ 's367_1', 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11', 's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19'] assert get_text(tsg_secedge, 's367_507') == \ u"Es dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt" assert not dg.is_continuous(tsg_secedge, 's367_507') # a coordinated sentence ('CS') assert get_span(tsg_secedge, 's367_506') == [ 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11', 's367_12', 's367_13', 's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19'] assert get_text(tsg_secedge, 's367_506') == \ u"dass die Basis gewissermaßen die Moral pachtet und ihn die realpolitische Schmutzarbeit machen lässt" assert dg.is_continuous(tsg_secedge, 's367_506') # a conjunct sentence ('CJ') with an ingoing secondary edge assert get_span(tsg_secedge, 's367_503') == [ 's367_6', 's367_7', 's367_8', 's367_9', 's367_10', 's367_11', 's367_12'] assert get_text(tsg_secedge, 's367_503') == \ u"dass die Basis gewissermaßen die Moral pachtet" assert dg.is_continuous(tsg_secedge, 's367_503') assert get_span(tsg_secedge, 's367_500') == ['s367_7', 's367_8'] assert get_text(tsg_secedge, 's367_500') == u"die Basis" assert dg.is_continuous(tsg_secedge, 's367_500') assert get_span(tsg_secedge, 's367_501') == ['s367_10', 's367_11'] assert get_text(tsg_secedge, 's367_501') == u"die Moral" assert dg.is_continuous(tsg_secedge, 's367_501') # a conjunct sentence ('CJ') with an ingoing secondary edge assert get_span(tsg_secedge, 's367_505') == [ 's367_14', 's367_15', 's367_16', 's367_17', 's367_18', 's367_19'] assert get_text(tsg_secedge, 's367_505') == \ u"ihn die realpolitische Schmutzarbeit machen lässt" assert dg.is_continuous(tsg_secedge, 's367_505') # a clausal object ('OC') VP assert get_span(tsg_secedge, 's367_504') == [ 's367_15', 's367_16', 's367_17', 's367_18'] assert get_text(tsg_secedge, 's367_504') == \ u"die realpolitische Schmutzarbeit machen" assert dg.is_continuous(tsg_secedge, 's367_504') assert get_span(tsg_secedge, 's367_502') == [ 's367_15', 's367_16', 's367_17'] assert get_text(tsg_secedge, 's367_502') == \ u"die realpolitische Schmutzarbeit" assert dg.is_continuous(tsg_secedge, 's367_502')
def get_rst_relations(docgraph): """ returns a dictionary with RST relation root node IDs (str, e.g. 'rst:23') as keys and dictionaries describing these RST relations as values. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains RST annotations Returns ------- rst_relations : defaultdict(str) possible keys: 'tokens', 'nucleus', 'satellites', 'multinuc' maps from an RST relation root node ID (str, e.g. 'rst:23') to a dictionary describing this RST relation. The key 'tokens' maps to a list of token (node IDs) which the relation spans. If the dictionary contains the key 'multinuc', the relation is multinuclear and the keys 'nucleus' and 'satellites' contain nothing. The key 'multinuc' maps to a list of (node ID (str), RST reltype (str), list of token node IDs) triples; each one describes a nucleus. The key 'nucleus' maps to a list of token (node IDs) which the relation spans. The key 'satellites' maps to a list of (node ID (str), RST reltype (str), list of token node IDs) triples; each one describes a satellite. """ rst_relations = defaultdict(lambda : defaultdict(str)) for dom_node, relname, toks in get_rst_relation_root_nodes(docgraph): neighbors = \ list(select_neighbors_by_layer(docgraph, dom_node, layer={'rst:segment', 'rst:group'})) multinuc_nuc_count = 1 directly_dominated_tokens = sorted([node for node in docgraph.neighbors(dom_node) if istoken(docgraph, node)], key=natural_sort_key) if directly_dominated_tokens: rst_relations[dom_node]['tokens'] = directly_dominated_tokens for neighbor in neighbors: for edge in docgraph[dom_node][neighbor]: # multidigraph edge_attrs = docgraph[dom_node][neighbor][edge] if edge_attrs['edge_type'] == EdgeTypes.spanning_relation: # a span always signifies the nucleus of a relation # there can be only one rst_relations[dom_node]['nucleus'] = (neighbor, get_span(docgraph, neighbor)) elif edge_attrs['rst:rel_type'] == 'rst': # a segment/group nucleus can dominate multiple satellites # (in different RST relations) satellite = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor)) if 'satellites' in rst_relations[dom_node]: rst_relations[dom_node]['satellites'].append(satellite) else: rst_relations[dom_node]['satellites'] = [satellite] elif edge_attrs['rst:rel_type'] == 'multinuc': nucleus = (neighbor, edge_attrs['rst:rel_name'], get_span(docgraph, neighbor)) if 'multinuc' in rst_relations[dom_node]: rst_relations[dom_node]['multinuc'].append(nucleus) else: rst_relations[dom_node]['multinuc'] = [nucleus] multinuc_nuc_count += 1 else: raise NotImplementedError("unknown type of RST segment domination") return rst_relations