def test_select_nodes_by_layer(): """Are RST nodes correctly filtered based on their layer?""" rst_filepath = os.path.join(pcc.path, 'rst/maz-10374.rs3') rdg = dg.read_rst(rst_filepath) rst_node_ids = list(dg.select_nodes_by_layer(rdg, 'rst')) rst_nodes = list(dg.select_nodes_by_layer(rdg, 'rst', data=True)) assert len(rdg) == len(rst_node_ids) == len(rst_nodes) == 195
def test_select_nodes_by_layer(): """Are MMAX2 nodes correctly filtered based on their layer?""" coref_fpath = os.path.join(pcc.path, 'coreference/maz-10374.mmax') cdg = dg.read_mmax2(coref_fpath) coref_node_ids = list(dg.select_nodes_by_layer(cdg, 'mmax')) coref_nodes = list(dg.select_nodes_by_layer(cdg, 'mmax', data=True)) assert len(coref_node_ids) == len(cdg) == 231
def test_select_nodes_by_layer(): """Are nodes correctly filtered based on their layer?""" conano_fpath = os.path.join(pcc.path, 'connectors/maz-10374.xml') codg = dg.read_conano(conano_fpath) conano_node_ids = list(dg.select_nodes_by_layer(codg, 'conano')) conano_nodes = list(dg.select_nodes_by_layer(codg, 'conano', data=True)) assert len(codg) == len(conano_node_ids) == len(conano_nodes) == 188
def test_select_nodes_by_layer(): """Are Tiger syntax nodes correctly filtered based on their layer?""" tiger_fpath = os.path.join(pcc.path, 'syntax/maz-10374.xml') tdg = dg.read_tiger(tiger_fpath) tiger_node_ids = list(dg.select_nodes_by_layer(tdg, 'tiger')) tiger_nodes = list(dg.select_nodes_by_layer(tdg, 'tiger', data=True)) assert len(tdg) == len(tiger_node_ids) == 253
def get_potential_markables(docgraph): """ returns a list of all NPs and PPs in the given docgraph. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph that (at least) contains syntax trees (imported from Tiger XML files) Returns ------- potential_markables : list of str or int Node IDs of all nodes that represent an NP/PP syntactical category/phrase in the input document. If an NP is embedded in a PP, only the node ID of the PP is returned. """ potential_markables = [] for node_id, nattr in dg.select_nodes_by_layer(docgraph, 'tiger:syntax', data=True): if nattr['tiger:cat'] == 'NP': # if an NP is embedded into a PP, only print the PP pp_parent = False for source, target in docgraph.in_edges(node_id): parent_node = docgraph.node[source] if 'tiger:cat' in parent_node and parent_node['tiger:cat'] == 'PP': potential_markables.append(source) # add parent PP phrase pp_parent = True if not pp_parent: potential_markables.append(node_id) # add NP phrase elif nattr['tiger:cat'] == 'PP': potential_markables.append(node_id) # add PP phrase return potential_markables
def get_conano_units(docgraph, data=True, conano_namespace='conano'): """ yield all Conano units that occur in the given document graph, sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains Conano annotations data : bool If True (default), yields (node ID, list of tokens) tuples. If False, yields just unit IDs. conano_namespace : str The namespace that the Conano annotations use (default: conano) Yields ------ relations : str or (str, str, list of str) tuples If data=False, this will just yield node IDs of the nodes that directly dominate an RST relation. If data=True, this yields tuples of the form: (node ID, relation name, list of tokens that this relation spans). """ for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace+':unit'), key=natural_sort_key): yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
def __gen_struct_anno_files(self, top_level_layer): """ A struct annotation file contains node (struct) attributes (of non-token nodes). It is e.g. used to annotate the type of a syntactic category (NP, VP etc.). See also: __gen_hierarchy_file() """ paula_id = '{0}.{1}.{2}_{3}_struct'.format(top_level_layer, self.corpus_name, self.name, top_level_layer) E, tree = gen_paula_etree(paula_id) base_paula_id = self.paulamap['hierarchy'][top_level_layer] mflist = E('multiFeatList', {XMLBASE: base_paula_id+'.xml'}) for node_id in select_nodes_by_layer(self.dg, top_level_layer): if not istoken(self.dg, node_id): mfeat = E('multiFeat', {XLINKHREF: '#{0}'.format(node_id)}) node_dict = self.dg.node[node_id] for attr in node_dict: if attr not in IGNORED_NODE_ATTRIBS: mfeat.append( E('feat', {'name': attr, 'value': node_dict[attr]})) if self.human_readable: # adds node label as a <!--comment--> mfeat.append(Comment(node_dict.get('label'))) mflist.append(mfeat) tree.append(mflist) self.files[paula_id] = tree self.file2dtd[paula_id] = PaulaDTDs.multifeat return paula_id
def get_conano_units(docgraph, data=True, conano_namespace='conano'): """ yield all Conano units that occur in the given document graph, sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph which contains Conano annotations data : bool If True (default), yields (node ID, list of tokens) tuples. If False, yields just unit IDs. conano_namespace : str The namespace that the Conano annotations use (default: conano) Yields ------ relations : str or (str, str, list of str) tuples If data=False, this will just yield node IDs of the nodes that directly dominate an RST relation. If data=True, this yields tuples of the form: (node ID, relation name, list of tokens that this relation spans). """ for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace + ':unit'), key=natural_sort_key): yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
def __add_annotation_tier(self, docgraph, body, annotation_layer): """ adds a span-based annotation layer as a <tier> to the Exmaralda <body>. Parameter --------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file annotation_layer : str the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence' """ layer_cat = annotation_layer.split(':')[-1] temp_tier = self.E('tier', {'id': "TIE{}".format(self.tier_count), 'category': layer_cat, 'type': "t", 'display-name': "[{}]".format(annotation_layer)}) self.tier_count += 1 for node_id in select_nodes_by_layer(docgraph, annotation_layer): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) event_label = docgraph.node[node_id].get('label', '') event = self.E('event', {'start': "T{}".format(start_id), 'end': "T{}".format(end_id)}, event_label) temp_tier.append(event) body.append(temp_tier)
def __add_annotation_tier(self, docgraph, body, annotation_layer): """ adds a span-based annotation layer as a <tier> to the Exmaralda <body>. Parameter --------- docgraph : DiscourseDocumentGraph the document graph from which the chains will be extracted body : etree._Element an etree representation of the <basic_body> element (and all its descendants) of the Exmaralda file annotation_layer : str the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence' """ layer_cat = annotation_layer.split(':')[-1] temp_tier = self.E( 'tier', { 'id': "TIE{}".format(self.tier_count), 'category': layer_cat, 'type': "t", 'display-name': "[{}]".format(annotation_layer) }) self.tier_count += 1 for node_id in select_nodes_by_layer(docgraph, annotation_layer): span_node_ids = get_span(docgraph, node_id) if span_node_ids: start_id, end_id = self.__span2event(span_node_ids) event_label = docgraph.node[node_id].get('label', '') event = self.E('event', { 'start': "T{}".format(start_id), 'end': "T{}".format(end_id) }, event_label) temp_tier.append(event) body.append(temp_tier)
def get_sentences_and_token_nodes(self): """ Returns a list of sentence root node IDs and a list of sentences, where each list contains the token node IDs of that sentence. Both lists will be empty if sentences were not annotated in the original MMAX2 data. TODO: Refactor this! There's code overlap with self.add_annotation_layer(). Ideally, we would always import sentence annotations and filter them out in the exporters (e.g. Exmaralda, CoNLL), probably by modifying get_pointing_chains(). Returns ------- sentence_root_nodes : list of str a list of all sentence root node IDs, in the order they occur in the text token_nodes : list of list of str a list of lists. each list represents a sentence and contains token node IDs (in the order they occur in the text) """ token_nodes = [] # if sentence annotations were ignored during MMAXDocumentGraph # construction, we need to extract sentence/token node IDs manually if self.ignore_sentence_annotations: mp = self.mmax_project layer_dict = mp.annotations['sentence'] file_id = self.get_file_id(self.name) sentence_anno_file = os.path.join( mp.project_path, mp.paths['markable'], file_id + layer_dict['file_extension']) tree = etree.parse(sentence_anno_file) root = tree.getroot() sentence_root_nodes = [] for markable in root.iterchildren(): sentence_root_nodes.append(markable.attrib['id']) sentence_token_nodes = [] for token_id in spanstring2tokens(self, markable.attrib['span']): # ignore token IDs that aren't used in the *_words.xml file # NOTE: we only need this filter for broken files in the PCC corpus if token_id in self.tokens: sentence_token_nodes.append(token_id) self.add_node(markable.attrib['id'], layers={self.ns, self.ns + ':sentence'}) token_nodes.append(sentence_token_nodes) else: sentence_root_nodes = list( select_nodes_by_layer(self, self.ns + ':sentence')) for sent_node in sentence_root_nodes: sentence_token_nodes = [] for token_id in self.get_token_nodes_from_sentence(sent_node): # ignore token IDs that aren't used in the *_words.xml file # NOTE: we only need this filter for broken files in the PCC corpus if token_id in self.tokens: sentence_token_nodes.append(token_id) token_nodes.append(sentence_token_nodes) return sentence_root_nodes, token_nodes
def test_select_nodes_by_layer(): """Are nodes correctly filtered based on their layer?""" ddg = dg.DiscourseDocumentGraph(namespace='test') assert len(ddg) == 1 add_tokens(ddg, ['The', 'dog', 'barks', '.']) assert len(ddg) == 5 # don't filter any nodes all_node_ids = list(dg.select_nodes_by_layer(ddg)) all_nodes = list(dg.select_nodes_by_layer(ddg, data=True)) assert len(all_node_ids) == len(all_nodes) == 5 test_node_ids = list(dg.select_nodes_by_layer(ddg, 'test')) test_nodes = list(dg.select_nodes_by_layer(ddg, 'test', data=True)) assert len(ddg) == len(test_node_ids) == len(test_nodes) == 5 ddg.add_node(10, layers={'foo'}) ddg.add_node(11, layers={'bar'}) # filter several layers test_foo_ids = list(dg.select_nodes_by_layer(ddg, layer={'test', 'foo'})) test_foo_nodes = list(dg.select_nodes_by_layer( ddg, layer={'test', 'foo'}, data=True)) assert len(test_foo_ids) == len(test_foo_nodes) == 6 test_foobar_ids = list(dg.select_nodes_by_layer( ddg, layer={'test', 'foo', 'bar'})) assert len(test_foobar_ids) == 7 # test if data=True works as expected for nodelist in (all_nodes, test_nodes, test_foo_nodes): for node_id, attr_dict in nodelist: assert isinstance(node_id, (str, int)) assert isinstance(attr_dict, dict)
def test_fix_148(): """Are all Tiger sentence root nodes part of the 'tiger:syntax' layer?""" # 00002: with VROOT, 10374: normal sentence root for tiger_doc in ('maz-00002.xml', 'maz-10374.xml'): tiger_fpath = os.path.join(pcc.path, 'syntax', tiger_doc) tdg = dg.read_tiger(tiger_fpath) assert all('tiger:syntax' in tdg.node[node_id]['layers'] for node_id in dg.select_nodes_by_layer( tdg, 'tiger:sentence:root'))
def get_sentences_and_token_nodes(self): """ Returns a list of sentence root node IDs and a list of sentences, where each list contains the token node IDs of that sentence. Both lists will be empty if sentences were not annotated in the original MMAX2 data. TODO: Refactor this! There's code overlap with self.add_annotation_layer(). Ideally, we would always import sentence annotations and filter them out in the exporters (e.g. Exmaralda, CoNLL), probably by modifying get_pointing_chains(). Returns ------- sentence_root_nodes : list of str a list of all sentence root node IDs, in the order they occur in the text token_nodes : list of list of str a list of lists. each list represents a sentence and contains token node IDs (in the order they occur in the text) """ token_nodes = [] # if sentence annotations were ignored during MMAXDocumentGraph # construction, we need to extract sentence/token node IDs manually if self.ignore_sentence_annotations: mp = self.mmax_project layer_dict = mp.annotations['sentence'] file_id = self.get_file_id(self.name) sentence_anno_file = os.path.join(mp.project_path, mp.paths['markable'], file_id+layer_dict['file_extension']) tree = etree.parse(sentence_anno_file) root = tree.getroot() sentence_root_nodes = [] for markable in root.iterchildren(): sentence_root_nodes.append(markable.attrib['id']) sentence_token_nodes = [] for token_id in spanstring2tokens(self, markable.attrib['span']): # ignore token IDs that aren't used in the *_words.xml file # NOTE: we only need this filter for broken files in the PCC corpus if token_id in self.tokens: sentence_token_nodes.append(token_id) self.add_node(markable.attrib['id'], layers={self.ns, self.ns+':sentence'}) token_nodes.append(sentence_token_nodes) else: sentence_root_nodes = list(select_nodes_by_layer(self, self.ns+':sentence')) for sent_node in sentence_root_nodes: sentence_token_nodes = [] for token_id in self.get_token_nodes_from_sentence(sent_node): # ignore token IDs that aren't used in the *_words.xml file # NOTE: we only need this filter for broken files in the PCC corpus if token_id in self.tokens: sentence_token_nodes.append(token_id) token_nodes.append(sentence_token_nodes) return sentence_root_nodes, token_nodes
def gen_anaphoricity_str(docgraph, anaphora='es'): assert anaphora in ('das', 'es') ret_str = u'' annotated_token_ids = [tok_id for tok_id in dg.select_nodes_by_layer(docgraph, docgraph.ns+':annotated') if docgraph.get_token(tok_id).lower() == anaphora] for token_id in docgraph.tokens: if token_id in annotated_token_ids: certainty_str = '' if docgraph.ns+':certainty' == '1.0' else '?' ret_str += u'{0}/{1}{2} '.format( docgraph.get_token(token_id), ANNOTATIONS[docgraph.node[token_id][docgraph.ns+':annotation']], certainty_str) else: ret_str += u'{} '.format(docgraph.get_token(token_id)) return ret_str
def __build_markable_token_mapper(self, coreference_layer=None, markable_layer=None): """ Creates mappings from tokens to the markable spans they belong to and the coreference chains these markables are part of. Returns ------- tok2markables : dict (str -> set of str) Maps from a token (node ID) to all the markables (node IDs) it is part of. markable2toks : dict (str -> list of str) Maps from a markable (node ID) to all the tokens (node IDs) that belong to it. markable2chains : dict (str -> list of int) Maps from a markable (node ID) to all the chains (chain ID) it belongs to. """ tok2markables = defaultdict(set) markable2toks = defaultdict(list) markable2chains = defaultdict(list) coreference_chains = get_pointing_chains(self.docgraph, layer=coreference_layer) for chain_id, chain in enumerate(coreference_chains): for markable_node_id in chain: markable2chains[markable_node_id].append(chain_id) # ID of the first singleton (if there are any) singleton_id = len(coreference_chains) # markable2toks/tok2markables shall contains all markables, not only # those which are part of a coreference chain for markable_node_id in select_nodes_by_layer(self.docgraph, markable_layer): span = get_span(self.docgraph, markable_node_id) markable2toks[markable_node_id] = span for token_node_id in span: tok2markables[token_node_id].add(markable_node_id) # singletons each represent their own chain (with only one element) if markable_node_id not in markable2chains: markable2chains[markable_node_id] = [singleton_id] singleton_id += 1 return tok2markables, markable2toks, markable2chains
def get_potential_markables(docgraph): """ returns a list of all NPs and PPs in the given docgraph. Parameters ---------- docgraph : DiscourseDocumentGraph a document graph that (at least) contains syntax trees (imported from Tiger XML files) Returns ------- potential_markables : list of str or int Node IDs of all nodes that represent an NP/PP syntactical category/phrase in the input document. If an NP is embedded in a PP, only the node ID of the PP is returned. """ potential_markables = [] for node_id, nattr in dg.select_nodes_by_layer(docgraph, 'tiger:syntax', data=True): if nattr['tiger:cat'] == 'NP': # if an NP is embedded into a PP, only print the PP pp_parent = False for source, target in docgraph.in_edges(node_id): parent_node = docgraph.node[source] if 'tiger:cat' in parent_node and parent_node[ 'tiger:cat'] == 'PP': potential_markables.append(source) # add parent PP phrase pp_parent = True if not pp_parent: potential_markables.append(node_id) # add NP phrase elif nattr['tiger:cat'] == 'PP': potential_markables.append(node_id) # add PP phrase return potential_markables