Beispiel #1
0
def test_select_nodes_by_layer():
    """Are RST nodes correctly filtered based on their layer?"""
    rst_filepath = os.path.join(pcc.path, 'rst/maz-10374.rs3')
    rdg = dg.read_rst(rst_filepath)
    rst_node_ids = list(dg.select_nodes_by_layer(rdg, 'rst'))
    rst_nodes = list(dg.select_nodes_by_layer(rdg, 'rst', data=True))
    assert len(rdg) == len(rst_node_ids) == len(rst_nodes) == 195
Beispiel #2
0
def test_select_nodes_by_layer():
    """Are MMAX2 nodes correctly filtered based on their layer?"""
    coref_fpath = os.path.join(pcc.path, 'coreference/maz-10374.mmax')
    cdg = dg.read_mmax2(coref_fpath)
    coref_node_ids = list(dg.select_nodes_by_layer(cdg, 'mmax'))
    coref_nodes = list(dg.select_nodes_by_layer(cdg, 'mmax', data=True))
    assert len(coref_node_ids) == len(cdg) == 231
Beispiel #3
0
def test_select_nodes_by_layer():
    """Are nodes correctly filtered based on their layer?"""
    conano_fpath = os.path.join(pcc.path, 'connectors/maz-10374.xml')
    codg = dg.read_conano(conano_fpath)
    conano_node_ids = list(dg.select_nodes_by_layer(codg, 'conano'))
    conano_nodes = list(dg.select_nodes_by_layer(codg, 'conano', data=True))
    assert len(codg) == len(conano_node_ids) == len(conano_nodes) == 188
Beispiel #4
0
def test_select_nodes_by_layer():
    """Are Tiger syntax nodes correctly filtered based on their layer?"""
    tiger_fpath = os.path.join(pcc.path, 'syntax/maz-10374.xml')
    tdg = dg.read_tiger(tiger_fpath)
    tiger_node_ids = list(dg.select_nodes_by_layer(tdg, 'tiger'))
    tiger_nodes = list(dg.select_nodes_by_layer(tdg, 'tiger', data=True))
    assert len(tdg) == len(tiger_node_ids) == 253
def test_select_nodes_by_layer():
    """Are RST nodes correctly filtered based on their layer?"""
    rst_filepath = os.path.join(pcc.path, 'rst/maz-10374.rs3')
    rdg = dg.read_rst(rst_filepath)
    rst_node_ids = list(dg.select_nodes_by_layer(rdg, 'rst'))
    rst_nodes = list(dg.select_nodes_by_layer(rdg, 'rst', data=True))
    assert len(rdg) == len(rst_node_ids) == len(rst_nodes) == 195
Beispiel #6
0
def test_select_nodes_by_layer():
    """Are Tiger syntax nodes correctly filtered based on their layer?"""
    tiger_fpath = os.path.join(pcc.path, 'syntax/maz-10374.xml')
    tdg = dg.read_tiger(tiger_fpath)
    tiger_node_ids = list(dg.select_nodes_by_layer(tdg, 'tiger'))
    tiger_nodes = list(dg.select_nodes_by_layer(tdg, 'tiger', data=True))
    assert len(tdg) == len(tiger_node_ids) == 253
Beispiel #7
0
def test_select_nodes_by_layer():
    """Are nodes correctly filtered based on their layer?"""
    conano_fpath = os.path.join(pcc.path, 'connectors/maz-10374.xml')
    codg = dg.read_conano(conano_fpath)
    conano_node_ids = list(dg.select_nodes_by_layer(codg, 'conano'))
    conano_nodes = list(dg.select_nodes_by_layer(codg, 'conano', data=True))
    assert len(codg) == len(conano_node_ids) == len(conano_nodes) == 188
Beispiel #8
0
def get_potential_markables(docgraph):
    """
    returns a list of all NPs and PPs in the given docgraph.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph that (at least) contains syntax trees
        (imported from Tiger XML files)

    Returns
    -------
    potential_markables : list of str or int
        Node IDs of all nodes that represent an NP/PP syntactical category/phrase
        in the input document. If an NP is embedded in a PP, only the node
        ID of the PP is returned.
    """
    potential_markables = []

    for node_id, nattr in dg.select_nodes_by_layer(docgraph, 'tiger:syntax', data=True):
        if nattr['tiger:cat'] == 'NP':
            # if an NP is embedded into a PP, only print the PP
            pp_parent = False
            for source, target in docgraph.in_edges(node_id):
                parent_node = docgraph.node[source]
                if 'tiger:cat' in parent_node and parent_node['tiger:cat'] == 'PP':
                    potential_markables.append(source) # add parent PP phrase
                    pp_parent = True
            if not pp_parent:
                potential_markables.append(node_id) # add NP phrase

        elif nattr['tiger:cat'] == 'PP':
            potential_markables.append(node_id) # add PP phrase
    return potential_markables
Beispiel #9
0
def get_conano_units(docgraph, data=True, conano_namespace='conano'):
    """
    yield all Conano units that occur in the given document graph,
    sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains Conano annotations
    data : bool
        If True (default), yields (node ID, list of tokens)
        tuples. If False, yields just unit IDs.
    conano_namespace : str
        The namespace that the Conano annotations use (default: conano)

    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph, conano_namespace+':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
Beispiel #10
0
    def __gen_struct_anno_files(self, top_level_layer):
        """
        A struct annotation file contains node (struct) attributes (of
        non-token nodes). It is e.g. used to annotate the type of a syntactic
        category (NP, VP etc.).

        See also: __gen_hierarchy_file()
        """
        paula_id = '{0}.{1}.{2}_{3}_struct'.format(top_level_layer,
                                               self.corpus_name, self.name,
                                               top_level_layer)
        E, tree = gen_paula_etree(paula_id)

        base_paula_id = self.paulamap['hierarchy'][top_level_layer]
        mflist = E('multiFeatList',
                   {XMLBASE: base_paula_id+'.xml'})

        for node_id in select_nodes_by_layer(self.dg, top_level_layer):
            if not istoken(self.dg, node_id):
                mfeat = E('multiFeat',
                          {XLINKHREF: '#{0}'.format(node_id)})
                node_dict = self.dg.node[node_id]
                for attr in node_dict:
                    if attr not in IGNORED_NODE_ATTRIBS:
                        mfeat.append(
                            E('feat',
                              {'name': attr, 'value': node_dict[attr]}))
                if self.human_readable:  # adds node label as a <!--comment-->
                    mfeat.append(Comment(node_dict.get('label')))
                mflist.append(mfeat)
        tree.append(mflist)
        self.files[paula_id] = tree
        self.file2dtd[paula_id] = PaulaDTDs.multifeat
        return paula_id
Beispiel #11
0
def get_conano_units(docgraph, data=True, conano_namespace='conano'):
    """
    yield all Conano units that occur in the given document graph,
    sorted by their unit ID. int(ernal) and ext(ernal) count as distinct units.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph which contains Conano annotations
    data : bool
        If True (default), yields (node ID, list of tokens)
        tuples. If False, yields just unit IDs.
    conano_namespace : str
        The namespace that the Conano annotations use (default: conano)

    Yields
    ------
    relations : str or (str, str, list of str) tuples
        If data=False, this will just yield node IDs of the nodes that
        directly dominate an RST relation. If data=True, this yields
        tuples of the form: (node ID, relation name, list of tokens that this
        relation spans).
    """
    for unit_id in sorted(select_nodes_by_layer(docgraph,
                                                conano_namespace + ':unit'),
                          key=natural_sort_key):
        yield (unit_id, get_span(docgraph, unit_id)) if data else (unit_id)
Beispiel #12
0
    def __add_annotation_tier(self, docgraph, body, annotation_layer):
        """
        adds a span-based annotation layer as a <tier> to the Exmaralda <body>.

        Parameter
        ---------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        annotation_layer : str
            the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence'
        """
        layer_cat = annotation_layer.split(':')[-1]
        temp_tier = self.E('tier',
                           {'id': "TIE{}".format(self.tier_count),
                            'category': layer_cat, 'type': "t",
                            'display-name': "[{}]".format(annotation_layer)})
        self.tier_count += 1

        for node_id in select_nodes_by_layer(docgraph, annotation_layer):
            span_node_ids = get_span(docgraph, node_id)
            if span_node_ids:
                start_id, end_id = self.__span2event(span_node_ids)
                event_label = docgraph.node[node_id].get('label', '')
                event = self.E('event',
                               {'start': "T{}".format(start_id),
                                'end': "T{}".format(end_id)},
                               event_label)
                temp_tier.append(event)
        body.append(temp_tier)
Beispiel #13
0
    def __add_annotation_tier(self, docgraph, body, annotation_layer):
        """
        adds a span-based annotation layer as a <tier> to the Exmaralda <body>.

        Parameter
        ---------
        docgraph : DiscourseDocumentGraph
            the document graph from which the chains will be extracted
        body : etree._Element
            an etree representation of the <basic_body> element (and all its
            descendants) of the Exmaralda file
        annotation_layer : str
            the name of a layer, e.g. 'tiger', 'tiger:token' or 'mmax:sentence'
        """
        layer_cat = annotation_layer.split(':')[-1]
        temp_tier = self.E(
            'tier', {
                'id': "TIE{}".format(self.tier_count),
                'category': layer_cat,
                'type': "t",
                'display-name': "[{}]".format(annotation_layer)
            })
        self.tier_count += 1

        for node_id in select_nodes_by_layer(docgraph, annotation_layer):
            span_node_ids = get_span(docgraph, node_id)
            if span_node_ids:
                start_id, end_id = self.__span2event(span_node_ids)
                event_label = docgraph.node[node_id].get('label', '')
                event = self.E('event', {
                    'start': "T{}".format(start_id),
                    'end': "T{}".format(end_id)
                }, event_label)
                temp_tier.append(event)
        body.append(temp_tier)
Beispiel #14
0
    def get_sentences_and_token_nodes(self):
        """
        Returns a list of sentence root node IDs and a list of sentences,
        where each list contains the token node IDs of that sentence.
        Both lists will be empty if sentences were not annotated in the original
        MMAX2 data.

        TODO: Refactor this! There's code overlap with
        self.add_annotation_layer(). Ideally, we would always import sentence
        annotations and filter them out in the exporters (e.g. Exmaralda,
        CoNLL), probably by modifying get_pointing_chains().

        Returns
        -------
        sentence_root_nodes : list of str
            a list of all sentence root node IDs, in the order they occur in the
            text
        token_nodes : list of list of str
            a list of lists. each list represents a sentence and contains
            token node IDs (in the order they occur in the text)
        """
        token_nodes = []
        # if sentence annotations were ignored during MMAXDocumentGraph
        # construction, we need to extract sentence/token node IDs manually
        if self.ignore_sentence_annotations:
            mp = self.mmax_project
            layer_dict = mp.annotations['sentence']
            file_id = self.get_file_id(self.name)
            sentence_anno_file = os.path.join(
                mp.project_path, mp.paths['markable'],
                file_id + layer_dict['file_extension'])
            tree = etree.parse(sentence_anno_file)
            root = tree.getroot()
            sentence_root_nodes = []
            for markable in root.iterchildren():
                sentence_root_nodes.append(markable.attrib['id'])

                sentence_token_nodes = []
                for token_id in spanstring2tokens(self,
                                                  markable.attrib['span']):
                    # ignore token IDs that aren't used in the *_words.xml file
                    # NOTE: we only need this filter for broken files in the PCC corpus
                    if token_id in self.tokens:
                        sentence_token_nodes.append(token_id)
                        self.add_node(markable.attrib['id'],
                                      layers={self.ns, self.ns + ':sentence'})
                token_nodes.append(sentence_token_nodes)
        else:
            sentence_root_nodes = list(
                select_nodes_by_layer(self, self.ns + ':sentence'))
            for sent_node in sentence_root_nodes:
                sentence_token_nodes = []
                for token_id in self.get_token_nodes_from_sentence(sent_node):
                    # ignore token IDs that aren't used in the *_words.xml file
                    # NOTE: we only need this filter for broken files in the PCC corpus
                    if token_id in self.tokens:
                        sentence_token_nodes.append(token_id)
                token_nodes.append(sentence_token_nodes)
        return sentence_root_nodes, token_nodes
def test_select_nodes_by_layer():
    """Are nodes correctly filtered based on their layer?"""
    ddg = dg.DiscourseDocumentGraph(namespace='test')
    assert len(ddg) == 1
    add_tokens(ddg, ['The', 'dog', 'barks', '.'])
    assert len(ddg) == 5

    # don't filter any nodes
    all_node_ids = list(dg.select_nodes_by_layer(ddg))
    all_nodes = list(dg.select_nodes_by_layer(ddg, data=True))
    assert len(all_node_ids) == len(all_nodes) == 5

    test_node_ids = list(dg.select_nodes_by_layer(ddg, 'test'))
    test_nodes = list(dg.select_nodes_by_layer(ddg, 'test', data=True))
    assert len(ddg) == len(test_node_ids) == len(test_nodes) == 5

    ddg.add_node(10, layers={'foo'})
    ddg.add_node(11, layers={'bar'})

    # filter several layers
    test_foo_ids = list(dg.select_nodes_by_layer(ddg, layer={'test', 'foo'}))
    test_foo_nodes = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo'}, data=True))
    assert len(test_foo_ids) == len(test_foo_nodes) == 6
    test_foobar_ids = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo', 'bar'}))
    assert len(test_foobar_ids) == 7

    # test if data=True works as expected
    for nodelist in (all_nodes, test_nodes, test_foo_nodes):
        for node_id, attr_dict in nodelist:
            assert isinstance(node_id, (str, int))
            assert isinstance(attr_dict, dict)
Beispiel #16
0
def test_select_nodes_by_layer():
    """Are nodes correctly filtered based on their layer?"""
    ddg = dg.DiscourseDocumentGraph(namespace='test')
    assert len(ddg) == 1
    add_tokens(ddg, ['The', 'dog', 'barks', '.'])
    assert len(ddg) == 5

    # don't filter any nodes
    all_node_ids = list(dg.select_nodes_by_layer(ddg))
    all_nodes = list(dg.select_nodes_by_layer(ddg, data=True))
    assert len(all_node_ids) == len(all_nodes) == 5

    test_node_ids = list(dg.select_nodes_by_layer(ddg, 'test'))
    test_nodes = list(dg.select_nodes_by_layer(ddg, 'test', data=True))
    assert len(ddg) == len(test_node_ids) == len(test_nodes) == 5

    ddg.add_node(10, layers={'foo'})
    ddg.add_node(11, layers={'bar'})

    # filter several layers
    test_foo_ids = list(dg.select_nodes_by_layer(ddg, layer={'test', 'foo'}))
    test_foo_nodes = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo'}, data=True))
    assert len(test_foo_ids) == len(test_foo_nodes) == 6
    test_foobar_ids = list(dg.select_nodes_by_layer(
        ddg, layer={'test', 'foo', 'bar'}))
    assert len(test_foobar_ids) == 7

    # test if data=True works as expected
    for nodelist in (all_nodes, test_nodes, test_foo_nodes):
        for node_id, attr_dict in nodelist:
            assert isinstance(node_id, (str, int))
            assert isinstance(attr_dict, dict)
Beispiel #17
0
def test_fix_148():
    """Are all Tiger sentence root nodes part of the 'tiger:syntax' layer?"""
    # 00002: with VROOT, 10374: normal sentence root
    for tiger_doc in ('maz-00002.xml', 'maz-10374.xml'):
        tiger_fpath = os.path.join(pcc.path, 'syntax', tiger_doc)
        tdg = dg.read_tiger(tiger_fpath)
        assert all('tiger:syntax' in tdg.node[node_id]['layers']
                   for node_id in dg.select_nodes_by_layer(
                       tdg, 'tiger:sentence:root'))
Beispiel #18
0
    def get_sentences_and_token_nodes(self):
        """
        Returns a list of sentence root node IDs and a list of sentences,
        where each list contains the token node IDs of that sentence.
        Both lists will be empty if sentences were not annotated in the original
        MMAX2 data.

        TODO: Refactor this! There's code overlap with
        self.add_annotation_layer(). Ideally, we would always import sentence
        annotations and filter them out in the exporters (e.g. Exmaralda,
        CoNLL), probably by modifying get_pointing_chains().

        Returns
        -------
        sentence_root_nodes : list of str
            a list of all sentence root node IDs, in the order they occur in the
            text
        token_nodes : list of list of str
            a list of lists. each list represents a sentence and contains
            token node IDs (in the order they occur in the text)
        """
        token_nodes = []
        # if sentence annotations were ignored during MMAXDocumentGraph
        # construction, we need to extract sentence/token node IDs manually
        if self.ignore_sentence_annotations:
            mp = self.mmax_project
            layer_dict = mp.annotations['sentence']
            file_id = self.get_file_id(self.name)
            sentence_anno_file = os.path.join(mp.project_path,
                mp.paths['markable'], file_id+layer_dict['file_extension'])
            tree = etree.parse(sentence_anno_file)
            root = tree.getroot()
            sentence_root_nodes = []
            for markable in root.iterchildren():
                sentence_root_nodes.append(markable.attrib['id'])

                sentence_token_nodes = []
                for token_id in spanstring2tokens(self, markable.attrib['span']):
                    # ignore token IDs that aren't used in the *_words.xml file
                    # NOTE: we only need this filter for broken files in the PCC corpus
                    if token_id in self.tokens:
                        sentence_token_nodes.append(token_id)
                        self.add_node(markable.attrib['id'], layers={self.ns, self.ns+':sentence'})
                token_nodes.append(sentence_token_nodes)
        else:
            sentence_root_nodes = list(select_nodes_by_layer(self, self.ns+':sentence'))
            for sent_node in sentence_root_nodes:
                sentence_token_nodes = []
                for token_id in self.get_token_nodes_from_sentence(sent_node):
                    # ignore token IDs that aren't used in the *_words.xml file
                    # NOTE: we only need this filter for broken files in the PCC corpus
                    if token_id in self.tokens:
                        sentence_token_nodes.append(token_id)
                token_nodes.append(sentence_token_nodes)
        return sentence_root_nodes, token_nodes
Beispiel #19
0
def gen_anaphoricity_str(docgraph, anaphora='es'):
    assert anaphora in ('das', 'es')
    ret_str = u''
    annotated_token_ids = [tok_id for tok_id in dg.select_nodes_by_layer(docgraph, docgraph.ns+':annotated')
                           if docgraph.get_token(tok_id).lower() == anaphora]
    for token_id in docgraph.tokens:
        if token_id in annotated_token_ids:
            certainty_str = '' if docgraph.ns+':certainty' == '1.0' else '?'
            ret_str += u'{0}/{1}{2} '.format(
                docgraph.get_token(token_id),
                ANNOTATIONS[docgraph.node[token_id][docgraph.ns+':annotation']],
                certainty_str)
        else:
            ret_str += u'{} '.format(docgraph.get_token(token_id))
    return ret_str
Beispiel #20
0
    def __build_markable_token_mapper(self, coreference_layer=None,
                                      markable_layer=None):
        """
        Creates mappings from tokens to the markable spans they belong to
        and the coreference chains these markables are part of.

        Returns
        -------
        tok2markables : dict (str -> set of str)
            Maps from a token (node ID) to all the markables (node IDs)
            it is part of.
        markable2toks : dict (str -> list of str)
            Maps from a markable (node ID) to all the tokens (node IDs)
            that belong to it.
        markable2chains : dict (str -> list of int)
            Maps from a markable (node ID) to all the chains (chain ID) it
            belongs to.
        """
        tok2markables = defaultdict(set)
        markable2toks = defaultdict(list)
        markable2chains = defaultdict(list)

        coreference_chains = get_pointing_chains(self.docgraph,
                                                 layer=coreference_layer)
        for chain_id, chain in enumerate(coreference_chains):
            for markable_node_id in chain:
                markable2chains[markable_node_id].append(chain_id)

        # ID of the first singleton (if there are any)
        singleton_id = len(coreference_chains)

        # markable2toks/tok2markables shall contains all markables, not only
        # those which are part of a coreference chain
        for markable_node_id in select_nodes_by_layer(self.docgraph,
                                                      markable_layer):
            span = get_span(self.docgraph, markable_node_id)
            markable2toks[markable_node_id] = span
            for token_node_id in span:
                tok2markables[token_node_id].add(markable_node_id)

            # singletons each represent their own chain (with only one element)
            if markable_node_id not in markable2chains:
                markable2chains[markable_node_id] = [singleton_id]
                singleton_id += 1

        return tok2markables, markable2toks, markable2chains
Beispiel #21
0
def get_potential_markables(docgraph):
    """
    returns a list of all NPs and PPs in the given docgraph.

    Parameters
    ----------
    docgraph : DiscourseDocumentGraph
        a document graph that (at least) contains syntax trees
        (imported from Tiger XML files)

    Returns
    -------
    potential_markables : list of str or int
        Node IDs of all nodes that represent an NP/PP syntactical category/phrase
        in the input document. If an NP is embedded in a PP, only the node
        ID of the PP is returned.
    """
    potential_markables = []

    for node_id, nattr in dg.select_nodes_by_layer(docgraph,
                                                   'tiger:syntax',
                                                   data=True):
        if nattr['tiger:cat'] == 'NP':
            # if an NP is embedded into a PP, only print the PP
            pp_parent = False
            for source, target in docgraph.in_edges(node_id):
                parent_node = docgraph.node[source]
                if 'tiger:cat' in parent_node and parent_node[
                        'tiger:cat'] == 'PP':
                    potential_markables.append(source)  # add parent PP phrase
                    pp_parent = True
            if not pp_parent:
                potential_markables.append(node_id)  # add NP phrase

        elif nattr['tiger:cat'] == 'PP':
            potential_markables.append(node_id)  # add PP phrase
    return potential_markables