Exemple #1
0
def load_training_as_dataframe():
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame of all instances of relations in the training section.
        Interesting columns are 'rel', 'nuc_sig', 'arity'
    """
    rst_phrases = []  # list of rows, each represented as a dict

    rst_reader = RstReader(CD_TRAIN)
    rst_corpus = rst_reader.slurp()
    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        # convert labels to coarse
        coarse_rtree_ref = REL_CONV(rtree_ref)
        # store "same-unit" subtrees
        heterogeneous_nodes = []
        internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1
        for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes):
            # get each kid's relation
            kid_rels = tuple(treenode(kid).rel for kid in su_subtree)
            # filter out nodes whose kids have different relations
            rels = [r for r in set(kid_rels) if r != 'span']
            if len(rels) > 1:
                heterogeneous_nodes.append(kid_rels)
                continue

            # process homogeneous nodes
            res = dict()
            rel = rels[0]
            res['rel'] = rel
            # arity
            res['arity'] = len(su_subtree)  # number of kids
            # nuclearity signature
            kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree)
            nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N'
                              for kid_nuc in kid_nucs)
            res['nuc_sig'] = (nuc_sig
                              if nuc_sig in frozenset(['SN', 'NS']) else 'NN')
            # TODO len(kid_rels) - 1 is the nb of bin rels

            # height
            rel_hgt = su_subtree.height()
            res['height'] = rel_hgt

            # TODO disc relations of the grandchildren
            #

            rst_phrases.append(res)

    # turn into a DataFrame
    df = pd.DataFrame(rst_phrases)
    # add calculated columns
    # * "undirected" nuclearity, e.g. NS == SN
    df['unuc_sig'] = map(
        lambda nuc_sig: ('NS'
                         if nuc_sig in ['NS', 'SN'] else 'NN'), df.nuc_sig)
    return df
Exemple #2
0
def load_training_as_dataframe():
    """Load training section of the RST-WSJ corpus as a pandas.DataFrame.

    Returns
    -------
    df: pandas.DataFrame
        DataFrame of all instances of relations in the training section.
        Interesting columns are 'rel', 'nuc_sig', 'arity'
    """
    rst_phrases = []  # list of rows, each represented as a dict

    rst_reader = RstReader(CD_TRAIN)
    rst_corpus = rst_reader.slurp()
    for doc_id, rtree_ref in sorted(rst_corpus.items()):
        # convert labels to coarse
        coarse_rtree_ref = REL_CONV(rtree_ref)
        # store "same-unit" subtrees
        heterogeneous_nodes = []
        internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1
        for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes):
            # get each kid's relation
            kid_rels = tuple(treenode(kid).rel for kid in su_subtree)
            # filter out nodes whose kids have different relations
            rels = [r for r in set(kid_rels) if r != 'span']
            if len(rels) > 1:
                heterogeneous_nodes.append(kid_rels)
                continue

            # process homogeneous nodes
            res = dict()
            rel = rels[0]
            res['rel'] = rel
            # arity
            res['arity'] = len(su_subtree)  # number of kids
            # nuclearity signature
            kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree)
            nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N'
                              for kid_nuc in kid_nucs)
            res['nuc_sig'] = (nuc_sig if nuc_sig in frozenset(['SN', 'NS'])
                              else 'NN')
            # TODO len(kid_rels) - 1 is the nb of bin rels

            # height
            rel_hgt = su_subtree.height()
            res['height'] = rel_hgt

            # TODO disc relations of the grandchildren
            #

            rst_phrases.append(res)

    # turn into a DataFrame
    df = pd.DataFrame(rst_phrases)
    # add calculated columns
    # * "undirected" nuclearity, e.g. NS == SN
    df['unuc_sig'] = map(lambda nuc_sig: ('NS' if nuc_sig in ['NS', 'SN']
                                          else 'NN'),
                         df.nuc_sig)
    return df
Exemple #3
0
def simplify_deptree(dtree):
    """
    Boil a dependency tree down into a dictionary from (edu, edu) to rel
    """
    relations = {}
    for subtree in dtree:
        src = treenode(subtree).edu
        for child in subtree:
            cnode = treenode(child)
            relations[(src, cnode.edu)] = cnode.rel
    return relations
Exemple #4
0
    def decode(self, doc_key):
        """Decode a document from the RST-DT (gold)

        Parameters
        ----------
        doc_key: string ?
            Identifier (in corpus) of the document we want to decode.

        Returns
        -------
        doc: DocumentPlus
            Bunch of information about this document notably its list of
            EDUs and the structures defined on them: RSTTree,
            SimpleRSTTree, RstDepTree.
        """
        # create a DocumentPlus
        # grouping is the document name
        grouping = os.path.basename(id_to_path(doc_key))
        # the RST tree is currently pivotal to get all the layers of info,
        # including the RSTContext that contains the document text and
        # structure (paragraphs + poorly segmented sentences)
        orig_rsttree = self.corpus[doc_key]
        rst_context = treenode(orig_rsttree).context
        # finally...
        doc = DocumentPlus(doc_key, grouping, rst_context)

        # TODO get EDUs here rather than below (see dep tree)
        # edus = orig_rsttree.leaves()
        # doc.edus.extend(edus)

        # attach original RST tree
        # (optional) rewrite pseudo-relations
        if self.fix_pseudo_rels:
            orig_rsttree = rewrite_pseudo_rels(doc_key, orig_rsttree)
        # (optional) convert relation labels
        if self.rel_conv is not None:
            orig_rsttree = self.rel_conv(orig_rsttree)
        doc.orig_rsttree = orig_rsttree

        # TO DEPRECATE - shunt SimpleRSTTree (possible?)
        # convert to binary tree
        rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree)
        # WIP incorporate nuclearity into label
        if self.nuc_in_label:
            rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree)
        doc.rsttree = rsttree
        # end TO DEPRECATE

        # convert to dep tree
        deptree = RstDepTree.from_rst_tree(orig_rsttree,
                                           nary_enc=self.nary_enc)
        # end WIP
        doc.deptree = deptree

        # get EDUs (bad)
        # TODO: get EDUs from orig_rsttree.leaves() and let
        # document_plus do the left padding
        doc.edus = doc.deptree.edus

        return doc
Exemple #5
0
def get_syntactic_labels(edu_info):
    "Syntactic labels for this EDU"
    result = []

    try:
        ptree = edu_info['ptree']
    except KeyError:
        return None

    edu = edu_info['edu']

    # get the tree position of the leaves of the syntactic tree that are in
    # the EDU
    tpos_leaves_edu = [
        tpos_leaf for tpos_leaf in ptree.treepositions('leaves')
        if ptree[tpos_leaf].overlaps(edu)
    ]
    # for each span of syntactic leaves in this EDU
    tpos_parent = lowest_common_parent(tpos_leaves_edu)
    # for each leaf between leftmost and rightmost, add its ancestors
    # up to the lowest common parent
    for leaf in tpos_leaves_edu:
        for i in reversed(range(len(leaf))):
            tpos_node = leaf[:i]
            node = ptree[tpos_node]
            node_lbl = treenode(node)
            if tpos_node == tpos_parent:
                result.append('top_' + node_lbl)
                break
            else:
                result.append(node_lbl)
    return result
Exemple #6
0
def get_syntactic_labels(edu_info):
    "Syntactic labels for this EDU"
    result = []

    try:
        ptree = edu_info['ptree']
    except KeyError:
        return None

    edu = edu_info['edu']

    # get the tree position of the leaves of the syntactic tree that are in
    # the EDU
    tpos_leaves_edu = [tpos_leaf
                       for tpos_leaf in ptree.treepositions('leaves')
                       if ptree[tpos_leaf].overlaps(edu)]
    # for each span of syntactic leaves in this EDU
    tpos_parent = lowest_common_parent(tpos_leaves_edu)
    # for each leaf between leftmost and rightmost, add its ancestors
    # up to the lowest common parent
    for leaf in tpos_leaves_edu:
        for i in reversed(range(len(leaf))):
            tpos_node = leaf[:i]
            node = ptree[tpos_node]
            node_lbl = treenode(node)
            if tpos_node == tpos_parent:
                result.append('top_' + node_lbl)
                break
            else:
                result.append(node_lbl)
    return result
Exemple #7
0
def dump(corpus, odir):
    """
    Dump a text file for every RST tree in the corpus
    """
    for k in corpus:
        ctx = treenode(corpus[k]).context
        for para in ctx.paragraphs:
            print("PARA", ctx.text(para.text_span()))
Exemple #8
0
def dump(corpus, odir):
    """
    Dump a text file for every RST tree in the corpus
    """
    for k in corpus:
        ctx = treenode(corpus[k]).context
        for para in ctx.paragraphs:
            print("PARA", ctx.text(para.text_span()))
Exemple #9
0
 def convert_tree(self, rst_tree):
     """Change relation labels in rst_tree using the mapping"""
     conv_lbl = self.convert_label
     for pos in rst_tree.treepositions():
         t = rst_tree[pos]
         if isinstance(t, Tree):
             node = treenode(t)
             # replace old rel with new rel
             node.rel = conv_lbl(node.rel)
     return rst_tree
Exemple #10
0
 def convert_tree(self, rst_tree):
     """Change relation labels in rst_tree using the mapping"""
     conv_lbl = self.convert_label
     for pos in rst_tree.treepositions():
         t = rst_tree[pos]
         if isinstance(t, Tree):
             node = treenode(t)
             # replace old rel with new rel
             node.rel = conv_lbl(node.rel)
     return rst_tree
Exemple #11
0
    def decode(self, doc_key):
        """Decode a document from the RST-DT (gold)"""
        # create a DocumentPlus
        # grouping is the document name
        grouping = os.path.basename(id_to_path(doc_key))
        # the RST tree is currently pivotal to get all the layers of info,
        # including the RSTContext that contains the document text and
        # structure (paragraphs + poorly segmented sentences)
        orig_rsttree = self.corpus[doc_key]
        rst_context = treenode(orig_rsttree).context
        # finally...
        doc = DocumentPlus(doc_key, grouping, rst_context)

        # TODO get EDUs here rather than below (see dep tree)
        # edus = orig_rsttree.leaves()
        # doc.edus.extend(edus)

        # attach original RST tree
        # convert relation labels if needed
        if self.rel_conv is not None:
            orig_rsttree = self.rel_conv(orig_rsttree)
        doc.orig_rsttree = orig_rsttree

        # convert to binary tree
        rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree)
        # NEW incorporate nuclearity into label
        # TODO add a parameter (in init or this function) to trigger this
        if False:
            rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree)
        doc.rsttree = rsttree

        # convert to dep tree
        deptree = RstDepTree.from_simple_rst_tree(rsttree)
        doc.deptree = deptree

        # get EDUs (bad)
        # TODO: get EDUs from orig_rsttree.leaves() and let
        # document_plus do the left padding
        doc.edus = doc.deptree.edus

        return doc
Exemple #12
0
    def decode(self, doc_key):
        """Decode a document from the RST-DT (gold)"""
        # create a DocumentPlus
        # grouping is the document name
        grouping = os.path.basename(id_to_path(doc_key))
        # the RST tree is currently pivotal to get all the layers of info,
        # including the RSTContext that contains the document text and
        # structure (paragraphs + poorly segmented sentences)
        orig_rsttree = self.corpus[doc_key]
        rst_context = treenode(orig_rsttree).context
        # finally...
        doc = DocumentPlus(doc_key, grouping, rst_context)

        # TODO get EDUs here rather than below (see dep tree)
        # edus = orig_rsttree.leaves()
        # doc.edus.extend(edus)

        # attach original RST tree
        # convert relation labels if needed
        if self.rel_conv is not None:
            orig_rsttree = self.rel_conv(orig_rsttree)
        doc.orig_rsttree = orig_rsttree

        # convert to binary tree
        rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree)
        # NEW incorporate nuclearity into label
        # TODO add a parameter (in init or this function) to trigger this
        if False:
            rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree)
        doc.rsttree = rsttree

        # convert to dep tree
        deptree = RstDepTree.from_simple_rst_tree(rsttree)
        doc.deptree = deptree

        # get EDUs (bad)
        # TODO: get EDUs from orig_rsttree.leaves() and let
        # document_plus do the left padding
        doc.edus = doc.deptree.edus

        return doc
Exemple #13
0
    def walk(ancestor, subtree):
        """
        The basic descent/ascent driver of our conversion algorithm.
        Note that we are looking at three layers of the dependency
        tree at the same time.


                     r0       r1
            ancestor --> src +--> tgt1
                             |
                             |r2
                             +--> tgt2
                             |
                             ..
                             |
                             |rN
                             +--> tgtN

        The base case is if src is a leaf node (no children),
        whereupon we return a tiny tree connecting the two.

        If we do have children, we have to first obtain the
        full RST tree for src (through the folding process
        described in the docstring for the main function)
        before connecting it to its ancestor.

        Parameters
        ----------
        ancestor : SimpleRSTTree
            SimpleRSTTree of the ancestor

        subtree : int
            Index of the head of the subtree

        Returns
        -------
        res : SimpleRSTTree
            SimpleRSTTree covering ancestor and subtree.
        """
        # create tree leaf for src
        edu_src = dtree.edus[subtree]
        src = SimpleRSTTree(
            Node("leaf", (edu_src.num, edu_src.num), edu_src.text_span(),
                 "leaf"),
            [edu_src])

        # descend into each child, but note that we are folding
        # rather than mapping, ie. we threading along a nested
        # RST tree as go from sibling to sibling
        ranked_targets = dtree.deps(subtree)
        for tgt in ranked_targets:
            src = walk(src, tgt)
        if not ancestor:
            # first call: ancestor is None, subtree is the index of the
            # (presumably unique) real root
            return src

        # connect ancestor with src
        n_anc = treenode(ancestor)
        n_src = treenode(src)
        rel = dtree.labels[subtree]
        nuc = dtree.nucs[subtree]
        #
        if n_anc.span.overlaps(n_src.span):
            raise RstDtException("Span %s overlaps with %s " %
                                 (n_anc.span, n_src.span))
        else:
            if n_anc.span <= n_src.span:
                left = ancestor
                right = src
                nuc_kids = [NUC_N, nuc]
            else:
                left = src
                right = ancestor
                nuc_kids = [nuc, NUC_N]
            # nuc in SimpleRSTTree is the concatenation of the initial
            # letter of each kid's nuclearity for the relation,
            # eg. {NS, SN, NN}
            nuc = ''.join(x[0] for x in nuc_kids)
        # compute EDU span of the parent node from the kids'
        l_edu_span = treenode(left).edu_span
        r_edu_span = treenode(right).edu_span
        edu_span = (min(l_edu_span[0], r_edu_span[0]),
                    max(l_edu_span[1], r_edu_span[1]))
        txt_span = n_anc.span.merge(n_src.span)
        res = SimpleRSTTree(
            Node(nuc, edu_span, txt_span, rel),
            [left, right])
        return res
Exemple #14
0
    def decode(self, doc_key):
        """Decode a document from the RST-DT (gold)

        Parameters
        ----------
        doc_key : string ?
            Identifier in the corpus of the document we want to decode.

        Returns
        -------
        doc : DocumentPlus
            Bunch of information about this document notably its list of
            EDUs and the structures defined on them: RSTTree,
            SimpleRSTTree, RstDepTree.
        """
        # create a DocumentPlus
        # grouping is the document name
        grouping = os.path.basename(id_to_path(doc_key))
        # the RST tree is currently pivotal to get all the layers of info,
        # including the RSTContext that contains the document text and
        # structure (paragraphs + poorly segmented sentences)
        orig_rsttree = self.corpus[doc_key]
        rst_context = treenode(orig_rsttree).context
        # finally...
        doc = DocumentPlus(doc_key, grouping, rst_context)

        # TODO get EDUs here rather than below (see dep tree)
        # edus = orig_rsttree.leaves()
        # doc.edus.extend(edus)

        # attach original RST tree
        # convert relation labels if needed
        if self.rel_conv is not None:
            orig_rsttree = self.rel_conv(orig_rsttree)
        doc.orig_rsttree = orig_rsttree

        # TO DEPRECATE - shunt SimpleRSTTree (possible?)
        # convert to binary tree
        rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree)
        # WIP incorporate nuclearity into label
        if self.nuc_in_label:
            rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree)
        doc.rsttree = rsttree
        # end TO DEPRECATE

        # convert to dep tree
        # WIP
        if self.nary_conv == 'chain':
            # legacy mode, through SimpleRSTTree
            # deptree = RstDepTree.from_simple_rst_tree(rsttree)
            # modern mode, directly from a binarized RSTTree
            deptree = RstDepTree.from_rst_tree(_binarize(orig_rsttree))
        else:  # tree conversion
            deptree = RstDepTree.from_rst_tree(orig_rsttree)
        # end WIP
        doc.deptree = deptree

        # get EDUs (bad)
        # TODO: get EDUs from orig_rsttree.leaves() and let
        # document_plus do the left padding
        doc.edus = doc.deptree.edus

        return doc