def load_training_as_dataframe(): """Load training section of the RST-WSJ corpus as a pandas.DataFrame. Returns ------- df: pandas.DataFrame DataFrame of all instances of relations in the training section. Interesting columns are 'rel', 'nuc_sig', 'arity' """ rst_phrases = [] # list of rows, each represented as a dict rst_reader = RstReader(CD_TRAIN) rst_corpus = rst_reader.slurp() for doc_id, rtree_ref in sorted(rst_corpus.items()): # convert labels to coarse coarse_rtree_ref = REL_CONV(rtree_ref) # store "same-unit" subtrees heterogeneous_nodes = [] internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1 for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes): # get each kid's relation kid_rels = tuple(treenode(kid).rel for kid in su_subtree) # filter out nodes whose kids have different relations rels = [r for r in set(kid_rels) if r != 'span'] if len(rels) > 1: heterogeneous_nodes.append(kid_rels) continue # process homogeneous nodes res = dict() rel = rels[0] res['rel'] = rel # arity res['arity'] = len(su_subtree) # number of kids # nuclearity signature kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree) nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N' for kid_nuc in kid_nucs) res['nuc_sig'] = (nuc_sig if nuc_sig in frozenset(['SN', 'NS']) else 'NN') # TODO len(kid_rels) - 1 is the nb of bin rels # height rel_hgt = su_subtree.height() res['height'] = rel_hgt # TODO disc relations of the grandchildren # rst_phrases.append(res) # turn into a DataFrame df = pd.DataFrame(rst_phrases) # add calculated columns # * "undirected" nuclearity, e.g. NS == SN df['unuc_sig'] = map( lambda nuc_sig: ('NS' if nuc_sig in ['NS', 'SN'] else 'NN'), df.nuc_sig) return df
def load_training_as_dataframe(): """Load training section of the RST-WSJ corpus as a pandas.DataFrame. Returns ------- df: pandas.DataFrame DataFrame of all instances of relations in the training section. Interesting columns are 'rel', 'nuc_sig', 'arity' """ rst_phrases = [] # list of rows, each represented as a dict rst_reader = RstReader(CD_TRAIN) rst_corpus = rst_reader.slurp() for doc_id, rtree_ref in sorted(rst_corpus.items()): # convert labels to coarse coarse_rtree_ref = REL_CONV(rtree_ref) # store "same-unit" subtrees heterogeneous_nodes = [] internal_nodes = lambda t: isinstance(t, RSTTree) and len(t) > 1 for su_subtree in coarse_rtree_ref.subtrees(filter=internal_nodes): # get each kid's relation kid_rels = tuple(treenode(kid).rel for kid in su_subtree) # filter out nodes whose kids have different relations rels = [r for r in set(kid_rels) if r != 'span'] if len(rels) > 1: heterogeneous_nodes.append(kid_rels) continue # process homogeneous nodes res = dict() rel = rels[0] res['rel'] = rel # arity res['arity'] = len(su_subtree) # number of kids # nuclearity signature kid_nucs = tuple(treenode(kid).nuclearity for kid in su_subtree) nuc_sig = ''.join('S' if kid_nuc == 'Satellite' else 'N' for kid_nuc in kid_nucs) res['nuc_sig'] = (nuc_sig if nuc_sig in frozenset(['SN', 'NS']) else 'NN') # TODO len(kid_rels) - 1 is the nb of bin rels # height rel_hgt = su_subtree.height() res['height'] = rel_hgt # TODO disc relations of the grandchildren # rst_phrases.append(res) # turn into a DataFrame df = pd.DataFrame(rst_phrases) # add calculated columns # * "undirected" nuclearity, e.g. NS == SN df['unuc_sig'] = map(lambda nuc_sig: ('NS' if nuc_sig in ['NS', 'SN'] else 'NN'), df.nuc_sig) return df
def simplify_deptree(dtree): """ Boil a dependency tree down into a dictionary from (edu, edu) to rel """ relations = {} for subtree in dtree: src = treenode(subtree).edu for child in subtree: cnode = treenode(child) relations[(src, cnode.edu)] = cnode.rel return relations
def decode(self, doc_key): """Decode a document from the RST-DT (gold) Parameters ---------- doc_key: string ? Identifier (in corpus) of the document we want to decode. Returns ------- doc: DocumentPlus Bunch of information about this document notably its list of EDUs and the structures defined on them: RSTTree, SimpleRSTTree, RstDepTree. """ # create a DocumentPlus # grouping is the document name grouping = os.path.basename(id_to_path(doc_key)) # the RST tree is currently pivotal to get all the layers of info, # including the RSTContext that contains the document text and # structure (paragraphs + poorly segmented sentences) orig_rsttree = self.corpus[doc_key] rst_context = treenode(orig_rsttree).context # finally... doc = DocumentPlus(doc_key, grouping, rst_context) # TODO get EDUs here rather than below (see dep tree) # edus = orig_rsttree.leaves() # doc.edus.extend(edus) # attach original RST tree # (optional) rewrite pseudo-relations if self.fix_pseudo_rels: orig_rsttree = rewrite_pseudo_rels(doc_key, orig_rsttree) # (optional) convert relation labels if self.rel_conv is not None: orig_rsttree = self.rel_conv(orig_rsttree) doc.orig_rsttree = orig_rsttree # TO DEPRECATE - shunt SimpleRSTTree (possible?) # convert to binary tree rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree) # WIP incorporate nuclearity into label if self.nuc_in_label: rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree) doc.rsttree = rsttree # end TO DEPRECATE # convert to dep tree deptree = RstDepTree.from_rst_tree(orig_rsttree, nary_enc=self.nary_enc) # end WIP doc.deptree = deptree # get EDUs (bad) # TODO: get EDUs from orig_rsttree.leaves() and let # document_plus do the left padding doc.edus = doc.deptree.edus return doc
def get_syntactic_labels(edu_info): "Syntactic labels for this EDU" result = [] try: ptree = edu_info['ptree'] except KeyError: return None edu = edu_info['edu'] # get the tree position of the leaves of the syntactic tree that are in # the EDU tpos_leaves_edu = [ tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu) ] # for each span of syntactic leaves in this EDU tpos_parent = lowest_common_parent(tpos_leaves_edu) # for each leaf between leftmost and rightmost, add its ancestors # up to the lowest common parent for leaf in tpos_leaves_edu: for i in reversed(range(len(leaf))): tpos_node = leaf[:i] node = ptree[tpos_node] node_lbl = treenode(node) if tpos_node == tpos_parent: result.append('top_' + node_lbl) break else: result.append(node_lbl) return result
def get_syntactic_labels(edu_info): "Syntactic labels for this EDU" result = [] try: ptree = edu_info['ptree'] except KeyError: return None edu = edu_info['edu'] # get the tree position of the leaves of the syntactic tree that are in # the EDU tpos_leaves_edu = [tpos_leaf for tpos_leaf in ptree.treepositions('leaves') if ptree[tpos_leaf].overlaps(edu)] # for each span of syntactic leaves in this EDU tpos_parent = lowest_common_parent(tpos_leaves_edu) # for each leaf between leftmost and rightmost, add its ancestors # up to the lowest common parent for leaf in tpos_leaves_edu: for i in reversed(range(len(leaf))): tpos_node = leaf[:i] node = ptree[tpos_node] node_lbl = treenode(node) if tpos_node == tpos_parent: result.append('top_' + node_lbl) break else: result.append(node_lbl) return result
def dump(corpus, odir): """ Dump a text file for every RST tree in the corpus """ for k in corpus: ctx = treenode(corpus[k]).context for para in ctx.paragraphs: print("PARA", ctx.text(para.text_span()))
def convert_tree(self, rst_tree): """Change relation labels in rst_tree using the mapping""" conv_lbl = self.convert_label for pos in rst_tree.treepositions(): t = rst_tree[pos] if isinstance(t, Tree): node = treenode(t) # replace old rel with new rel node.rel = conv_lbl(node.rel) return rst_tree
def decode(self, doc_key): """Decode a document from the RST-DT (gold)""" # create a DocumentPlus # grouping is the document name grouping = os.path.basename(id_to_path(doc_key)) # the RST tree is currently pivotal to get all the layers of info, # including the RSTContext that contains the document text and # structure (paragraphs + poorly segmented sentences) orig_rsttree = self.corpus[doc_key] rst_context = treenode(orig_rsttree).context # finally... doc = DocumentPlus(doc_key, grouping, rst_context) # TODO get EDUs here rather than below (see dep tree) # edus = orig_rsttree.leaves() # doc.edus.extend(edus) # attach original RST tree # convert relation labels if needed if self.rel_conv is not None: orig_rsttree = self.rel_conv(orig_rsttree) doc.orig_rsttree = orig_rsttree # convert to binary tree rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree) # NEW incorporate nuclearity into label # TODO add a parameter (in init or this function) to trigger this if False: rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree) doc.rsttree = rsttree # convert to dep tree deptree = RstDepTree.from_simple_rst_tree(rsttree) doc.deptree = deptree # get EDUs (bad) # TODO: get EDUs from orig_rsttree.leaves() and let # document_plus do the left padding doc.edus = doc.deptree.edus return doc
def walk(ancestor, subtree): """ The basic descent/ascent driver of our conversion algorithm. Note that we are looking at three layers of the dependency tree at the same time. r0 r1 ancestor --> src +--> tgt1 | |r2 +--> tgt2 | .. | |rN +--> tgtN The base case is if src is a leaf node (no children), whereupon we return a tiny tree connecting the two. If we do have children, we have to first obtain the full RST tree for src (through the folding process described in the docstring for the main function) before connecting it to its ancestor. Parameters ---------- ancestor : SimpleRSTTree SimpleRSTTree of the ancestor subtree : int Index of the head of the subtree Returns ------- res : SimpleRSTTree SimpleRSTTree covering ancestor and subtree. """ # create tree leaf for src edu_src = dtree.edus[subtree] src = SimpleRSTTree( Node("leaf", (edu_src.num, edu_src.num), edu_src.text_span(), "leaf"), [edu_src]) # descend into each child, but note that we are folding # rather than mapping, ie. we threading along a nested # RST tree as go from sibling to sibling ranked_targets = dtree.deps(subtree) for tgt in ranked_targets: src = walk(src, tgt) if not ancestor: # first call: ancestor is None, subtree is the index of the # (presumably unique) real root return src # connect ancestor with src n_anc = treenode(ancestor) n_src = treenode(src) rel = dtree.labels[subtree] nuc = dtree.nucs[subtree] # if n_anc.span.overlaps(n_src.span): raise RstDtException("Span %s overlaps with %s " % (n_anc.span, n_src.span)) else: if n_anc.span <= n_src.span: left = ancestor right = src nuc_kids = [NUC_N, nuc] else: left = src right = ancestor nuc_kids = [nuc, NUC_N] # nuc in SimpleRSTTree is the concatenation of the initial # letter of each kid's nuclearity for the relation, # eg. {NS, SN, NN} nuc = ''.join(x[0] for x in nuc_kids) # compute EDU span of the parent node from the kids' l_edu_span = treenode(left).edu_span r_edu_span = treenode(right).edu_span edu_span = (min(l_edu_span[0], r_edu_span[0]), max(l_edu_span[1], r_edu_span[1])) txt_span = n_anc.span.merge(n_src.span) res = SimpleRSTTree( Node(nuc, edu_span, txt_span, rel), [left, right]) return res
def decode(self, doc_key): """Decode a document from the RST-DT (gold) Parameters ---------- doc_key : string ? Identifier in the corpus of the document we want to decode. Returns ------- doc : DocumentPlus Bunch of information about this document notably its list of EDUs and the structures defined on them: RSTTree, SimpleRSTTree, RstDepTree. """ # create a DocumentPlus # grouping is the document name grouping = os.path.basename(id_to_path(doc_key)) # the RST tree is currently pivotal to get all the layers of info, # including the RSTContext that contains the document text and # structure (paragraphs + poorly segmented sentences) orig_rsttree = self.corpus[doc_key] rst_context = treenode(orig_rsttree).context # finally... doc = DocumentPlus(doc_key, grouping, rst_context) # TODO get EDUs here rather than below (see dep tree) # edus = orig_rsttree.leaves() # doc.edus.extend(edus) # attach original RST tree # convert relation labels if needed if self.rel_conv is not None: orig_rsttree = self.rel_conv(orig_rsttree) doc.orig_rsttree = orig_rsttree # TO DEPRECATE - shunt SimpleRSTTree (possible?) # convert to binary tree rsttree = SimpleRSTTree.from_rst_tree(orig_rsttree) # WIP incorporate nuclearity into label if self.nuc_in_label: rsttree = SimpleRSTTree.incorporate_nuclearity_into_label(rsttree) doc.rsttree = rsttree # end TO DEPRECATE # convert to dep tree # WIP if self.nary_conv == 'chain': # legacy mode, through SimpleRSTTree # deptree = RstDepTree.from_simple_rst_tree(rsttree) # modern mode, directly from a binarized RSTTree deptree = RstDepTree.from_rst_tree(_binarize(orig_rsttree)) else: # tree conversion deptree = RstDepTree.from_rst_tree(orig_rsttree) # end WIP doc.deptree = deptree # get EDUs (bad) # TODO: get EDUs from orig_rsttree.leaves() and let # document_plus do the left padding doc.edus = doc.deptree.edus return doc