コード例 #1
0
ファイル: process_xml.py プロジェクト: ffancellu/ucca
def split_sents(passage):
    """
    Split the paragraph and the DAG into sentences. The DAG is also transformed into a tree representation.
    """
    # Node 1.1 is always a root FN node
    outgoing_edges = passage.layer('1').all[0].outgoing
    # root_nodes: H, U, L nodes at the top
    root_nodes = map(lambda x: node.Internal(x.child,x.tag,0),outgoing_edges)
    words = passage.layer('0').all
    par = ' '.join(map(lambda x: x.text,words))
    tok_par_nodes = correct_split(sent_tokenize(par),root_nodes)
    # current_index starts at 1 like the nodes
    current_index = 1
    for sent,head_nodes in tok_par_nodes:
        for head_node in head_nodes:
            tree = Tree(head_node)
            tree.fill_tree()
            print tree.print_tree_penn()