def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) # # Skip blank lines # if not string.strip(l): # print # continue tree = parsetree.read_tree(l) assert tree != None # Sanity check that the tree's already been regularized. treestr = tree.to_string() tree = parsetree.regularize(tree) assert tree.to_string() == treestr if duplicate_top_item: # Add a second TOP label, s.t. we can raise punctuation # above the first TOP label node = parsetree.Node() node.isleaf = 0 node.label = "TOP" node.children = [tree] tree = parsetree.refresh(node) tree = parsetree.preprocess(tree) else: tree = parsetree.preprocess(tree) for n in tree.leaves(): # Make sure that the headtag is a terminal label (POS tag) assert vocab.label_to_idx[n.headtag][1] == 1 # Make sure that the headword is in the vocabulary assert vocab.vocab_to_idx[n.headword] > 0 for n in tree.internal_nodes(): # Make sure that the label is a constituent label assert vocab.label_to_idx[n.label][1] == 0 print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) tree = parsetree.read_tree(l) tree = parsetree.regularize(tree) assert tree != None # SANITY CHECK: # Ensure that the cleaned output is "stable", i.e. that # this script will produce identical output if we pipe # cleaned output from this script back into it. assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string() print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)