def main(): vocab.init() sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): continue tree = parsetree.read_tree(l) assert tree != None if remove_quotation_marks: tree.prune_labels(["``", "''"]) for n in tree.leaves(): if lowercase_vocabulary: n.headword = string.lower(n.headword) tree = parsetree.refresh(tree) vocab.add(tree) del tree if sentence % 1000 == 0: debug(1, "Sentence #%d done" % sentence) elif sentence % 100 == 0: debug(2, "Sentence #%d done" % sentence) vocab.write()
def postprocess_closed_class_compounds(tree): assert flatten_closed_class_compounds leaves = tree.leaves() for n in leaves: for (words, pos, structure) in compounds: if string.lower(n.headword) != string.join(words, "+"): continue # Exception for "sort of", in the case that p is an NP # (this finds the adjectival usage of "sort of", as opposed to # the default adverbial usage). if words == ["sort", "of"] and n.parent().label == "NP": structure = "(*P* (NP *L* (NN sort)) (PP (IN of) *R*))" structure = string.replace(structure, "*P*", n.parent().label) structure = string.replace(structure, "*L*", string.join([l.to_string() for l in n.left_siblings()])) structure = string.replace(structure, "*R*", string.join([r.to_string() for r in n.right_siblings()])) p = n.parent() assert p != None # print p.to_string() newnode = parsetree.read_tree(structure) p.children = newnode.children tree = parsetree.refresh(tree) # print p.to_string() return tree
def transform(tree): assert add_basal_nps # Do a bottom-up transformation of the tree if not tree.isleaf: tree.children = [transform(c) for c in tree.children] tree = parsetree.refresh(tree) tree = transform_node(tree) return tree
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) # # Skip blank lines # if not string.strip(l): # print # continue tree = parsetree.read_tree(l) assert tree != None # Sanity check that the tree's already been regularized. treestr = tree.to_string() tree = parsetree.regularize(tree) assert tree.to_string() == treestr if duplicate_top_item: # Add a second TOP label, s.t. we can raise punctuation # above the first TOP label node = parsetree.Node() node.isleaf = 0 node.label = "TOP" node.children = [tree] tree = parsetree.refresh(node) tree = parsetree.preprocess(tree) else: tree = parsetree.preprocess(tree) for n in tree.leaves(): # Make sure that the headtag is a terminal label (POS tag) assert vocab.label_to_idx[n.headtag][1] == 1 # Make sure that the headword is in the vocabulary assert vocab.vocab_to_idx[n.headword] > 0 for n in tree.internal_nodes(): # Make sure that the label is a constituent label assert vocab.label_to_idx[n.label][1] == 0 print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): # print continue tree = parsetree.read_tree(l) assert tree != None # SANITY CHECK: # Ensure that the cleaned output is "stable", i.e. that # this script will produce identical output if we pipe # cleaned output from this script back into it. assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string() # Remove all internal nodes with labels that are unknown # (not in the constituent list). for n in tree.internal_nodes(): if n.label not in constits: p = n.parent() assert p != None p.children = n.left_siblings() + n.children + n.right_siblings() tree = parsetree.refresh(tree) if n.label not in unknown_constits: unknown_constits[n.label] = 1 sys.stderr.write("Stripping unknown label: %s\n" % n.label) to_print = False for n in tree.leaves(): if n.headword not in [":", ",", ".", "``", "''", "?", "!"]: to_print = True break if to_print: print tree.to_string() else: sys.stderr.write("Skipping all punctuation tree\n") if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 2 debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold(sys.argv[1]), postprocess_jmx(sys.argv[1]))) treebank_file = open(postprocess_gold(sys.argv[1])) jmx_file = open(postprocess_jmx(sys.argv[1])) sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) if l == "\n": lt = treebank_file.readline() lj = jmx_file.readline() print continue tree = parsetree.read_tree(l) assert tree != None if duplicate_top_item: assert(0) # Add a second TOP label, s.t. we can raise punctuation # above the first TOP label node = parsetree.Node() node.isleaf = 0 node.label = "TOP" node.children = [tree] tree = parsetree.refresh(node) # else: # tree = parsetree.refresh(tree) lt = treebank_file.readline() treebank_tree = parsetree.read_tree(lt) assert treebank_tree != None treebank_tree = parsetree.reverse_regularize(treebank_tree) treebank_leaves = [(n.headword, n.headtag) for n in treebank_tree.leaves()] del treebank_tree lj = jmx_file.readline() jmx_tree = parsetree.read_tree(lj) assert jmx_tree != None jmx_tree = parsetree.reverse_regularize(jmx_tree) jmx_leaves = [(n.headword, n.headtag) for n in jmx_tree.leaves()] del jmx_tree tree = parsetree.reverse_regularize(tree) treestr = tree.to_string() tree = parsetree.reverse_regularize(tree) assert treestr == tree.to_string() parsetree.postprocess(tree, origleaves=jmx_leaves, treebank_leaves=treebank_leaves) print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence) assert(not treebank_file.readline()) assert(not jmx_file.readline()) treebank_file.close() jmx_file.close()
def preprocess_closed_class_compounds(tree): assert flatten_closed_class_compounds found = True while found: found = False leaves = tree.leaves() for i in range(len(leaves)): for (words, pos, structure) in compounds: l = len(words) if i + l >= len(leaves): continue match = True for j in range(l): if string.lower(leaves[i+j].headword) != words[j]: match = False break if not match: continue # print tree.to_string() # print words p = parsetree.lowest_common_ancestor(leaves[i+0:i+l]) left_siblings = [] right_siblings = [] # Collect, for left siblings for the new node, # the left siblings of all ancestors of the leftmost word, # from the child of p down to the leftmost word. leftmost_ancestors = leaves[i+0].ancestors() leftmost_ancestors.reverse() found = False for a in leftmost_ancestors: if a == p: found = True elif found == True: left_siblings += a().left_siblings() assert found left_siblings += leaves[i+0].left_siblings() # Collect, for right siblings for the new node, # the right siblings of all ancestors of the rightmost word, # from rightmost word up through the child of p. right_siblings += leaves[i+l-1].right_siblings() for a in leaves[i+l-1].ancestors(): if a == p: break right_siblings += a().right_siblings() assert a == p leafcnt = len(words) for n in left_siblings + right_siblings: leafcnt += len(n.leaves()) assert(len(p().leaves()) == leafcnt) # Exception for "sort of", in the case that p is an NP # (this finds the adjectival usage of "sort of", as opposed to # the default adverbial usage). if words == ["sort", "of"] and p().label == "NP": compound = parsetree.Node(tag="JJ", word=string.join(words, "+")) else: compound = parsetree.Node(tag=pos, word=string.join(words, "+")) node = parsetree.Node() node.isleaf = 0 node.label = p().label node.children = left_siblings + [compound] + right_siblings # print [l.to_string() for l in p().leaves()] # print [n.to_string() for n in left_siblings], [n.to_string() for n in right_siblings] # print len(p().leaves()), leafcnt # if len(left_siblings) > len(leaves[i+0].left_siblings()): # print words, p().to_string(), node.to_string() # print p().to_string() newchildren = p().left_siblings() + [node] + p().right_siblings() assert p().parent != None p().parent().children = newchildren # sys.stdout.flush() found = True break if found: break if found: tree = parsetree.refresh(tree) return tree