def main(): assert(len(sys.argv) == 2) debug(1, "Opening files:\n\t%s\n" % (sys.argv[1])) gold_file = open(sys.argv[1]) sentence = 0 for lt in sys.stdin: sentence += 1 lg = gold_file.readline() gold_tree = parsetree.read_tree(lg) if lt == "\n" or lt == "(null)\n": goldstr = leaves_string(gold_tree) sys.stderr.write("FOUND NULL! Sentence #%d\n" % sentence) sys.stderr.write("Output: %s\n" % goldstr) print goldstr else: test_tree = parsetree.read_tree(lt) teststr = "(%s)" % ([l.headword for l in test_tree.leaves()]) goldstr = "(%s)" % ([l.headword for l in test_tree.leaves()]) assert(goldstr == teststr) print lt, if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence) assert(not gold_file.readline()) gold_file.close()
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): # print continue tree = parsetree.read_tree(l) assert tree != None # SANITY CHECK: # Ensure that the cleaned output is "stable", i.e. that # this script will produce identical output if we pipe # cleaned output from this script back into it. assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string() print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 1 sentence = 0 skip_sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) tree = parsetree.read_tree(l) assert tree != None # print string.strip(tree.to_string()) # print string.strip(l) # assert string.strip(tree.to_string()) == string.strip(l) origleaves = [(n.headword, n.headtag) for n in tree.leaves()] if len(origleaves) > max_words_per_sentence: skip_sentence += 1 # print else: print string.strip(l) if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence) keep_sentence = sentence - skip_sentence debug(1, "Kept %.2f%% (%d of %d) sentences with at most %d words" % (100.*keep_sentence/sentence, keep_sentence, sentence, max_words_per_sentence)) debug(1, "i.e. skipped %.2f%% (%d of %d) sentences with more that %d words" % (100.*skip_sentence/sentence, skip_sentence, sentence, max_words_per_sentence))
def main(): vocab.init() sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): continue tree = parsetree.read_tree(l) assert tree != None if remove_quotation_marks: tree.prune_labels(["``", "''"]) for n in tree.leaves(): if lowercase_vocabulary: n.headword = string.lower(n.headword) tree = parsetree.refresh(tree) vocab.add(tree) del tree if sentence % 1000 == 0: debug(1, "Sentence #%d done" % sentence) elif sentence % 100 == 0: debug(2, "Sentence #%d done" % sentence) vocab.write()
def postprocess_closed_class_compounds(tree): assert flatten_closed_class_compounds leaves = tree.leaves() for n in leaves: for (words, pos, structure) in compounds: if string.lower(n.headword) != string.join(words, "+"): continue # Exception for "sort of", in the case that p is an NP # (this finds the adjectival usage of "sort of", as opposed to # the default adverbial usage). if words == ["sort", "of"] and n.parent().label == "NP": structure = "(*P* (NP *L* (NN sort)) (PP (IN of) *R*))" structure = string.replace(structure, "*P*", n.parent().label) structure = string.replace(structure, "*L*", string.join([l.to_string() for l in n.left_siblings()])) structure = string.replace(structure, "*R*", string.join([r.to_string() for r in n.right_siblings()])) p = n.parent() assert p != None # print p.to_string() newnode = parsetree.read_tree(structure) p.children = newnode.children tree = parsetree.refresh(tree) # print p.to_string() return tree
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): # print continue tree = parsetree.read_tree(l) assert tree != None # SANITY CHECK: # Ensure that the cleaned output is "stable", i.e. that # this script will produce identical output if we pipe # cleaned output from this script back into it. assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string() # Remove all internal nodes with labels that are unknown # (not in the constituent list). for n in tree.internal_nodes(): if n.label not in constits: p = n.parent() assert p != None p.children = n.left_siblings() + n.children + n.right_siblings() tree = parsetree.refresh(tree) if n.label not in unknown_constits: unknown_constits[n.label] = 1 sys.stderr.write("Stripping unknown label: %s\n" % n.label) to_print = False for n in tree.leaves(): if n.headword not in [":", ",", ".", "``", "''", "?", "!"]: to_print = True break if to_print: print tree.to_string() else: sys.stderr.write("Skipping all punctuation tree\n") if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 3 sentence = 0 f = open(sys.argv[1], "rt") total_actions = 0 for l in f: sentence += 1 if not l: assert(0) tree = parsetree.read_tree(l) assert tree != None assert string.strip(tree.to_string()) == string.strip(l) total_actions += len(tree.internal_nodes()) if sentence % 100 == 0: debug(1, "Sentence #%d done (first pass)" % sentence) else: debug(2, "Sentence #%d done (first pass)" % sentence) f.close() keep_actions = float(sys.argv[2]) * total_actions debug(1, "Wish to keep %.2f actions out of %d" % (keep_actions, total_actions)) split_sentence = 0 f = open(sys.argv[1], "rt") split_actions = 0 for l in f: split_sentence += 1 if not l: assert(0) tree = parsetree.read_tree(l) assert tree != None assert string.strip(tree.to_string()) == string.strip(l) split_actions += len(tree.internal_nodes()) print string.strip(l) if split_actions >= keep_actions: break f.close() debug(1, "Kept %.2f%% (%d of %d) sentences, %.2f%% (%d) actions versus %.2f%% (%.2f) desired)" % (100.*split_sentence/sentence, split_sentence, sentence, 100.*split_actions/total_actions, split_actions, 100.*float(sys.argv[2]), keep_actions))
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) # # Skip blank lines # if not string.strip(l): # print # continue tree = parsetree.read_tree(l) assert tree != None # Sanity check that the tree's already been regularized. treestr = tree.to_string() tree = parsetree.regularize(tree) assert tree.to_string() == treestr if duplicate_top_item: # Add a second TOP label, s.t. we can raise punctuation # above the first TOP label node = parsetree.Node() node.isleaf = 0 node.label = "TOP" node.children = [tree] tree = parsetree.refresh(node) tree = parsetree.preprocess(tree) else: tree = parsetree.preprocess(tree) for n in tree.leaves(): # Make sure that the headtag is a terminal label (POS tag) assert vocab.label_to_idx[n.headtag][1] == 1 # Make sure that the headword is in the vocabulary assert vocab.vocab_to_idx[n.headword] > 0 for n in tree.internal_nodes(): # Make sure that the label is a constituent label assert vocab.label_to_idx[n.label][1] == 0 print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): # Open the Toutanova pos tagger global jmxin, jmxout (jmxin, jmxout) = os.popen2(toutanova_cmd) slash_re = re.compile("^(.+)\/([^\/]+)$") for l in sys.stdin: if not l: assert(0) tree = parsetree.read_tree(l) if tree == None: debug(1, "Skipping empty tree") continue tree.prune_labels(["-NONE-"]) # Run the JMX pos tagger on this tree. jmxstr = string.join([n.headword for n in tree.leaves()]) + "\n" jmxin.write(jmxstr) jmxin.flush() jmxtoks = string.split(jmxout.readline()) assert len(jmxtoks) == len(tree.leaves()) #print string.join(["%s_%s" % (n.headword, n.headtag) for n in tree.leaves()]) + "\n", string.join(jmxtoks, " ") #print # Change the POS tags at the leaves to match the output # of the JMX pos tagger for n in tree.leaves(): m = slash_re.match(jmxtoks[0]) assert m (word, jmxtag) = (m.group(1), m.group(2)) assert n.headword == word # if remove_quotation_marks and jmxtag in ["``", "''"]: # if n.headtag != jmxtag: # print jmxstr # assert n.headtag == jmxtag # elif raise_punctuation and jmxtag in punctuation_tags: # if n.headtag != jmxtag: # print jmxstr # assert n.headtag == jmxtag n.headtag = jmxtag jmxtoks = jmxtoks[1:] sys.stdout.write("%s\n" % tree.to_string()) jmxin.close() jmxout.close()
def main(): assert(len(sys.argv) == 2) # Open the JMX pos tagger global jmxin, jmxout (jmxin, jmxout) = os.popen2(string.replace(jmxcmd, "PROJECTDIR", sys.argv[1])) for l in sys.stdin: if not l: assert(0) tree = parsetree.read_tree(l) if tree == None: debug(1, "Skipping empty tree") continue tree.prune_labels(["-NONE-"]) # Run the JMX pos tagger on this tree. jmxstr = string.join([n.headword for n in tree.leaves()]) + "\n" jmxin.write(jmxstr) jmxin.flush() jmxtoks = string.split(jmxout.readline()) assert len(jmxtoks) == len(tree.leaves()) #print string.join(["%s_%s" % (n.headword, n.headtag) for n in tree.leaves()]) + "\n", string.join(jmxtoks, " ") #print # Change the POS tags at the leaves to match the output # of the JMX pos tagger for n in tree.leaves(): (word, jmxtag) = string.split(jmxtoks[0], "_") assert n.headword == word # if remove_quotation_marks and jmxtag in ["``", "''"]: # if n.headtag != jmxtag: # print jmxstr # assert n.headtag == jmxtag # elif raise_punctuation and jmxtag in punctuation_tags: # if n.headtag != jmxtag: # print jmxstr # assert n.headtag == jmxtag n.headtag = jmxtag jmxtoks = jmxtoks[1:] sys.stdout.write("%s\n" % tree.to_string()) jmxin.close() jmxout.close()
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): # print continue tree = parsetree.read_tree(l) assert tree != None print "<s> %s </s>" % string.join([n.headword for n in tree.leaves()]) if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 1 sentence = 0 for l in sys.stdin: sentence += 1 # if not l: assert(0) # Skip blank lines if not string.strip(l): # print continue tree = parsetree.read_tree(l) print "(", for n in tree.leaves(): print "(%s (%s))" % (n.headword, n.label), print ")" if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence)
def main(): assert len(sys.argv) == 2 debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold(sys.argv[1]), postprocess_jmx(sys.argv[1]))) treebank_file = open(postprocess_gold(sys.argv[1])) jmx_file = open(postprocess_jmx(sys.argv[1])) sentence = 0 for l in sys.stdin: sentence += 1 if not l: assert(0) if l == "\n": lt = treebank_file.readline() lj = jmx_file.readline() print continue tree = parsetree.read_tree(l) assert tree != None if duplicate_top_item: assert(0) # Add a second TOP label, s.t. we can raise punctuation # above the first TOP label node = parsetree.Node() node.isleaf = 0 node.label = "TOP" node.children = [tree] tree = parsetree.refresh(node) # else: # tree = parsetree.refresh(tree) lt = treebank_file.readline() treebank_tree = parsetree.read_tree(lt) assert treebank_tree != None treebank_tree = parsetree.reverse_regularize(treebank_tree) treebank_leaves = [(n.headword, n.headtag) for n in treebank_tree.leaves()] del treebank_tree lj = jmx_file.readline() jmx_tree = parsetree.read_tree(lj) assert jmx_tree != None jmx_tree = parsetree.reverse_regularize(jmx_tree) jmx_leaves = [(n.headword, n.headtag) for n in jmx_tree.leaves()] del jmx_tree tree = parsetree.reverse_regularize(tree) treestr = tree.to_string() tree = parsetree.reverse_regularize(tree) assert treestr == tree.to_string() parsetree.postprocess(tree, origleaves=jmx_leaves, treebank_leaves=treebank_leaves) print tree.to_string() if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence) assert(not treebank_file.readline()) assert(not jmx_file.readline()) treebank_file.close() jmx_file.close()
def main(): assert len(sys.argv) == 2 assert sys.argv[1] == "devel" or sys.argv[1] == "train" check_parsefiles(sys.argv[1]) debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold[sys.argv[1]], postprocess_jmx[sys.argv[1]])) gold_file = open(postprocess_gold[sys.argv[1]]) # jmx_file = open(postprocess_jmx[sys.argv[1]]) sentence = 0 all_types = {} test_constits = {} gold_constits = {} test_constits_totals = {} gold_constits_totals = {} for l in sys.stdin: sentence += 1 if not l: assert 0 test_tree = parsetree.read_tree(l) assert test_tree != None test_tree = parsetree.normalize(test_tree) test_leaves = [(n.headword, n.headtag) for n in test_tree.leaves()] lt = gold_file.readline() gold_tree = parsetree.read_tree(lt) assert gold_tree != None gold_tree = parsetree.normalize(gold_tree) gold_leaves = [(n.headword, n.headtag) for n in gold_tree.leaves()] # Make sure we're comparing the same sentences assert test_leaves == gold_leaves for n in test_tree.internal_nodes(): if n.label == "TOP": continue s = n.span() c = "Sentence #%d: %s @ [%d, %d]" % (sentence, n.label, s[0], s[1]) # Types of constituents we are analyzing: # * All constituents # * Constituents broken down by label # * Constituents broken down by number of children # * Constituents broken down by label, number of children # types = ["all", "label %s" % n.label, "%d children" % len(n.children), "label %s with %d children" % (n.label, len(n.children))] types = ["all", "label %s" % n.label, "%d children" % len(n.children)] for t in types: all_types[t] = 1 if t not in test_constits: test_constits[t] = {} test_constits_totals[t] = 0 if c not in test_constits[t]: test_constits[t][c] = 0 test_constits_totals[t] += 1 test_constits[t][c] += 1 for n in gold_tree.internal_nodes(): if n.label == "TOP": continue s = n.span() c = "Sentence #%d: %s @ [%d, %d]" % (sentence, n.label, s[0], s[1]) # Types of constituents we are analyzing: # * All constituents # * Constituents broken down by label # * Constituents broken down by number of children # * Constituents broken down by label, number of children # types = ["all", "label %s" % n.label, "%d children" % len(n.children), "label %s with %d children" % (n.label, len(n.children))] types = ["all", "label %s" % n.label, "%d children" % len(n.children)] for t in types: all_types[t] = 1 if t not in gold_constits: gold_constits[t] = {} gold_constits_totals[t] = 0 if c not in gold_constits[t]: gold_constits[t][c] = 0 gold_constits_totals[t] += 1 gold_constits[t][c] += 1 if sentence % 100 == 0: debug(1, "Sentence #%d done" % sentence) else: debug(2, "Sentence #%d done" % sentence) gsum = 0 tsum = 0 msum = 0 # FIXME: Don't hardcode this for i in range(128): t = "%d children" % i if t in gold_constits_totals: gsum += gold_constits_totals[t] if t in test_constits_totals: tsum += test_constits_totals[t] assert gsum == gold_constits_totals["all"] assert tsum == test_constits_totals["all"] alltot = test_constits_totals["all"] + gold_constits_totals["all"] print "Total constituents in test + gold: %d" % alltot all_error_fms = 0.0 nonall_error_fms = 0.0 sorted_types = [] for t in all_types: if t not in test_constits: test_constits[t] = {} test_constits_totals[t] = 0 if t not in gold_constits: gold_constits[t] = {} gold_constits_totals[t] = 0 tot = test_constits_totals[t] + gold_constits_totals[t] str = "" str += "\n" str += "Breakdown type: %s\n" % t str += "comprising %.2f%% (%d/%d) of all constituents\n" % (100.0 * tot / alltot, tot, alltot) testmatch = 0 goldmatch = 0 testtot = 0 goldtot = 0 allkeys = {} for k in test_constits[t].keys() + gold_constits[t].keys(): allkeys[k] = True for c in allkeys: testmatch += min(test_constits[t].get(c, 0), gold_constits["all"].get(c, 0)) goldmatch += min(test_constits["all"].get(c, 0), gold_constits[t].get(c, 0)) goldtot += gold_constits[t].get(c, 0) testtot += test_constits[t].get(c, 0) assert goldtot == gold_constits_totals[t] assert testtot == test_constits_totals[t] # BUG: These error FMS are all skewed! # To see this, observe the "VP with 2 children" has more attributed error than just "VP" # error_fms = 1. * (testtot + goldtot - 2 * match) / alltot error_fms = 1.0 * (testtot + goldtot - goldmatch - testmatch) / alltot str += "overall error incurred by this constituent type = %.3f%% (%d/%d)\n" % ( 100.0 * error_fms, testtot + goldtot - goldmatch - testmatch, alltot, ) # errprc = 1. * (testtot - match) / test_constits_totals["all"] # errrcl = 1. * (goldtot - match) / gold_constits_totals["all"] ##errprc = 1. * (test_constits_totals["all"] - testtot + match) / test_constits_totals["all"] ##errrcl = 1. * (gold_constits_totals["all"] - goldtot + match) / gold_constits_totals["all"] # if errprc == 0 or errrcl == 0: error_fms = 0 # else: error_fms = 1-2*errrcl*errprc/(errrcl+errprc) # str += "overall error incurred by this constituent type = %.3f%%\n" % (100.*error_fms) # str += "overall PRC error incurred by this constituent type = %.3f%% (%d/%d)\n" % (100.*errprc,testtot - match, test_constits_totals["all"]) # str += "overall RCL error incurred by this constituent type = %.3f%% (%d/%d)\n" % (100.*errrcl,goldtot - match, gold_constits_totals["all"]) if t == "all": all_error_fms += error_fms else: nonall_error_fms += error_fms if testtot == 0: lprc = 0 else: lprc = 1.0 * testmatch / testtot if goldtot == 0: lrc = 0 else: lrcl = 1.0 * goldmatch / goldtot if lprc == 0 or lrcl == 0: lfms = 0 else: lfms = 2 * lrcl * lprc / (lrcl + lprc) str += "LFMS = %.3f%%\n" % (100.0 * lfms) str += "LPRC = %.3f%% (%d/%d)\n" % (100.0 * lprc, testmatch, testtot) str += "LRCL = %.3f%% (%d/%d)\n" % (100.0 * lrcl, goldmatch, goldtot) sorted_types.append((error_fms, str)) sorted_types.sort() sorted_types.reverse() for (error_fms, str) in sorted_types: print str assert not gold_file.readline() # assert(not jmx_file.readline()) gold_file.close()
def main(): difftxt = "***" if len(sys.argv) == 4: difftxt = sys.argv[3] sys.argv = sys.argv[:3] assert(len(sys.argv) == 3) debug(1, "Using '%s' as difftxt." % difftxt) debug(1, "Opening files:\n\t%s\n\t%s\n" % (sys.argv[1], sys.argv[2])) in_file = open(sys.argv[1]) out_file = open(sys.argv[2]) sentence = 0 for lin in in_file: sentence += 1 lout = out_file.readline() assert lin assert lout in_tree = parsetree.read_tree(lin) out_tree = parsetree.read_tree(lout) assert in_tree != None assert out_tree != None # Find all constituents in the in_tree and in the out_tree in_nodes = {} for n in in_tree.internal_nodes(): nt = node_txt(n) # Check that there are no unaries to self assert nt not in in_nodes in_nodes[nt] = True out_nodes = {} for n in out_tree.internal_nodes(): nt = node_txt(n) # Check that there are no unaries to self assert nt not in out_nodes out_nodes[nt] = True assert(len(in_nodes) <= len(out_nodes)) # Sanity check: # Make sure that every constituent in the in_tree # is also in the out_tree for n in in_nodes: assert n in out_nodes found_diff = False # Find all constituents in out_tree that are not # in in_tree for n in out_tree.internal_nodes(): nt = node_txt(n) # If the constituent is not present in in_tree, # then add difftxt to this node's label if nt not in in_nodes: found_diff = True n.label += difftxt # Otherwise, remove this node from in_tree's # list, since we don't want to use it twice # [This should have no effect if there are no # unary projections to self] else: del in_nodes[nt] if not found_diff: assert lin == lout assert in_tree.to_string() == out_tree.to_string() debug(1, "WARNING: No diff found for sentence #%d: %s" % (sentence, lin)) print out_tree.to_string() # if sentence % 100 == 0: # debug(1, "Sentence #%d done" % sentence) # else: # debug(2, "Sentence #%d done" % sentence) assert(not in_file.readline()) assert(not out_file.readline()) in_file.close() out_file.close()