Esempi in Python per read_tree, esempi in Python per parsetree.read_tree

Esempio n. 1

0

Mostra file

File: penalize-nulls.py Progetto: arne-cl/turian-parser

def main():
	assert(len(sys.argv) == 2)
	debug(1, "Opening files:\n\t%s\n" % (sys.argv[1]))
	gold_file = open(sys.argv[1])

	sentence = 0
	for lt in sys.stdin:
		sentence += 1

		lg = gold_file.readline()
		gold_tree = parsetree.read_tree(lg)

		if lt == "\n" or lt == "(null)\n":
			goldstr = leaves_string(gold_tree)
			sys.stderr.write("FOUND NULL! Sentence #%d\n" % sentence)
			sys.stderr.write("Output: %s\n" % goldstr)
			print goldstr
		else:
			test_tree = parsetree.read_tree(lt)
			teststr = "(%s)" % ([l.headword for l in test_tree.leaves()])
			goldstr = "(%s)" % ([l.headword for l in test_tree.leaves()])
			assert(goldstr == teststr)
			print lt,

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

	assert(not gold_file.readline())
	gold_file.close()

Esempio n. 2

0

Mostra file

File: clean-treebank.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l):
#			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# SANITY CHECK:
		# Ensure that the cleaned output is "stable", i.e. that
		# this script will produce identical output if we pipe
		# cleaned output from this script back into it.
		assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string()
		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

Esempio n. 3

0

Mostra file

File: skip-long-sentences.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	skip_sentence = 0
	for l in sys.stdin:
		sentence += 1
		if not l: assert(0)

		tree = parsetree.read_tree(l)
		assert tree != None
#		print string.strip(tree.to_string())
#		print string.strip(l)
#		assert string.strip(tree.to_string()) == string.strip(l)

		origleaves = [(n.headword, n.headtag) for n in tree.leaves()]
		if len(origleaves) > max_words_per_sentence:
			skip_sentence += 1
#			print
		else:
			print string.strip(l)

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

	keep_sentence = sentence - skip_sentence
	debug(1, "Kept %.2f%% (%d of %d) sentences with at most %d words" % (100.*keep_sentence/sentence, keep_sentence, sentence, max_words_per_sentence))
	debug(1, "i.e. skipped %.2f%% (%d of %d) sentences with more that %d words" % (100.*skip_sentence/sentence, skip_sentence, sentence, max_words_per_sentence))

Esempio n. 4

0

Mostra file

File: create-vocabulary.py Progetto: arne-cl/turian-parser

def main():
	vocab.init()

	sentence = 0
	for l in sys.stdin:
		sentence += 1
#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l): continue

		tree = parsetree.read_tree(l)
		assert tree != None
		if remove_quotation_marks:
			tree.prune_labels(["``", "''"])

		for n in tree.leaves():
			if lowercase_vocabulary:
				n.headword = string.lower(n.headword)
		tree = parsetree.refresh(tree)

		vocab.add(tree)
		del tree

		if sentence % 1000 == 0:
			debug(1, "Sentence #%d done" % sentence)
		elif sentence % 100 == 0:
			debug(2, "Sentence #%d done" % sentence)

	vocab.write()

Esempio n. 5

0

Mostra file

File: compounds.py Progetto: arne-cl/turian-parser

def postprocess_closed_class_compounds(tree):
	assert flatten_closed_class_compounds
	leaves = tree.leaves()
	for n in leaves:
		for (words, pos, structure) in compounds:
			if string.lower(n.headword) != string.join(words, "+"):
				continue

			# Exception for "sort of", in the case that p is an NP
			# (this finds the adjectival usage of "sort of", as opposed to
			# the default adverbial usage).
			if words == ["sort", "of"] and n.parent().label == "NP":
				structure = "(*P* (NP *L* (NN sort)) (PP (IN of) *R*))"

			structure = string.replace(structure, "*P*", n.parent().label)
			structure = string.replace(structure, "*L*", string.join([l.to_string() for l in n.left_siblings()]))
			structure = string.replace(structure, "*R*", string.join([r.to_string() for r in n.right_siblings()]))
			p = n.parent()
			assert p != None
#			print p.to_string()
			newnode = parsetree.read_tree(structure)
			p.children = newnode.children
			tree = parsetree.refresh(tree)
#			print p.to_string()

	return tree

Esempio n. 6

0

Mostra file

File: clean-brown-treebank.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l):
#			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# SANITY CHECK:
		# Ensure that the cleaned output is "stable", i.e. that
		# this script will produce identical output if we pipe
		# cleaned output from this script back into it.
		assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string()

		# Remove all internal nodes with labels that are unknown
		# (not in the constituent list).
		for n in tree.internal_nodes():
			if n.label not in constits:
		                p = n.parent()
				assert p != None
				p.children = n.left_siblings() + n.children + n.right_siblings()
				tree = parsetree.refresh(tree)
				if n.label not in unknown_constits:
					unknown_constits[n.label] = 1
					sys.stderr.write("Stripping unknown label: %s\n" % n.label)

		to_print = False
		for n in tree.leaves():
			if n.headword not in [":", ",", ".", "``", "''", "?", "!"]:
				to_print = True
				break

		if to_print: print tree.to_string()
		else: sys.stderr.write("Skipping all punctuation tree\n")

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

Esempio n. 7

0

Mostra file

File: find-split-line.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 3

	sentence = 0
	f = open(sys.argv[1], "rt")
	total_actions = 0
	for l in f:
		sentence += 1
		if not l: assert(0)

		tree = parsetree.read_tree(l)
		assert tree != None
		assert string.strip(tree.to_string()) == string.strip(l)

		total_actions += len(tree.internal_nodes())
		if sentence % 100 == 0:
			debug(1, "Sentence #%d done (first pass)" % sentence)
		else:
			debug(2, "Sentence #%d done (first pass)" % sentence)
	f.close()

	keep_actions = float(sys.argv[2]) * total_actions
	debug(1, "Wish to keep %.2f actions out of %d" % (keep_actions, total_actions))

	split_sentence = 0
	f = open(sys.argv[1], "rt")
	split_actions = 0
	for l in f:
		split_sentence += 1
		if not l: assert(0)

		tree = parsetree.read_tree(l)
		assert tree != None
		assert string.strip(tree.to_string()) == string.strip(l)

		split_actions += len(tree.internal_nodes())
		print string.strip(l)
		if split_actions >= keep_actions:
			break
	f.close()

	debug(1, "Kept %.2f%% (%d of %d) sentences, %.2f%% (%d) actions versus %.2f%% (%.2f) desired)" % (100.*split_sentence/sentence, split_sentence, sentence, 100.*split_actions/total_actions, split_actions, 100.*float(sys.argv[2]), keep_actions))

Esempio n. 8

0

Mostra file

File: preprocess.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

		if not l: assert(0)
#		# Skip blank lines
#		if not string.strip(l):
#			print
#			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# Sanity check that the tree's already been regularized.
		treestr = tree.to_string()
		tree = parsetree.regularize(tree)
		assert tree.to_string() == treestr

		if duplicate_top_item:
			# Add a second TOP label, s.t. we can raise punctuation
			# above the first TOP label
			node = parsetree.Node()
			node.isleaf = 0
			node.label = "TOP"
			node.children = [tree]
			tree = parsetree.refresh(node)
			tree = parsetree.preprocess(tree)
		else:
			tree = parsetree.preprocess(tree)

		for n in tree.leaves():
			# Make sure that the headtag is a terminal label (POS tag)
			assert vocab.label_to_idx[n.headtag][1] == 1
			# Make sure that the headword is in the vocabulary
			assert vocab.vocab_to_idx[n.headword] > 0

		for n in tree.internal_nodes():
			# Make sure that the label is a constituent label
			assert vocab.label_to_idx[n.label][1] == 0

		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

Esempio n. 9

0

Mostra file

File: toutanova-tag.py Progetto: arne-cl/turian-parser

def main():
	# Open the Toutanova pos tagger
	global jmxin, jmxout
	(jmxin, jmxout) = os.popen2(toutanova_cmd)

	slash_re = re.compile("^(.+)\/([^\/]+)$")

	for l in sys.stdin:
		if not l: assert(0)

		tree = parsetree.read_tree(l)
		if tree == None:
			debug(1, "Skipping empty tree")
			continue

		tree.prune_labels(["-NONE-"])

		# Run the JMX pos tagger on this tree.
		jmxstr = string.join([n.headword for n in tree.leaves()]) + "\n"
		jmxin.write(jmxstr)
		jmxin.flush()
		jmxtoks = string.split(jmxout.readline())
		assert len(jmxtoks) == len(tree.leaves())
		#print string.join(["%s_%s" % (n.headword, n.headtag) for n in tree.leaves()]) + "\n", string.join(jmxtoks, " ")
		#print
		# Change the POS tags at the leaves to match the output
		# of the JMX pos tagger
		for n in tree.leaves():
			m = slash_re.match(jmxtoks[0])
			assert m
			(word, jmxtag) = (m.group(1), m.group(2))
			assert n.headword == word

#			if remove_quotation_marks and jmxtag in ["``", "''"]:
#				if n.headtag != jmxtag:
#					print jmxstr
#				assert n.headtag == jmxtag
#			elif raise_punctuation and jmxtag in punctuation_tags:
#				if n.headtag != jmxtag:
#					print jmxstr
#				assert n.headtag == jmxtag
			n.headtag = jmxtag
			jmxtoks = jmxtoks[1:]

		sys.stdout.write("%s\n" % tree.to_string())

	jmxin.close()
	jmxout.close()

Esempio n. 10

0

Mostra file

File: jmxtag.py Progetto: arne-cl/turian-parser

def main():
	assert(len(sys.argv) == 2)

	# Open the JMX pos tagger
	global jmxin, jmxout
	(jmxin, jmxout) = os.popen2(string.replace(jmxcmd, "PROJECTDIR", sys.argv[1]))

	for l in sys.stdin:
		if not l: assert(0)

		tree = parsetree.read_tree(l)
		if tree == None:
			debug(1, "Skipping empty tree")
			continue

		tree.prune_labels(["-NONE-"])

		# Run the JMX pos tagger on this tree.
		jmxstr = string.join([n.headword for n in tree.leaves()]) + "\n"
		jmxin.write(jmxstr)
		jmxin.flush()
		jmxtoks = string.split(jmxout.readline())
		assert len(jmxtoks) == len(tree.leaves())
		#print string.join(["%s_%s" % (n.headword, n.headtag) for n in tree.leaves()]) + "\n", string.join(jmxtoks, " ")
		#print
		# Change the POS tags at the leaves to match the output
		# of the JMX pos tagger
		for n in tree.leaves():
			(word, jmxtag) = string.split(jmxtoks[0], "_")
			assert n.headword == word

#			if remove_quotation_marks and jmxtag in ["``", "''"]:
#				if n.headtag != jmxtag:
#					print jmxstr
#				assert n.headtag == jmxtag
#			elif raise_punctuation and jmxtag in punctuation_tags:
#				if n.headtag != jmxtag:
#					print jmxstr
#				assert n.headtag == jmxtag
			n.headtag = jmxtag
			jmxtoks = jmxtoks[1:]

		sys.stdout.write("%s\n" % tree.to_string())

	jmxin.close()
	jmxout.close()

Esempio n. 11

0

Mostra file

File: charniak-sentences.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l):
#			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		print "<s> %s </s>" % string.join([n.headword for n in tree.leaves()])

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

Esempio n. 12

0

Mostra file

File: bikel-sentences.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l):
#			print
			continue

		tree = parsetree.read_tree(l)
		print "(",
		for n in tree.leaves():
			print "(%s (%s))" % (n.headword, n.label),
		print ")"

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

Esempio n. 13

0

Mostra file

File: postprocess.py Progetto: arne-cl/turian-parser

def main():
	assert len(sys.argv) == 2
	debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold(sys.argv[1]), postprocess_jmx(sys.argv[1])))
	treebank_file = open(postprocess_gold(sys.argv[1]))
	jmx_file = open(postprocess_jmx(sys.argv[1]))

	sentence = 0
	for l in sys.stdin:
		sentence += 1
		if not l: assert(0)
		if l == "\n":
			lt = treebank_file.readline()
			lj = jmx_file.readline()
			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		if duplicate_top_item:
			assert(0)
			# Add a second TOP label, s.t. we can raise punctuation
			# above the first TOP label
			node = parsetree.Node()
			node.isleaf = 0
			node.label = "TOP"
			node.children = [tree]
			tree = parsetree.refresh(node)
#		else:
#			tree = parsetree.refresh(tree)

		lt = treebank_file.readline()
		treebank_tree = parsetree.read_tree(lt)
		assert treebank_tree != None
		treebank_tree = parsetree.reverse_regularize(treebank_tree)
		treebank_leaves = [(n.headword, n.headtag) for n in treebank_tree.leaves()]
		del treebank_tree

		lj = jmx_file.readline()
		jmx_tree = parsetree.read_tree(lj)
		assert jmx_tree != None
		jmx_tree = parsetree.reverse_regularize(jmx_tree)
		jmx_leaves = [(n.headword, n.headtag) for n in jmx_tree.leaves()]
		del jmx_tree

		tree = parsetree.reverse_regularize(tree)
		treestr = tree.to_string()
		tree = parsetree.reverse_regularize(tree)
		assert treestr == tree.to_string()

		parsetree.postprocess(tree, origleaves=jmx_leaves, treebank_leaves=treebank_leaves)
		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

	assert(not treebank_file.readline())
	assert(not jmx_file.readline())
	treebank_file.close()
	jmx_file.close()

Esempio n. 14

0

Mostra file

File: analyze-errors.py Progetto: arne-cl/turian-parser

def main():
    assert len(sys.argv) == 2
    assert sys.argv[1] == "devel" or sys.argv[1] == "train"
    check_parsefiles(sys.argv[1])
    debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold[sys.argv[1]], postprocess_jmx[sys.argv[1]]))
    gold_file = open(postprocess_gold[sys.argv[1]])
    # 	jmx_file = open(postprocess_jmx[sys.argv[1]])

    sentence = 0

    all_types = {}

    test_constits = {}
    gold_constits = {}

    test_constits_totals = {}
    gold_constits_totals = {}

    for l in sys.stdin:
        sentence += 1
        if not l:
            assert 0

        test_tree = parsetree.read_tree(l)
        assert test_tree != None
        test_tree = parsetree.normalize(test_tree)
        test_leaves = [(n.headword, n.headtag) for n in test_tree.leaves()]

        lt = gold_file.readline()
        gold_tree = parsetree.read_tree(lt)
        assert gold_tree != None
        gold_tree = parsetree.normalize(gold_tree)
        gold_leaves = [(n.headword, n.headtag) for n in gold_tree.leaves()]

        # Make sure we're comparing the same sentences
        assert test_leaves == gold_leaves

        for n in test_tree.internal_nodes():
            if n.label == "TOP":
                continue

            s = n.span()
            c = "Sentence #%d: %s @ [%d, %d]" % (sentence, n.label, s[0], s[1])

            # Types of constituents we are analyzing:
            # 	* All constituents
            # 	* Constituents broken down by label
            # 	* Constituents broken down by number of children
            # 	* Constituents broken down by label, number of children
            # 			types = ["all", "label %s" % n.label, "%d children" % len(n.children), "label %s with %d children" % (n.label, len(n.children))]
            types = ["all", "label %s" % n.label, "%d children" % len(n.children)]

            for t in types:
                all_types[t] = 1
                if t not in test_constits:
                    test_constits[t] = {}
                    test_constits_totals[t] = 0
                if c not in test_constits[t]:
                    test_constits[t][c] = 0
                test_constits_totals[t] += 1
                test_constits[t][c] += 1

        for n in gold_tree.internal_nodes():
            if n.label == "TOP":
                continue

            s = n.span()
            c = "Sentence #%d: %s @ [%d, %d]" % (sentence, n.label, s[0], s[1])

            # Types of constituents we are analyzing:
            # 	* All constituents
            # 	* Constituents broken down by label
            # 	* Constituents broken down by number of children
            # 	* Constituents broken down by label, number of children
            # 			types = ["all", "label %s" % n.label, "%d children" % len(n.children), "label %s with %d children" % (n.label, len(n.children))]
            types = ["all", "label %s" % n.label, "%d children" % len(n.children)]

            for t in types:
                all_types[t] = 1
                if t not in gold_constits:
                    gold_constits[t] = {}
                    gold_constits_totals[t] = 0
                if c not in gold_constits[t]:
                    gold_constits[t][c] = 0
                gold_constits_totals[t] += 1
                gold_constits[t][c] += 1

        if sentence % 100 == 0:
            debug(1, "Sentence #%d done" % sentence)
        else:
            debug(2, "Sentence #%d done" % sentence)

    gsum = 0
    tsum = 0
    msum = 0
    # FIXME: Don't hardcode this
    for i in range(128):
        t = "%d children" % i
        if t in gold_constits_totals:
            gsum += gold_constits_totals[t]
        if t in test_constits_totals:
            tsum += test_constits_totals[t]
    assert gsum == gold_constits_totals["all"]
    assert tsum == test_constits_totals["all"]

    alltot = test_constits_totals["all"] + gold_constits_totals["all"]
    print "Total constituents in test + gold: %d" % alltot

    all_error_fms = 0.0
    nonall_error_fms = 0.0

    sorted_types = []
    for t in all_types:
        if t not in test_constits:
            test_constits[t] = {}
            test_constits_totals[t] = 0
        if t not in gold_constits:
            gold_constits[t] = {}
            gold_constits_totals[t] = 0

        tot = test_constits_totals[t] + gold_constits_totals[t]

        str = ""
        str += "\n"
        str += "Breakdown type: %s\n" % t
        str += "comprising %.2f%% (%d/%d) of all constituents\n" % (100.0 * tot / alltot, tot, alltot)

        testmatch = 0
        goldmatch = 0
        testtot = 0
        goldtot = 0

        allkeys = {}
        for k in test_constits[t].keys() + gold_constits[t].keys():
            allkeys[k] = True
        for c in allkeys:
            testmatch += min(test_constits[t].get(c, 0), gold_constits["all"].get(c, 0))
            goldmatch += min(test_constits["all"].get(c, 0), gold_constits[t].get(c, 0))
            goldtot += gold_constits[t].get(c, 0)
            testtot += test_constits[t].get(c, 0)

        assert goldtot == gold_constits_totals[t]
        assert testtot == test_constits_totals[t]

        # BUG: These error FMS are all skewed!
        # To see this, observe the "VP with 2 children" has more attributed error than just "VP"

        # error_fms = 1. * (testtot + goldtot - 2 * match) / alltot
        error_fms = 1.0 * (testtot + goldtot - goldmatch - testmatch) / alltot
        str += "overall error incurred by this constituent type = %.3f%% (%d/%d)\n" % (
            100.0 * error_fms,
            testtot + goldtot - goldmatch - testmatch,
            alltot,
        )
        # errprc = 1. * (testtot - match) / test_constits_totals["all"]
        # errrcl = 1. * (goldtot - match) / gold_constits_totals["all"]
        ##errprc = 1. * (test_constits_totals["all"] - testtot + match) / test_constits_totals["all"]
        ##errrcl = 1. * (gold_constits_totals["all"] - goldtot + match) / gold_constits_totals["all"]
        # if errprc == 0 or errrcl == 0: error_fms = 0
        # else: error_fms = 1-2*errrcl*errprc/(errrcl+errprc)
        # str += "overall error incurred by this constituent type = %.3f%%\n" % (100.*error_fms)
        # str += "overall PRC error incurred by this constituent type = %.3f%% (%d/%d)\n" % (100.*errprc,testtot - match, test_constits_totals["all"])
        # str += "overall RCL error incurred by this constituent type = %.3f%% (%d/%d)\n" % (100.*errrcl,goldtot - match, gold_constits_totals["all"])

        if t == "all":
            all_error_fms += error_fms
        else:
            nonall_error_fms += error_fms

        if testtot == 0:
            lprc = 0
        else:
            lprc = 1.0 * testmatch / testtot

        if goldtot == 0:
            lrc = 0
        else:
            lrcl = 1.0 * goldmatch / goldtot

        if lprc == 0 or lrcl == 0:
            lfms = 0
        else:
            lfms = 2 * lrcl * lprc / (lrcl + lprc)
        str += "LFMS = %.3f%%\n" % (100.0 * lfms)
        str += "LPRC = %.3f%% (%d/%d)\n" % (100.0 * lprc, testmatch, testtot)
        str += "LRCL = %.3f%% (%d/%d)\n" % (100.0 * lrcl, goldmatch, goldtot)

        sorted_types.append((error_fms, str))

    sorted_types.sort()
    sorted_types.reverse()
    for (error_fms, str) in sorted_types:
        print str

    assert not gold_file.readline()
    # 	assert(not jmx_file.readline())
    gold_file.close()

Esempio n. 15

0

Mostra file

File: tdiff.py Progetto: arne-cl/turian-parser

def main():
	difftxt = "***"
	if len(sys.argv) == 4:
		difftxt = sys.argv[3]
		sys.argv = sys.argv[:3]

	assert(len(sys.argv) == 3)
	debug(1, "Using '%s' as difftxt." % difftxt)
	debug(1, "Opening files:\n\t%s\n\t%s\n" % (sys.argv[1], sys.argv[2]))

	in_file = open(sys.argv[1])
	out_file = open(sys.argv[2])

	sentence = 0
	for lin in in_file:
		sentence += 1

		lout = out_file.readline()

		assert lin
		assert lout

		in_tree = parsetree.read_tree(lin)
		out_tree = parsetree.read_tree(lout)

		assert in_tree != None
		assert out_tree != None

		# Find all constituents in the in_tree and in the out_tree
		in_nodes = {}
                for n in in_tree.internal_nodes():
			nt = node_txt(n)
			# Check that there are no unaries to self
			assert nt not in in_nodes
			in_nodes[nt] = True
		out_nodes = {}
                for n in out_tree.internal_nodes():
			nt = node_txt(n)
			# Check that there are no unaries to self
			assert nt not in out_nodes
			out_nodes[nt] = True

		assert(len(in_nodes) <= len(out_nodes))
		# Sanity check:
		# Make sure that every constituent in the in_tree
		# is also in the out_tree
		for n in in_nodes: assert n in out_nodes

		found_diff = False

		# Find all constituents in out_tree that are not
		# in in_tree
                for n in out_tree.internal_nodes():
			nt = node_txt(n)

			# If the constituent is not present in in_tree,
			# then add difftxt to this node's label
			if nt not in in_nodes:
				found_diff = True
				n.label += difftxt
			# Otherwise, remove this node from in_tree's
			# list, since we don't want to use it twice
			# [This should have no effect if there are no
			# unary projections to self]
			else: del in_nodes[nt]

		if not found_diff:
			assert lin == lout
			assert in_tree.to_string() == out_tree.to_string()
			debug(1, "WARNING: No diff found for sentence #%d: %s" % (sentence, lin))

		print out_tree.to_string()

#		if sentence % 100 == 0:
#			debug(1, "Sentence #%d done" % sentence)
#		else:
#			debug(2, "Sentence #%d done" % sentence)

	assert(not in_file.readline())
	assert(not out_file.readline())
	in_file.close()
	out_file.close()