Ejemplo n.º 1
0
def main():
	vocab.init()

	sentence = 0
	for l in sys.stdin:
		sentence += 1
#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l): continue

		tree = parsetree.read_tree(l)
		assert tree != None
		if remove_quotation_marks:
			tree.prune_labels(["``", "''"])

		for n in tree.leaves():
			if lowercase_vocabulary:
				n.headword = string.lower(n.headword)
		tree = parsetree.refresh(tree)

		vocab.add(tree)
		del tree

		if sentence % 1000 == 0:
			debug(1, "Sentence #%d done" % sentence)
		elif sentence % 100 == 0:
			debug(2, "Sentence #%d done" % sentence)

	vocab.write()
Ejemplo n.º 2
0
def postprocess_closed_class_compounds(tree):
	assert flatten_closed_class_compounds
	leaves = tree.leaves()
	for n in leaves:
		for (words, pos, structure) in compounds:
			if string.lower(n.headword) != string.join(words, "+"):
				continue

			# Exception for "sort of", in the case that p is an NP
			# (this finds the adjectival usage of "sort of", as opposed to
			# the default adverbial usage).
			if words == ["sort", "of"] and n.parent().label == "NP":
				structure = "(*P* (NP *L* (NN sort)) (PP (IN of) *R*))"

			structure = string.replace(structure, "*P*", n.parent().label)
			structure = string.replace(structure, "*L*", string.join([l.to_string() for l in n.left_siblings()]))
			structure = string.replace(structure, "*R*", string.join([r.to_string() for r in n.right_siblings()]))
			p = n.parent()
			assert p != None
#			print p.to_string()
			newnode = parsetree.read_tree(structure)
			p.children = newnode.children
			tree = parsetree.refresh(tree)
#			print p.to_string()

	return tree
Ejemplo n.º 3
0
def transform(tree):
	assert add_basal_nps

	# Do a bottom-up transformation of the tree
	if not tree.isleaf:
		tree.children = [transform(c) for c in tree.children]
		tree = parsetree.refresh(tree)
	tree = transform_node(tree)

	return tree
Ejemplo n.º 4
0
def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

		if not l: assert(0)
#		# Skip blank lines
#		if not string.strip(l):
#			print
#			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# Sanity check that the tree's already been regularized.
		treestr = tree.to_string()
		tree = parsetree.regularize(tree)
		assert tree.to_string() == treestr

		if duplicate_top_item:
			# Add a second TOP label, s.t. we can raise punctuation
			# above the first TOP label
			node = parsetree.Node()
			node.isleaf = 0
			node.label = "TOP"
			node.children = [tree]
			tree = parsetree.refresh(node)
			tree = parsetree.preprocess(tree)
		else:
			tree = parsetree.preprocess(tree)

		for n in tree.leaves():
			# Make sure that the headtag is a terminal label (POS tag)
			assert vocab.label_to_idx[n.headtag][1] == 1
			# Make sure that the headword is in the vocabulary
			assert vocab.vocab_to_idx[n.headword] > 0

		for n in tree.internal_nodes():
			# Make sure that the label is a constituent label
			assert vocab.label_to_idx[n.label][1] == 0

		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)
Ejemplo n.º 5
0
def main():
	assert len(sys.argv) == 1

	sentence = 0
	for l in sys.stdin:
		sentence += 1

#		if not l: assert(0)
		# Skip blank lines
		if not string.strip(l):
#			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		# SANITY CHECK:
		# Ensure that the cleaned output is "stable", i.e. that
		# this script will produce identical output if we pipe
		# cleaned output from this script back into it.
		assert tree.to_string() == parsetree.read_tree(tree.to_string()).to_string()

		# Remove all internal nodes with labels that are unknown
		# (not in the constituent list).
		for n in tree.internal_nodes():
			if n.label not in constits:
		                p = n.parent()
				assert p != None
				p.children = n.left_siblings() + n.children + n.right_siblings()
				tree = parsetree.refresh(tree)
				if n.label not in unknown_constits:
					unknown_constits[n.label] = 1
					sys.stderr.write("Stripping unknown label: %s\n" % n.label)

		to_print = False
		for n in tree.leaves():
			if n.headword not in [":", ",", ".", "``", "''", "?", "!"]:
				to_print = True
				break

		if to_print: print tree.to_string()
		else: sys.stderr.write("Skipping all punctuation tree\n")

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)
Ejemplo n.º 6
0
def main():
	assert len(sys.argv) == 2
	debug(1, "Opening files:\n\t%s\n\t%s\n" % (postprocess_gold(sys.argv[1]), postprocess_jmx(sys.argv[1])))
	treebank_file = open(postprocess_gold(sys.argv[1]))
	jmx_file = open(postprocess_jmx(sys.argv[1]))

	sentence = 0
	for l in sys.stdin:
		sentence += 1
		if not l: assert(0)
		if l == "\n":
			lt = treebank_file.readline()
			lj = jmx_file.readline()
			print
			continue

		tree = parsetree.read_tree(l)
		assert tree != None

		if duplicate_top_item:
			assert(0)
			# Add a second TOP label, s.t. we can raise punctuation
			# above the first TOP label
			node = parsetree.Node()
			node.isleaf = 0
			node.label = "TOP"
			node.children = [tree]
			tree = parsetree.refresh(node)
#		else:
#			tree = parsetree.refresh(tree)

		lt = treebank_file.readline()
		treebank_tree = parsetree.read_tree(lt)
		assert treebank_tree != None
		treebank_tree = parsetree.reverse_regularize(treebank_tree)
		treebank_leaves = [(n.headword, n.headtag) for n in treebank_tree.leaves()]
		del treebank_tree

		lj = jmx_file.readline()
		jmx_tree = parsetree.read_tree(lj)
		assert jmx_tree != None
		jmx_tree = parsetree.reverse_regularize(jmx_tree)
		jmx_leaves = [(n.headword, n.headtag) for n in jmx_tree.leaves()]
		del jmx_tree

		tree = parsetree.reverse_regularize(tree)
		treestr = tree.to_string()
		tree = parsetree.reverse_regularize(tree)
		assert treestr == tree.to_string()

		parsetree.postprocess(tree, origleaves=jmx_leaves, treebank_leaves=treebank_leaves)
		print tree.to_string()

		if sentence % 100 == 0:
			debug(1, "Sentence #%d done" % sentence)
		else:
			debug(2, "Sentence #%d done" % sentence)

	assert(not treebank_file.readline())
	assert(not jmx_file.readline())
	treebank_file.close()
	jmx_file.close()
Ejemplo n.º 7
0
def preprocess_closed_class_compounds(tree):
	assert flatten_closed_class_compounds

	found = True
	while found:
		found = False
		leaves = tree.leaves()
		for i in range(len(leaves)):
			for (words, pos, structure) in compounds:
				l = len(words)
				if i + l >= len(leaves):
					continue
				match = True
				for j in range(l):
					if string.lower(leaves[i+j].headword) != words[j]:
						match = False
						break
				if not match: continue
	
#				print tree.to_string()
#				print words
				p = parsetree.lowest_common_ancestor(leaves[i+0:i+l])
	
				left_siblings = []
				right_siblings = []
	
				# Collect, for left siblings for the new node,
				# the left siblings of all ancestors of the leftmost word,
				# from the child of p down to the leftmost word.
				leftmost_ancestors = leaves[i+0].ancestors()
				leftmost_ancestors.reverse()
				found = False
				for a in leftmost_ancestors:
					if a == p: found = True
					elif found == True: left_siblings += a().left_siblings()
				assert found
				left_siblings += leaves[i+0].left_siblings()
	
				# Collect, for right siblings for the new node,
				# the right siblings of all ancestors of the rightmost word,
				# from rightmost word up through the child of p.
				right_siblings += leaves[i+l-1].right_siblings()
				for a in leaves[i+l-1].ancestors():
					if a == p: break
					right_siblings += a().right_siblings()
				assert a == p
	
				leafcnt = len(words)
				for n in left_siblings + right_siblings:
					leafcnt += len(n.leaves())
				assert(len(p().leaves()) == leafcnt)
	
				# Exception for "sort of", in the case that p is an NP
				# (this finds the adjectival usage of "sort of", as opposed to
				# the default adverbial usage).
				if words == ["sort", "of"] and p().label == "NP":
					compound = parsetree.Node(tag="JJ", word=string.join(words, "+"))
				else:
					compound = parsetree.Node(tag=pos, word=string.join(words, "+"))
				node = parsetree.Node()
				node.isleaf = 0
				node.label = p().label
				node.children = left_siblings + [compound] + right_siblings
	
#				print [l.to_string() for l in p().leaves()]
#				print [n.to_string() for n in left_siblings], [n.to_string() for n in right_siblings]
#				print len(p().leaves()), leafcnt
#				if len(left_siblings) > len(leaves[i+0].left_siblings()):
#				print words, p().to_string(), node.to_string()
	
#				print p().to_string()
				newchildren = p().left_siblings() + [node] + p().right_siblings()
				assert p().parent != None
				p().parent().children = newchildren
#				sys.stdout.flush()

				found = True
				break
			if found: break
		if found: tree = parsetree.refresh(tree)
	return tree