def check_add_prob(prob, a, b, c, begin, end, split):
			# add production to this chart location
			if prob > 0:
				if verbose > 1:
					util.log_p("add C %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split))
				chart[begin, end].add(a)
				# store our covering productions
				if a == start_symbol:
					covering[begin, end, a].add((b, c))
			# if max, break ties by not changing
			if prob > pi[begin, end, a]:
				if verbose > 1:
					util.log_p("add pi %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split))
				pi[begin, end, a] = prob
				viterbi_back[begin, end, a] = [b, c, split]
				return True
			return False
Beispiel #2
0
def main():
	# extract args
	p = args.pickle
	verbose = args.very_verbose or args.verbose
	check = args.check
	ambiguous = args.ambiguous
	mlps = args.most_likely_productions
	lower_case = args.lower_case
	test = args.test
	non_terms_for_ml = mlps.split() if mlps and mlps.__class__ != bool else ['VP', 'S', 'NP', 'SBAR', 'PP']
	max_word_length = 15

	# loading grammar
	if p:
		if verbose:
			util.log_g("Loading grammar from pickle file %s" % (p))
		pkl_file = open(p, 'rb')
		G = pickle.load(pkl_file)
		pkl_file.close()
	else:
		if verbose:
			util.log_g("Loading grammar from treebank %s" % (args.treebank))
		f = open(args.treebank, 'r')
		G = grammar.Grammar(f, args.grammar_limit, verbose, lower_case)
		f.close()
		if args.save:
			output = open(args.save + '.pkl', 'wb')
			pickle.dump(G, output)
			output.close()
	if verbose: util.log_g("Grammar loaded.")
		
	# running checks and statistics
	if check:
		util.log_g("Testing probability consistencies.")
		util.log_g("Greatest divergence from unity: %0.20f." % max([abs(1 - i) for i in G.check_pcfg_sums()]))
	if check or ambiguous:
		util.log_g("Ambiguous word tests.")
		ambig = G.ambiguous()
		ambig_words = zip(*ambig)[0] if ambig else []
		if ambiguous and not ambiguous.__class__ == bool:
			for word in ambiguous.split():
				if word in ambig_words:
					util.log_g("'%s' is ambiguous." % (word))
					pprint.pprint(ambig[ambig_words.index(word)])
				else:
					util.log_g("'%s' is not ambiguous." % (word))
		else:
			util.log_g("4 randomly chosen syntactically ambiguous terminals:")
			pprint.pprint(ambig[0:4])
	if check or mlps:
		util.log_g("Most likely production for non-terminals %s:" % non_terms_for_ml)
		mlps = G.most_likely_productions(non_terms_for_ml)
		pprint.pprint(mlps)
		
	# running CYK
	if args.cyk:
		if args.cyk.__class__ == bool:
			util.log_p("Enter new line to exit.")
			while True:
				s = raw_input('Enter a sentence to parse: ')
				if len(s):
					if verbose:
						util.log_p("Start CYK")
					parse = cyk.CYK(G, s, verbose, lower_case)
					if verbose > 1:
						util.log_p("Covering productions:")
						pprint.pprint(parse.covering_productions())
						util.log_p("Covering productions string: %s" % parse.covering_productions_str())
					util.log_p("Viterbi Parse: %s" % parse.viterbi_parse())
				else:
					break
		else:
			f = open(args.cyk)
			limit = args.parser_test_limit
			start = args.parser_test_start
			i = 0
			if test:
				f_vit = open('viterbi_sentences.txt', 'w')
			else:
				f_cov = open('covering_productions.txt', 'w')
			for line in f:
				if limit and i >= limit:
					break
				i += 1
				if start and i < start:
					continue
				if max_word_length and len(line.split()) > max_word_length:
					out = "\n"
					if test:
						f_vit.write(out)
					else:
						f_cov.write(out)
				else:
					util.log_p("Sentence %d, parsing sentence: << %s >>" % (i, line.strip()))
					parse = cyk.CYK(G, line, verbose)
					# write parse results to output file
					if test:
						out = parse.viterbi_parse()
						if out == util.NOT_IN_GRAMMAR_ERROR:
							out = "\n"
						else:
							out += "\n"
						f_vit.write(out)
					else:
						out = parse.covering_productions_str()
						f_cov.write(out + "\n")
					if verbose:
						util.log_p("Wrote line: %s" % out)
					gc.collect() # collect cyk object
			f.close()
			if test:
				f_vit.close()
			else:
				f_cov.close()