def check_add_prob(prob, a, b, c, begin, end, split): # add production to this chart location if prob > 0: if verbose > 1: util.log_p("add C %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split)) chart[begin, end].add(a) # store our covering productions if a == start_symbol: covering[begin, end, a].add((b, c)) # if max, break ties by not changing if prob > pi[begin, end, a]: if verbose > 1: util.log_p("add pi %s => (%s %s) to [%d, %d] split: %d." % (a, b, c, begin, end, split)) pi[begin, end, a] = prob viterbi_back[begin, end, a] = [b, c, split] return True return False
def main(): # extract args p = args.pickle verbose = args.very_verbose or args.verbose check = args.check ambiguous = args.ambiguous mlps = args.most_likely_productions lower_case = args.lower_case test = args.test non_terms_for_ml = mlps.split() if mlps and mlps.__class__ != bool else ['VP', 'S', 'NP', 'SBAR', 'PP'] max_word_length = 15 # loading grammar if p: if verbose: util.log_g("Loading grammar from pickle file %s" % (p)) pkl_file = open(p, 'rb') G = pickle.load(pkl_file) pkl_file.close() else: if verbose: util.log_g("Loading grammar from treebank %s" % (args.treebank)) f = open(args.treebank, 'r') G = grammar.Grammar(f, args.grammar_limit, verbose, lower_case) f.close() if args.save: output = open(args.save + '.pkl', 'wb') pickle.dump(G, output) output.close() if verbose: util.log_g("Grammar loaded.") # running checks and statistics if check: util.log_g("Testing probability consistencies.") util.log_g("Greatest divergence from unity: %0.20f." % max([abs(1 - i) for i in G.check_pcfg_sums()])) if check or ambiguous: util.log_g("Ambiguous word tests.") ambig = G.ambiguous() ambig_words = zip(*ambig)[0] if ambig else [] if ambiguous and not ambiguous.__class__ == bool: for word in ambiguous.split(): if word in ambig_words: util.log_g("'%s' is ambiguous." % (word)) pprint.pprint(ambig[ambig_words.index(word)]) else: util.log_g("'%s' is not ambiguous." % (word)) else: util.log_g("4 randomly chosen syntactically ambiguous terminals:") pprint.pprint(ambig[0:4]) if check or mlps: util.log_g("Most likely production for non-terminals %s:" % non_terms_for_ml) mlps = G.most_likely_productions(non_terms_for_ml) pprint.pprint(mlps) # running CYK if args.cyk: if args.cyk.__class__ == bool: util.log_p("Enter new line to exit.") while True: s = raw_input('Enter a sentence to parse: ') if len(s): if verbose: util.log_p("Start CYK") parse = cyk.CYK(G, s, verbose, lower_case) if verbose > 1: util.log_p("Covering productions:") pprint.pprint(parse.covering_productions()) util.log_p("Covering productions string: %s" % parse.covering_productions_str()) util.log_p("Viterbi Parse: %s" % parse.viterbi_parse()) else: break else: f = open(args.cyk) limit = args.parser_test_limit start = args.parser_test_start i = 0 if test: f_vit = open('viterbi_sentences.txt', 'w') else: f_cov = open('covering_productions.txt', 'w') for line in f: if limit and i >= limit: break i += 1 if start and i < start: continue if max_word_length and len(line.split()) > max_word_length: out = "\n" if test: f_vit.write(out) else: f_cov.write(out) else: util.log_p("Sentence %d, parsing sentence: << %s >>" % (i, line.strip())) parse = cyk.CYK(G, line, verbose) # write parse results to output file if test: out = parse.viterbi_parse() if out == util.NOT_IN_GRAMMAR_ERROR: out = "\n" else: out += "\n" f_vit.write(out) else: out = parse.covering_productions_str() f_cov.write(out + "\n") if verbose: util.log_p("Wrote line: %s" % out) gc.collect() # collect cyk object f.close() if test: f_vit.close() else: f_cov.close()