def test_empty(self): original = 'What is witch hazel?' result = json.dumps(cky.cky(original.split(' '), 'SBARQ', self.prepared[0], self.prepared[1])) print result # Yeah, a side effect however I'd like to see this in input. self.assertEqual(result, '["SBARQ", ["WHNP+PRON", "What"], ["SQ+VP", ["VERB", "is"], ["NP", ["NOUN", "_RARE_"], ' '["NOUN", "_RARE_"]]]]')
def insideOutside(sentence, grammar, count): """ insideOutside() finds the expected number of counts for rules in our grammar, given a sentence. @params: sentence (list of strings), grammar and count dictionary. @return: n/a (updates count dictionary). """ n = len(sentence) trees = cky.cky(grammar, sentence) trees_top = [] for tree in trees: if tree.root == 'TOP': trees_top.append(tree) # cky.printParseTrees(trees_top) inside = getAlpha(sentence, grammar, trees_top) # print(inside) outside = getBeta(sentence, grammar, trees_top, inside) Z = inside[grammar.start_symbol][0][n-1] mu = {lhs:[[0]*n]*n for lhs in inside} for lhs in mu: for i in range(n): for j in range(n): mu[lhs][i][j] = inside[lhs][i][j]*outside[lhs][i][j] gamma = {} for lhs in grammar.NR: for rule in grammar.NR[lhs].values(): gamma[rule] = [[[0]*n]*n]*n for i in range(n-1): for j in range(i+1, n): for k in range(i,j): gamma[rule][i][k][j] = outside[rule.lhs][i][j]*rule.prob*inside[rule.rhs[0]][i][k]*inside[rule.rhs[1]][k+1][j] for lhs in grammar.NR: if lhs not in count: count[lhs] = {} for rule in grammar.NR[lhs].values(): if tuple(rule.rhs) not in count[lhs]: count[lhs][tuple(rule.rhs)] = 0 for i in range(n-1): for j in range(i+1, n): for k in range(i, j): count[lhs][tuple(rule.rhs)] += gamma[rule][i][k][j]/Z for term in grammar.TR: for lhs in grammar.TR[term]: if lhs not in count: count[lhs] = {} for i in range(n): for lhs in grammar.TR[sentence[i]]: if tuple([sentence[i]]) in count[lhs]: count[lhs][tuple([sentence[i]])] += mu[lhs][i][i]/Z else: count[lhs][tuple([sentence[i]])] = mu[lhs][i][i]/Z
from PCFG import PCFG from cky import load_sents_to_parse, cky if __name__ == '__main__': pcfg = PCFG.from_file_assert_cnf('grammar3-CNF.txt') good_sents = load_sents_to_parse('sents_3.txt') bad_sents = load_sents_to_parse('sents_bad.txt') print '========' print 'Checking good sentences!' print '========' failures = 0 for sent in good_sents: if "FAILED" in cky(pcfg, sent): print 'FAILURE!!! Failed to parse %s' % sent failures += 1 print 'Succeeded %d out of %d' % (len(good_sents) - failures, len(good_sents)) print '========' print 'Checking bad sentences!' print '========' failures = 0 for sent in bad_sents: if "FAILED" not in cky(pcfg, sent): print 'FAILURE!!! Parsed when it should have failed: %s' % sent print 'Succeeded %d out of %d' % (len(bad_sents) - failures, len(bad_sents))
def main(): if len(sys.argv) == 3: sentences = sys.argv[2].split(' ') # Get the grammar. file_path = sys.argv[1] trees = [] print 'Parsing trees in file...' f = open(file_path, 'rb') trees.extend(count_cfg.read_trees(f)) print 'Converting trees to grammar...' g = grammar.Grammar(nodes = trees) print 'Converting to CNF...' g.convertToCNF() # Parse and get nodes back. print 'Running CKY...' nodes_back = cky.cky(g, sentences) # Only get the nodes back that have a TOP. nodes_back_top = [] for tree in nodes_back: if tree.root == 'TOP': nodes_back_top.append(tree) print 'Getting best and worst tree...' if nodes_back_top == []: print('No tree could be constructed for the sentence.') sys.exit() elif len(nodes_back_top) == 1: print('Only one valid tree found for the sentence.') print(cky.getParseTree(nodes_back_top[0], 5)) max_pot = float('-inf') min_pot = float('inf') max_tree = nodes_back_top[0] min_tree = nodes_back_top[0] for tree in nodes_back_top: pot_tree = potential(tree, g) if pot_tree > max_pot: max_pot = pot_tree max_tree = tree elif pot_tree < min_pot: min_pot = pot_tree min_tree = tree print('Max tree:') print(cky.getParseTree(max_tree, 5)) print('Min tree:') print(cky.getParseTree(min_tree, 5)) elif len(sys.argv) == 2: if os.path.isfile('grammar.p'): g = pickle.load(open('grammar.p', 'rb')) else: trees = [] print 'Parsing trees' for path in os.listdir(sys.argv[1]): if path.split('.')[1] != 'prd': continue file_path = sys.argv[1]+'/'+path f = open(file_path, 'rb') trees.extend(count_cfg.read_trees(f)) print 'Converting trees to grammar' g = grammar.Grammar(nodes = trees) print 'Converting to CNF' g.convertToCNF() pickle.dump(g, open('grammar.p', 'wb')) print 'Parsing Sentence' sentences = [['His', 'tall', 'frame'], ['the', 'dog', 'saved'], ['discover', 'the', 'first', 'snail'], ['it', 'is', 'juxtaposed', 'well'], ['Her', 'handling', 'of', 'paint'], ['He', 'glowered', 'down', 'at', 'her']] for t in range(5): # num_t = [len(g.TR[lhs]) for lhs in g.TR] # num_n = [len(g.NR[lhs]) for lhs in g.NR] # print sum(num_t), sum(num_n) to_del = [] count = {} for sent in sentences: insideOutside(sent, g, count) for lhs in count: lhs_sum = sum(count[lhs].values()) if lhs_sum == 0: to_del.append(lhs) else: for key,val in count[lhs].items(): count[lhs][key] = val/lhs_sum for lhs in to_del: del count[lhs] for lhs in count: for key,val in count[lhs].items(): rule_dat = [lhs,] rule_dat.extend(list(key)) rule_dat.append(val) g.add_rule(grammar.Rule(vals = rule_dat)) def isTop(node): return node.root == 'TOP' for s in sentences: nodes_back = cky.cky(g, s) node_back = filter(isTop, nodes_back) node_back = [(node, potential(node, g)) for node in node_back] node_back.sort(key=lambda node: -1*node[1]) cky.printParseTrees([node_back[0][0]]) else: print('Error. Invalid number of arguments.') print('Two options for running:') print('Usage: $ inside_out.py <directory>') print 'Note: only files ending in .prd in the directory provided', \ 'will be read into a grammar.' print 'Note: .prd files need to be in s-expression form.' print 'OR:' print('Usage: $ inside_out.py <grammar file> <string to be parsed>') print 'Note: grammar file needs to be in s-expression form.' sys.exit()