def production_count(path, oldtable, tables, conf): grammar = dict((row[0], tuple(row[1:])) for row in tables["infer_grammar"]) stats = dict() prodnum = dict() # print grammar # print "\n" for nonterm, P in grammar.iteritems(): for i, p in enumerate(P): prodnum[(nonterm, p)] = i + 1 stats[(nonterm, i + 1)] = 0 if oldtable is not None: for nonterm, p, count in oldtable: stats[(nonterm, prodnum[(nonterm, p)])] = int(count) # print prodnum def callback(grammar, stats, node, depth): if not node.children: return productions = grammar[node.label] p = productions.index(":".join(kid.label for kid in node.children)) + 1 # print node.label + " => " + grammar[node.label][p-1] stats[(node.label, p)] += 1 walktrees(conf["trees"], functools.partial(callback, grammar, stats)) table = [(key[0], grammar[key[0]][key[1] - 1], count) for key, count in stats.iteritems()] table.sort(key=lambda x: (x[0], x[2])) save(path, table) return table
def symbol_counter(path, oldtable, trees, callback): symbols = dict() if oldtable is not None: symbols.update(oldtable) walktrees(trees, functools.partial(callback, symbols)) return save(path, tuple((name, count) for name, count in symbols.iteritems()))
def infer_grammar(path, oldtable, tables, conf): productions = dict() if oldtable is not None: ## TODO: clean this jankyness up! we shouldn't have to rejoin oldtable ## so we can parse it. productions.update( lib.parse_grammar('\n'.join(''.join(row) for row in oldtable)) ) def callback(productions, node, depth): if not node.children: return p = productions.get(node.label, set()) p.add(tuple(kid.label for kid in node.children)) productions[node.label] = p walktrees(conf['trees'], functools.partial(callback, productions)) table = tuple( tuple([nonterm] + [':'.join(p) for p in P]) for nonterm, P in productions.iteritems() ) gramfile = '\n'.join( ' : '.join(( nonterm, ' '.join(prod) )) for nonterm, prods in productions.iteritems() for prod in prods ) + '\n' with open(path, 'w') as f: f.write(gramfile) return table
def production_coverage_global(path, oldtable, tables, conf): grammar = dict( ( nonterm, tuple(sorted(':'.join(p) for p in prods)) ) for nonterm, prods in conf['grammar'].iteritems() ) def initstats(): stats = dict() total = 0 for nonterm, P in grammar.iteritems(): for i, p in enumerate(P): stats[(nonterm, i+1)] = 0 total += 1 return stats, total global_cov, total = initstats() walktrees(conf['trees'], functools.partial(callback, grammar, global_cov)) print 'global', float(sum(val for val in global_cov.itervalues()))/total
def conditional_counts(path, oldtable, tables, conf): grammar = dict((row[0], tuple(row[1:])) for row in tables["infer_grammar"]) counts = ( dict() ) # counts how many times a RULE is reached by a specific prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5) nonterminalCounts = ( dict() ) # counts how many times a prevTuple reaches a specific NONTERMINAL (e.g. nonterminalCounts[(NT1,NT2)][NT] == 78) stack = list() lookBack = 2 # how many items in prevTuple? def count_nonterms(nonterminalCounts, prevAsTuple, node): if not nonterminalCounts.has_key(prevAsTuple): nonterminalCounts[prevAsTuple] = {node.label: 1} else: if not nonterminalCounts.get(prevAsTuple).has_key(node.label): nonterminalCounts[prevAsTuple][node.label] = 1 else: nonterminalCounts[prevAsTuple][node.label] += 1 def increase_counts(counts, prevAsTuple, chosenRule): if not counts.has_key(chosenRule): counts[chosenRule] = {prevAsTuple: 1} else: if not counts.get(chosenRule).has_key(prevAsTuple): counts[chosenRule][prevAsTuple] = 1 else: counts[chosenRule][prevAsTuple] += 1 def callback(grammar, node, depth): # if this is a new ast then we want to clear our stack if node.label == "Start": while stack: stack.pop() initStack = (tuple(None for x in range(lookBack)), False) stack.append(initStack) prev = stack[len(stack) - 1][0] requirePop = stack[len(stack) - 1][1] prevAsTuple = tuple(prev[x] for x in range(lookBack)) if not node.children: if requirePop: stack.pop() return if requirePop: stack.pop() productions = grammar[node.label] p = productions.index(":".join(kid.label for kid in node.children)) + 1 chosenRule = node.label + " => " + grammar[node.label][p - 1] increase_counts(counts, prevAsTuple, chosenRule) count_nonterms(nonterminalCounts, prevAsTuple, node) # append this new rule to the stack as our new "most previous" if grammar[node.label][p - 1].count(":") > 1: # do we have more than 1 nonterminal in this rule? stack.append((tuple(prev[x + 1] for x in range(lookBack - 1)) + (node.label,), False)) else: # if there is only one nonterminal in this rule then we want to log it as a previous but then pop it from the stack # this way, rules that have >1 nonterminals will keep their "prev" relative to what it was originally. # e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions stack.append((tuple(prev[x + 1] for x in range(lookBack - 1)) + (node.label,), True)) walktrees(conf["trees"], functools.partial(callback, grammar)) retTables = dict() retTables[0] = tuple( (lookBack, rule) + tuple(nt for nt in prev) + (count,) for rule, myCounts in counts.iteritems() for prev, count in myCounts.iteritems() ) # we don't save this guy as a csv but we need to log it so that conditional_probabilities() can work right retTables[1] = tuple( (lookBack,) + tuple(prev for prev in prevAsTuple) + (nonterm, count) for prevAsTuple, myCounts in nonterminalCounts.iteritems() for nonterm, count in myCounts.iteritems() ) save(path, retTables[0]) return retTables
def conditional_probabilities(path, oldtable, tables, conf): grammar = dict((row[0], tuple(row[1:])) for row in tables['infer_grammar']) counts = dict() #counts how many times a RULE is reached by a specific prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5) terminalCounts = dict() #counts how many times a prevTuple reaches a specific NONTERMINAL (e.g. terminalCounts[(NT1,NT2)][NT] == 78) stack = list() lookBack = 2 #how many items in prevTuple? def callback(grammar, node, depth): #if this is a new ast then we want to clear our stack if node.label == "Start": while stack: stack.pop() initStack = (tuple(None for x in range(lookBack)), False) stack.append(initStack) prev = stack[len(stack)-1][0] requirePop = stack[len(stack)-1][1] prevAsTuple = tuple(prev[x] for x in range(lookBack)) if not node.children: if requirePop: stack.pop() return if requirePop: stack.pop() productions = grammar[node.label] p = productions.index(':'.join(kid.label for kid in node.children)) + 1 chosenRule = node.label + " => " + grammar[node.label][p-1] if not counts.has_key(chosenRule): counts[chosenRule] = {prevAsTuple : 1} else: if not counts.get(chosenRule).has_key(prevAsTuple): counts[chosenRule][prevAsTuple] = 1 else: counts[chosenRule][prevAsTuple] += 1 if not terminalCounts.has_key(prevAsTuple): terminalCounts[prevAsTuple] = {node.label : 1} else: if not terminalCounts.get(prevAsTuple).has_key(node.label): terminalCounts[prevAsTuple][node.label] = 1 else: terminalCounts[prevAsTuple][node.label] += 1 #append this new rule to the stack as our new "most previous" if grammar[node.label][p-1].count(":") > 1: stack.append( ( tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,), False ) ) else: #if there is only one nonterminal in this rule then we want to log it as a previous but then pop it from the stack #this way, rules that have >1 nonterminals will keep their "prev" relative to what it was originally. #e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions stack.append( ( tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,), True ) ) walktrees(conf['trees'], functools.partial(callback, grammar)) probabilities = dict( ( rule, dict( ( prev, float(num)/float(terminalCounts[prev][rule.split("=>")[0].strip()]) #P[rule | prev] ) for prev, num in myCounts.iteritems() ) ) for rule, myCounts in counts.iteritems() ) table = tuple( (lookBack, prod) + tuple(nt for nt in prev) + (probability,) for prod, myCounts in probabilities.iteritems() for prev, probability in myCounts.iteritems() ) save(path, table) return table