def production_coverage_local(path, oldtable, tables, conf): grammar = dict( ( nonterm, tuple(sorted(':'.join(p) for p in prods)) ) for nonterm, prods in conf['grammar'].iteritems() ) def initstats(): stats = dict() total = 0 for nonterm, P in grammar.iteritems(): for i, p in enumerate(P): stats[(nonterm, i+1)] = 0 total += 1 return stats, total for tree in conf['trees']: local_cov, total = initstats() walktree(tree, functools.partial(callback, grammar, local_cov)) print float(sum(val for val in local_cov.itervalues()))/total
def conditional_counts(path, oldtable, tables, conf): grammar = dict((row[0], tuple(row[1:])) for row in tables['infer_grammar']) counts = dict() # counts how many times a RULE is reached by a specific # prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5) nonterminalCounts = dict() # counts how many times a prevTuple reaches a # specific NONTERMINAL (e.g. # nonterminalCounts[(NT1,NT2)][NT] == 78) stack = list() lookBack = 2 # how many items in prevTuple? def count_nonterms(nonterminalCounts, prevAsTuple, node): if not nonterminalCounts.has_key(prevAsTuple): nonterminalCounts[prevAsTuple] = {node.label : 1} else: if not nonterminalCounts.get(prevAsTuple).has_key(node.label): nonterminalCounts[prevAsTuple][node.label] = 1 else: nonterminalCounts[prevAsTuple][node.label] += 1 def increase_counts(counts, prevAsTuple, chosenRule): if not counts.has_key(chosenRule): counts[chosenRule] = {prevAsTuple : 1} else: if not counts.get(chosenRule).has_key(prevAsTuple): counts[chosenRule][prevAsTuple] = 1 else: counts[chosenRule][prevAsTuple] += 1 def callback(grammar, node, depth): #if this is a new ast then we want to clear our stack if not stack: initStack = (tuple(None for x in range(lookBack)), False) stack.append(initStack) #print stack prev = stack[len(stack)-1][0] requirePop = stack[len(stack)-1][1] prevAsTuple = tuple(prev[x] for x in range(lookBack)) if not node.children: if requirePop: stack.pop() return if requirePop: stack.pop() productions = grammar[node.label] p = productions.index(':'.join(kid.label for kid in node.children)) + 1 chosenRule = node.label + " => " + grammar[node.label][p-1] increase_counts(counts, prevAsTuple, chosenRule) count_nonterms(nonterminalCounts, prevAsTuple, node) #append this new rule to the stack as our new "most previous" if grammar[node.label][p-1].count(":") > 1: #do we have more than 1 nonterminal in this rule? stack.append( ( tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,), False ) ) else: #if there is only one nonterminal in this rule then we want to log it as a previous but then pop it from the stack #this way, rules that have >1 nonterminals will keep their "prev" relative to what it was originally. #e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions stack.append( ( tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,), True ) ) for tree in conf['trees']: stack = list() walktree(tree, functools.partial(callback, grammar)) retTables = dict() retTables[0] = tuple( (lookBack, rule) + tuple(nt for nt in prev) + (count,) for rule, myCounts in counts.iteritems() for prev, count in myCounts.iteritems() ) #we don't save this guy as a csv but we need to log it so that conditional_probabilities() can work right retTables[1] = tuple( (lookBack,) + tuple(prev for prev in prevAsTuple) + (nonterm, count) for prevAsTuple, myCounts in nonterminalCounts.iteritems() for nonterm, count in myCounts.iteritems() ) save(path, retTables[0]) return retTables