Beispiel #1
0
def production_count(path, oldtable, tables, conf):
    grammar = dict((row[0], tuple(row[1:])) for row in tables["infer_grammar"])
    stats = dict()
    prodnum = dict()
    # print grammar
    # print "\n"
    for nonterm, P in grammar.iteritems():
        for i, p in enumerate(P):
            prodnum[(nonterm, p)] = i + 1
            stats[(nonterm, i + 1)] = 0
    if oldtable is not None:
        for nonterm, p, count in oldtable:
            stats[(nonterm, prodnum[(nonterm, p)])] = int(count)
    # print prodnum
    def callback(grammar, stats, node, depth):
        if not node.children:
            return
        productions = grammar[node.label]
        p = productions.index(":".join(kid.label for kid in node.children)) + 1
        # print node.label + " => " + grammar[node.label][p-1]
        stats[(node.label, p)] += 1

    walktrees(conf["trees"], functools.partial(callback, grammar, stats))

    table = [(key[0], grammar[key[0]][key[1] - 1], count) for key, count in stats.iteritems()]
    table.sort(key=lambda x: (x[0], x[2]))
    save(path, table)
    return table
Beispiel #2
0
def symbol_counter(path, oldtable, trees, callback):
    symbols = dict()
    if oldtable is not None:
        symbols.update(oldtable)
    walktrees(trees, functools.partial(callback, symbols))

    return save(path, tuple((name, count) for name, count in symbols.iteritems()))
Beispiel #3
0
def infer_grammar(path, oldtable, tables, conf):
    productions = dict()
    if oldtable is not None:
        ## TODO: clean this jankyness up! we shouldn't have to rejoin oldtable
        ## so we can parse it.
        productions.update(
          lib.parse_grammar('\n'.join(''.join(row) for row in oldtable))
        )

    def callback(productions, node, depth):
        if not node.children: return
        p = productions.get(node.label, set())
        p.add(tuple(kid.label for kid in node.children))
        productions[node.label] = p
    walktrees(conf['trees'], functools.partial(callback, productions))

    table = tuple(
        tuple([nonterm] + [':'.join(p) for p in P])
        for nonterm, P in productions.iteritems()
    )
    gramfile = '\n'.join(
        ' : '.join((
            nonterm,
            ' '.join(prod)
        ))
        for nonterm, prods in productions.iteritems()
        for prod in prods
    ) + '\n'
    with open(path, 'w') as f: f.write(gramfile)
    return table
Beispiel #4
0
def production_coverage_global(path, oldtable, tables, conf):
    grammar = dict(
      (
        nonterm,
        tuple(sorted(':'.join(p) for p in prods))
      )
      for nonterm, prods in conf['grammar'].iteritems()
    )
    def initstats():
        stats = dict()
        total = 0
        for nonterm, P in grammar.iteritems():
            for i, p in enumerate(P):
                stats[(nonterm, i+1)] = 0
                total += 1
        return stats, total


    global_cov, total = initstats()
    walktrees(conf['trees'], functools.partial(callback, grammar, global_cov))
    print 'global', float(sum(val for val in global_cov.itervalues()))/total
Beispiel #5
0
def conditional_counts(path, oldtable, tables, conf):
    grammar = dict((row[0], tuple(row[1:])) for row in tables["infer_grammar"])

    counts = (
        dict()
    )  # counts how many times a RULE is reached by a specific prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5)
    nonterminalCounts = (
        dict()
    )  # counts how many times a prevTuple reaches a specific NONTERMINAL (e.g. nonterminalCounts[(NT1,NT2)][NT] == 78)
    stack = list()
    lookBack = 2  # how many items in prevTuple?

    def count_nonterms(nonterminalCounts, prevAsTuple, node):
        if not nonterminalCounts.has_key(prevAsTuple):
            nonterminalCounts[prevAsTuple] = {node.label: 1}
        else:
            if not nonterminalCounts.get(prevAsTuple).has_key(node.label):
                nonterminalCounts[prevAsTuple][node.label] = 1
            else:
                nonterminalCounts[prevAsTuple][node.label] += 1

    def increase_counts(counts, prevAsTuple, chosenRule):
        if not counts.has_key(chosenRule):
            counts[chosenRule] = {prevAsTuple: 1}
        else:
            if not counts.get(chosenRule).has_key(prevAsTuple):
                counts[chosenRule][prevAsTuple] = 1
            else:
                counts[chosenRule][prevAsTuple] += 1

    def callback(grammar, node, depth):
        # if this is a new ast then we want to clear our stack
        if node.label == "Start":
            while stack:
                stack.pop()
            initStack = (tuple(None for x in range(lookBack)), False)
            stack.append(initStack)

        prev = stack[len(stack) - 1][0]
        requirePop = stack[len(stack) - 1][1]

        prevAsTuple = tuple(prev[x] for x in range(lookBack))

        if not node.children:
            if requirePop:
                stack.pop()
            return

        if requirePop:
            stack.pop()

        productions = grammar[node.label]
        p = productions.index(":".join(kid.label for kid in node.children)) + 1

        chosenRule = node.label + " => " + grammar[node.label][p - 1]

        increase_counts(counts, prevAsTuple, chosenRule)
        count_nonterms(nonterminalCounts, prevAsTuple, node)

        # append this new rule to the stack as our new "most previous"
        if grammar[node.label][p - 1].count(":") > 1:  # do we have more than 1 nonterminal in this rule?
            stack.append((tuple(prev[x + 1] for x in range(lookBack - 1)) + (node.label,), False))
        else:  # if there is only one nonterminal in this rule then we want to log it as a previous but then pop it from the stack
            # this way, rules that have >1 nonterminals will keep their "prev" relative to what it was originally.
            # e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions
            stack.append((tuple(prev[x + 1] for x in range(lookBack - 1)) + (node.label,), True))

    walktrees(conf["trees"], functools.partial(callback, grammar))

    retTables = dict()

    retTables[0] = tuple(
        (lookBack, rule) + tuple(nt for nt in prev) + (count,)
        for rule, myCounts in counts.iteritems()
        for prev, count in myCounts.iteritems()
    )

    # we don't save this guy as a csv but we need to log it so that conditional_probabilities() can work right
    retTables[1] = tuple(
        (lookBack,) + tuple(prev for prev in prevAsTuple) + (nonterm, count)
        for prevAsTuple, myCounts in nonterminalCounts.iteritems()
        for nonterm, count in myCounts.iteritems()
    )

    save(path, retTables[0])
    return retTables
Beispiel #6
0
def conditional_probabilities(path, oldtable, tables, conf):
    grammar = dict((row[0], tuple(row[1:])) for row in tables['infer_grammar'])

    counts = dict() #counts how many times a RULE is reached by a specific prevTuple (e.g. counts[NT => A:B:C][(NT1,NT2)] == 5)
    terminalCounts = dict() #counts how many times a prevTuple reaches a specific NONTERMINAL (e.g. terminalCounts[(NT1,NT2)][NT] == 78)
    stack = list()
    lookBack = 2 #how many items in prevTuple?
    def callback(grammar, node, depth):
        #if this is a new ast then we want to clear our stack
        if node.label == "Start":
            while stack:
                stack.pop()
            initStack = (tuple(None for x in range(lookBack)), False)
            stack.append(initStack)

        prev = stack[len(stack)-1][0]
        requirePop = stack[len(stack)-1][1]

        prevAsTuple = tuple(prev[x] for x in range(lookBack))

        if not node.children:
            if requirePop:
                stack.pop()
            return

        if requirePop:
            stack.pop()

        productions = grammar[node.label]
        p = productions.index(':'.join(kid.label for kid in node.children)) + 1

        chosenRule = node.label + " => " + grammar[node.label][p-1]


        if not counts.has_key(chosenRule):
            counts[chosenRule] = {prevAsTuple : 1}
        else:
            if not counts.get(chosenRule).has_key(prevAsTuple):
                counts[chosenRule][prevAsTuple] = 1
            else:
                counts[chosenRule][prevAsTuple] += 1

        if not terminalCounts.has_key(prevAsTuple):
            terminalCounts[prevAsTuple] = {node.label : 1}
        else:
            if not terminalCounts.get(prevAsTuple).has_key(node.label):
                terminalCounts[prevAsTuple][node.label] = 1
            else:
                terminalCounts[prevAsTuple][node.label] += 1

        #append this new rule to the stack as our new "most previous"
        if grammar[node.label][p-1].count(":") > 1:
            stack.append(
                (
                    tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,),
                    False
                )
            )
        else: #if there is only one nonterminal in this rule then we want to log it as a previous but then pop it from the stack
              #this way, rules that have >1 nonterminals will keep their "prev" relative to what it was originally.
              #e.g. with NT:NT2:NT3, when we get to NT2, we dont want previous to include the previous from when we went down NT's productions
            stack.append(
                (
                    tuple(prev[x+1] for x in range(lookBack-1)) + (node.label,),
                    True
                 )
            )
    walktrees(conf['trees'], functools.partial(callback, grammar))


    probabilities = dict(
        (
          rule,
          dict(
            (
              prev,
              float(num)/float(terminalCounts[prev][rule.split("=>")[0].strip()]) #P[rule | prev]
            )
            for prev, num in myCounts.iteritems()
          )
        )
        for rule, myCounts in counts.iteritems()
    )

    table = tuple(
        (lookBack, prod) + tuple(nt for nt in prev) + (probability,)
        for prod, myCounts in probabilities.iteritems()
            for prev, probability in myCounts.iteritems()
    )

    save(path, table)
    return table