Ejemplo n.º 1
0
 def decode_grammar(self, P):
     g=SubGrammar(self.G)
     vd = VaultDistribution()
     iterp = iter(P)
     stack = ['G']
     done = []
     while stack:
         head = stack.pop()
         assert head not in done
         done.append(head)
         p = iterp.next()
         n = vd.decode_vault_size(head, p)
         #print "RuleSizeDecoding:", head, n
         t_set = []
         for x in range(n):
             rhs = self.decode(head, iterp.next())
             #print "Decoding:", stack, head, '==>', rhs
             if rhs != '__totoal__':
                 r = filter(lambda x: x not in done+stack, 
                            self.G.get_actual_NonTlist(head, rhs))
                 if r:
                     for x in r:
                         if (x not in t_set):
                             t_set.append(x)
             g.add_rule(head, rhs)
         t_set.reverse()
         stack.extend(t_set)
     g.finalize() # fixes the freq and some other book keepings
     return g
Ejemplo n.º 2
0
def cal_size_subG(base_pcfg, vault_set_file):
    tdata = [(k, filter(lambda x: x, v))
             for k, v in json.load(open(vault_set_file)).items()
             if len(filter(lambda x: x, v)) > 1]
    rm = []
    for x in tdata:
        k, v = x
        for p in v:
            try:
                p.decode('ascii')
            except:
                rm.append(k)
                continue
    sys.stderr.write(' '.join([str(x) for x in rm]))
    data = dict(filter(lambda x: x not in rm, tdata))
    D = {}
    for k, v in data.items():
        g = SubGrammar(base_pcfg)
        g.update_grammar(*v)
        res = [(nt, len(g[nt]) - 1) for nt in NT]
        D[k] = {'vault': v, 'length': len(v)}
        D[k].update(dict(res))
    return D
Ejemplo n.º 3
0
 def __init__(self, grammar=None):
     self.G = grammar
     if not self.G:
         self.G = SubGrammar()