Example #1
0
def viterbiCKY(w):
    n = len(w)
    best = defaultdict(
        lambda: defaultdict(lambda: defaultdict(lambda: bigfloat(0))))
    back = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: None)))
    for i in range(1, n + 1):
        for X in N:
            if w[i - 1] in R[X].keys():
                p = bigfloat(R[X][w[i - 1].strip()])
                if p > best[i - 1][i][X]:
                    best[i - 1][i][X] = p
                    back[i - 1][i][X] = (X, w[i - 1], i - 1, i)
    for l in range(2, n + 1):
        for i in range(0, n - l + 1):
            j = i + l
            for k in range(i + 1, j):
                for X in N:

                    for right in R[X].keys():
                        if isGenTwoNonTerminal(right):
                            YZ = right.strip().split(" ")
                            Y = YZ[0].strip()
                            Z = YZ[1].strip()
                            p = bigfloat(R[X][right])
                            p_ = p * best[i][k][Y] * best[k][j][Z]
                            if p_ > best[i][j][X]:
                                best[i][j][X] = p_
                                back[i][j][X] = (X, Y, Z, i, j, k)
    #G_ = extract(S, 0, n, back)
    G_ = str(Tree(createTree(S, 0, n, back)))
    print(w)
    print("Prob = " + str(math.log(best[i][j][S], 10)))
    return G_
Example #2
0
def parser(line):
    states = []
    bps = {}
    best = {}
    for i in xrange(len(line)):
        found = False
        pattern = '.*->\s'+re.sub('\?','\?',re.sub('\.','\.',line[i]))+r'\s#\s.*'
        with open('pcfg') as rules:
            for rule in rules:
                if re.match(pattern, rule):
                    X = re.match(r'(.*)\s->.*', rule).group(1)
                    logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                    states.append((X,i,i+1))
                    best[(X,i,i+1)]=logprob
                    bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),)
                    found = True
        if found == False:
            unk_pattern = re.compile(r'.*<unk>.*')
            with open('pcfg') as rules:
                for rule in rules:
                    if unk_pattern.match(rule):
                        X = re.match(r'(.*)\s->.*', rule).group(1)
                        logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                        states.append((X,i,i+1))
                        best[(X,i,i+1)]=logprob
                        bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),)
                        found = True
    states, bps, best = parse(states, bps, best, line)
    root = ('TOP', 0, len(line))
    try:
        return trees(root, bps)
    except:
        return ''
def main():
    parser = argparse.ArgumentParser(
        description=
        "ignore input; make a demo grammar that is compliant in form",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    addonoffarg(parser, 'debug', help="debug mode", default=False)
    parser.add_argument("--infile",
                        "-i",
                        nargs='?',
                        type=argparse.FileType('r'),
                        default=sys.stdin,
                        help="input file (ignored)")
    parser.add_argument("--outfile",
                        "-o",
                        nargs='?',
                        type=argparse.FileType('w'),
                        default=sys.stdout,
                        help="output file (grammar)")

    try:
        args = parser.parse_args()
    except IOError as msg:
        parser.error(str(msg))

    workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__),
                               dir=os.getenv('TMPDIR', '/tmp'))

    fh = open('pcfg_log', 'w')

    def cleanwork():
        shutil.rmtree(workdir, ignore_errors=True)

    if args.debug:
        print(workdir)
    else:
        atexit.register(cleanwork)

    rule_dict = {}
    rule_freq = {}
    treebank_rules = []
    rule_lhs = []
    infile = prepfile(args.infile, 'r')
    outfile = prepfile(args.outfile, 'w')
    for tree in infile:
        t = Tree.fromstring(tree)
        tree_rules = t.productions()
        for rule in tree_rules:
            rule_lhs.append(rule.lhs())
            treebank_rules.append(rule)
        #print treebank_rules
    freq_dict = Counter(rule_lhs)
    treebank_dict = Counter(treebank_rules)
    for production in treebank_dict.iterkeys():
        count = treebank_dict.get(production)
        prob = bigfloat.bigfloat(count) / bigfloat.bigfloat(
            freq_dict.get(production.lhs()))
        outfile.write('{0} # {1} \n'.format(production, prob))
        fh.write('{0} # {1} \n'.format(production, bigfloat.log10(prob)))
    fh.close()
def q1_parse_input_trees(tree_unk_file):
    f = open(tree_unk_file, 'r')

    fileData = f.read()
    data = fileData.split('\n')

    inputTrees = []
    for line in data:
        if line == '': continue
        inputTrees.append(Tree.from_str(line))

    rules_dict = {}
    for tree in inputTrees:
        if tree == '': continue

        nodes = tree.bottomup()
        children = None

        for node in nodes:
            children = node.children
            if children == []: continue

            rules_dict.setdefault(str(node), {})
            # if leaf node(a terminal), add a string else tuple
            right_rule = None
            if len(children[0].children) == 0:
                right_rule = str(
                    children[0])  #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER??
            else:
                right_rule = tuple(map(lambda x: str(x), node.children))

            rules_dict[str(node)].setdefault(right_rule, {
                'count': 0,
                'probability': 0
            })

            rules_dict[str(node)][right_rule]['count'] += 1

    q1_answer = [[None, [None]], 0]
    for left_rule, right_rule in rules_dict.iteritems():

        denominator = 0
        for r_rule, count_prob_dict in right_rule.iteritems():
            denominator += count_prob_dict['count']

            if count_prob_dict['count'] > q1_answer[1]:
                q1_answer[1] = count_prob_dict['count']
                q1_answer[0][0] = left_rule
                q1_answer[0][1] = r_rule

        for r_rule, count_prob_dict in right_rule.iteritems():
            count_prob_dict['probability'] = log10(
                bigfloat(float(count_prob_dict['count']) / denominator))

    print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][
        0], '->' + q1_answer[0][1], '    Occcourence =', q1_answer[1]

    return rules_dict
Example #5
0
File: graph.py Project: yqu1/PSHRG
        def visit(node):
            if node in memo:
                p = memo[node]
                return p
            memo[node] = psum = 0.
            for edge in node.outedges:
                p = bigfloat.bigfloat(edge.weight)
                for child in edge.tails:
                    p *= visit(child)
                psum += p

            memo[node] = psum
            return psum
Example #6
0
 def visit(u, clo):
     if u not in chart:
         return bigfloat.bigfloat(0.)
     if u in weight:
         return weight[u]
     w_max = None
     e_max = None
     for e in hypergraphs.edges(chart, u):
         if e.h[0] != u: continue
         w = bigfloat.bigfloat(1.)
         for v in e.h[1:]:
             #print (v)
             if v in clo:
                 continue
             clo.add(v)
             w *= visit(v, clo.copy())
         if w_max is None or w > w_max:
             w_max = w
             e_max = e
     weight[u] = w_max
     ant[u] = e_max
     return w_max
Example #7
0
File: graph.py Project: yqu1/PSHRG
 def __init__(self, node, viterbi):
     self.nbest = []  # of (viterbi, edge, tailranks)
     self.cands = []  # priority queue of (viterbi, edge, tailranks)
     self.index = set()  # of (edge, tailranks)
     for edge in node.outedges:
         zeros = (0, ) * len(edge.tails)
         p = bigfloat.bigfloat(edge.weight)
         for tail in edge.tails:
             tail_viterbi, _ = viterbi[tail]
             p *= tail_viterbi
         self.cands.append((-p, edge, zeros))
         self.index.add((edge, zeros))
     heapq.heapify(self.cands)
     (p, edge, ranks) = heapq.heappop(self.cands)
     self.nbest.append((p, edge, ranks))
Example #8
0
def FileRead():
    with open('./train.trees.pre.unk','r') as f:
        for line in f:
            tr1 = Tree.from_str(line)
            q = tr1.bottomup()
            for l in q:
                if l.children == []:
                    continue
                grammar.setdefault(l.label, {})
                children = map(lambda x:str(x), l.children)
                grammar[l.label].setdefault(tuple(children),0 )
                grammar[l.label] [tuple(children)]+=1
    
    #Smoothing by adding additional rules
    i='<unk>'
    for k,v in grammar.iteritems():
        if i not in str(v):
            grammar[k][('<unk>',)]=1
            
    count =0
    for k,v in grammar.iteritems():
        count+=len(v)
    
    print "QUESTION 1 - \n Number of rules in grammar = ", count
    
    answer=[ [None,[None]],0]
    for k,v in grammar.iteritems():
        
        denominator=0
        for k1, v1 in v.iteritems():
            denominator+=v1
            
            if v1 > answer[1]:
                answer[1] = v1
                answer[0][0] = k
                answer[0][1] = k1
        
    
    print "Most Frequent Rule: \n ",str(answer[0][0]),"->"+ str(answer[0][1]),"Count =", str(answer[1])
    
    for k,v in grammar.iteritems():
        s1=0
        for k1,v1 in v.iteritems():
            s1 = s1 + v1 
        for k1,v1 in v.iteritems():
            p = float(v1)/float(s1)
            v[k1]= log10(bigfloat(p))
Example #9
0
File: graph.py Project: yqu1/PSHRG
        def visit(node):
            if node in memo:
                p, _ = memo[node]
                return p
            # We put a zero probability into the memo already
            # in case one of our descendants points back to self.
            # This will cause the descendant not to choose self.
            memo[node] = pmax, emax = (0., None)
            for edge in node.outedges:
                p = bigfloat.bigfloat(edge.weight)
                for child in edge.tails:
                    p *= visit(child)
                if emax is None or p > pmax:
                    pmax, emax = p, edge

            memo[node] = pmax, emax
            return pmax
Example #10
0
File: graph.py Project: yqu1/PSHRG
 def sample(self, inside):
     edges = {}
     for node in self.dfs():
         ps = []
         # Recompute the edge inside weights because we threw them away before
         for edge in node.outedges:
             p = bigfloat.bigfloat(edge.weight)
             for child in edge.tails:
                 p *= inside[child]
             ps.append((p / inside[node], edge))
         r = random.random()
         psum = 0.
         for p, edge in ps:
             psum += p
             if psum > r:
                 edges[node] = edge
                 break
         else:
             assert False
     return self.construct(edges)
Example #11
0
def parse(states, bps, best, line, length=1):
    if length == len(line)+1:
        return states, bps, best
    else:
        for s1 in states:
            for s2 in states:
                if s1[2]==s2[1] and s2[2]-s1[1]==length:       
                    with open('pcfg') as rules:
                        for rule in rules:
                            backptr = {}
                            if re.compile('.*->\s'+re.sub('\*','\*',s1[0])+' '+re.sub('\*','\*',s2[0])+r'\s#\s.*').match(rule):
                                logprob = best[s1]+best[s2]+bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1))))
                                new_state = (re.match(r'(.*)\s->.*', rule).group(1), s1[1], s2[2])
                                if new_state not in states:
                                    states.append(new_state)
                                    best[new_state]=logprob
                                    bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) 
                                elif logprob > best[new_state]:
                                    best[new_state]=logprob
                                    bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2])
        length += 1
        return parse(states, bps, best, line, length)
def q1_parse_input_trees(tree_unk_file):
    f = open(tree_unk_file, 'r')

    fileData = f.read()
    data = fileData.split('\n')

    inputTrees = []
    for line in data:
        if line == '': continue
        inputTrees.append(Tree.from_str(line))

    rules_dict = {}
    for tree in inputTrees:
        if tree == '': continue

        nodes = tree.bottomup()
        children = None

        for node in nodes:
            children = node.children
            if children == []: continue

            rules_dict.setdefault(str(node), {})
            # if leaf node(a terminal), add a string else tuple
            right_rule = None
            if len(children[0].children) == 0:
                right_rule = str(
                    children[0])  #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER??
            else:
                right_rule = tuple(map(lambda x: str(x), node.children))

            rules_dict[str(node)].setdefault(right_rule, {
                'count': 0,
                'probability': 0
            })

            rules_dict[str(node)][right_rule]['count'] += 1

    #SMOOTHEN <unk>
    for k, v in rules_dict.iteritems():
        if '<unk>' not in v:
            rules_dict[k].setdefault('<unk>', {'count': 0, 'probability': 0})
            rules_dict[k]['<unk>']['count'] += 1

    q1_answer = [[None, [None]], 0]
    for left_rule, right_rule in rules_dict.iteritems():

        denominator = 0
        for r_rule, count_prob_dict in right_rule.iteritems():
            denominator += count_prob_dict['count']

            if count_prob_dict['count'] > q1_answer[1]:
                q1_answer[1] = count_prob_dict['count']
                q1_answer[0][0] = left_rule
                q1_answer[0][1] = r_rule

        for r_rule, count_prob_dict in right_rule.iteritems():
            count_prob_dict['probability'] = log10(
                bigfloat(float(count_prob_dict['count']) / denominator))

    print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][
        0], '->', q1_answer[0][1], '    Occcourence =', q1_answer[1]

    #===========================================================================
    # import csv
    # with open('Rules.csv','wb') as f:
    #     cw=csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL)
    #     for k,v in rules_dict.iteritems():
    #         pk=True
    #         for k2,v2 in v.iteritems():
    #             if pk:
    #                 cw.writerow([k,k2,v2])
    #                 pk = False
    #             else: cw.writerow(['',k2,v2])
    #===========================================================================

    #print 'CSV PRINTED'
    return rules_dict
    def parse(self, sentence):
        if len(sentence) == 0:
            return

        lengh = 1
        input_str = sentence.split()
        self.word_num = len(input_str)
        self.two_array = [[0 for x in range(self.word_num)]
                          for y in range(self.word_num)]
        # nonterminal to terminal
        max_probability = -1
        backpointer = ''
        for index in range(len(input_str)):
            max_probability = -1
            backpointer = ''
            self.two_array[index][index] = {}
            rl = self.get_key_from_value(input_str[index])
            self.two_array[index][index]['key_probability'] = {}
            for each_key in rl:
                self.two_array[index][index]['key_probability'][
                    each_key] = self.lg.count_dict[each_key + ' ' +
                                                   input_str[index]]
                if float(self.lg.count_dict[each_key + ' ' + input_str[index]]
                         ) > float(max_probability):
                    max_probability = self.lg.count_dict[each_key + ' ' +
                                                         input_str[index]]
                    backpointer = each_key + ' ' + input_str[index]
            #print type(max_probability)
            #print max_probability
            self.two_array[index][index]['_max_probability'] = bigfloat(
                float(max_probability))
            self.two_array[index][index]['_backpointer'] = backpointer
            self.two_array[index][index]['_left_position_x'] = index
            self.two_array[index][index]['_left_position_y'] = index
            self.two_array[index][index]['_right_position_x'] = None
            self.two_array[index][index]['_right_position_y'] = None

        # nonterminal to nonterminal
        for num in range(1, self.word_num):
            for i in range(0, self.word_num - num):
                self.two_array[i][i + num] = {}
                # x = i y = i + num
                # compare with --- |||
                self.two_array[i][i + num]['key_probability'] = {}
                #vertical - x
                v_dict = {}
                h_dict = {}
                max_probability = -1
                backpointer = ''
                left_position_x = None
                left_position_y = None
                right_position_x = None
                right_position_y = None

                for n in range(i + num - 1, i - 1, -1):
                    if self.two_array[i][n] != 0:
                        h_dict = self.two_array[i][n]['key_probability']
                        #for m in range(i + 1, i + num + 1):
                        # margin = i + num - (i + num - n) =
                        m = n + 1

                        print "################################"
                        print i
                        print num
                        print n
                        print m
                        print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"

                        if self.two_array[m][i + num] != 0:
                            v_dict = self.two_array[m][i +
                                                       num]['key_probability']

                            for hk in h_dict:
                                for vk in v_dict:
                                    # first we got the rule_root of vk+hk, and pick the one with bigger value
                                    krl = self.get_key_from_value(hk + ' ' +
                                                                  vk)
                                    if krl[0] == 'unk':
                                        continue

                                    child_max_probability = -1
                                    child_rule = ''
                                    child_key = ''

                                    child_left_position_x = None
                                    child_left_position_y = None
                                    child_right_position_x = None
                                    child_right_position_y = None
                                    for k in krl:
                                        if (k + ' ' + hk + ' ' +
                                                vk) in self.lg.count_dict:
                                            if float(self.lg.count_dict[
                                                    k + ' ' + hk + ' ' + vk]
                                                     ) > child_max_probability:
                                                child_max_probability = self.lg.count_dict[
                                                    k + ' ' + hk + ' ' + vk]
                                                child_rule = k + ' ' + hk + ' ' + vk
                                                child_key = k
                                                child_left_position_x = i
                                                child_left_position_y = n
                                                child_right_position_x = m
                                                child_right_position_y = i + num

                                    pro = float(
                                        (child_max_probability)) * float(
                                            (h_dict[hk])) * float((v_dict[vk]))
                                    self.two_array[i][
                                        i + num]['key_probability'][k] = pro
                                    if pro > max_probability:
                                        max_probability = pro
                                        backpointer = child_rule
                                        left_position_x = child_left_position_x
                                        left_position_y = child_left_position_y
                                        right_position_x = child_right_position_x
                                        right_position_y = child_right_position_y

                self.two_array[i][i +
                                  num]['_max_probability'] = max_probability
                self.two_array[i][i + num]['_backpointer'] = backpointer
                self.two_array[i][i +
                                  num]['_left_position_x'] = left_position_x
                self.two_array[i][i +
                                  num]['_left_position_y'] = left_position_y
                self.two_array[i][i +
                                  num]['_right_position_x'] = right_position_x
                self.two_array[i][i +
                                  num]['_right_position_y'] = right_position_y
Example #14
0
    print "Most Frequent Rule is:-", items, "and the count is=", grammar_dict[
        items]

keys = grammar_dict.keys()


def getDenominator(each, keys):
    count = 0
    seach = each.split(' ', 1)
    if (seach[0] == 'RB'):
        pass

    # print '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&'
    # print seach[0]
    for key in keys:
        if seach[0] == key.split(' ', 1)[0]:
            count = count + 1
    return count


prob_dict = {}
for each in keys:
    num = grammar_dict[each]
    den = getDenominator(each, keys)
    #print 'num=',num,'den=',den
    #print round((num/den),2)
    prob_dict[each] = log10(bigfloat(float(num / den)))
print '################################################'
#print prob_dict
#print 'prob dict length:',len(prob_dict.keys())