def viterbiCKY(w): n = len(w) best = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: bigfloat(0)))) back = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: None))) for i in range(1, n + 1): for X in N: if w[i - 1] in R[X].keys(): p = bigfloat(R[X][w[i - 1].strip()]) if p > best[i - 1][i][X]: best[i - 1][i][X] = p back[i - 1][i][X] = (X, w[i - 1], i - 1, i) for l in range(2, n + 1): for i in range(0, n - l + 1): j = i + l for k in range(i + 1, j): for X in N: for right in R[X].keys(): if isGenTwoNonTerminal(right): YZ = right.strip().split(" ") Y = YZ[0].strip() Z = YZ[1].strip() p = bigfloat(R[X][right]) p_ = p * best[i][k][Y] * best[k][j][Z] if p_ > best[i][j][X]: best[i][j][X] = p_ back[i][j][X] = (X, Y, Z, i, j, k) #G_ = extract(S, 0, n, back) G_ = str(Tree(createTree(S, 0, n, back))) print(w) print("Prob = " + str(math.log(best[i][j][S], 10))) return G_
def parser(line): states = [] bps = {} best = {} for i in xrange(len(line)): found = False pattern = '.*->\s'+re.sub('\?','\?',re.sub('\.','\.',line[i]))+r'\s#\s.*' with open('pcfg') as rules: for rule in rules: if re.match(pattern, rule): X = re.match(r'(.*)\s->.*', rule).group(1) logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) states.append((X,i,i+1)) best[(X,i,i+1)]=logprob bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),) found = True if found == False: unk_pattern = re.compile(r'.*<unk>.*') with open('pcfg') as rules: for rule in rules: if unk_pattern.match(rule): X = re.match(r'(.*)\s->.*', rule).group(1) logprob = bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) states.append((X,i,i+1)) best[(X,i,i+1)]=logprob bps[(X,i,i+1)] = (re.match(r'(.*)\s#.*', rule).group(1),) found = True states, bps, best = parse(states, bps, best, line) root = ('TOP', 0, len(line)) try: return trees(root, bps) except: return ''
def main(): parser = argparse.ArgumentParser( description= "ignore input; make a demo grammar that is compliant in form", formatter_class=argparse.ArgumentDefaultsHelpFormatter) addonoffarg(parser, 'debug', help="debug mode", default=False) parser.add_argument("--infile", "-i", nargs='?', type=argparse.FileType('r'), default=sys.stdin, help="input file (ignored)") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file (grammar)") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) workdir = tempfile.mkdtemp(prefix=os.path.basename(__file__), dir=os.getenv('TMPDIR', '/tmp')) fh = open('pcfg_log', 'w') def cleanwork(): shutil.rmtree(workdir, ignore_errors=True) if args.debug: print(workdir) else: atexit.register(cleanwork) rule_dict = {} rule_freq = {} treebank_rules = [] rule_lhs = [] infile = prepfile(args.infile, 'r') outfile = prepfile(args.outfile, 'w') for tree in infile: t = Tree.fromstring(tree) tree_rules = t.productions() for rule in tree_rules: rule_lhs.append(rule.lhs()) treebank_rules.append(rule) #print treebank_rules freq_dict = Counter(rule_lhs) treebank_dict = Counter(treebank_rules) for production in treebank_dict.iterkeys(): count = treebank_dict.get(production) prob = bigfloat.bigfloat(count) / bigfloat.bigfloat( freq_dict.get(production.lhs())) outfile.write('{0} # {1} \n'.format(production, prob)) fh.write('{0} # {1} \n'.format(production, bigfloat.log10(prob))) fh.close()
def q1_parse_input_trees(tree_unk_file): f = open(tree_unk_file, 'r') fileData = f.read() data = fileData.split('\n') inputTrees = [] for line in data: if line == '': continue inputTrees.append(Tree.from_str(line)) rules_dict = {} for tree in inputTrees: if tree == '': continue nodes = tree.bottomup() children = None for node in nodes: children = node.children if children == []: continue rules_dict.setdefault(str(node), {}) # if leaf node(a terminal), add a string else tuple right_rule = None if len(children[0].children) == 0: right_rule = str( children[0]) #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER?? else: right_rule = tuple(map(lambda x: str(x), node.children)) rules_dict[str(node)].setdefault(right_rule, { 'count': 0, 'probability': 0 }) rules_dict[str(node)][right_rule]['count'] += 1 q1_answer = [[None, [None]], 0] for left_rule, right_rule in rules_dict.iteritems(): denominator = 0 for r_rule, count_prob_dict in right_rule.iteritems(): denominator += count_prob_dict['count'] if count_prob_dict['count'] > q1_answer[1]: q1_answer[1] = count_prob_dict['count'] q1_answer[0][0] = left_rule q1_answer[0][1] = r_rule for r_rule, count_prob_dict in right_rule.iteritems(): count_prob_dict['probability'] = log10( bigfloat(float(count_prob_dict['count']) / denominator)) print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][ 0], '->' + q1_answer[0][1], ' Occcourence =', q1_answer[1] return rules_dict
def visit(node): if node in memo: p = memo[node] return p memo[node] = psum = 0. for edge in node.outedges: p = bigfloat.bigfloat(edge.weight) for child in edge.tails: p *= visit(child) psum += p memo[node] = psum return psum
def visit(u, clo): if u not in chart: return bigfloat.bigfloat(0.) if u in weight: return weight[u] w_max = None e_max = None for e in hypergraphs.edges(chart, u): if e.h[0] != u: continue w = bigfloat.bigfloat(1.) for v in e.h[1:]: #print (v) if v in clo: continue clo.add(v) w *= visit(v, clo.copy()) if w_max is None or w > w_max: w_max = w e_max = e weight[u] = w_max ant[u] = e_max return w_max
def __init__(self, node, viterbi): self.nbest = [] # of (viterbi, edge, tailranks) self.cands = [] # priority queue of (viterbi, edge, tailranks) self.index = set() # of (edge, tailranks) for edge in node.outedges: zeros = (0, ) * len(edge.tails) p = bigfloat.bigfloat(edge.weight) for tail in edge.tails: tail_viterbi, _ = viterbi[tail] p *= tail_viterbi self.cands.append((-p, edge, zeros)) self.index.add((edge, zeros)) heapq.heapify(self.cands) (p, edge, ranks) = heapq.heappop(self.cands) self.nbest.append((p, edge, ranks))
def FileRead(): with open('./train.trees.pre.unk','r') as f: for line in f: tr1 = Tree.from_str(line) q = tr1.bottomup() for l in q: if l.children == []: continue grammar.setdefault(l.label, {}) children = map(lambda x:str(x), l.children) grammar[l.label].setdefault(tuple(children),0 ) grammar[l.label] [tuple(children)]+=1 #Smoothing by adding additional rules i='<unk>' for k,v in grammar.iteritems(): if i not in str(v): grammar[k][('<unk>',)]=1 count =0 for k,v in grammar.iteritems(): count+=len(v) print "QUESTION 1 - \n Number of rules in grammar = ", count answer=[ [None,[None]],0] for k,v in grammar.iteritems(): denominator=0 for k1, v1 in v.iteritems(): denominator+=v1 if v1 > answer[1]: answer[1] = v1 answer[0][0] = k answer[0][1] = k1 print "Most Frequent Rule: \n ",str(answer[0][0]),"->"+ str(answer[0][1]),"Count =", str(answer[1]) for k,v in grammar.iteritems(): s1=0 for k1,v1 in v.iteritems(): s1 = s1 + v1 for k1,v1 in v.iteritems(): p = float(v1)/float(s1) v[k1]= log10(bigfloat(p))
def visit(node): if node in memo: p, _ = memo[node] return p # We put a zero probability into the memo already # in case one of our descendants points back to self. # This will cause the descendant not to choose self. memo[node] = pmax, emax = (0., None) for edge in node.outedges: p = bigfloat.bigfloat(edge.weight) for child in edge.tails: p *= visit(child) if emax is None or p > pmax: pmax, emax = p, edge memo[node] = pmax, emax return pmax
def sample(self, inside): edges = {} for node in self.dfs(): ps = [] # Recompute the edge inside weights because we threw them away before for edge in node.outedges: p = bigfloat.bigfloat(edge.weight) for child in edge.tails: p *= inside[child] ps.append((p / inside[node], edge)) r = random.random() psum = 0. for p, edge in ps: psum += p if psum > r: edges[node] = edge break else: assert False return self.construct(edges)
def parse(states, bps, best, line, length=1): if length == len(line)+1: return states, bps, best else: for s1 in states: for s2 in states: if s1[2]==s2[1] and s2[2]-s1[1]==length: with open('pcfg') as rules: for rule in rules: backptr = {} if re.compile('.*->\s'+re.sub('\*','\*',s1[0])+' '+re.sub('\*','\*',s2[0])+r'\s#\s.*').match(rule): logprob = best[s1]+best[s2]+bigfloat.log10(bigfloat.bigfloat(float(re.match(r'.*\s#\s(.*)\b', rule).group(1)))) new_state = (re.match(r'(.*)\s->.*', rule).group(1), s1[1], s2[2]) if new_state not in states: states.append(new_state) best[new_state]=logprob bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) elif logprob > best[new_state]: best[new_state]=logprob bps[new_state] = (re.match(r'(.*)\s#.*', rule).group(1),s1[2]) length += 1 return parse(states, bps, best, line, length)
def q1_parse_input_trees(tree_unk_file): f = open(tree_unk_file, 'r') fileData = f.read() data = fileData.split('\n') inputTrees = [] for line in data: if line == '': continue inputTrees.append(Tree.from_str(line)) rules_dict = {} for tree in inputTrees: if tree == '': continue nodes = tree.bottomup() children = None for node in nodes: children = node.children if children == []: continue rules_dict.setdefault(str(node), {}) # if leaf node(a terminal), add a string else tuple right_rule = None if len(children[0].children) == 0: right_rule = str( children[0]) #<<<<<<<<<---- CONVERT LEAF NODES TO LOWER?? else: right_rule = tuple(map(lambda x: str(x), node.children)) rules_dict[str(node)].setdefault(right_rule, { 'count': 0, 'probability': 0 }) rules_dict[str(node)][right_rule]['count'] += 1 #SMOOTHEN <unk> for k, v in rules_dict.iteritems(): if '<unk>' not in v: rules_dict[k].setdefault('<unk>', {'count': 0, 'probability': 0}) rules_dict[k]['<unk>']['count'] += 1 q1_answer = [[None, [None]], 0] for left_rule, right_rule in rules_dict.iteritems(): denominator = 0 for r_rule, count_prob_dict in right_rule.iteritems(): denominator += count_prob_dict['count'] if count_prob_dict['count'] > q1_answer[1]: q1_answer[1] = count_prob_dict['count'] q1_answer[0][0] = left_rule q1_answer[0][1] = r_rule for r_rule, count_prob_dict in right_rule.iteritems(): count_prob_dict['probability'] = log10( bigfloat(float(count_prob_dict['count']) / denominator)) print 'QUESTION 1 - Most Frequent Rule: ', q1_answer[0][ 0], '->', q1_answer[0][1], ' Occcourence =', q1_answer[1] #=========================================================================== # import csv # with open('Rules.csv','wb') as f: # cw=csv.writer(f,delimiter=',',quoting=csv.QUOTE_ALL) # for k,v in rules_dict.iteritems(): # pk=True # for k2,v2 in v.iteritems(): # if pk: # cw.writerow([k,k2,v2]) # pk = False # else: cw.writerow(['',k2,v2]) #=========================================================================== #print 'CSV PRINTED' return rules_dict
def parse(self, sentence): if len(sentence) == 0: return lengh = 1 input_str = sentence.split() self.word_num = len(input_str) self.two_array = [[0 for x in range(self.word_num)] for y in range(self.word_num)] # nonterminal to terminal max_probability = -1 backpointer = '' for index in range(len(input_str)): max_probability = -1 backpointer = '' self.two_array[index][index] = {} rl = self.get_key_from_value(input_str[index]) self.two_array[index][index]['key_probability'] = {} for each_key in rl: self.two_array[index][index]['key_probability'][ each_key] = self.lg.count_dict[each_key + ' ' + input_str[index]] if float(self.lg.count_dict[each_key + ' ' + input_str[index]] ) > float(max_probability): max_probability = self.lg.count_dict[each_key + ' ' + input_str[index]] backpointer = each_key + ' ' + input_str[index] #print type(max_probability) #print max_probability self.two_array[index][index]['_max_probability'] = bigfloat( float(max_probability)) self.two_array[index][index]['_backpointer'] = backpointer self.two_array[index][index]['_left_position_x'] = index self.two_array[index][index]['_left_position_y'] = index self.two_array[index][index]['_right_position_x'] = None self.two_array[index][index]['_right_position_y'] = None # nonterminal to nonterminal for num in range(1, self.word_num): for i in range(0, self.word_num - num): self.two_array[i][i + num] = {} # x = i y = i + num # compare with --- ||| self.two_array[i][i + num]['key_probability'] = {} #vertical - x v_dict = {} h_dict = {} max_probability = -1 backpointer = '' left_position_x = None left_position_y = None right_position_x = None right_position_y = None for n in range(i + num - 1, i - 1, -1): if self.two_array[i][n] != 0: h_dict = self.two_array[i][n]['key_probability'] #for m in range(i + 1, i + num + 1): # margin = i + num - (i + num - n) = m = n + 1 print "################################" print i print num print n print m print "$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$" if self.two_array[m][i + num] != 0: v_dict = self.two_array[m][i + num]['key_probability'] for hk in h_dict: for vk in v_dict: # first we got the rule_root of vk+hk, and pick the one with bigger value krl = self.get_key_from_value(hk + ' ' + vk) if krl[0] == 'unk': continue child_max_probability = -1 child_rule = '' child_key = '' child_left_position_x = None child_left_position_y = None child_right_position_x = None child_right_position_y = None for k in krl: if (k + ' ' + hk + ' ' + vk) in self.lg.count_dict: if float(self.lg.count_dict[ k + ' ' + hk + ' ' + vk] ) > child_max_probability: child_max_probability = self.lg.count_dict[ k + ' ' + hk + ' ' + vk] child_rule = k + ' ' + hk + ' ' + vk child_key = k child_left_position_x = i child_left_position_y = n child_right_position_x = m child_right_position_y = i + num pro = float( (child_max_probability)) * float( (h_dict[hk])) * float((v_dict[vk])) self.two_array[i][ i + num]['key_probability'][k] = pro if pro > max_probability: max_probability = pro backpointer = child_rule left_position_x = child_left_position_x left_position_y = child_left_position_y right_position_x = child_right_position_x right_position_y = child_right_position_y self.two_array[i][i + num]['_max_probability'] = max_probability self.two_array[i][i + num]['_backpointer'] = backpointer self.two_array[i][i + num]['_left_position_x'] = left_position_x self.two_array[i][i + num]['_left_position_y'] = left_position_y self.two_array[i][i + num]['_right_position_x'] = right_position_x self.two_array[i][i + num]['_right_position_y'] = right_position_y
print "Most Frequent Rule is:-", items, "and the count is=", grammar_dict[ items] keys = grammar_dict.keys() def getDenominator(each, keys): count = 0 seach = each.split(' ', 1) if (seach[0] == 'RB'): pass # print '&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&' # print seach[0] for key in keys: if seach[0] == key.split(' ', 1)[0]: count = count + 1 return count prob_dict = {} for each in keys: num = grammar_dict[each] den = getDenominator(each, keys) #print 'num=',num,'den=',den #print round((num/den),2) prob_dict[each] = log10(bigfloat(float(num / den))) print '################################################' #print prob_dict #print 'prob dict length:',len(prob_dict.keys())