f_in.close() return tbl if __name__ == "__main__": prior_tbl = gen_prior(sys.argv[1]) test = open(sys.argv[2], 'r') output = open(sys.argv[3], 'w') for line in test: line=line.strip('\n') if len(line) == 0: output.write('\n') continue alias = line if alias not in prior_tbl: #using get_types to assign a new value for rare words(in this case it is '_RARE_') alias = get_type(alias, '') val = float('-inf') tag = '' cand_set = prior_tbl[alias] #find the tag with maximal emission probability for cand in cand_set: tmp_max = cand_set[cand] if tmp_max > val: val = tmp_max tag = cand output.write(line + ' ' + tag + ' ' + str(val) + '\n') test.close() output.close()
def viterbi(x, S, q, e, opt_type): n = len(x) #initialization for the DP pi = {(-1,'*','*'):0} trace = {(-1,'*','*'):[]} for u in S: for v in S: if u == '*' and v == '*': continue pi[-1,u,v] = float('-inf') trace[-1,u,v] = [] for k in range(0, n): x_k = x[k] if x_k not in e: #replace for the rare words x_k = get_type(x_k, opt_type) for u in S: for v in S: max_prob = float('-inf'); max_w = '' e_xv = float('-inf') if v in e[x_k]: e_xv = e[x_k][v] for w in S: q_wuv = float('-inf') pi_kwu = float('-inf') if (w,u,v) in q: q_wuv = q[w,u,v] if (k-1,w,u) in pi: pi_kwu = pi[k-1,w,u] #update the new probability tmp_prob = pi_kwu + q_wuv + e_xv #find argmax_w if tmp_prob > max_prob: max_prob = tmp_prob max_w = w if max_w != '': tl = list(trace[k-1,max_w,u]) tl.append((max_w,max_prob)) trace[k,u,v] = tl pi[k,u,v] = max_prob cand_u = '' cand_v = '' max_prob = float('-inf') #find argmax_uv(pi(n-1,u,v)q('STOP'|u,v)) for u in S: for v in S: q_wuv = float('-inf') pi_kuv = float('-inf') if (u,v,'STOP') in q: q_wuv = q[u,v,'STOP'] if (n-1,u,v) in pi: pi_kuv = pi[n-1,u,v] tmp_prob = pi_kuv + q_wuv if tmp_prob > max_prob: max_prob = tmp_prob cand_u = u cand_v = v taglist = list(trace[n-1,cand_u,cand_v]) taglist.append((cand_u, 'misc')) taglist.append((cand_v, 'misc')) #find the trace of the tags out = '' for k in range(0, n - 1): out = out + x[k] + ' ' + taglist[k+2][0] + ' ' + str(taglist[k][1]) + '\n' out = out + x[n - 1] + ' ' + cand_v + ' ' + str(max_prob) + '\n' return out
# argument3: switch for the advanced replacement for the rare words if __name__ == "__main__": f_in = open(sys.argv[1], 'r') f_out = open(sys.argv[2], 'w') opt_type = sys.argv[3]; tbl = {} pattern = re.compile('(\S+)\s(\S+)') for line in f_in: match = pattern.match(line) if match: word = match.group(1) if word in tbl: tbl[word] = tbl[word] + 1 else: tbl[word] = 1 f_in.seek(0, 0) for line in f_in: match = pattern.match(line) if match: word = match.group(1) #check the word frequency and process the replacement if tbl[word] < 5: f_out.write( get_type(word, opt_type) + ' ' + match.group(2) + '\n') else: f_out.write(line) else: f_out.write(line) f_in.close() f_out.close()