def test_viterbi(): site = [0,1,2,3,0,1,2,3] background = lambda n:[random.choice(range(4)) for i in range(n)] obs = (site + background(1000) + site + background(1000) + site) states = [0,1,2,3,4,5,6,7,8] # 0 is off, 1-8 are bs positions start_p = [0.99,0.01,0,0,0,0,0,0,0] trans_p = [[0.99,0.01,0,0,0,0,0,0,0], [0,0,1,0,0,0,0,0,0], [0,0,0,1,0,0,0,0,0], [0,0,0,0,1,0,0,0,0], [0,0,0,0,0,1,0,0,0], [0,0,0,0,0,0,1,0,0], [0,0,0,0,0,0,0,1,0], [0,0,0,0,0,0,0,0,1], [0.99,0.01,0,0,0,0,0,0,0], ] emit_p = [[0.25,0.25,0.25,0.25], [1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1], [1,0,0,0], [0,1,0,0], [0,0,1,0], [0,0,0,1] ] return viterbi(obs, states, start_p, trans_p, emit_p)
def tagger(testFile,taggedFile,n): dicOfWordTag,dicOfTag,dicOfTags=read_corpus("corpus/CoNLL2009-ST-English-train-pos.txt") f=open(testFile,'r') fw=open(taggedFile,'w') totalNumber=0 noSolution=0 words=[] lines=[] for line in f: line=line.strip() if line=='': if len(words)>n: for row in lines: for item in row: print >> fw,item, words=[] lines=[] continue prob,path=viterbi(words,dicOfWordTag,dicOfTag,dicOfTags) if prob==0: #print >>fw,"no solution" noSolution+=1 for row in lines: for item in row: print >> fw,item, print >> fw print >>fw words=[] lines=[] continue i=0 while i<len(path): for item in lines[i]: print >> fw,item, print >> fw,path[i],path[i] i+=1 print >> fw words=[] lines=[] else: line=line.split() words.append(line[1]) lines.append(line) totalNumber+=1 print 1-float(noSolution)/totalNumber
def main(): """main function """ n = 2 # Bigram HMM args = parse_arguments() treebank = TaggedCorpusReader( os.path.split(args.train_f)[0], os.path.split(args.train_f)[1]) observation_space = [item[0] for item in treebank.sents()] # all words state_space = [item[1] for item in treebank.sents()] # all pos tags words = dict.fromkeys(observation_space) tags = dict.fromkeys(state_space) # HMM parameter estimation- initial, transition and emission probablity start = time.time() init_p = [item[1] for item in comp_initial(tags, treebank)] trans_p = comp_transition(n, tags, state_space) emission_p = comp_emission(words, tags, state_space, treebank, smoothing=args.smoothing) end = time.time() print("Runtime (training): %.3f s" % (end - start)) # Test your HMM-trained model treebank = TaggedCorpusReader( os.path.split(args.eval_f)[0], os.path.split(args.eval_f)[1]) viterbi_tags = [] start = time.time() for sentence in treebank.paras(): test_words = [item[0] for item in sentence] O, S, Y, pi, A, B = pre_process(words, tags, test_words, init_p, trans_p, emission_p) # Computes Viterbi's most likely tags if args.log_prob: X = viterbi_log(O, S, Y, pi, A, B) else: X = viterbi(O, S, Y, pi, A, B) viterbi_tags.append(X) end = time.time() print("Runtime (viterbi): %.3f s" % (end - start)) output_path = "./" + "de-tagger.tt" post_processing(viterbi_tags, args.test_f, output_path)
def evaluate(testFile,n): dicOfWordTag,dicOfTag,dicOfTags=read_corpus("corpus/CoNLL2009-ST-English-train-pos.txt") f=open(testFile,'r') rightNumber=0 denominator=0 totalNumber=0 noSolution=0 words=[] right=[] for line in f: line=line.strip() if line=='': if len(words)>n: #print "length> ",n words=[] right=[] continue prob,path=viterbi(words,dicOfWordTag,dicOfTag,dicOfTags) if prob==0: words=[] right=[] noSolution+=1 continue i=0 while i<len(path): if path[i]==right[i]: rightNumber+=1 denominator+=1 i+=1 words=[] right=[] else: line=line.split() words.append(line[1]) right.append(line[4]) totalNumber+=1 precision=float(rightNumber)/denominator recall=1-float(denominator)/totalNumber return precision,recall
def __cut(sen): prob, pos_list = viterbi(sen, status, start_P, trans_P, emit_P, end_status='ES') flag = 0 for num, pos in enumerate(pos_list): if pos == 'E': word = sen[flag:num + 1] flag = num + 1 yield word elif pos == 'S': word = sen[flag:num + 1] flag = num + 1 yield word
def main(): outFileName = sys.argv[1] n = int(sys.argv[2]) outFile = open(outFileName,'w') data = genDieNumber(n) prediction = viterbi(data[0]) for i in range(0,len(prediction)//50): outFile.write("Rolls ") for j in range(0,50): outFile.write(str(data[0][j])) outFile.write("\n") outFile.write("Die ") for j in range(0,50): outFile.write(str(data[1][j])) outFile.write("\n") outFile.write("Viterbi ") for j in range(0,50): outFile.write(str(prediction[j])) outFile.write("\n")
def test(trainfile,testfile): word_dict,pos_dict,tag_list,emission_probabilities,transition_probabilities,prefixp,suffixp = learnFromTraining(trainfile) file = open(testfile,"r") unknown = count_unknown(file) file = open(testfile,"r") words = [] for line in file: words.append(line.rstrip()) pos=[] new_sentence=[] for i in range(0,len(words)): if words[i]=='' or words[i]=='\n': v,p = viterbi(new_sentence,emission_probabilities,transition_probabilities,pos_dict,prefixp,suffixp,unknown) tags = bestPath(new_sentence,v,p,transition_probabilities.keys()) tags.append('') tags=tags[1:] pos.extend(tags) new_sentence=[] else: new_sentence.append(words[i]) return pos,words
def evaluate(): seqLen = [] avg_accuracy = [] avg_MCC = [] for n in range(1000, 10100, 100): for i in range(0, 10): accuracy = [] MCC = [] data = genDieNumber(n) true_result = data[1] prediction = viterbi(data[0]) TP = 0 TN = 0 FP = 0 FN = 0 for i in range(0, len(data[0])): if true_result[i] == "F": if prediction[i] == "F": TP += 1 else: FN += 1 else: if prediction[i] == "F": FP += 1 else: TN += 1 accuracy.append((TP + TN) / (TP + TN + FP + FN)) if (TP + TN) * (TP + FP) * (TN + FP) * (TN + FN) != 0: MCC.append((TP * TN - FP * FN) / math.sqrt( (TP + TN) * (TP + FP) * (TN + FP) * (TN + FN))) else: MCC.append((TP * TN - FP * FN) / 0.000001) seqLen.append(n) avg_accuracy.append(sum(accuracy) / len(accuracy)) avg_MCC.append(sum(MCC) / len(MCC)) return [seqLen, avg_accuracy, avg_MCC]
def baum_welch(obs,L): """Given sequence and bs length L, approximate MLE parameters for emission probabilities,transition rate a01 (background->site). TODO: non-uniform background frequencies""" states = range(L+1) a01 = random.random() start_p = make_start_p(a01) trans_p = make_trans_p(a01) emit_p = [simplex_sample(4) for state in states] hidden_states = [random.choice(states) for ob in obs] iterations = 0 while True: # compute hidden states, given probs prob,hidden_states_new = viterbi(obs, states, start_p, trans_p, emit_p) # compute probs, given hidden states # first compute a01 a01_new = estimate_a01(hidden_states_new) start_p_new = make_start_p(a01_new) trans_p_new = make_trans_p(a01_new) emit_p_new = estimate_emit_p(obs,hidden_states_new,states) if (start_p_new == start_p and trans_p_new == trans_p and emit_p_new == emit_p and hidden_states_new == hidden_states): break else: print iterations,a01,l2(start_p,start_p_new), print l2(concat(trans_p),concat(trans_p_new)), print l2((hidden_states),hidden_states_new) a01 = a01_new start_p = start_p_new trans_p = trans_p_new emit_p = emit_p_new hidden_states = hidden_states_new iterations += 1 return start_p,trans_p,emit_p,hidden_states
def memm(classifier, test_file, all_pos, start, transition, w): test = open(test_file, "r") tokens = [] index = [] pos = [] counter = 0 #used for validation only correct = 0 seen = set() greedy_predictions = [] memm_predictions = [] memm_lex = defaultdict(lambda: defaultdict(float)) for line in test: predictions = [] if (counter % 3 == 0): tokens = line.split() for t in tokens: seen.add(t) #print ("tokens: " + "".join(tokens)) elif (counter % 3 == 1): pos = line.split() else: index = line.split() for i in range(len(tokens)): dict = {} #feature one = is this token captialized #print ("word is " + tokens[i]) if tokens[i][0].isupper(): #print ("upper") dict["caps"] = 1 else: #print ("lower") dict["caps"] = 0 dict["pos"] = pos[i] if len(predictions) == 0: dict["prevBIO"] = "<s>" else: dict["prevBIO"] = predictions[-1] #feature 4 = does prev word start in caps if i == 0: dict["prevCaps"] = 0 else: if tokens[i - 1][0].isupper(): dict["prevCaps"] = 1 else: dict["prevCaps"] = 0 # feature 5 = previous pos # if pos[i - 1] == "NNP": # dict["prevNNP"] = 1 # else: # dict["prevNNP"] = 0 if i == 0: dict["prevPOS"] = "<s>" else: dict["prevPOS"] = pos[i - 1] # feature 6 = previous word if i == 0: dict["prevWord"] = "<s>" else: dict["prevWord"] = tokens[i - 1] # feature 7 = next word if i == len(tokens) - 1: dict["nextWord"] = "</s>" else: dict["nextWord"] = tokens[i + 1] # feature 8 = next word pos if i == len(tokens) - 1: dict["nextPOS"] = "</s>" else: dict["nextPOS"] = pos[i + 1] # feature 9 = does this word appear in training dict["training"] = "nan" # w[word] = tag if tokens[i] in w: dict["training"] = w[tokens[i]] #print (dict) probs = classifier.prob_classify(dict) #greedy predictions maxscore = 0 best_tag = "" for tag in TAGS: score = probs.prob(tag) memm_lex[tag][tokens[i]] = score if score > maxscore: maxscore = score best_tag = tag #print(best_tag) predictions.append(best_tag) memm_pre = viterbi(tokens, start, transition, memm_lex, seen) memm_predictions.append(memm_pre) greedy_predictions.append(predictions) counter += 1 test.close() return greedy_predictions, memm_predictions
def execute(): # If there is no sentence input, asks for a sentence input if len(sentence_input.get()) == 0: canvas.delete("all") canvas.create_text(375, 50, text="Please Enter a Sentence") # If there is an illegal word in the input, tells the user elif len(sentence_in_corpus(word_tokenize(sentence_input.get()))) > 0: canvas.delete("all") canvas.create_text(375, 50, text="Found an Illegal Word") # If the user has selected "Forward Algorithm", it shows the results of running the Forward # Algorithm on the entire sentence with only the most likely part of speech shown elif v.get() == "1": canvas.delete("all") new_sentence = word_tokenize(sentence_input.get()) non_normalized_result = forward(set(pos_tags), new_sentence, transition_matrix, emission_matrix) result = normalize(non_normalized_result) top_results = {} for i in range(len(new_sentence)): highest = 0.0 best_pos = None for pos in set(pos_tags): if result[i][pos] >= highest: best_pos = pos highest = result[i][pos] top_results[new_sentence[i]] = best_pos, highest new_len = 100 * len(result[0]) new_height = 60 * len(result) + 100 if new_len > 750 or new_height > 750: canvas.config(height=new_height, width=new_len) for i in range(len(result)): canvas.create_rectangle(10 + 100 * i, 25, 100 + 100 * i, 75, fill="white") canvas.create_text(55 + 100 * i, 50, text=new_sentence[i]) canvas.create_rectangle(10 + 100 * i, 90, 100 + 100 * i, 135, fill="lightblue") val = top_results[new_sentence[i]][0] + ": " + "{0:.4f}".format( top_results[new_sentence[i]][1]) canvas.create_text(55 + 100 * i, 112, text=val) # If the user has selected "Viterbi Algorithm", it shows the results of running the Viterbi # Algorithm on the entire sentence elif v.get() == "2": canvas.delete("all") new_sentence = word_tokenize(sentence_input.get()) result = viterbi(set(pos_tags), new_sentence, transition_matrix, emission_matrix) new_len = 100 * len(result["predicted_tags"]) new_height = 750 if new_len > 750 or new_height > 750: canvas.config(height=new_height, width=new_len) for i in range(len(new_sentence)): canvas.create_rectangle(10 + 100 * i, 25, 100 + 100 * i, 75, fill="white") canvas.create_text(55 + 100 * i, 50, text=new_sentence[i]) canvas.create_rectangle(10 + 100 * i, 90, 100 + 100 * i, 135, fill="lightblue") canvas.create_text(55 + 100 * i, 112, text=result["predicted_tags"][i + 1]) # If the user has selected "Forward with All Parts of Speech", then it shows a trellis diagram for # all the parts of speech elif v.get() == "3": canvas.delete("all") new_sentence = word_tokenize(sentence_input.get()) non_normalized_result = forward(set(pos_tags), new_sentence, transition_matrix, emission_matrix) result = normalize(non_normalized_result) print(result) new_len = 100 * len(result) new_height = 60 * len(result[0]) + 100 if new_len > 750 or new_height > 750: canvas.config(height=new_height, width=new_len) for i in range(len(result)): canvas.create_rectangle(10 + 100 * i, 25, 100 + 100 * i, 75, fill="white") canvas.create_text(55 + 100 * i, 50, text=new_sentence[i]) j = 0 for pos in result[i]: val = str(pos) + ": " + "{0:.3f}".format(result[i][pos]) canvas.create_rectangle(10 + 100 * i, 90 + 50 * j, 100 + 100 * i, 135 + 50 * j, fill="lightblue") canvas.create_text(55 + 100 * i, 112 + 50 * j, text=val) j += 1 # If the user has selected "Show All Progressions for Forward", it shows the step progression after each word # is added elif v.get() == "4": canvas.delete("all") new_sentence = word_tokenize(sentence_input.get()) for i in range(1, len(new_sentence) + 1): non_normalized_result = forward(set(pos_tags), new_sentence[:i], transition_matrix, emission_matrix) result = normalize(non_normalized_result) top_results = {} for j in range(len(new_sentence[:i])): highest = 0.0 best_pos = None for pos in set(pos_tags): if result[j][pos] >= highest: best_pos = pos highest = result[j][pos] top_results[new_sentence[j]] = best_pos, highest new_len = 100 * len(result[0]) new_height = 60 * len(result) + 100 if new_len > 750 or new_height > 750: canvas.config(height=new_height, width=new_len) for k in range(len(result)): canvas.create_rectangle(10 + 100 * k, 25, 100 + 100 * k, 75, fill="white") canvas.create_text(55 + 100 * k, 50, text=new_sentence[k]) canvas.create_rectangle(10 + 100 * k, 30 + 60 * i, 100 + 100 * k, 75 + 60 * i, fill="lightblue") val = top_results[ new_sentence[k]][0] + ": " + "{0:.4f}".format( top_results[new_sentence[k]][1]) canvas.create_text(55 + 100 * k, 52 + 60 * i, text=val) # If the user has selected "Show All Progressions for Viterbi", it shows the step progression after each word # is added to the algorithm elif v.get() == "5": canvas.delete("all") new_sentence = word_tokenize(sentence_input.get()) for i in range(1, len(new_sentence) + 1): result = viterbi(set(pos_tags), new_sentence[:i], transition_matrix, emission_matrix) new_len = 100 * len(result["predicted_tags"]) new_height = 750 if new_len > 750 or new_height > 750: canvas.config(height=new_len, width=new_len) for j in range(len(new_sentence[:i])): canvas.create_rectangle(10 + 100 * j, 25, 100 + 100 * j, 75, fill="white") canvas.create_text(55 + 100 * j, 50, text=new_sentence[j]) canvas.create_rectangle(10 + 100 * j, 30 + 60 * i, 100 + 100 * j, 75 + 60 * i, fill="lightblue") canvas.create_text(55 + 100 * j, 52 + 60 * i, text=result["predicted_tags"][j + 1])
print(sum(B[1, :])) # ## Question 4 # In[12]: tags_true = [] tags_pred = [] scores = [] for sent in tqdm(testing): word_list = to_ids(V, [word for word, _ in sent]) tag_list = to_ids(Q, [tag for _, tag in sent]) tags_true.append(tag_list) predicted, score = viterbi((Pi, A, B), word_list) tags_pred.append(predicted) scores.append(score) # In[13]: predicted_set = set(flatten(tags_pred)) reference_set = set(flatten(tags_true)) print('Precision :', precision(predicted_set, reference_set)) print('Recall :', recall(predicted_set, reference_set)) print('F1-score :', f_measure(predicted_set, reference_set)) # ## Question 5 # We only conserve the pairs of tags which appears at least once in the whole set.
f.close() if (function == 'viterbi'): f = open(outfile, 'w') print("generating tables..") EMISSION = emissionTable(train_X, train_Y, test_X) print("emission done") TRANSITION = transitionTable(train_Y) print("transition done") unique_tags = getUniqueY(train_Y) print("unique tags gotten from text") print("All pre-requisites done, now running viterbi") for i in range(0, len(test_X)): # print(test_X[i]) print("Writing one sentence, " + str(len(test_X) - i) + " to go.") viterbi_sentence = viterbi(test_X[i], len(test_X[i]), TRANSITION, EMISSION, unique_tags) for j in range(0, len(test_X[i])): towrite = str(test_X[i][j]) + " " + str(viterbi_sentence[j]) f.write(towrite + '\n') f.write('\n') f.close() if (function == 'viterbi_topk'): f = open(outfile, 'w') print("generating tables..") EMISSION = emissionTable(train_X, train_Y, test_X) print("emission done") TRANSITION = transitionTable(train_Y) print("transition done") unique_tags = getUniqueY(train_Y) print("unique tags gotten from text")
def main( param=0.2, PATH_LOAD_FILE='/home/keums/Melody/dataset/adc2004_full_set/file/pop4.wav', PATH_SAVE_FILE='./SAVE_RESULTS/pop4.txt'): # PATH_LOAD_FILE = sys.argv[1] # PATH_SAVE_FILE = sys.argv[2] #================================== # Feature Extraction # .wav --> spectrogram #================================== x_test_log = myFeatureExtraction(PATH_LOAD_FILE) #path ?? #================================== # making multi column spectrogram # for trainging #================================== x_test_SF = making_multi_frame(x_test_log, num_frames=1) x_test_MF = making_multi_frame(x_test_log, num_frames=11) select_res_1st = 1 select_res_2nd = 2 select_res_3rd = 4 pitch_range = np.arange(min_pitch, max_pitch + 1.0 / select_res_3rd, 1.0 / select_res_3rd) #================================== # Melody extraction # using DNN #================================== y_predict_1st = MelodyExtraction_SCDNN(x_test_MF, select_res_1st) y_predict_2nd = MelodyExtraction_SCDNN(x_test_MF, select_res_2nd) y_predict_3rd = MelodyExtraction_SCDNN(x_test_MF, select_res_3rd) #================================== # merge SCDNN #================================== # print 'Merging....' ratio_res_1_3 = select_res_3rd / select_res_1st ratio_res_2_3 = select_res_3rd / select_res_2nd y_predict_tmp_1_3 = np.zeros(y_predict_3rd.shape) y_predict_tmp_2_3 = np.zeros(y_predict_3rd.shape) for i in range(y_predict_3rd.shape[0]): for j in range(y_predict_1st.shape[1] - 1): y_predict_tmp_1_3[i, j * ratio_res_1_3:j * ratio_res_1_3 + ratio_res_1_3] = y_predict_1st[i, j] y_predict_tmp_1_3[i, -1] = y_predict_1st[i, -1] for i in range(y_predict_3rd.shape[0]): for j in range(y_predict_2nd.shape[1] - 1): y_predict_tmp_2_3[i, j * ratio_res_2_3:j * ratio_res_2_3 + ratio_res_2_3] = y_predict_2nd[i, j] y_predict_tmp_2_3[i, -1] = y_predict_2nd[i, -1] # y_predict = (y_predict_tmp_1_3+0.0000001) *(y_predict_tmp_2_3+0.0000001) * (y_predict_3rd +0.0000001) y_predict = 10**(np.log10(y_predict_tmp_1_3) + np.log10(y_predict_tmp_2_3) + np.log10(y_predict_3rd)) del y_predict_tmp_1_3 del y_predict_tmp_2_3 #================================== # singing voice detection #================================== voice_frame_vad = VAD_DNN(x_test_SF, y_predict_1st, param=0.2) #================================== # viterbi algorithm #================================== path_viterbi = './viterbi/' path_prior_matrix_file = path_viterbi + 'prior_' + str( select_res_3rd) + '.npy' path_transition_matrix_file = path_viterbi + 'transition_matrix_' + str( select_res_3rd) + '.npy' prior = np.load(path_prior_matrix_file) transition_matrix = np.load(path_transition_matrix_file) viterbi_path = viterbi(y_predict, transition_matrix=transition_matrix, prior=prior, penalty=0, scaled=True) pitch_MIDI = np.zeros([y_predict.shape[0], 1]) pitch_freq = np.zeros([y_predict.shape[0], 1]) for i in range(y_predict.shape[0]): # for test : origianl # index_predict[i] = np.argmax(y_predict[i,:]) # pitch_MIDI[i] = pitch_range[index_predict[i]] #viterbi_path pitch_MIDI[i] = pitch_range[viterbi_path[i]] pitch_freq[i] = 2**((pitch_MIDI[i] - 69) / 12.) * 440 est_pitch = np.multiply(pitch_freq, voice_frame_vad) #================================== #adjust frame #================================== idx_shift = 2 shift_array = np.zeros(idx_shift) est_pitch = np.append(shift_array, est_pitch[:-idx_shift]) #================================== # save result #================================== PATH_est_pitch = PATH_SAVE_FILE if not os.path.exists(os.path.dirname(PATH_est_pitch)): os.makedirs(os.path.dirname(PATH_est_pitch)) f = open(PATH_est_pitch, 'w') for j in range(len(est_pitch)): est = "%f\t%f\n" % (0.01 * j, est_pitch[j]) f.write(est) f.close() print PATH_est_pitch
if random.random() < P[1]: state = 1 for i in range(60): switched_state = (state + 1) % 2 if i > 0 and random.random() < Tm[state][switched_state]: state = switched_state lines[0] += str(random.choices(list(range(1, 7)), Em[state])[0]) lines[1] += 'F' if state == 0 else 'L' return lines probabilities_file, rolls_file = parse_args() P, Tm, Em = parse_probabilities(probabilities_file) if rolls_file is None: lines = generate_rolls() else: with open(rolls_file) as input: lines = input.read().splitlines() print('Rolls: ' + lines[0]) print('Die: ' + lines[1]) observations = [int(i) for i in lines[0]] viter = viterbi(S, P, observations, Tm, Em) forw_back = forward_backward(observations, S, P, Tm, Em) percent = lambda obs: str( round( sum([obs[i] == lines[1][i] for i in range(len(observations))]) / len(observations) * 100, 2)) print('Viterbi: ' + ''.join(viter) + ' (' + percent(viter) + '%)') print('Posterior: ' + ''.join(forw_back) + ' (' + percent(forw_back) + '%)')