def predict_tag(tags, vocab, A, B): # Reading the test data and preprocessing it (prep is the word list with empty line marked by <n>) test_file = Config.TEST original, prep = read_preprocess_test_data(vocab, test_file) # Decodes the sequence using Viterbi algorithm and returns optimal predicted tag sequences for each of the sentences decoder = Viterbi.Viterbi(vocab, tags, prep, A, B) predicted_tags = decoder.decode() tagged = [] for word, tag in zip(original, predicted_tags): tagged.append((word, tag)) # writing the output into a file (location output/test_out.tt) out_file = Config.TEST_OUT with open(out_file, 'w', encoding='utf-8') as out: for word, tag in tagged: if not word: out.write("\n") else: out.write("{0}\t{1}\n".format(word, tag)) out.close()
def Decode(self): if self.Nodes==[] or self.Edges==None: print("No graph for decoding") return [] else: self.cls=[] self.cls=Viterbi.Viterbi([self.Edges]*(len(self.Nodes)-1),copy.copy(self.Nodes)) return self.cls
def estMaxSequence(self, filename): print("Reading testing data from %s" % (filename)) # Read in the testing dta from the file self.dataset = DataSet(filename) self.dataset.readFile(200, "test") # Run Viterbi to estimate most likely sequence viterbi = Viterbi(self.hmm) self.maxSequence = viterbi.mostLikelySequence(self.dataset.testOutput)
def Decode(self): if self.Edges == None: print("No graph for decoding") elif self.Nodes == []: return [] elif len(self.Nodes) == 1: return self.Nodes[0].index(max(self.Nodes[0])) else: self.cls = [] self.cls = Viterbi.Viterbi([self.Edges] * (len(self.Nodes) - 1), copy.copy(self.Nodes)) return self.cls
def estMaxSequence(self, filename): print "Reading testing data from %s" % (filename) # Read in the testing dta from the file dataset = DataSet(filename) states, obs = dataset.read_file() # Run Viterbi to estimate most likely sequence viterbi = Viterbi(self.hmm) for idx in range(len(obs)): self.maxSequence.append(viterbi.mostLikelySequence(obs[idx])) self.realStates.append(states[idx])
def runNaive(ObsMat, kmer_size, num_state, event_data_test, write_fasta): kmer_map, inv_kmer_map = Util.getKmerMap(kmer_size) total_acc = 0.0 T = 0.0 for event in event_data_test: currentSeq, state_label = DataInput.getData_event(event, kmer_map) t = len(currentSeq) Vit = Viterbi.Viterbi([], ObsMat, num_state, t, kmer_size) Y_hat, seq_est = Vit.decodeNaive(currentSeq) Y_test = np.array(state_label).reshape(-1, 1) acc = float(np.sum(Y_hat == Y_test)) / t total_acc += float(np.sum(Y_hat == Y_test)) T += t print("Accuracy = %f" % acc) if write_fasta == 1: write_to_file(seq_est, T, kmer_size) # print(seq_est) total_acc /= T print("Total Accuracy = %f" % total_acc)
for seq in ["aactgcacatgcggcgcgcccgcgctaat", "gggcgcgggcgccccgcg"]: # NB. Book and Lio's notes use integrated transition and initial # distribution matrix (initial step is transition from dummy state 0) # This is confusing, so I will separate them out. # Wiki has non-integrated Viterbi algorithm implementation # 1.1. Implement Forward algorithm fwd = Forward(TransitionP, EmissionP, InitialP) p = fwd.prob(seq) print "**************************************" print "Probability of", seq, ":", p print "Log probability:", -log(p) print "**************************************" # 1.2. Implement Viterbi algorithm vtb = Viterbi(TransitionP, EmissionP, InitialP) (prob, path) = vtb.maxSeq(seq) print "**************************************" print "Viterbi path:" print "P =", prob print seq print ''.join(str(i) for i in path) print "**************************************" # 1.3. Length distribution # Suppose we have a string of only G-C (with equal emission probili- # ties for each state). # Once HMM enters state 1 (detect G-C islands), modify the probability # of going out of this state to 1/200, and staying to 199/200. Then on # average HMM will stay in that state for 200 characters.
#calculate state and path probabilities for i in range(0, self.len_a): # Smoothing to prevent divide by zero error if (0 in self.paths[i]): self.paths[i] = [a + 1 for a in self.paths[i]] sum_paths = self.paths[i].sum() if (0 in self.states[i]): self.states[i] = [a + 1 for a in self.states[i]] sum_states = self.states[i].sum() for j in range(0, self.len_a): self.prob_paths[i][j] = float(self.paths[i][j]) / sum_paths self.prob_states[i][j] = float(self.states[i][j]) / sum_states #percentage corruption: percentages = [0.1, 0.2] #create wordcorrector class object wc = WordCorrector() #call model for percent in percentages: print "Results for corruption percentage: ", percent wc.construct_HMM(percent) #create viterbi object viterbi = Viterbi.Viterbi(wc.prob_states, wc.prob_paths, wc.c_test_data) #invoke the execution process of viterbi viterbi.parse_data(wc.test_data) viterbi.calc_precision_recall()
if (0 in self.Eis[i]): self.Eis[i] = [x + 1 for x in self.Eis[i]] sum = self.Eis[i].sum() # print self.alphabets[i], sum for j in range(0, len(self.alphabets)): self.probEis[i][j] = float(self.Eis[i][j]) / sum def getEmissionProbabilities(self): return self.probEis def getTransitionProbabilities(self): return self.probAij def trainHMModel(self): self.splitDocument() # Corrupt the text splited for training set and test set self.corruptedTrainingSet = self.corruptText(self.trainingSet, True) # Calculate the probability for transition from state i to state j self.corruptedTestSet = self.corruptText(self.testSet, False) self.probabilityAij() self.probabilityEmission() objSC = SpellingCorrection() objSC.trainHMModel() objViterbi = Viterbi.Viterbi(objSC.getEmissionProbabilities(), objSC.getTransitionProbabilities(), objSC.corruptedTestSet) objViterbi.process(objSC.testSet)
import numpy as np import math import copy from Viterbi import * network_type = 'original' predictions = './predictions/prediction_' + network_type + '_prob.csv' actual = './predictions/actual_' + network_type + '_prob.csv' states = [ 'WALKING', 'RUNNING', 'STAIRS (UP)', 'STAIRS (DOWN)', 'STANDING', 'SITTING', 'LYING', 'BENDING', 'CYCLING (SITTING)', 'CYCLING (STANDING)' ] numOfAct = len(states) v = Viterbi(states) v.load_observations(predictions) actual_labels = v.load_actual_labels(actual) v.generate_start_probability(numOfAct) #transMatrix={'STANDING': {'STANDING': 82.0, 'BENDING': 3.0, 'WALKING': 7.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 2.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'BENDING': {'STANDING': 23.0, 'BENDING': 69.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'WALKING': {'STANDING': 14.0, 'BENDING': 1.0, 'WALKING': 78.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'CYCLING (SITTING)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING':1.0, 'CYCLING (SITTING)': 89.0, 'SITTING': 3.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'SITTING': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 91.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'CYCLING (STANDING)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 91.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'RUNNING': {'STANDING': 2.0, 'BENDING': 1.0, 'WALKING': 6.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 85.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'STAIRS (UP)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 91.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0}, # 'STAIRS (DOWN)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 91.0, 'LYING': 1.0}, # 'LYING': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 91.0}}
HMM.array_A[line_state[j]][line_state[j+1]] += 1 #array_A计算状态转移概率 for p in range(len(line_state)): HMM.count_dic[line_state[p]] += 1 # 记录每一个状态的出现次数 for state in HMM.STATES: if word_list[p] not in HMM.array_B[state]: HMM.array_B[state][word_list[p]] = 0.0 #保证每个字都在STATES的字典中 # if word_list[p] not in array_B[line_state[p]]: # # print(word_list[p]) # array_B[line_state[p]][word_list[p]] = 0 # else: HMM.array_B[line_state[p]][word_list[p]] += 1 # array_B用于计算发射概率 HMM.Prob_Array() #对概率取对数保证精度 output = '' for line in testSet: line = line.strip() tag = Viterbi.Viterbi(line, HMM.array_Pi, HMM.array_A, HMM.array_B) # print(tag) seg = wordSplit.tag_seg(line, tag) # print(seg) list = '' for i in range(len(seg)): list = list + seg[i] + ' ' # print(list) output = output + list + '\n' print(output) outputfile = open('output.txt', mode='w', encoding='utf-8') outputfile.write(output)