Ejemplo n.º 1
0
def predict_tag(tags, vocab, A, B):

    # Reading the test data and preprocessing it (prep is the word list with empty line marked by <n>)
    test_file = Config.TEST
    original, prep = read_preprocess_test_data(vocab, test_file)

    # Decodes the sequence using Viterbi algorithm and returns optimal predicted tag sequences for each of the sentences
    decoder = Viterbi.Viterbi(vocab, tags, prep, A, B)
    predicted_tags = decoder.decode()

    tagged = []

    for word, tag in zip(original, predicted_tags):
        tagged.append((word, tag))

    # writing the output into a file (location output/test_out.tt)
    out_file = Config.TEST_OUT

    with open(out_file, 'w', encoding='utf-8') as out:
        for word, tag in tagged:
            if not word:
                out.write("\n")
            else:
                out.write("{0}\t{1}\n".format(word, tag))

    out.close()
Ejemplo n.º 2
0
 def Decode(self):
     if self.Nodes==[] or self.Edges==None:
         print("No graph for decoding")
         return []
     else:
         self.cls=[]
         self.cls=Viterbi.Viterbi([self.Edges]*(len(self.Nodes)-1),copy.copy(self.Nodes))
         return self.cls
    def estMaxSequence(self, filename):

        print("Reading testing data from %s" % (filename))

        # Read in the testing dta from the file
        self.dataset = DataSet(filename)
        self.dataset.readFile(200, "test")

        # Run Viterbi to estimate most likely sequence
        viterbi = Viterbi(self.hmm)
        self.maxSequence = viterbi.mostLikelySequence(self.dataset.testOutput)
Ejemplo n.º 4
0
 def Decode(self):
     if self.Edges == None:
         print("No graph for decoding")
     elif self.Nodes == []:
         return []
     elif len(self.Nodes) == 1:
         return self.Nodes[0].index(max(self.Nodes[0]))
     else:
         self.cls = []
         self.cls = Viterbi.Viterbi([self.Edges] * (len(self.Nodes) - 1),
                                    copy.copy(self.Nodes))
         return self.cls
Ejemplo n.º 5
0
    def estMaxSequence(self, filename):

        print "Reading testing data from %s" % (filename)

        # Read in the testing dta from the file
        dataset = DataSet(filename)
        states, obs = dataset.read_file()

        # Run Viterbi to estimate most likely sequence
        viterbi = Viterbi(self.hmm)
        for idx in range(len(obs)):
            self.maxSequence.append(viterbi.mostLikelySequence(obs[idx]))
            self.realStates.append(states[idx])
Ejemplo n.º 6
0
def runNaive(ObsMat, kmer_size, num_state, event_data_test, write_fasta):
    kmer_map, inv_kmer_map = Util.getKmerMap(kmer_size)
    total_acc = 0.0
    T = 0.0
    for event in event_data_test:
        currentSeq, state_label = DataInput.getData_event(event, kmer_map)
        t = len(currentSeq)
        Vit = Viterbi.Viterbi([], ObsMat, num_state, t, kmer_size)
        Y_hat, seq_est = Vit.decodeNaive(currentSeq)
        Y_test = np.array(state_label).reshape(-1, 1)
        acc = float(np.sum(Y_hat == Y_test)) / t
        total_acc += float(np.sum(Y_hat == Y_test))
        T += t
        print("Accuracy = %f" % acc)
        if write_fasta == 1:
            write_to_file(seq_est, T, kmer_size)
        # print(seq_est)

    total_acc /= T
    print("Total Accuracy = %f" % total_acc)
Ejemplo n.º 7
0
    for seq in ["aactgcacatgcggcgcgcccgcgctaat", "gggcgcgggcgccccgcg"]:
        # NB. Book and Lio's notes use integrated transition and initial
        # distribution matrix (initial step is transition from dummy state 0)
        # This is confusing, so I will separate them out.
        # Wiki has non-integrated Viterbi algorithm implementation

        # 1.1. Implement Forward algorithm
        fwd = Forward(TransitionP, EmissionP, InitialP)
        p = fwd.prob(seq)
        print "**************************************"
        print "Probability of", seq, ":", p
        print "Log probability:", -log(p)
        print "**************************************"

        # 1.2. Implement Viterbi algorithm
        vtb = Viterbi(TransitionP, EmissionP, InitialP)
        (prob, path) = vtb.maxSeq(seq)
        print "**************************************"
        print "Viterbi path:"
        print "P =", prob
        print seq
        print ''.join(str(i) for i in path)
        print "**************************************"

        # 1.3. Length distribution
        # Suppose we have a string of only G-C (with equal emission probili-
        # ties for each state).
        # Once HMM enters state 1 (detect G-C islands), modify the probability
        # of going out of this state to 1/200, and staying to 199/200. Then on
        # average HMM will stay in that state for 200 characters.
Ejemplo n.º 8
0
        #calculate state and path probabilities
        for i in range(0, self.len_a):
            # Smoothing to prevent divide by zero error
            if (0 in self.paths[i]):
                self.paths[i] = [a + 1 for a in self.paths[i]]
            sum_paths = self.paths[i].sum()

            if (0 in self.states[i]):
                self.states[i] = [a + 1 for a in self.states[i]]
            sum_states = self.states[i].sum()

            for j in range(0, self.len_a):
                self.prob_paths[i][j] = float(self.paths[i][j]) / sum_paths
                self.prob_states[i][j] = float(self.states[i][j]) / sum_states


#percentage corruption:
percentages = [0.1, 0.2]
#create wordcorrector class object
wc = WordCorrector()
#call model
for percent in percentages:
    print "Results for corruption percentage: ", percent
    wc.construct_HMM(percent)

    #create viterbi object
    viterbi = Viterbi.Viterbi(wc.prob_states, wc.prob_paths, wc.c_test_data)
    #invoke the execution process of viterbi
    viterbi.parse_data(wc.test_data)
    viterbi.calc_precision_recall()
Ejemplo n.º 9
0
            if (0 in self.Eis[i]):
                self.Eis[i] = [x + 1 for x in self.Eis[i]]
            sum = self.Eis[i].sum()
            # print self.alphabets[i], sum
            for j in range(0, len(self.alphabets)):
                self.probEis[i][j] = float(self.Eis[i][j]) / sum

    def getEmissionProbabilities(self):
        return self.probEis

    def getTransitionProbabilities(self):
        return self.probAij

    def trainHMModel(self):
        self.splitDocument()
        # Corrupt the text splited for training set and test set
        self.corruptedTrainingSet = self.corruptText(self.trainingSet, True)
        # Calculate the probability for transition from state i to state j
        self.corruptedTestSet = self.corruptText(self.testSet, False)

        self.probabilityAij()
        self.probabilityEmission()


objSC = SpellingCorrection()
objSC.trainHMModel()
objViterbi = Viterbi.Viterbi(objSC.getEmissionProbabilities(),
                             objSC.getTransitionProbabilities(),
                             objSC.corruptedTestSet)
objViterbi.process(objSC.testSet)
Ejemplo n.º 10
0
import numpy as np
import math
import copy
from Viterbi import *

network_type = 'original'
predictions = './predictions/prediction_' + network_type + '_prob.csv'
actual = './predictions/actual_' + network_type + '_prob.csv'

states = [
    'WALKING', 'RUNNING', 'STAIRS (UP)', 'STAIRS (DOWN)', 'STANDING',
    'SITTING', 'LYING', 'BENDING', 'CYCLING (SITTING)', 'CYCLING (STANDING)'
]
numOfAct = len(states)

v = Viterbi(states)
v.load_observations(predictions)
actual_labels = v.load_actual_labels(actual)
v.generate_start_probability(numOfAct)

#transMatrix={'STANDING': {'STANDING': 82.0, 'BENDING': 3.0, 'WALKING': 7.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 2.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'BENDING': {'STANDING': 23.0, 'BENDING': 69.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'WALKING': {'STANDING': 14.0, 'BENDING': 1.0, 'WALKING': 78.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'CYCLING (SITTING)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING':1.0, 'CYCLING (SITTING)': 89.0, 'SITTING': 3.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'SITTING': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 91.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'CYCLING (STANDING)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 91.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'RUNNING': {'STANDING': 2.0, 'BENDING': 1.0, 'WALKING': 6.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 85.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'STAIRS (UP)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 91.0, 'STAIRS (DOWN)': 1.0, 'LYING': 1.0},
#	'STAIRS (DOWN)': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 91.0, 'LYING': 1.0},
#	'LYING': {'STANDING': 1.0, 'BENDING': 1.0, 'WALKING': 1.0, 'CYCLING (SITTING)': 1.0, 'SITTING': 1.0, 'CYCLING (STANDING)': 1.0, 'RUNNING': 1.0, 'STAIRS (UP)': 1.0, 'STAIRS (DOWN)': 1.0, 'LYING': 91.0}}
Ejemplo n.º 11
0
            HMM.array_A[line_state[j]][line_state[j+1]] += 1  #array_A计算状态转移概率
 
        for p in range(len(line_state)):
            HMM.count_dic[line_state[p]] += 1  # 记录每一个状态的出现次数
            for state in HMM.STATES:
                if word_list[p] not in HMM.array_B[state]:
                    HMM.array_B[state][word_list[p]] = 0.0  #保证每个字都在STATES的字典中
            # if word_list[p] not in array_B[line_state[p]]:
            #     # print(word_list[p])
            #     array_B[line_state[p]][word_list[p]] = 0
            # else:
            HMM.array_B[line_state[p]][word_list[p]] += 1  # array_B用于计算发射概率
 
    HMM.Prob_Array()    #对概率取对数保证精度
 
    output = ''
 
    for line in testSet:
        line = line.strip()
        tag = Viterbi.Viterbi(line, HMM.array_Pi, HMM.array_A, HMM.array_B)
        # print(tag)
        seg = wordSplit.tag_seg(line, tag)
        # print(seg)
        list = ''
        for i in range(len(seg)):
            list = list + seg[i] + ' '
        # print(list)
        output = output + list + '\n'
    print(output)
    outputfile = open('output.txt', mode='w', encoding='utf-8')
    outputfile.write(output)