def GetUserpro(): employees = CSVFile.loadCSVfile1("./data/allusers/validusers.csv") userpro=[] for item in employees: proresult =[] user = item[0] state = item[1] testsequence = UserSequences.GettraintestdataV2(user) if testsequence==0: continue proresult.append(user) proresult.append(state) preProcess.GetTransiMatrixV2(user) H = HMM(user) for sequence in testsequence: result = H.hmmV2(sequence) proresult.append(result) userpro.append(proresult) # 得到用户60天的基于改进mm的方法的每天的概率,相乘 #filename= "./data/allusers/userpro.csv" #得到用户60天的基于mm的方法的每天的概率,相加 filename = "./data/allusers/MM_userpro.csv" CSVFile.Writecsvtofile(filename,userpro) print userpro
class RunViterbi(object): def __init__(self): self.maxSequence = [] def trainHMM(self, filename): print("Reading training data from %s" % (filename)) # Read in the training data from the file self.dataset = DataSet(filename) self.dataset.readFile(200, "train") # Instatiate and train the HMM self.hmm = HMM(self.dataset.numStates, self.dataset.numOutputs, self.dataset.trainState, self.dataset.trainOutput) self.hmm.train() return def estMaxSequence(self, filename): print("Reading testing data from %s" % (filename)) # Read in the testing dta from the file self.dataset = DataSet(filename) self.dataset.readFile(200, "test") # Run Viterbi to estimate most likely sequence viterbi = Viterbi(self.hmm) self.maxSequence = viterbi.mostLikelySequence(self.dataset.testOutput)
def test_inequality(self): """ Compare two posterior distributions. The hidden state for the unobserved coin is less likely to be fair in the first case. """ # define the dishonest casino model fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) M = np.array([[0.95, 0.05], [0.1, 0.9]]) T = TransitionMatrix.MatrixTransitionObject(M) hidden_states = [fair_state, loaded_state] # create the hidden markov model object hmm_new = InternalModel(T, hidden_states) # define a sequence of observations observations_a = [1, 6, 6, None, 6, 2, 3, 4, 5, 1] observations_b = [1, 6, 6, 6, 6, 2, 3, 4, None, 1] # get posterior distributions distributions_a = hmm_new.posterior( hmm_new.get_dp_info(observations_a)) distributions_b = hmm_new.posterior( hmm_new.get_dp_info(observations_b)) # Compare the posterior probability that the die was fair # at each interesting position. p_fair_a = distributions_a[3][0] p_fair_b = distributions_b[-2][0] self.assertTrue(p_fair_a < p_fair_b) self.assertNotAlmostEqual(p_fair_a, p_fair_b)
def test_external_file_model_compatibility(self): """ Test StringIO streams for dynamic programming. """ # define the dishonest casino model fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) M = np.array([[0.95, 0.05], [0.1, 0.9]]) T = TransitionMatrix.MatrixTransitionObject(M) hidden_states = [fair_state, loaded_state] # define a sequence of observations observations = [1, 2, 6, 6, 1, 2, 3, 4, 5, 6] # define the observation stream o_converter = lineario.IntConverter() o_stream = lineario.SequentialStringIO(o_converter) o_stream.open_write() for x in observations: o_stream.write(x) o_stream.close() # create the reference hidden markov model object hmm_old = HMM.TrainedModel(M, hidden_states) # create the testing hidden markov model object names = ('tmp_f.tmp', 'tmp_s.tmp', 'tmp_b.tmp') hmm_new = ExternalModel(T, hidden_states, names) # get posterior distributions distributions_old = hmm_old.scaled_posterior_durbin(observations) hmm_new.init_dp(o_stream) distributions_new = list(hmm_new.posterior()) # assert that the distributions are the same self.assertTrue(np.allclose(distributions_old, distributions_new))
def test_scaled_ntransitions_expected_inequality(self): """ In the dishonest casino a run of sixes means not much expected switching. Maybe this test should be moved to the HMM module. """ fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) states = [fair_state, loaded_state] prandom = 0.1 # define the new hmm cache_size = 100 transition_object = TransitionMatrix.UniformTransitionObject( prandom, len(states)) hmm = Model(transition_object, [fair_state, loaded_state], cache_size) # define sequences of observations observations_a = [6, 6, 6, 6, 1, 2, 2, 4, 5, 4] observations_b = [6, 6, 6, 6, 1, 6, 6, 6, 5, 6] # define the (degenerate) distances between observations distances = [1] * (len(observations_a) - 1) # use the algorithm to get the expected number of transitions for each observation sequence e_a = hmm.scaled_ntransitions_expected( hmm.get_dp_info(observations_a, distances)) e_b = hmm.scaled_ntransitions_expected( hmm.get_dp_info(observations_b, distances)) # assert that we see what we expect self.assertTrue(e_a > e_b) self.assertNotAlmostEqual(e_a, e_b)
def main(): word_list = [] dictionary = set() syllable_dict = {} with open('data/Syllable_dictionary.txt') as f: for line in f: word_list.append(line.split()[0]) dictionary.add(line.split()[0]) syllable_dict[line.split()[0]] = line.split()[1:] data = load_data('data/shakespeare.txt', dictionary) data_rhyme = load_data_rhyme('data/shakespeare.txt', dictionary) data_HMM = encode_data_HMM(data, word_list) data_HMM_rhyme = encode_data_HMM(data_rhyme, word_list) rhyme_dict = generate_rhyme_pairs(load_data_rhyme('data/shakespeare_no_dumb_poems.txt', dictionary)) ''' model = HMM.unsupervised_HMM(data_HMM_rhyme, 20, 100) model.save("reversedHMM") ''' modelRhyme = HMM.load_from_file("reversedHMM.npz") modelRegular = HMM.load_from_file("HMM20.npz") poems_from_various_models(word_list, syllable_dict) generate_rhyming_sonnet(modelRhyme, word_list, syllable_dict, rhyme_dict) generate_haiku(modelRegular, word_list, syllable_dict)
def hmm(spike_datas, m, dt, seed, spikestart, spikeend, plotFlag=False): """ 並列する時系列データを受け取って、隠れ状態の遷移を推定する。 この場合の隠れ状態は、時系列の発生率(ポアソンモデルによる発生を仮定している)の状態空間の中のものである。 例:複数のニューロンの時系列データから隠れ状態を推定する。 引数: spike_datas:複数の並列する時刻データをlistにそれぞれ格納して、listでつないだもの。少し作るのは面倒だが、時刻データの数がそれぞれ違うので、わざわざlistに入れている。 例:data = np.loadtxt -> data.tolist() -> spike_datas.append(data)みたいに作る m:状態数 dt:時間ビン幅 seed:初期値を決める際の乱数のタネ spikestart:スパイクが始まる時刻 spikeend:スパイクが終わる plotFlag:隠れ状態の遷移をplotするかどうか。Trueなら書く。 返り値: spike_hmm:予測した隠れ状態の遷移 means:それぞれの状態の時系列データごとのrate """ # t1 = time.time() total_step, spike_obs = HMM.get_obs(spikestart, spikeend, spike, dt) # 初期値 pi, a, means = HMM.firstmodel(spike, seed, dt, m, total_step) # 計算 pi, a, means = HMM.baumwelch(pi, a, means, spike_obs) # viterbiアルゴリズムの計算 spike_hmm = HMM.viterbirate(dt, pi, a, means, spike_obs) # 図を描きたいならTrueにすればいい if plotFlag == True: plot_rate(m, dt, spike_hmm) # t2 = time.time() return spike_hmm, means
def test_scaled_ntransitions_expected_compatibility(self): fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) states = [fair_state, loaded_state] prandom = 0.1 # define the old hmm transition_matrix = TransitionMatrix.get_uniform_transition_matrix( prandom, len(states)) old_hmm = HMM.TrainedModel(transition_matrix, states) # define the new hmm cache_size = 100 transition_object = TransitionMatrix.UniformTransitionObject( prandom, len(states)) new_hmm = Model(transition_object, [fair_state, loaded_state], cache_size) # define a sequence of observations observations = [1, 2, 6, 6, 1, 2, 3, 4, 5, 6] # define the (degenerate) distances between observations distances = [1] * (len(observations) - 1) # use the old algorithm to get the expected number of transitions e_initial, A = old_hmm.scaled_transition_expectations_durbin( observations) ntransitions_expected_old = np.sum(A) - np.sum(np.diag(A)) # use the new algorithm to get the expected number of transitions dp_info = new_hmm.get_dp_info(observations, distances) ntransitions_expected_new = new_hmm.scaled_ntransitions_expected( dp_info) # assert that the expected number of transitions are almost the same self.assertAlmostEqual(ntransitions_expected_old, ntransitions_expected_new)
def __init__(self): fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) transition_matrix = np.array([[0.95, 0.05], [0.1, 0.9]]) transition_object = TransitionMatrix.MatrixTransitionObject( transition_matrix) cache_size = 100 Model.__init__(self, transition_object, [fair_state, loaded_state], cache_size)
def trainHMM(self, filename): print("Reading training data from %s" % (filename)) # Read in the training data from the file self.dataset = DataSet(filename) self.dataset.readFile(200, "train") # Instatiate and train the HMM self.hmm = HMM(self.dataset.numStates, self.dataset.numOutputs, self.dataset.trainState, self.dataset.trainOutput) self.hmm.train() return
def hmm_shakespeare_sonnet_goal1(): # Load in everything sonnets, obs_map = sp.get_sonnets("data/shakespeare.txt", 3000) obs_map_r = {} for key in obs_map: obs_map_r[obs_map[key]] = key syl_map = sp.get_syllable_map("data/Syllable_dictionary.txt") # Train HMM model = HMM.unsupervised_HMM(sonnets, 5, 25) num_states = model.L # Print one blank line to make it pretty print("") # Generate sonnet last_state = np.random.choice(num_states, p=model.A_start) for n_lines in [4, 4, 2]: for l_no in range(n_lines): line = "" while (not line): curr_state = last_state l = "" no_syls = 0 while (no_syls < 10): w, curr_state = add_word(curr_state, model.A, model.O) new_word = obs_map_r[w] l += new_word l += " " no_syls += syl_map[new_word] if (count_syllables(l, syl_map) == 10): line = l last_state = curr_state print(line.capitalize()) print("")
def main(): folder = 'C:\Users\Chandramohan\Desktop\@IIT\POS Tagger\Data' data_file = folder + '\Data.txt' inp_file = folder + '\Train.txt' out_file = folder + '\Test.txt' # Creating vocabulary data, tags = Utilities.Get_data(data_file) d_vocab, t_vocab = Utilities.Get_vocabulary(data, tags) # Get train data data, tags = Utilities.Get_data(inp_file) train_d, train_t = Utilities.Convert_data(data, tags, d_vocab, t_vocab) # Get test data data, tags = Utilities.Get_data(out_file) test_d, test_t = Utilities.Convert_data(data, tags, d_vocab, t_vocab) # Training hmm = HMM.HMM(train_d, train_t, d_vocab, t_vocab) p_t = [] for i in range(len(test_d)): seq = hmm.Viterbi(test_d[i]) p_t.append(seq) Utilities.Accuracy_item(seq, test_t[i]) Utilities.Accuracy_model(p_t, test_t)
def main(argv): # Call a function to construct the emission probabilities of hitting a key # given you tried to hit a (potentially) different key. b = ce.constructEmissions(pr_correct, adj) # Call a function to construct transmission probabilities and a prior distribution # from the King James Bible. (p, prior) = ct.constructTransitions('bible.txt') # Run the Viterbi algorithm on each word of the messages to determine the # most likely sequence of characters. for t in range(0, len(input)): s_in = input[t].split() output = "" for i, word in enumerate(s_in): y = np.zeros(shape=(1, len(word))) for j in range(0, len(word)): y[0][j] = ord(word[j]) - ord('a') # perform the Viterbi algorithm x = hmm.HMM(p, prior, b, y[0]) for j in range(0, len(x)): output += chr(int(x[j]) + ord('a')) if i != len(s_in) - 1: output += ' ' print(input[t]) print(output) print()
def main(): args = parse_args() if args.fasta_files: #load matricies tp = np.loadtxt(TRANSITION_MATRIX, delimiter=',') ep = pd.read_csv(EMISSION_MATRIX, index_col=0) with open(INITIAL_MATRIX, 'r') as json_file: ip = json.load(json_file) #initialize HMM cpgi_finder = HMM.HMM(transitions=tp, emissions=ep, initials=ip, states=STATES) #predict islands results = [] for fasta_file in args.fasta_files: try: seq_id, observation = SeqHandler.convert_seq(fasta_file) except ValueError as err: print('Incorrect file format:', err) else: result = cpgi_finder.viterbi(observation) results.append((seq_id, result)) if results: SeqHandler.writeout(results)
def poems_from_various_models(word_list, syllable_dict): models = [1, 3, 6, 10, 15, 20, 25] for i in models: print("### Model " + str(i) + " ####") model = HMM.load_from_file('HMM' + str(i) + '.npz') generate_sonnet(model, word_list, syllable_dict) print()
def hmm_shakespeare_sonnet_naive(): # Load in everything sonnets, obs_map = sp.get_sonnets("data/shakespeare.txt", 900) obs_map_r = {} for key in obs_map: obs_map_r[obs_map[key]] = key # Train HMM model = HMM.unsupervised_HMM(sonnets, 10, 10) # Print one blank line to make it pretty print("") # Generate quatrain 1 for _ in range(4): line = "" while (not line): l = "" emission, states = model.generate_emission(7) for i in range(len(emission)): e = emission[i] w = obs_map_r[e] if (i == 0): w = w.capitalize() l += w l += " " line = l print(line) print("") # Generate quatrain 2 for _ in range(4): line = "" while (not line): l = "" emission, states = model.generate_emission(7) for i in range(len(emission)): e = emission[i] w = obs_map_r[e] if (i == 0): w = w.capitalize() l += w l += " " line = l print(line) # Generate couplet for _ in range(2): line = "" while (not line): l = "" emission, states = model.generate_emission(7) for i in range(len(emission)): e = emission[i] w = obs_map_r[e] if (i == 0): w = w.capitalize() l += w l += " " line = get_n_syls(10, l, syl_map) print(line)
def test_model_compatibility(self): # define the dishonest casino model fair_state = HMM.HiddenDieState(1 / 6.0) loaded_state = HMM.HiddenDieState(0.5) M = np.array([[0.95, 0.05], [0.1, 0.9]]) T = TransitionMatrix.MatrixTransitionObject(M) hidden_states = [fair_state, loaded_state] # define a sequence of observations observations = [1, 2, 6, 6, 1, 2, 3, 4, 5, 6] # create the reference hidden markov model object hmm_old = HMM.TrainedModel(M, hidden_states) # create the testing hidden markov model object hmm_new = InternalModel(T, hidden_states) # get posterior distributions distributions_old = hmm_old.scaled_posterior_durbin(observations) distributions_new = hmm_new.posterior( hmm_new.get_dp_info(observations)) # assert that the distributions are the same self.assertTrue(np.allclose(distributions_old, distributions_new))
def _init_classifiers(self): # Initialize classifier objects self.fenc = FreemanEncoder() self.knn = KNN.KNN() self.HMM = HMM.HMM() self.NaiveBayes = NaiveBayes.NaiveBayes() self.RandomForest = RandomForest.RandomForests() self.SVM = svm.SVM_SVC() self.LogisticReg = LogisticReg.LogisticReg() self.AdaBoost = adaboost.AdaBoost() self.GBRT = gbrt.GBRT() #Train initially on the default data set, if no model saved already # Initialize KNN, no saved model for KNN self.knn.knn_train(CharRecognitionGUI_support.training_dataset, 1.0) # Initialize HMM self.HMM.training(CharRecognitionGUI_support.training_dataset) # Initialize Naive Bayes try: pickle.load( open( "./Models/naivebayes_model.p", "rb" ) ) except IOError: self.NaiveBayes.training(CharRecognitionGUI_support.training_dataset) # Initialize Random Forest try: pickle.load( open( "./Models/random_forest.p", "rb" ) ) except IOError: self.RandomForest.training(CharRecognitionGUI_support.training_dataset) # Initialize SVM try: pickle.load( open( "./Models/svm.p", "rb" ) ) except IOError: self.SVM.training(CharRecognitionGUI_support.training_dataset) # Initialize Logistic Regression try: pickle.load( open( "./Models/logistic_model.p", "rb" ) ) except IOError: self.LogisticReg.training(CharRecognitionGUI_support.training_dataset) # Initialize AdaBoost try: pickle.load( open( "./Models/AdaBoostClassifier.p", "rb" ) ) except IOError: self.AdaBoost.training(CharRecognitionGUI_support.training_dataset) # Initialize GBRT try: pickle.load( open( "./Models/GradientBoostingClassifier.p", "rb" ) ) except IOError: self.GBRT.training(CharRecognitionGUI_support.training_dataset)
def segmentWord(oriSentence, useRules=True): sentence = preprocess(oriSentence) #this judgement is very necessary. #if we have initialized the dictionary, we don't have to do it again. if (not isInitialized): initialize() DAG = getDAG(sentence) route = {} calculateRoute(DAG, route, sentence) words = [] cur = 0 buf = "" N = len(sentence) #cur is the current curse of state while (cur < N): next_cur = route[cur][1] word = sentence[cur:next_cur + 1] #if the word is just a single char, then it might be un-login word #buf is the temporary part. if (len(word) == 1): buf += word else: #until we have next word which is not single. if (len(buf) != 0): #we have buf #if this word is in dictionary, usually not #or if it is a single word if (buf in Possibility or len(buf) == 1): words.append(buf) else: #if not , it is an un-login word. #we must use HMM to cut them words += HMM.cutUnrecognized(buf) #clear the buf buf = "" words.append(word) cur = next_cur + 1 if (buf): words += HMM.cutUnrecognized(buf) return words
def HMMClassify(train, trainNE, test): """ The function returns the tagging prediction by the HMM system as a dictionary. """ model = HMM.HMM(train, trainNE) lines = preprocess.readFile(test) prediction = {'PER': [], 'LOC': [], 'ORG': [], 'MISC': []} lineNum = 1 for line in lines: if (lineNum % 3) == 1: #Line with Tokens tags = model.assignTags(line) elif (lineNum % 3) == 0: #Line with indexes indexes = line.strip().split() preClass = None firstIdx = None lastIdx = None NEcontinues = False for i in range(len(tags)): tag = tags[i] if tag == 'O': if NEcontinues: #Previous tag ends prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = None firstIdx = None lastIdx = None NEcontinues = False else: if NEcontinues: if tag != preClass: #Previous tag ends, new Tag begins prediction[preClass].append(firstIdx + '-' + lastIdx) preClass = tag firstIdx = indexes[i] lastIdx = indexes[i] else: #Previous tag continues lastIdx = indexes[i] else: #New tag begins preClass = tag firstIdx = indexes[i] lastIdx = indexes[i] NEcontinues = True lineNum += 1 return prediction
def auto_reply(msg): if msg.is_at: start = msg.text print(start) song = HMM.generate(start) print('正在写歌') print(type(song)) print(song) # 回复消息内容和类型tuling #answer = tuling.chat(msg.text) #return answer return song
def get_score(test_file, output_file, mode='quick'): with open(test_file, "r") as f: with open(output_file, "w") as f2: for line in f: if mode == 'quick': for word in cut_sentense(line): f2.write(word + " ") if mode == 'HMM': for word in HMM.cut_HMM(line): f2.write(word + " ") if mode == 'CRF': for word in CRF.cut_CRF(line): f2.write(word + " ")
def Create_HMM_Predictions(yearly_df): #Create hidden markov model predictions. Calculates the transitions of the ratio, and predicts based on the next direction. predictions = [] num_years = yearly_df.shape[1] - 1 for i in range(len(yearly_df)): years = yearly_df.values[i] if np.nan in years: predictions.append(np.nan) else: try: direction_years = [] for i in range(1, num_years): if years[i] > years[i - 1]: direction_years.append(1) else: direction_years.append(0) params = HMM.Prep_Forward(direction_years) probs = HMM.forward(params, np.array(direction_years)) recent_year = probs[0][-1] direction = np.argmax(recent_year) diff = abs((years[-3] - years[-2])) if direction == 0: prediction = years[-2] - diff else: prediction = years[-2] + diff predictions.append(prediction) except: predictions.append(np.nan) real = yearly_df.values[:, -1] pred = np.array(predictions) error = ((real - pred)**2) / len(real) # error=mean_squared_error(real,pred) yearly_df["HMM_error"] = error return yearly_df
def GetResult(): k=4 UserSequences.Gettraintestdata(k) #employees = Employees.queryEmployees() employees = CSVFile.loadCSVfile1("./data/allusers/validusers.csv") #employees = CSVFile.loadCSVfile1("./data/allusers/Allusers_state.csv") resultlist = [] avgresult = [] for item in employees: user = item[0] state = item[1] preProcess.GetTransiMatrix(user) H = HMM(user) result = H.hmm(user) print user, result #resultlist.append([user,result]) avgresultpro = average(result) avgresult.append([user,avgresultpro,state]) print "average:",average(result), state result.insert(0, user) resultlist.append(result) #======= t1 = time.time() start_date = '2009-12-01' employees = Employees.queryLeaveEmployees() #f = open('./data/Result.txt', 'w') f = open('./data/ProSquenceResult.txt', 'w') #for user in employees: #>>>>>>> 34c6dfe6197febd37f54b0a34e607135f3d0d8e2:GetHMMresult.py #print user resultfile = './data/allusers/'+k+'ResultPro974.csv' avgresultfile = './data/allusers/'+k+'AvgResultPro974.csv' CSVFile.Writecsvtofile(resultfile,resultlist) CSVFile.Writecsvtofile(avgresultfile,avgresult)
def do_hmm(num_states=15, verbose=False, give_hmm=False): """ verbose: output the debugging stress pattern matrix give_hmm: if True, """ reversed_hmm = HMM.unsupervised_HMM(reversed_lines, n_states=num_states, N_iters=20, verbose=verbose) rhyming_words = preprocessor.get_rhyme_pairs(preprocessor.load_sonnets()) rhyming_lines = [] wanted_stress = [ True, False, True, False, True, False, True, False, True, False ] for i in range(7): start1, start2 = "", "" while start1 not in token2index or start2 not in token2index: start1, start2 = random.choice(rhyming_words) start1 = token2index[start1] start2 = token2index[start2] line1 = reversed_hmm.generate_emission_syllables( 10, syllable_dict, start1, stresses=stresses, desired_stresses=[x for x in wanted_stress])[0] line2 = reversed_hmm.generate_emission_syllables( 10, syllable_dict, start2, stresses=stresses, desired_stresses=[x for x in wanted_stress])[0] if verbose: print(wanted_stress) rhyming_lines.append((" ".join([index2token[x] for x in line1[::-1]]), " ".join([index2token[x] for x in line2[::-1]]))) sonnet = "\n".join([ upper_first(rhyming_lines[0][0]) + ",", rhyming_lines[1][0] + ".", upper_first(rhyming_lines[0][1]) + ",", rhyming_lines[1][1] + ".", upper_first(rhyming_lines[2][0]) + ",", rhyming_lines[3][0] + ".", upper_first(rhyming_lines[2][1]) + ",", rhyming_lines[3][1] + ".", upper_first(rhyming_lines[4][0]) + ",", rhyming_lines[5][0] + ".", upper_first(rhyming_lines[4][1]) + ",", rhyming_lines[5][1] + ".", upper_first(rhyming_lines[6][0]) + ",", rhyming_lines[6][1] + "." ]) if give_hmm: return (reversed_hmm, sonnet) else: return sonnet
def train(num_states, is_reversed=False): text = open('data/shakespeare.txt').read() obs, vocab, inv_vocab = preprocess.get_observations(text) if is_reversed: for ob in obs: ob.reverse() filename = f"models/hmm{num_states}" + ('_rev' if is_reversed else '') + ".txt" hmm = HMM.unsupervised_HMM(obs, num_states, 100) hmm.save(filename) return hmm
def extract(domain): # alex = alexa(domain) #alexa排名 # seonum = seo(domain) #seo收录数 suff = suffix(domain) #是否主流域名后缀 #num = number(domain) #域名中的数字数量 length = len(domain) numratio = numberratio(domain) #域名中的数字比率 consnumber = consecutivenumber(domain) #域名中连续数字的最大长度 conschar = consecutivechar(domain) #域名中连续字符的最大长度 consamenum = consecutivesamechar(domain) #连续相同字母字符的最大长度 mvdlen = mvd(domain) entr = entropy(domain) hmm = HMM.HMM(domain) return suff, length, numratio, consnumber, conschar, consamenum, mvdlen, entr, hmm
def partition(self): ''' Segment the sentence in the showing window ''' sen = self.InputText.toPlainText() sen = sen.encode("utf8") res = HMM.partition(sen, self.InitiateProb, self.TransProbMatrix, self.EmitProbMatrix) stri = "" for thing in res: stri += thing stri += " " #stri=stri.decode("utf8") stri = stri[0:len(stri) - 1] self.OutputText.setText(stri) return None
def hmm_shakespeare_sonnet_goal2(): # Load in everything sonnets, obs_map = sp.get_sonnets("data/shakespeare.txt", 2000) obs_map_r = {} for key in obs_map: obs_map_r[obs_map[key]] = key syl_map = sp.get_syllable_map("data/Syllable_dictionary.txt") rhymes = sp.get_rhymes("data/shakespeare.txt", True) # Train HMM model = HMM.unsupervised_HMM(sonnets, 5, 25) num_states = model.L # Print one blank line to make it pretty print("") # Generate quatrains for _ in range(2): while (True): r = random.randint(0, len(rhymes) - 1) rhyme_pair_1 = rhymes[r] if (rhyme_pair_1[0] in obs_map and rhyme_pair_1[1] in obs_map): break while (True): r = random.randint(0, len(rhymes) - 1) rhyme_pair_2 = rhymes[r] if (rhyme_pair_2[0] in obs_map and rhyme_pair_2[1] in obs_map): break print( make_rhyme_line(model, obs_map[rhyme_pair_1[0]], syl_map, obs_map_r)) print( make_rhyme_line(model, obs_map[rhyme_pair_2[0]], syl_map, obs_map_r)) print( make_rhyme_line(model, obs_map[rhyme_pair_1[1]], syl_map, obs_map_r)) print( make_rhyme_line(model, obs_map[rhyme_pair_2[1]], syl_map, obs_map_r)) print("") # Generate couplet while (True): r = random.randint(0, len(rhymes) - 1) rhyme_pair = rhymes[r] if (rhyme_pair[0] in obs_map and rhyme_pair[1] in obs_map): break print(make_rhyme_line(model, obs_map[rhyme_pair[0]], syl_map, obs_map_r)) print(make_rhyme_line(model, obs_map[rhyme_pair[1]], syl_map, obs_map_r)) print("")
def test_scaled_posterior_durbin_compatibility(self): """ Test the missing observation model when no observation is missing. """ # define the models standard_hmm = HMM.DishonestCasino() missing_hmm = DishonestCasino() # define a sequence of observations observations = [1, 2, 6, 6, 1, 2, 3, 4, 5, 6] # define the (degenerate) distances between observations distances = [1]*(len(observations) - 1) # get posterior distributions with the standard algorithm standard_distributions = standard_hmm.scaled_posterior_durbin(observations) # get posterior distributions using the degenerate distances missing_distributions = missing_hmm.scaled_posterior_durbin(observations, distances) # assert that the distributions are the same self.assertTrue(np.allclose(standard_distributions, missing_distributions))
from HMM import * status=['a','b'] observation=['m','n'] trans_matrix=[[0.5,0.5], [0.5,0.5]] initial_status=[0,1] observation_probability_distribution=[[0.3,0.7],[0.3,0.7]] a=HMM(status,observation,trans_matrix,initial_status,observation_probability_distribution) print(a.forward_algorithm(['m','n'])) print(a.backward_algorithm(['m','n'])) print(a.viterbi_method1(['m','n'],1)) print(a.viterbi_method2(['m','n']))
print('\nTESTING todict()') # todict() fromtoken, totoken, dat = todict(tokens, tokenlist) test(set(fromtoken.keys()) == set(['0', '1', '\n', '']), "todict() fromtoken has correct keys", "todict() fromtoken has incorrect keys", numTests) test(all([totoken[fromtoken[i]] == i for i in fromtoken.keys()]) and all([fromtoken[totoken[i]] == i for i in totoken.keys()]), "todict() dicts match", "todict() dicts don\'t match", numTests) # Unsupervised prediction using Viterbi print("Deterministic viterbi prediction") testHMM = HMM(4, fromtoken, totoken, k=k) testHMM.learn(dat, tol=0.001) predseq = testHMM.predict(max_iters=30) print(testHMM.toktostr(predseq)) # random viterbi print("Random Viterbi Test, should oscillate 0-1 (half the time this works)") for i in range(5): teststr = testHMM.predict(rand=True, max_iters=30) print(testHMM.toktostr(teststr)) print('\nTESTING DUMMY2') # Unsupervised prediction on new dummy file, should favor 1212 oscillations s, l = parseFile('data/dummy2.txt') fromt, tot, ll = todict(s, l) dummy1 = HMM(4, fromt, tot, k=2) dummy1.learn(ll)
#!/usr/bin/env python # generate transition/emission matricies for the various datasets # single shakespeare, few shakespeare, truncated shakespeare and full # full runs too slowly. from HMM import * import pickle f = open('sspeare10.pkl', 'wb') print("\nSingle Shakespeare") s, l = parseFile('data/singlespeare.txt') fromt, tot, ll = todict(s, l) print("Num tokens: " + str(len(fromt))) sspeare = HMM(len(fromt), fromt, tot, k=10) sspeare.learn(ll, tol=0.1, prt=True) lstwords = sspeare.gettops(numWords=10) for l in lstwords: print(l) pickle.dump(sspeare, f, -1) f = open('small10.pkl', 'wb') print("\nSmall Shakespeare") s, l = parseFile('data/smallspeare.txt') fromt, tot, ll = todict(s, l) print("Num tokens: " + str(len(fromt))) smallspeare = HMM(len(fromt), fromt, tot, k=10) smallspeare.learn(ll, tol=0.03, prt=True) lstwords = smallspeare.gettops(numWords=10) for l in lstwords: print(l) pickle.dump(smallspeare, f, -1)
from HMM import * status=['a','b'] observation=['m','n'] trans_matrix=[[0.5,0.5], [0.5,0.5]] initial_status=[0.5,0.5] observation_probability_distribution=[[1,0],[0,1]] a=HMM(status,observation,trans_matrix,initial_status,observation_probability_distribution) print(a.forward_algorithm(['m','m']))
import sys, HMM, math DEBUG = False #=============================================== # Script #=============================================== DEV_FILE_NAME = sys.argv[1] TRANS_FILE_NAME = sys.argv[2] EMIT_FILE_NAME = sys.argv[3] PRIOR_FILE_NAME = sys.argv[4] HMM = HMM.HiddenMarkovModel() HMM.initHMM(TRANS_FILE_NAME, EMIT_FILE_NAME, PRIOR_FILE_NAME) # for debug if DEBUG: print 'prior', HMM.hmmPrior print 'trans', HMM.hmmTrans print 'emit', HMM.hmmEmit print 'states', HMM.getStates() print 'observables', HMM.getObservables() # actual output delim = ' ' with open(DEV_FILE_NAME) as FID: for line in FID: vObserved = line.strip().split(delim) print math.log(HMM.backwardAlg(vObserved))
print "complete!" #-----------------------------------------------------------------------END - Load the training set #-----------------------------------------------------------------------START - Load the testing set sys.stdout.write("Loading testing set...") testingSentences = getWSJDirectories(6, wsjLocation, 7) print "complete!" #-----------------------------------------------------------------------END - Load the testing set #-----------------------------------------------------------------------START - Extract the lexicon and tags sys.stdout.write("Extracting lexicon and tags...") lexicon, tags = extractLexicon_and_Tags(trainingSentences) print "complete!" #-----------------------------------------------------------------------END - Extract the lexicon and tags model = HMM(tags) #-----------------------------------------------------------------------START - Train the model sys.stdout.write("Training model...") model.train(trainingSentences) print "complete!" #-----------------------------------------------------------------------END - Train the model #-----------------------------------------------------------------------START - Evaluate the model on the training set sys.stdout.write("Evaluating on training set...") print "{0:.2f}% accuracy".format(model.evaluate(trainingSentences[:100])*100.0) #-----------------------------------------------------------------------END - Evaluate the model on the training set #-----------------------------------------------------------------------START - Evaluate the model on the testing set sys.stdout.write("Evaluating on testing set...") print "{0:.2f}% accuracy".format(model.evaluate(testingSentences[:100])*100.0)
# place training data in list of sentence lists trainingData = list() # list of lists vocabulary = set() # vocabulary contains unique words with open(TRAIN_FILE_NAME) as FID: for line in FID: data = line.strip().split(' ') trainingData.append( data ) vocabulary.update(set(data)) # random prob assignment or use provided files if len(sys.argv) == 5: # initialization files TRANS_FILE_NAME = sys.argv[2] EMIT_FILE_NAME = sys.argv[3] PRIOR_FILE_NAME = sys.argv[4] HMM.initHMM(TRANS_FILE_NAME, EMIT_FILE_NAME, PRIOR_FILE_NAME) else: # init topology with standard files HMM.initHMMRand(STATES, vocabulary) # for debug if DEBUG: print 'Before training' print 'prior', HMM.hmmPrior print 'trans', HMM.hmmTrans print 'emit', HMM.hmmEmit print 'states', HMM.getStates() print 'observables', HMM.getObservables() avgLL = HMM.baumWelchAlg(trainingData, True)
import HMM import randomHmm as r import avgll as a import checker hmm = HMM.getHMM() data = [line.strip() for line in open("../data/trainnew.txt")][0] def getXi(data, hmm): alpha = a.forward(data, hmm) beta = a.backward(data, hmm) likelihood = beta[0][0] xi = [[[[0, 0], [0, 0]], [[0, 0], [0, 0]]]] for t in range(len(data) - 1): xi.append([[[0, 0], [0, 0]], [[0, 0], [0, 0]]]) data = "#" + data t = 0 for h in range(len(data) - 1): for i in range(0, 2): for j in range(0, 2): ptrans = a.getTransitionProbability(i, j, hmm) pemit = a.getEmissionProbability(j, data[t + 1], hmm) temp = a.multiplyProbability(alpha[t][i], ptrans) temp = a.multiplyProbability(temp, pemit) temp = a.multiplyProbability(temp, beta[t + 1][j]) xi[t][i][j] = a.divideProbability(temp, likelihood) t += 1 return xi