def run_hmm(labels, observations, axis_divs, angle_divs, smooth_iterations, smooth_radius, validation_runs): train_runs = 6000 - validation_runs pd = PointDiscretizer(axis_divs) ad = AngleDiscretizer(angle_divs) print("Loading HMM Data...", end='') start_time = time.time() start_probabilities, transition_probabilities, emission_probabilities = load_hmm_data( pd, ad, labels, observations, smooth_iterations, smooth_radius, train_runs) print("Done ({:.3f} secs)".format(time.time() - start_time)) model = Hmm(start_probabilities, transition_probabilities, emission_probabilities) # grade the model on the validation runs # grading scheme: mean squared error (lower is better) print("Grading Model...", end='') start_time = time.time() hidden_states = pd.states() errors = [] for validation_run in range(train_runs, 6000): run_labels = filter(lambda l: l[0] == validation_run, labels) observation_list = [ ad.discretize(angle) for angle in observations[validation_run] ] hidden_state_list = model.viterbi(observation_list, hidden_states) for label in run_labels: label_timestep = label[1] label_state = ( label[2], label[3], ) guessed_state = pd.un_discretize( hidden_state_list[label_timestep][0], hidden_state_list[label_timestep][1]) errors.append(euclidean(guessed_state + label_state)) # get Mean Squared Error MSE = statistics.mean(map(lambda x: x * x, errors)) print("Done! ({}s)".format(time.time() - start_time)) print( "MSE of {:.3f} from AxDivs: {} AngDivs: {} SmIts: {} SmR: {} ValSet: {}" .format(MSE, axis_divs, angle_divs, smooth_iterations, smooth_radius, validation_runs)) return MSE
def compare_recusions(): # Compare the two version in terms of efficiency start_time = time.time() hmm = Hmm('rescaled') hmm.compute_proba() hmm.EM() print("rescaled version run in %.6s seconds " % (time.time() - start_time)) start_time = time.time() hmm2 = Hmm('log-scale') hmm2.compute_proba() hmm2.EM() print("log-scale version run in %.6s seconds " % (time.time() - start_time)) print('difference cond_proba ' % np.min(hmm.cond_proba - hmm2.cond_proba), np.max(hmm.cond_proba - hmm2.cond_proba)) print( 'difference joined_cond_proba' % np.min(hmm.joined_cond_proba - hmm2.joined_cond_proba), np.max(hmm.joined_cond_proba - hmm2.joined_cond_proba)) return
def cross_validate(args): # Run cross validation with config from args if args.m.lower() in ['lstmhmm', 'lstm']: # Use modified HMM truthHmm = LstmHmm() bluffHmm = LstmHmm() else: # Use normal HMM truthHmm = Hmm() bluffHmm = Hmm() truthHmm.read_sequences(args.i + '/truthers') bluffHmm.read_sequences(args.i + '/bluffers') if(len(truthHmm.X_mat_train) == 0 or len(bluffHmm.X_mat_train) == 0): raise IOError('No data found, make sure {} contains truthers/bluffers folders' .format(args.i)) # Split the sequences into args.n folds for truth and then bluff np.random.seed = args.seed kf = KFold(n_splits=args.n) X = truthHmm.X_mat_train truthSets = [] for train, test in kf.split(X): trainSet = [] testSet = [] for i in train: trainSet.append(X[i]) for i in test: testSet.append(X[i]) #print('Train = ', trainSet) #print('Test = ', testSet) truthSets.append([trainSet,testSet]) X = bluffHmm.X_mat_train bluffSets = [] for train, test in kf.split(X): trainSet = [] testSet = [] for i in train: trainSet.append(X[i]) for i in test: testSet.append(X[i]) #logging.debug('Train = ', trainSet) #logging.debug('Test = ', testSet) bluffSets.append([trainSet,testSet]) # Folder to put the weight files in for later analysis result_folder = str(time.time()).replace('.', '') try: os.mkdir('results') except OSError: # Already exists pass os.mkdir('results/' + result_folder) # Set up the arguments func_args = [] for i in range(len(truthSets)): func_args.append([args, truthSets[i], bluffSets[i], i + 1, result_folder]) # Run them all in parallel p = Pool(args.n) try: results = p.map(train_test, func_args) # Run folds in parallel except (KeyboardInterrupt, Exception): logging.error('An error occurred in Pool.') p.terminate() p.join() p.close() sys.exit(0) finally: p.close() # Write results to a csv for later graphing/analysis with open('results.csv', 'a+') as f: writer = csv.writer(f) # Calculate averages across the Folds avg_truth_score = 0.0 avg_bluff_score = 0.0 avg_accuracy = 0.0 total_correct = 0 total_tested = 0 avg_t_correct = 0.0 avg_b_correct = 0.0 for result in results: correct, test_size, truth_score, bluff_score, t_correct, b_correct = result avg_truth_score += truth_score avg_bluff_score += bluff_score avg_accuracy += float(correct) / test_size total_correct += correct total_tested += test_size avg_t_correct += t_correct avg_b_correct += b_correct avg_accuracy /= args.n avg_truth_score /= args.n avg_bluff_score /= args.n avg_t_correct /= args.n avg_b_correct /= args.n # Writes Result to CSV as: # [Time, k, d, n_init, n_iter, seed, n_folds, total_correct, out_of, percent_correct, # train_score_T, train_score_B, avg_correct_T, avg_correct_B, model, infolder, result_folder] writer.writerow([time.ctime(),args.k,'5',args.n_init,args.n_iter,args.seed,\ args.n, total_correct, total_tested, avg_accuracy * 100, \ avg_truth_score, avg_bluff_score, avg_t_correct * 100, avg_b_correct * 100,\ args.m, args.i, result_folder])
def train_test(args): # Parameters # # args[0] is normal args # args[1] is [truthTrainSequences, truthTestSequences] # args[2] is [bluffTrainSequences, bluffTestSequences] # args[3] is the fold number # args[4] is the folder name to dump the weights into try: n_init = args[0].n_init # Random initializations to try n_iter = args[0].n_iter # Iterations in each initialization k = args[0].k # Hidden States # TODO: Don't hardcode 5, parse from cluster-def (args.d) or add an arg d = 5 # Outputs (number of clusters used) if args[0].m.lower() in ['lstmhmm', 'lstm']: truthHmm = LstmHmm() bluffHmm = LstmHmm() else: truthHmm = Hmm() bluffHmm = Hmm() # Assign the train/test sequences for this fold # See hmm.Hmm.load_test_sequences for explination on 'wrap_interviews' param truthHmm.load_train_sequences(args[1][0]) truthHmm.load_test_sequences(args[1][1], wrap_interviews=True) bluffHmm.load_train_sequences(args[2][0]) bluffHmm.load_test_sequences(args[2][1], wrap_interviews=True) testSize = len(truthHmm.X_mat_test) + len(bluffHmm.X_mat_test) logging.info('# Truth Training Sequences: {0}\n# Bluff Training Sequences: {1}'.format(\ len(truthHmm.X_mat_train), len(bluffHmm.X_mat_train))) logging.info('k = {0}, d = {1}, n_init = {2}, n_iter = {3}, testSize = {4}'.format(\ k,d,n_init,n_iter,testSize)) logging.info('Beginning training on Truth-Tellers....') bestScore = -np.inf # Run em_train for Truth-Tellers multiple times, finding the best-scoring one for i in range(n_init): truthHmm.initialize_weights(k,d) truthHmm.em_train_v(n_iter) score = truthHmm.p_X_mat(truthHmm.X_mat_train) if(score > bestScore): bestScore = score bestWeights = truthHmm.P_k, truthHmm.T_kk, truthHmm.E_kd truthHmm.print_percents() logging.info('Trained truthHmm #',i+1,' Score = ',score) # Rebuild the best truthHmm truthHmm.P_k, truthHmm.T_kk, truthHmm.E_kd = bestWeights logging.info('Best Trained Truth-Tellers HMM:') truthHmm.print_percents() logging.info('Beginning training on Bluffers....') bestScore = -np.inf # Reset for bluffers # Run em_train for Bluffers multiple times, finding the best-scoring one for i in range(n_init): bluffHmm.initialize_weights(k,d) bluffHmm.em_train_v(n_iter) score = bluffHmm.p_X_mat(bluffHmm.X_mat_train) if(score > bestScore): bestScore = score bestWeights = bluffHmm.P_k, bluffHmm.T_kk, bluffHmm.E_kd bluffHmm.print_percents() logging.info('Trained bluffHmm #',i+1,' Score = ',score) # Rebuild the best bluffHMM bluffHmm.P_k, bluffHmm.T_kk, bluffHmm.E_kd = bestWeights print('\nBest Trained Truth-Tellers HMM:') truthHmm.print_percents() print('\nBest Trained Liars HMM:') bluffHmm.print_percents() # Evaluate on Testing sequences correct = 0 # total classified correctly t_correct, b_correct = 0, 0 # Each X in hmm.X_mat_test is a list, one sequence for each segment of the interview # (due to low confidence periods) so they should be evaluated together so each interview # is weighted equally. for X_interview in truthHmm.X_mat_test: if truthHmm.p_X_mat(X_interview) > bluffHmm.p_X_mat(X_interview): correct += 1 t_correct += 1 for X_interview in bluffHmm.X_mat_test: if bluffHmm.p_X_mat(X_interview) > truthHmm.p_X_mat(X_interview): correct += 1 b_correct += 1 print('Out of {0} test cases, {1} were correctly classified.'.format(\ testSize, correct)) # Train Score truthScore = truthHmm.p_X_mat(truthHmm.X_mat_train) bluffScore = bluffHmm.p_X_mat(bluffHmm.X_mat_train) # Write weight files for later usage truthHmm.write_weight_file('results/{}/truthers_fold_{}.weights'.format(args[4], args[3])) bluffHmm.write_weight_file('results/{}/bluffers_fold_{}.weights'.format(args[4], args[3])) # Write results of this fold and human-readable percents with open('results/{}/results_fold_{}.txt'.format(args[4], args[3]), 'w+') as f: out = 'Out of {0} test cases, {1} were correctly classified'.format(\ testSize, correct) out += '\nt_correct = {}\nb_correct = {}\ntrain_score_T = {}\ntrain_score_B = {}\n\n'.format( t_correct, b_correct, truthScore, bluffScore) out += '\n\nTruth HMM:\n' out += truthHmm.get_percents() out += '\n\nBluff HMM:\n' out += bluffHmm.get_percents() f.write(out) f.close() # Convert to percents for later averaging t_correct /= float(len(truthHmm.X_mat_test)) b_correct /= float(len(bluffHmm.X_mat_test)) # Return the number correct, testSize to be averaged and written to CSV return correct, testSize, truthScore, bluffScore, t_correct, b_correct except KeyboardInterrupt: return 'KeyboardInterrupt'
from hmm import Hmm import time label_map_file = '../dataset/phones/48_39.map' chr_map_file = '../dataset/48_idx_chr.map_b' label_file = '../dataset/label/train.lab' post_file = '../dataset/posteriorgram/test6.post'#'../dataset/posteriorgram/train2.post' Hmm.loadgetprob(label_map_file, chr_map_file, label_file, post_file)#'test_y_.txt' setting = 0.57 for i in xrange(3): n, p, path = Hmm.viterbi(setting) #count,counttotal = Hmm.check(path) #print (str(count) +'/'+ str(counttotal)) Hmm.save_result(path, setting) setting += 0.05
csv.get_all_tweets("UniofOxford") csv.get_all_tweets("Cambridge_Uni") """ ############################################################################################## print "PULIZIA TWEETS" csv.cleanCsv() csv.perturbate_tweets() esteem.transiction() clean_tweets = open('csv\lp_tweets.csv') perturbed_tweets = open('csv\perturbation_tweets.csv') esteem.observations_p(clean_tweets, perturbed_tweets) ############################################################################################## print "GENERAZIONE MODELLO HMM" hmm = Hmm(esteem.transition_p, esteem.obs_matrix, esteem.pigreco, esteem.final_p) hmm.create_hmm(csv.error_list) ui.setupUi(Form, hmm) print "################################################################" print "DIFFERENZA TRA ORIGINALI" clean_tweets = open('csv\lp_tweets.csv') perturbed_tweets = open('csv\perturbation_tweets.csv') prediction_capabilities.calculate_capabilities(clean_tweets, perturbed_tweets, ui) print "################################################################" print "DIFFERENZA FINALE" clean_tweets = open('csv\lp_tweets.csv') output_tweets = open('csv\output_tweets.csv')
seq2 = ('B', 'C', 'C', 'B', 'D', 'D', 'C', 'A', 'C', 'S') seq3 = ('A', 'C', 'D', 'S') seq4 = ('A', 'D', 'A', 'C', 'S') seq5 = ('D', 'B', 'B', 'S') seq6 = ('A', 'B', 'S') seq7 = ('D', 'D', 'B', 'D', 'D', 'B', 'A', 'C', 'C', 'D', 'A', 'B', 'B', 'C', 'D', 'B', 'B', 'B', 'S') seq8 = ('D', 'B', 'D', 'S') seq9 = ('A', 'A', 'A', 'A', 'D', 'C', 'B', 'S') observations = [seq0, seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, seq9] # observations = [('A','C','D','D','C','C','S'), ('A','D','S')] if __name__ == '__main__': model_file1 = "model1.json" hmm1 = Hmm(os.path.join(model_file1)) model_file2 = "model2.json" hmm2 = Hmm(os.path.join(model_file2)) print("Machine 1 generated samples:") hmm1.generator() print("\nFORWARD ALGORITHM:") for obs in observations: p1 = hmm1.forward(obs) p2 = hmm2.forward(obs) # print("Observations = ", obs, " Fwd Prob (Machine 1) = ", p1, ", Fwd Prob (Machine 2) = ", p2, ", Fwd Prob log (Machine 1) = ", (math.log(p1) if p1 != 0 else "NA"),
print('difference cond_proba ' % np.min(hmm.cond_proba - hmm2.cond_proba), np.max(hmm.cond_proba - hmm2.cond_proba)) print( 'difference joined_cond_proba' % np.min(hmm.joined_cond_proba - hmm2.joined_cond_proba), np.max(hmm.joined_cond_proba - hmm2.joined_cond_proba)) return # Q1 print('\n****************** Q1 ******************') compare_recusions() # Q2 print('\n****************** Q2 ******************') hmm = Hmm('rescaled') hmm2 = Hmm('log-scale') hmm.compute_proba(hmm.test_data) hmm.plot_proba(100, 'conditional proba with initial parameters on test', 'q02', '1') #Q3 & Q4 print('\n****************** Q4 ******************') hmm.EM(True) hmm.print_parameters() # Q5 print('\n****************** Q5 ******************') hmm.plot_likelihood('q05', '1') # Q6
from hmm import Hmm import pandas as pd data = pd.read_csv("data/协同_未标记实体.csv") for i in range(data.shape[0]): text = data.iloc[i, 2] model = Hmm(char2idx_path="dicts/char2idx.json", tag2idx_path="dicts/tag2idx.json") model.fit("corpus/train_data.txt") res = pd.DataFrame(columns=["标签", "实体", "正文"]) for i in range(data.shape[0]): raw = data.iloc[i, 2].strip() tmp = model.usage(raw) label = data.iloc[i, 0] ents = data.iloc[i, 1] res = res.append({"标签": label, "实体": ents, "正文": tmp}, ignore_index=True) res.to_csv("data/协同_已标记实体初版.csv", encoding="utf_8_sig", index=False)
from hmm import Hmm model = Hmm(char2idx_path="dicts/otherchar2idx.json", tag2idx_path="dicts/othertag2idx.json") model.fit("corpus/hmm_疫情其他数据集_20000.txt") f = open("corpus/疫情分句后的数据.txt", encoding='utf-8') ents = [] with open("hmm其他抽取结果.txt", "a", encoding="utf-8") as w: for line in f: line = line.strip() if len(line) <= 1: continue tmp = model.yq_usage(line) for t in tmp: if t in ents: continue ents.append(t) w.write(t + '\n') print(t) f.close()