Ejemplo n.º 1
0
def run_hmm(labels, observations, axis_divs, angle_divs, smooth_iterations,
            smooth_radius, validation_runs):
    train_runs = 6000 - validation_runs

    pd = PointDiscretizer(axis_divs)
    ad = AngleDiscretizer(angle_divs)

    print("Loading HMM Data...", end='')
    start_time = time.time()
    start_probabilities, transition_probabilities, emission_probabilities = load_hmm_data(
        pd, ad, labels, observations, smooth_iterations, smooth_radius,
        train_runs)
    print("Done ({:.3f} secs)".format(time.time() - start_time))

    model = Hmm(start_probabilities, transition_probabilities,
                emission_probabilities)

    # grade the model on the validation runs
    # grading scheme: mean squared error (lower is better)
    print("Grading Model...", end='')
    start_time = time.time()
    hidden_states = pd.states()
    errors = []
    for validation_run in range(train_runs, 6000):
        run_labels = filter(lambda l: l[0] == validation_run, labels)
        observation_list = [
            ad.discretize(angle) for angle in observations[validation_run]
        ]
        hidden_state_list = model.viterbi(observation_list, hidden_states)
        for label in run_labels:
            label_timestep = label[1]
            label_state = (
                label[2],
                label[3],
            )
            guessed_state = pd.un_discretize(
                hidden_state_list[label_timestep][0],
                hidden_state_list[label_timestep][1])
            errors.append(euclidean(guessed_state + label_state))
    # get Mean Squared Error
    MSE = statistics.mean(map(lambda x: x * x, errors))
    print("Done! ({}s)".format(time.time() - start_time))
    print(
        "MSE of {:.3f} from AxDivs: {} AngDivs: {} SmIts: {} SmR: {} ValSet: {}"
        .format(MSE, axis_divs, angle_divs, smooth_iterations, smooth_radius,
                validation_runs))
    return MSE
Ejemplo n.º 2
0
def compare_recusions():
    # Compare the two version in terms of efficiency
    start_time = time.time()
    hmm = Hmm('rescaled')
    hmm.compute_proba()
    hmm.EM()
    print("rescaled version run in %.6s seconds " % (time.time() - start_time))

    start_time = time.time()
    hmm2 = Hmm('log-scale')
    hmm2.compute_proba()
    hmm2.EM()
    print("log-scale version run in %.6s seconds " %
          (time.time() - start_time))

    print('difference cond_proba ' % np.min(hmm.cond_proba - hmm2.cond_proba),
          np.max(hmm.cond_proba - hmm2.cond_proba))
    print(
        'difference joined_cond_proba' %
        np.min(hmm.joined_cond_proba - hmm2.joined_cond_proba),
        np.max(hmm.joined_cond_proba - hmm2.joined_cond_proba))
    return
Ejemplo n.º 3
0
def cross_validate(args):
    # Run cross validation with config from args
    
    if args.m.lower() in ['lstmhmm', 'lstm']: # Use modified HMM
	truthHmm = LstmHmm()
	bluffHmm = LstmHmm()
    else: # Use normal HMM
	truthHmm = Hmm()
	bluffHmm = Hmm()
	
    truthHmm.read_sequences(args.i + '/truthers')
    bluffHmm.read_sequences(args.i + '/bluffers')
    
    if(len(truthHmm.X_mat_train) == 0 or len(bluffHmm.X_mat_train) == 0):
	raise IOError('No data found, make sure {} contains truthers/bluffers folders'
	        .format(args.i))
     
    # Split the sequences into args.n folds for truth and then bluff
    np.random.seed = args.seed
    kf = KFold(n_splits=args.n)
    
    X = truthHmm.X_mat_train
    truthSets = []
    for train, test in kf.split(X):
        trainSet = []
        testSet = []
        for i in train:
            trainSet.append(X[i])
        for i in test:
            testSet.append(X[i])
        #print('Train = ', trainSet)
        #print('Test = ', testSet)   
        truthSets.append([trainSet,testSet])
    
    X = bluffHmm.X_mat_train
    bluffSets = []
    for train, test in kf.split(X):
        trainSet = []
        testSet = []
        for i in train:
            trainSet.append(X[i])
        for i in test:
            testSet.append(X[i])
        #logging.debug('Train = ', trainSet)
        #logging.debug('Test = ', testSet)   
        bluffSets.append([trainSet,testSet])
    
    # Folder to put the weight files in for later analysis
    result_folder = str(time.time()).replace('.', '')
    try:
	os.mkdir('results')
    except OSError: # Already exists
	pass
    os.mkdir('results/' + result_folder)
    
    # Set up the arguments
    func_args = []
    for i in range(len(truthSets)):
        func_args.append([args, truthSets[i], bluffSets[i], i + 1, result_folder])
    
    # Run them all in parallel 
    p = Pool(args.n)
    try:
	results = p.map(train_test, func_args) # Run folds in parallel
    except (KeyboardInterrupt, Exception):
	logging.error('An error occurred in Pool.')
	p.terminate()
	p.join()
	p.close()
	sys.exit(0)
    finally:
	p.close()
    
    # Write results to a csv for later graphing/analysis
    with open('results.csv', 'a+') as f:
        writer = csv.writer(f)
	
	# Calculate averages across the Folds
	avg_truth_score = 0.0
	avg_bluff_score = 0.0
	avg_accuracy = 0.0
	total_correct = 0
	total_tested = 0
	avg_t_correct = 0.0
	avg_b_correct = 0.0
	
        for result in results:
	    correct, test_size, truth_score, bluff_score, t_correct, b_correct = result
	    avg_truth_score += truth_score
	    avg_bluff_score += bluff_score
	    avg_accuracy += float(correct) / test_size
	    total_correct += correct
	    total_tested += test_size
	    avg_t_correct += t_correct
	    avg_b_correct += b_correct
	
	avg_accuracy /= args.n
	avg_truth_score /= args.n
	avg_bluff_score /= args.n
	avg_t_correct /= args.n
	avg_b_correct /= args.n
        
        # Writes Result to CSV as: 
	# [Time, k, d, n_init, n_iter, seed, n_folds, total_correct, out_of, percent_correct,
	#  train_score_T, train_score_B, avg_correct_T, avg_correct_B, model, infolder, result_folder]           
        writer.writerow([time.ctime(),args.k,'5',args.n_init,args.n_iter,args.seed,\
                         args.n, total_correct, total_tested, avg_accuracy * 100, \
	                 avg_truth_score, avg_bluff_score, avg_t_correct * 100, avg_b_correct * 100,\
	                 args.m, args.i, result_folder])          
Ejemplo n.º 4
0
def train_test(args):
    #  Parameters  #
    # args[0] is normal args
    # args[1] is [truthTrainSequences, truthTestSequences]
    # args[2] is [bluffTrainSequences, bluffTestSequences]
    # args[3] is the fold number
    # args[4] is the folder name to dump the weights into
    try:
	n_init = args[0].n_init  # Random initializations to try
	n_iter = args[0].n_iter  # Iterations in each initialization
	k = args[0].k # Hidden States
	# TODO: Don't hardcode 5, parse from cluster-def (args.d) or add an arg
	d = 5 # Outputs (number of clusters used) 
	
	if args[0].m.lower() in ['lstmhmm', 'lstm']:
	    truthHmm = LstmHmm()
	    bluffHmm = LstmHmm()
	else:
	    truthHmm = Hmm()
	    bluffHmm = Hmm()
	
	# Assign the train/test sequences for this fold
	# See hmm.Hmm.load_test_sequences for explination on 'wrap_interviews' param
	truthHmm.load_train_sequences(args[1][0])
	truthHmm.load_test_sequences(args[1][1], wrap_interviews=True)
	bluffHmm.load_train_sequences(args[2][0])
	bluffHmm.load_test_sequences(args[2][1], wrap_interviews=True)
	
	testSize = len(truthHmm.X_mat_test) + len(bluffHmm.X_mat_test)
       
	logging.info('# Truth Training Sequences: {0}\n# Bluff Training Sequences: {1}'.format(\
	    len(truthHmm.X_mat_train), len(bluffHmm.X_mat_train)))
	logging.info('k = {0}, d = {1}, n_init = {2}, n_iter = {3}, testSize = {4}'.format(\
	    k,d,n_init,n_iter,testSize))
	
	logging.info('Beginning training on Truth-Tellers....')
	bestScore = -np.inf
	# Run em_train for Truth-Tellers multiple times, finding the best-scoring one
	for i in range(n_init):
	    truthHmm.initialize_weights(k,d)
	    truthHmm.em_train_v(n_iter)
	    score = truthHmm.p_X_mat(truthHmm.X_mat_train)
	    if(score > bestScore):
		bestScore = score
		bestWeights = truthHmm.P_k, truthHmm.T_kk, truthHmm.E_kd
	    truthHmm.print_percents()
	    logging.info('Trained truthHmm #',i+1,' Score = ',score)
	# Rebuild the best truthHmm
	truthHmm.P_k, truthHmm.T_kk, truthHmm.E_kd = bestWeights
	
	logging.info('Best Trained Truth-Tellers HMM:')
	truthHmm.print_percents()
	
	logging.info('Beginning training on Bluffers....')        
	bestScore = -np.inf # Reset for bluffers
	# Run em_train for Bluffers multiple times, finding the best-scoring one     
	for i in range(n_init):
	    bluffHmm.initialize_weights(k,d)
	    bluffHmm.em_train_v(n_iter)
	    score = bluffHmm.p_X_mat(bluffHmm.X_mat_train)
	    if(score > bestScore):
		bestScore = score
		bestWeights = bluffHmm.P_k, bluffHmm.T_kk, bluffHmm.E_kd
	    bluffHmm.print_percents()
	    logging.info('Trained bluffHmm #',i+1,' Score = ',score)
	# Rebuild the best bluffHMM
	bluffHmm.P_k, bluffHmm.T_kk, bluffHmm.E_kd = bestWeights    
	
	print('\nBest Trained Truth-Tellers HMM:')
	truthHmm.print_percents()
	print('\nBest Trained Liars HMM:')
	bluffHmm.print_percents()
	
	# Evaluate on Testing sequences
	correct = 0 # total classified correctly
	t_correct, b_correct = 0, 0
	# Each X in hmm.X_mat_test is a list, one sequence for each segment of the interview
	# (due to low confidence periods) so they should be evaluated together so each interview
	# is weighted equally.
	for X_interview in truthHmm.X_mat_test:
	    if truthHmm.p_X_mat(X_interview) > bluffHmm.p_X_mat(X_interview):
		correct += 1
		t_correct += 1
	for X_interview in bluffHmm.X_mat_test:
	    if bluffHmm.p_X_mat(X_interview) > truthHmm.p_X_mat(X_interview):
		correct += 1
		b_correct += 1
	
	print('Out of {0} test cases, {1} were correctly classified.'.format(\
	    testSize, correct))
	
	# Train Score
	truthScore = truthHmm.p_X_mat(truthHmm.X_mat_train)
	bluffScore = bluffHmm.p_X_mat(bluffHmm.X_mat_train)
	
	# Write weight files for later usage
	truthHmm.write_weight_file('results/{}/truthers_fold_{}.weights'.format(args[4], args[3]))
	bluffHmm.write_weight_file('results/{}/bluffers_fold_{}.weights'.format(args[4], args[3]))
	# Write results of this fold and human-readable percents
	with open('results/{}/results_fold_{}.txt'.format(args[4], args[3]), 'w+') as f:
	    out = 'Out of {0} test cases, {1} were correctly classified'.format(\
		testSize, correct)
	    out += '\nt_correct = {}\nb_correct = {}\ntrain_score_T = {}\ntrain_score_B = {}\n\n'.format(
		 t_correct, b_correct, truthScore, bluffScore)
	    out += '\n\nTruth HMM:\n'
	    out += truthHmm.get_percents()
	    out += '\n\nBluff HMM:\n'
	    out += bluffHmm.get_percents()
	    f.write(out)
	    f.close()
	    
	# Convert to percents for later averaging
	t_correct /= float(len(truthHmm.X_mat_test))
	b_correct /= float(len(bluffHmm.X_mat_test))  
	
	# Return the number correct, testSize to be averaged and written to CSV
	return correct, testSize, truthScore, bluffScore, t_correct, b_correct
    
    except KeyboardInterrupt:
	return 'KeyboardInterrupt'
Ejemplo n.º 5
0
from hmm import Hmm
import time

label_map_file = '../dataset/phones/48_39.map'
chr_map_file = '../dataset/48_idx_chr.map_b'
label_file = '../dataset/label/train.lab'
post_file = '../dataset/posteriorgram/test6.post'#'../dataset/posteriorgram/train2.post'
Hmm.loadgetprob(label_map_file, chr_map_file, label_file, post_file)#'test_y_.txt'


setting = 0.57
for i in xrange(3):
    n, p, path = Hmm.viterbi(setting)
    #count,counttotal = Hmm.check(path)
    #print (str(count) +'/'+ str(counttotal))
    Hmm.save_result(path, setting)
    setting += 0.05
Ejemplo n.º 6
0
    csv.get_all_tweets("UniofOxford")
    csv.get_all_tweets("Cambridge_Uni")
    """
    ##############################################################################################
    print "PULIZIA TWEETS"
    csv.cleanCsv()
    csv.perturbate_tweets()

    esteem.transiction()
    clean_tweets = open('csv\lp_tweets.csv')
    perturbed_tweets = open('csv\perturbation_tweets.csv')
    esteem.observations_p(clean_tweets, perturbed_tweets)

    ##############################################################################################
    print "GENERAZIONE MODELLO HMM"
    hmm = Hmm(esteem.transition_p, esteem.obs_matrix, esteem.pigreco,
              esteem.final_p)
    hmm.create_hmm(csv.error_list)

    ui.setupUi(Form, hmm)

    print "################################################################"
    print "DIFFERENZA TRA ORIGINALI"
    clean_tweets = open('csv\lp_tweets.csv')
    perturbed_tweets = open('csv\perturbation_tweets.csv')
    prediction_capabilities.calculate_capabilities(clean_tweets,
                                                   perturbed_tweets, ui)

    print "################################################################"
    print "DIFFERENZA FINALE"
    clean_tweets = open('csv\lp_tweets.csv')
    output_tweets = open('csv\output_tweets.csv')
Ejemplo n.º 7
0
seq2 = ('B', 'C', 'C', 'B', 'D', 'D', 'C', 'A', 'C', 'S')
seq3 = ('A', 'C', 'D', 'S')
seq4 = ('A', 'D', 'A', 'C', 'S')
seq5 = ('D', 'B', 'B', 'S')
seq6 = ('A', 'B', 'S')
seq7 = ('D', 'D', 'B', 'D', 'D', 'B', 'A', 'C', 'C', 'D', 'A', 'B', 'B', 'C',
        'D', 'B', 'B', 'B', 'S')
seq8 = ('D', 'B', 'D', 'S')
seq9 = ('A', 'A', 'A', 'A', 'D', 'C', 'B', 'S')

observations = [seq0, seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, seq9]
# observations = [('A','C','D','D','C','C','S'), ('A','D','S')]

if __name__ == '__main__':
    model_file1 = "model1.json"
    hmm1 = Hmm(os.path.join(model_file1))
    model_file2 = "model2.json"
    hmm2 = Hmm(os.path.join(model_file2))

    print("Machine 1 generated samples:")
    hmm1.generator()

    print("\nFORWARD ALGORITHM:")
    for obs in observations:
        p1 = hmm1.forward(obs)
        p2 = hmm2.forward(obs)
        #

        print("Observations = ", obs, " Fwd Prob (Machine 1) = ", p1,
              ", Fwd Prob (Machine 2) = ", p2, ", Fwd Prob log (Machine 1) = ",
              (math.log(p1) if p1 != 0 else "NA"),
Ejemplo n.º 8
0
    print('difference cond_proba ' % np.min(hmm.cond_proba - hmm2.cond_proba),
          np.max(hmm.cond_proba - hmm2.cond_proba))
    print(
        'difference joined_cond_proba' %
        np.min(hmm.joined_cond_proba - hmm2.joined_cond_proba),
        np.max(hmm.joined_cond_proba - hmm2.joined_cond_proba))
    return


# Q1
print('\n****************** Q1 ******************')
compare_recusions()

# Q2
print('\n****************** Q2 ******************')
hmm = Hmm('rescaled')
hmm2 = Hmm('log-scale')
hmm.compute_proba(hmm.test_data)
hmm.plot_proba(100, 'conditional proba with initial parameters on test', 'q02',
               '1')

#Q3 & Q4
print('\n****************** Q4 ******************')
hmm.EM(True)
hmm.print_parameters()

# Q5
print('\n****************** Q5 ******************')
hmm.plot_likelihood('q05', '1')

# Q6
Ejemplo n.º 9
0
from hmm import Hmm
import pandas as pd

data = pd.read_csv("data/协同_未标记实体.csv")
for i in range(data.shape[0]):
    text = data.iloc[i, 2]

model = Hmm(char2idx_path="dicts/char2idx.json",
            tag2idx_path="dicts/tag2idx.json")
model.fit("corpus/train_data.txt")

res = pd.DataFrame(columns=["标签", "实体", "正文"])

for i in range(data.shape[0]):
    raw = data.iloc[i, 2].strip()
    tmp = model.usage(raw)
    label = data.iloc[i, 0]
    ents = data.iloc[i, 1]
    res = res.append({"标签": label, "实体": ents, "正文": tmp}, ignore_index=True)

res.to_csv("data/协同_已标记实体初版.csv", encoding="utf_8_sig", index=False)
Ejemplo n.º 10
0
from hmm import Hmm

model = Hmm(char2idx_path="dicts/otherchar2idx.json",
            tag2idx_path="dicts/othertag2idx.json")
model.fit("corpus/hmm_疫情其他数据集_20000.txt")

f = open("corpus/疫情分句后的数据.txt", encoding='utf-8')
ents = []
with open("hmm其他抽取结果.txt", "a", encoding="utf-8") as w:
    for line in f:
        line = line.strip()
        if len(line) <= 1:
            continue
        tmp = model.yq_usage(line)
        for t in tmp:
            if t in ents:
                continue
            ents.append(t)
            w.write(t + '\n')
            print(t)

f.close()