def seqHMM(): clf = MultinomialHMM() clf.fit(input_data.iloc[:, :-1], input_data.iloc[:, -1], lengths) pred = clf.predict(input_data.iloc[:, :-1]) actual = input_data.iloc[:, -1] accuracy = sum(pred == actual) / float(len(actual)) print accuracy
def fit_validate_hmm(df: DataFrame, y_col: str, seq_id_col: str, feature_cols: List[str], k: int = 10, alpha: int = 0.01) -> dict: ''' Fit HMM using features on sequence of tokens df_tokens from sequences seq_id_col. ''' df_tokens = df.copy() # add one hot vectors encoded from features add_one_hot_vectors(df_tokens, feature_cols) # get sequence lengths df_lengths = get_sequence_lengths(df_tokens, seq_id_col) # cross validate model model = MultinomialHMM(alpha=alpha) # get cross validated scores k_scores = get_validated_scores(model, df_tokens, y_col, seq_id_col, df_lengths, k) # fit model on entire dataset X = np.vstack(df_tokens.one_hot_vector.values) y = df_tokens[y_col] l = df_lengths.length model.fit(X, y, l) # return model, data, metrics return {'model': model, 'k_scores': k_scores}
def trainHMM(data): # # # Extracts features from the datasets # X_train, y_train, lengths_train = load_conll(data, features) # # Models it as an HMM clf = MultinomialHMM() clf.fit(X_train, y_train, lengths_train) # print X_train, y_train return clf
def hmm_pred(a, X_train, X_test, y_train, y_test): # скрытая марковская модель hmm = MultinomialHMM(alpha=a) hmm.fit(X_train, y_train, lengths=np.array([1 for i in y_train])) y_pred = hmm.predict(X_test) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') F1 = f1_score(y_test, y_pred, average='weighted') return [accuracy, precision, recall, F1]
def trainHMM(X_train, y_train): # # # Extracts features from the datasets # # Models it as an HMM clf = MultinomialHMM() print "y shape", y_train.shape[0] lengths_train = [] for x in X_train: lengths_train.append(0) print lengths_train clf.fit(X_train, y_train, [len(y_train)]) return clf
def train_and_test_markov(decode, alpha, X_train, y_train, sequence_length_train, X_test, y_test, sequence_length_test, *args, **kwargs): clf = MultinomialHMM(decode=decode, alpha=alpha) #print("Training {}".format(clf)) start = time.time() clf.fit(X_train, y_train, sequence_length_train) mid = time.time() y_pred = clf.predict(X_test, sequence_length_test) stop = time.time() accuracy = 100 * accuracy_score(y_pred, y_test) fit_time = 1000 * (mid - start) pred_time = 1000 * (stop - mid) #print("Fit time: {:.3f}ms, Predict time: {:.3f}ms, Accuracy: {:.2f}".format(fit_time, pred_time, accuracy)) return (fit_time, pred_time, accuracy)
def test_hmm(): n_features = X.shape[1] clf = MultinomialHMM() clf.fit(X, y, lengths) assert_array_equal(clf.classes_, ["Adj", "DT", "IN", "N", "V"]) assert_array_equal(clf.predict(X), y) n_classes = len(clf.classes_) assert_array_almost_equal(np.ones(n_features), np.exp(clf.coef_).sum(axis=0)) assert_array_almost_equal(np.ones(n_classes), np.exp(clf.coef_trans_).sum(axis=0)) assert_array_almost_equal(1., np.exp(clf.coef_final_).sum()) assert_array_almost_equal(1., np.exp(clf.coef_init_).sum())
def train_HMM(X_train, y_train, house, f, i): #X_train = [s2features(s) for s in X_train] clf = MultinomialHMM(decode='viterbi') trainLens = np.array(map(lambda x: len(x), X_train)) X_train = np.array(np.concatenate(X_train)) y_train = np.array(np.concatenate(y_train)) print(X_train) print(len(X_train)) print(X_train[0].shape) print(len(y_train)) print(y_train[0]) print(trainLens, sum(trainLens)) clf.fit(X_train, y_train, trainLens) #model_name = 'crf_models/house_' + house + '_'+ f + str(i) + '.crfsuite' #trainer.train(model_name) print str(i), '. House:', house, '. Feature: ', f, ' training complete.' return clf
def _hmm(self, ind: Individual, train: Dataset, dev: Dataset): train_lengths = [len(s) for s in train.sentences] xtrain, ytrain = train.by_word() xdev, _ = dev.by_word() dev_lengths = [len(s) for s in dev.sentences] try: hmm = MultinomialHMM(decode=ind.choose('viterbi', 'bestfirst'), alpha=ind.nextfloat()) hmm.fit(xtrain, ytrain, train_lengths) return hmm.predict(xdev, dev_lengths) except ValueError as e: if 'non-negative integers' in str(e): raise InvalidPipeline(str(e)) elif 'unknown categories' in str(e): raise InvalidPipeline(str(e)) else: raise
def train_HMM(X_train, y_train, house, f,i ): #X_train = [s2features(s) for s in X_train] clf = MultinomialHMM(decode='viterbi') trainLens = np.array(map(lambda x: len(x), X_train)) X_train = np.array(np.concatenate(X_train)) y_train = np.array(np.concatenate(y_train)) print(X_train) print(len(X_train)) print(X_train[0].shape) print(len(y_train)) print(y_train[0]) print(trainLens, sum(trainLens)) clf.fit(X_train, y_train, trainLens) #model_name = 'crf_models/house_' + house + '_'+ f + str(i) + '.crfsuite' #trainer.train(model_name) print str(i),'. House:', house, '. Feature: ', f, ' training complete.' return clf
def test_hmm(): n_features = X.shape[1] clf = MultinomialHMM() clf.fit(X, y, lengths) assert_array_equal(clf.classes_, ["Adj", "DT", "IN", "N", "V"]) assert_array_equal(clf.predict(X), y) clf.set_params(decode="bestfirst") assert_array_equal(clf.predict(X), y) n_classes = len(clf.classes_) assert_array_almost_equal(np.ones(n_features), np.exp(clf.coef_).sum(axis=0)) assert_array_almost_equal(np.ones(n_classes), np.exp(clf.intercept_trans_).sum(axis=0)) assert_array_almost_equal(1., np.exp(clf.intercept_final_).sum()) assert_array_almost_equal(1., np.exp(clf.intercept_init_).sum())
def gridSearch(seqs, lens, decodes=[None], alphas=[None], init_eq_anys=[None]): maxAcc = 0.0 maxAccs = None bestClf = None for d, a, i in itertools.product(*[decodes, alphas, init_eq_anys]): clf = MultinomialHMM(decode=d, alpha=a, init_eq_any=i) accs = crossValidate(clf, seqs, lens) meanAcc = accs.mean() if meanAcc > maxAcc: maxAcc = meanAcc maxAccs = accs bestClf = clf ''' for decode in decodes: for alpha in alphas: clf = MultinomialHMM(decode=decode, alpha=alpha) accs = crossValidate(clf, seqs, lens) meanAcc = accs.mean() if meanAcc > maxAcc: maxAcc = meanAcc maxAccs = accs bestClf = clf ''' return bestClf, maxAccs
import pandas as pd import numpy as np from seqlearn.hmm import MultinomialHMM model = MultinomialHMM(decode='viterbi', alpha=0.01) # -- training -- training_data = [] training_labels = [] training_data_length = [] dataFile = pd.read_csv("../data/training-leftright-avkfxrmpauHdDpeaAAAa-3.csv", header=0) data = [dataFile['accX'][:5], dataFile['accY'][:5], dataFile['accZ'][:5]] #data = [dataFile['alpha'], dataFile['beta'], dataFile['gamma'], dataFile['accX'], dataFile['accY'], dataFile['accZ']] length = len(dataFile['accX'][:5]) training_data_length.append([length, length, length]) # 3 items because X, Y, Z data training_data.append(data) training_labels.append('leftright') dataFile = pd.read_csv("../data/training-updown-avkfxrmpauHdDpeaAAAa-1.csv", header=0) data = [dataFile['accX'][:5], dataFile['accY'][:5], dataFile['accZ'][:5]] #data = [dataFile['alpha'], dataFile['beta'], dataFile['gamma'], dataFile['accX'], dataFile['accY'], dataFile['accZ']] length = len(dataFile['accX'][:5]) training_data_length.append([length, length,
# flatten X_train for HMM function X_train_flatten = [item for sublist in X_train for item in sublist] X_test_flatten = [item for sublist in X_test for item in sublist] # flatten X_train for HMM function y_ground_truth_flatten = [ item for sublist in y_ground_truth_seqlearn for item in sublist ] # change type to array seqlearn_X_train = np.array(X_train_flatten) seqlearn_y_ground_truth = np.array(y_ground_truth_flatten) # HMM seqlearn MultimodalHMM model_seqlearn = MultinomialHMM() # training model_seqlearn.fit(seqlearn_X_train, seqlearn_y_ground_truth, len_train) # state prediction y_pred_seqlearn = model_seqlearn.predict(X_test_flatten) # print output time remarks outputSteps(y_pred_seqlearn) #state prediction for random sequence y_pred_seqlearn_random = model_seqlearn.predict(X_random) # state prediction for heuristic sequence y_pred_seqlearn_random = model_seqlearn.predict(X_heuristic)
X_train = (((X_tr[:, None] & (1 << np.arange(8)))) > 0).astype( int) # vector-> binary matrix Y_train = np.array(Y_train) # X_test = np.array(X_test).reshape(-1,1) X_te = np.array(X_test) X_test = (((X_te[:, None] & (1 << np.arange(8)))) > 0).astype(int) Y_test = np.array(Y_test) return [X_train, X_test, Y_train, Y_test] data = load_dataset() kf = SequenceKFold(seq_lengths(data[1]), 2) for tuple in kf: train_len = tuple[1] test_len = tuple[3] split = dataset_split(tuple[0], tuple[2]) #train the model clf = MultinomialHMM() clf.fit(split[0], split[2], train_len) #evaluate the model Y_pred = clf.predict(split[1], test_len) print('Accuracy:') print(clf.score(split[1], split[3], test_len)) print('Confusion matrix:') labels = list(data[2].values()) print(confusion_matrix(split[3], Y_pred, labels)) print('Report:') target_names = list(data[2].keys()) print(classification_report(split[3], Y_pred, target_names=target_names))
def trainMultinomialHMM(data, classes, seq_lengths, dump_file): clf = MultinomialHMM(decode='viterbi', alpha=0.01) baseSeqClassifierTrain(clf, "Multinomial Hidden Markov Model", data, classes, seq_lengths, dump_file)
def testMultinomialHMM(data, classes, seq_lengths, n_folds, metric=''): clf = MultinomialHMM(decode='bestfirst', alpha=1.0) baseSeqClassifierTest(clf, "Multinomial Hidden Markov Model", data, classes, seq_lengths, n_folds, metric)
mat2 = scipy.io.loadmat('train_subject1_psd03.mat') X2 = mat1['X'] Y2 = mat1['Y'] mat_test = scipy.io.loadmat('test_subject1_psd04.mat') test_X = mat_test['X'] true_label = np.loadtxt('test_subject1_true_label.csv', delimiter=",") X = mat['X'] Y = mat['Y'] new_X = np.concatenate((X, X1, X2), axis=0) new_Y = np.concatenate((Y, Y1, Y2), axis=0) clf = MultinomialHMM() clf.fit(new_X, new_Y, len(new_X)) clf.set_params(decode="bestfirst") ans = clf.predict(test_X) print 'sub-1, custom', accuracy_score(ans, true_label) print confusion_matrix(true_label, ans) #1440/3504: subject 1 accuracy #start subject-2 sub2_1 = scipy.io.loadmat('train_subject2_psd01.mat') sub2_X1 = sub2_1['X'] sub2_Y1 = sub2_1['Y'] sub2_2 = scipy.io.loadmat('train_subject2_psd02.mat') sub2_X2 = sub2_2['X'] sub2_Y2 = sub2_2['Y']
if (prediction[int(index)] == 0): print(n_word) elif (prediction[int(index)] == 1): print(n_word +"s") elif (prediction[int(index)] == 3): print(n_word[:-1] + "ies") else: print(n_word +"es") print("ACCURACY" + str(sum(1 for i,j in zip(prediction,target) if i == j)*1.0/len(prediction))) #just return accuracy for tuning on dev set def evaluate(prediction, target, input_x): return sum(1 for i,j in zip(prediction,target) if i == j)*1.0/len(prediction) #import data from csv file model = MultinomialHMM() data = pd.read_csv("weighted_data.csv") alphabet_dict = dict(zip(string.ascii_lowercase, range(1,27))) reverse_dict = dict(zip(range(1,27), string.ascii_lowercase)) X = [] Y = [] for index, row in data.iterrows(): w_class = 4 singular = row[0] plural = row[1] singular_without_end = singular[:len(singular)-1] #append -es cases if (plural == singular + 'es'):
def hmm_pred(X, y): # скрытая марковская модель hmm = MultinomialHMM(alpha=0.1) hmm.fit(X, y, lengths=np.array([1 for i in y])) return hmm
def test_hmm_validation(): assert_raises(ValueError, MultinomialHMM(alpha=0).fit, X, y, lengths) assert_raises(ValueError, MultinomialHMM(alpha=-1).fit, X, y, lengths)
# Helper functions: # get all the data files from the directory def getDataFileNames(dataType, movement="", dataFolder=DATA_FOLDER): files = os.listdir(dataFolder) output = [] for file in files: if dataType in file and movement in file: output.append(file) return output # ------------------- MAIN ------------------------------------ model = MultinomialHMM(decode='viterbi', alpha=0.01) # -- training -- training_data = [] training_labels = [] training_data_length = [] files = getDataFileNames("training") for trainingFile in files: dataFile = pd.read_csv(DATA_FOLDER + trainingFile, header=0) data = [ dataFile['accX'][:199], dataFile['accY'][:199], dataFile['accZ'][:199] ] #data = [dataFile['alpha'], dataFile['beta'], dataFile['gamma'], dataFile['accX'], dataFile['accY'], dataFile['accZ']]
from seqlearn.hmm import MultinomialHMM from hmmlearn.hmm import GaussianHMM input_data = pd.read_csv('../data/scaled_data/scaled_pca.csv') lengths = [len(input_data)] d1 = pd.read_csv('../data/train_subject1_psd01.csv',header=None) d2 = pd.read_csv('../data/train_subject1_psd02.csv',header=None) d3 = pd.read_csv('../data/train_subject1_psd03.csv',header=None) #input_data = pd.concat([d1, d2, d3], axis=0) lengths = [len(d1), len(d2), len(d3)] clf = MultinomialHMM() clf.fit(input_data.iloc[:,:-1], input_data.iloc[:,-1], lengths) pred = clf.predict(input_data.iloc[:,:-1]) actual = d3.iloc[:,-1] accuracy = sum(pred == actual)/float(len(actual)) print accuracy # Random Forest from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn import svm import pandas as pd
# Saves the image into a TXT file for line in line_mapping: for word in line: if word["matrix"].shape[1] == 0: print "Zero matrix... Skipping..." else: f_handle = file('test.txt', 'a') np.savetxt(f_handle, word['matrix'], delimiter=" ", fmt="%i", newline=" ", header='', footer="" + word["word"] + "\n\n", comments='') f_handle.close() # # Extracts features from the datasets X_train, y_train, lengths_train = load_conll("test.txt", features) # # Models it as an HMM clf = MultinomialHMM() clf.fit(X_train, y_train, lengths_train) print X_train, y_train # Validation after training X_test, y_test, lengths_test = load_conll("test.txt", features) y_pred = clf.predict(X_test, lengths_test) print y_pred # # Final score # print(bio_f_score(y_test, y_pred))
model = hmm.GaussianHMM(n_components=5) model.fit(X, lengths) test = "apple" test_int = [] for let in test: test_int.append(alphabet_dict[let]) print(model.predict(np.array([test_int]), lengths=[5])) print(alphabet_dict) ''' Documentation of seqlearn: http://larsmans.github.io/seqlearn/reference.html Some examples of HMMs using seqlearn seqlearn is supervised leanring vs hmmlearn, which is unsupervised. ''' model = MultinomialHMM() # encode X = pd.DataFrame() y = pd.DataFrame() # Here 0 represents pluarl = singuar + '', 1 represents plural = singular +s, and 2 represnets plural = singular + 'es' # data preparation # index the words such that a = 1, z = 26. w_class = 0 import pdb pdb.set_trace() for index, row in all_data.iterrows(): singular = row[0] plural = row[1] if (plural[-2] == 'es'): w_class = 2