def fit_validate_hmm(df: DataFrame, y_col: str, seq_id_col: str, feature_cols: List[str], k: int = 10, alpha: int = 0.01) -> dict: ''' Fit HMM using features on sequence of tokens df_tokens from sequences seq_id_col. ''' df_tokens = df.copy() # add one hot vectors encoded from features add_one_hot_vectors(df_tokens, feature_cols) # get sequence lengths df_lengths = get_sequence_lengths(df_tokens, seq_id_col) # cross validate model model = MultinomialHMM(alpha=alpha) # get cross validated scores k_scores = get_validated_scores(model, df_tokens, y_col, seq_id_col, df_lengths, k) # fit model on entire dataset X = np.vstack(df_tokens.one_hot_vector.values) y = df_tokens[y_col] l = df_lengths.length model.fit(X, y, l) # return model, data, metrics return {'model': model, 'k_scores': k_scores}
def seqHMM(): clf = MultinomialHMM() clf.fit(input_data.iloc[:, :-1], input_data.iloc[:, -1], lengths) pred = clf.predict(input_data.iloc[:, :-1]) actual = input_data.iloc[:, -1] accuracy = sum(pred == actual) / float(len(actual)) print accuracy
def trainHMM(data): # # # Extracts features from the datasets # X_train, y_train, lengths_train = load_conll(data, features) # # Models it as an HMM clf = MultinomialHMM() clf.fit(X_train, y_train, lengths_train) # print X_train, y_train return clf
def hmm_pred(a, X_train, X_test, y_train, y_test): # скрытая марковская модель hmm = MultinomialHMM(alpha=a) hmm.fit(X_train, y_train, lengths=np.array([1 for i in y_train])) y_pred = hmm.predict(X_test) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted') recall = recall_score(y_test, y_pred, average='weighted') F1 = f1_score(y_test, y_pred, average='weighted') return [accuracy, precision, recall, F1]
def trainHMM(X_train, y_train): # # # Extracts features from the datasets # # Models it as an HMM clf = MultinomialHMM() print "y shape", y_train.shape[0] lengths_train = [] for x in X_train: lengths_train.append(0) print lengths_train clf.fit(X_train, y_train, [len(y_train)]) return clf
def train_and_test_markov(decode, alpha, X_train, y_train, sequence_length_train, X_test, y_test, sequence_length_test, *args, **kwargs): clf = MultinomialHMM(decode=decode, alpha=alpha) #print("Training {}".format(clf)) start = time.time() clf.fit(X_train, y_train, sequence_length_train) mid = time.time() y_pred = clf.predict(X_test, sequence_length_test) stop = time.time() accuracy = 100 * accuracy_score(y_pred, y_test) fit_time = 1000 * (mid - start) pred_time = 1000 * (stop - mid) #print("Fit time: {:.3f}ms, Predict time: {:.3f}ms, Accuracy: {:.2f}".format(fit_time, pred_time, accuracy)) return (fit_time, pred_time, accuracy)
def test_hmm(): n_features = X.shape[1] clf = MultinomialHMM() clf.fit(X, y, lengths) assert_array_equal(clf.classes_, ["Adj", "DT", "IN", "N", "V"]) assert_array_equal(clf.predict(X), y) n_classes = len(clf.classes_) assert_array_almost_equal(np.ones(n_features), np.exp(clf.coef_).sum(axis=0)) assert_array_almost_equal(np.ones(n_classes), np.exp(clf.coef_trans_).sum(axis=0)) assert_array_almost_equal(1., np.exp(clf.coef_final_).sum()) assert_array_almost_equal(1., np.exp(clf.coef_init_).sum())
def test_hmm(): n_features = X.shape[1] clf = MultinomialHMM() clf.fit(X, y, lengths) assert_array_equal(clf.classes_, ["Adj", "DT", "IN", "N", "V"]) assert_array_equal(clf.predict(X), y) clf.set_params(decode="bestfirst") assert_array_equal(clf.predict(X), y) n_classes = len(clf.classes_) assert_array_almost_equal(np.ones(n_features), np.exp(clf.coef_).sum(axis=0)) assert_array_almost_equal(np.ones(n_classes), np.exp(clf.intercept_trans_).sum(axis=0)) assert_array_almost_equal(1., np.exp(clf.intercept_final_).sum()) assert_array_almost_equal(1., np.exp(clf.intercept_init_).sum())
def train_HMM(X_train, y_train, house, f, i): #X_train = [s2features(s) for s in X_train] clf = MultinomialHMM(decode='viterbi') trainLens = np.array(map(lambda x: len(x), X_train)) X_train = np.array(np.concatenate(X_train)) y_train = np.array(np.concatenate(y_train)) print(X_train) print(len(X_train)) print(X_train[0].shape) print(len(y_train)) print(y_train[0]) print(trainLens, sum(trainLens)) clf.fit(X_train, y_train, trainLens) #model_name = 'crf_models/house_' + house + '_'+ f + str(i) + '.crfsuite' #trainer.train(model_name) print str(i), '. House:', house, '. Feature: ', f, ' training complete.' return clf
def _hmm(self, ind: Individual, train: Dataset, dev: Dataset): train_lengths = [len(s) for s in train.sentences] xtrain, ytrain = train.by_word() xdev, _ = dev.by_word() dev_lengths = [len(s) for s in dev.sentences] try: hmm = MultinomialHMM(decode=ind.choose('viterbi', 'bestfirst'), alpha=ind.nextfloat()) hmm.fit(xtrain, ytrain, train_lengths) return hmm.predict(xdev, dev_lengths) except ValueError as e: if 'non-negative integers' in str(e): raise InvalidPipeline(str(e)) elif 'unknown categories' in str(e): raise InvalidPipeline(str(e)) else: raise
def train_HMM(X_train, y_train, house, f,i ): #X_train = [s2features(s) for s in X_train] clf = MultinomialHMM(decode='viterbi') trainLens = np.array(map(lambda x: len(x), X_train)) X_train = np.array(np.concatenate(X_train)) y_train = np.array(np.concatenate(y_train)) print(X_train) print(len(X_train)) print(X_train[0].shape) print(len(y_train)) print(y_train[0]) print(trainLens, sum(trainLens)) clf.fit(X_train, y_train, trainLens) #model_name = 'crf_models/house_' + house + '_'+ f + str(i) + '.crfsuite' #trainer.train(model_name) print str(i),'. House:', house, '. Feature: ', f, ' training complete.' return clf
X_test_flatten = [item for sublist in X_test for item in sublist] # flatten X_train for HMM function y_ground_truth_flatten = [ item for sublist in y_ground_truth_seqlearn for item in sublist ] # change type to array seqlearn_X_train = np.array(X_train_flatten) seqlearn_y_ground_truth = np.array(y_ground_truth_flatten) # HMM seqlearn MultimodalHMM model_seqlearn = MultinomialHMM() # training model_seqlearn.fit(seqlearn_X_train, seqlearn_y_ground_truth, len_train) # state prediction y_pred_seqlearn = model_seqlearn.predict(X_test_flatten) # print output time remarks outputSteps(y_pred_seqlearn) #state prediction for random sequence y_pred_seqlearn_random = model_seqlearn.predict(X_random) # state prediction for heuristic sequence y_pred_seqlearn_random = model_seqlearn.predict(X_heuristic) # target names for evaluation target_names = list(step_set)
X_train = (((X_tr[:, None] & (1 << np.arange(8)))) > 0).astype( int) # vector-> binary matrix Y_train = np.array(Y_train) # X_test = np.array(X_test).reshape(-1,1) X_te = np.array(X_test) X_test = (((X_te[:, None] & (1 << np.arange(8)))) > 0).astype(int) Y_test = np.array(Y_test) return [X_train, X_test, Y_train, Y_test] data = load_dataset() kf = SequenceKFold(seq_lengths(data[1]), 2) for tuple in kf: train_len = tuple[1] test_len = tuple[3] split = dataset_split(tuple[0], tuple[2]) #train the model clf = MultinomialHMM() clf.fit(split[0], split[2], train_len) #evaluate the model Y_pred = clf.predict(split[1], test_len) print('Accuracy:') print(clf.score(split[1], split[3], test_len)) print('Confusion matrix:') labels = list(data[2].values()) print(confusion_matrix(split[3], Y_pred, labels)) print('Report:') target_names = list(data[2].keys()) print(classification_report(split[3], Y_pred, target_names=target_names))
mat2 = scipy.io.loadmat('train_subject1_psd03.mat') X2 = mat1['X'] Y2 = mat1['Y'] mat_test = scipy.io.loadmat('test_subject1_psd04.mat') test_X = mat_test['X'] true_label = np.loadtxt('test_subject1_true_label.csv', delimiter=",") X = mat['X'] Y = mat['Y'] new_X = np.concatenate((X, X1, X2), axis=0) new_Y = np.concatenate((Y, Y1, Y2), axis=0) clf = MultinomialHMM() clf.fit(new_X, new_Y, len(new_X)) clf.set_params(decode="bestfirst") ans = clf.predict(test_X) print 'sub-1, custom', accuracy_score(ans, true_label) print confusion_matrix(true_label, ans) #1440/3504: subject 1 accuracy #start subject-2 sub2_1 = scipy.io.loadmat('train_subject2_psd01.mat') sub2_X1 = sub2_1['X'] sub2_Y1 = sub2_1['Y'] sub2_2 = scipy.io.loadmat('train_subject2_psd02.mat') sub2_X2 = sub2_2['X'] sub2_Y2 = sub2_2['Y']
padded_X[i][0:len(j)] = j import pdb; pdb.set_trace() print("TOTAL NUMBER OF SAMPLES: ", len(padded_X)) #separate into train and validate train_X = padded_X[:10000] train_Y = Y[:10000] val_X = padded_X[10000:] val_Y = Y[10000:] #fit the model to training data # print evaluate(model.predict(val_X), val_Y, val_X) # tuning on the dev set for the optimal number of hidden states best_accuracy = 0 best_number = 1 for n in range(50): model.fit(train_X, train_Y, n) accuracy = evaluate(model.predict(val_X), val_Y, val_X) print("n = ", n) print(accuracy) if accuracy > best_accuracy: best_accuracy = accuracy best_number = n # try to figure out how the HMM is learning. #evaluation on best number of hidden states model.fit(train_X, train_Y, best_number) #best accuracy: 0.48 print "Accuray: " , evaluate(model.predict(val_X), val_Y, val_X) print evaluate_with_output(model.predict(val_X), val_Y, val_X)
def hmm_pred(X, y): # скрытая марковская модель hmm = MultinomialHMM(alpha=0.1) hmm.fit(X, y, lengths=np.array([1 for i in y])) return hmm
length = len(dataFile['accX']) training_data_length.append([length, length, length]) # 3 items because X, Y, Z data training_data.append(data) if "updown" in trainingFile: training_labels.append("updown") elif "leftright" in trainingFile: training_labels.append("leftright") elif "rotateclock" in trainingFile: training_labels.append("rotateclockwise") print("label size:", len(training_data)) print("data size:", len(training_labels)) model.fit(training_data, training_labels, training_data_length) #----- testing ------- test_data = [] test_labels = [] test_data_length = [] files = getDataFileNames("test") for trainingFile in files: dataFile = pd.read_csv(DATA_FOLDER + trainingFile, header=0) data = [dataFile['accX'], dataFile['accY'], dataFile['accZ']] #data = [dataFile['alpha'], dataFile['beta'], dataFile['gamma'], dataFile['accX'], dataFile['accY'], dataFile['accZ']] length = len(dataFile['accX']) test_data_length.append([length, length, length])
from seqlearn.hmm import MultinomialHMM from hmmlearn.hmm import GaussianHMM input_data = pd.read_csv('../data/scaled_data/scaled_pca.csv') lengths = [len(input_data)] d1 = pd.read_csv('../data/train_subject1_psd01.csv',header=None) d2 = pd.read_csv('../data/train_subject1_psd02.csv',header=None) d3 = pd.read_csv('../data/train_subject1_psd03.csv',header=None) #input_data = pd.concat([d1, d2, d3], axis=0) lengths = [len(d1), len(d2), len(d3)] clf = MultinomialHMM() clf.fit(input_data.iloc[:,:-1], input_data.iloc[:,-1], lengths) pred = clf.predict(input_data.iloc[:,:-1]) actual = d3.iloc[:,-1] accuracy = sum(pred == actual)/float(len(actual)) print accuracy # Random Forest from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import cross_val_score from sklearn import svm import pandas as pd import numpy as np
# Saves the image into a TXT file for line in line_mapping: for word in line: if word["matrix"].shape[1] == 0: print "Zero matrix... Skipping..." else: f_handle = file('test.txt', 'a') np.savetxt(f_handle, word['matrix'], delimiter=" ", fmt="%i", newline=" ", header='', footer="" + word["word"] + "\n\n", comments='') f_handle.close() # # Extracts features from the datasets X_train, y_train, lengths_train = load_conll("test.txt", features) # # Models it as an HMM clf = MultinomialHMM() clf.fit(X_train, y_train, lengths_train) print X_train, y_train # Validation after training X_test, y_test, lengths_test = load_conll("test.txt", features) y_pred = clf.predict(X_test, lengths_test) print y_pred # # Final score # print(bio_f_score(y_test, y_pred))
model = MultinomialHMM() # encode X = pd.DataFrame() y = pd.DataFrame() # Here 0 represents pluarl = singuar + '', 1 represents plural = singular +s, and 2 represnets plural = singular + 'es' # data preparation # index the words such that a = 1, z = 26. w_class = 0 import pdb pdb.set_trace() for index, row in all_data.iterrows(): singular = row[0] plural = row[1] if (plural[-2] == 'es'): w_class = 2 elif (plural[-1] == 's'): w_class = 1 else: w_class = 0 word_int = [] for let in singular: word_int.append(alphabet_dict[let]) X = X.append(word_int) y = y.append([w_class]) pdb.set_trace() length = 10 # fit the HMM model.fit(X, y, length)