def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0) #embedding_wrapper = EmbeddingWrapper('tafeng_products') print(ub_basket) all_baskets = ub_basket.basket.values print(all_baskets) #changes every item to string print("nested change") all_baskets = nested_change(list(all_baskets), str) print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)") all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) print("uncommon products") all_baskets = remove_products_which_are_uncommon(all_baskets) print("short baskets") medium_baskets, all_baskets = remove_short_baskets(all_baskets) print(medium_baskets , all_baskets) print("nested change") all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) print("split_data") train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets) print('knndtw') knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) print(preds_all) print(distances) #print("Wasserstein distance", sum(distances)/len(distances)) return preds_all, distances
def k_fold_cross_val(k_list, train, label, folds): #Randomly shuffle the data and label in to the same sequence seed = np.arange(train.shape[0]) np.random.shuffle(seed) train = train[seed] label = label[seed] #Keep track of the score for this k value, num of scores = num of folds k_scores = [] #averaged scores for each k value, num of scores = num of K #we want to split train data into test and train label_name = { 1: 'Hover', 2: 'Impact (Front Left)', 3: 'Impact (Front Right)', 4: 'Impact (Back Left)', 5: 'Impact (Back Right)', 6: 'Gust (from Left)', 7: 'Gust (from Right)', 8: 'Gust (from front)' } clf = KnnDtw(n_neighbors=1, max_warping_window=100) #Initialize classifier kf = KFold(n_splits=folds) kf.get_n_splits(train) for K in k_list: scores = [ ] #averaged scores for each k value, num of scores = num of K clf = KnnDtw(n_neighbors=K, max_warping_window=100) for train_index, test_index in kf.split(train): print("TRAIN:", train_index, "TEST:", test_index) X_train, X_test = train[train_index], train[test_index] y_train, y_test = label[train_index], label[test_index] clf_copy = copy.deepcopy( clf ) #try to make sure the estimator is reset before each fit, but maybe I can just move clf into the loop? clf_copy.fit(X_train, y_train) labels, proba = clf_copy.predict(X_test) #print(classification_report(labels, y_test,target_names=[l for l in label_name.values()])) acc = accuracy_score(y_test, labels) print('Accuracy for this fold is:', acc) scores.append(acc) scores = np.array(scores) #convert the fold scores array into numpy score = np.average( scores) #averages the fold scores to a single socre for the k k_scores.append(score) #Plot the average accuracy score for each k, recommend a besk (highest accuracy) k k_best = k_list[np.argmax(k_scores)] plt.bar(k_list, k_scores, width=0.2) plt.xlabel('k (nearest neighbors)') plt.ylabel('Accuracy (average)') plt.xticks(k_list) print('Best k value from list is:', k_best) return k_best
def machine_learning(x_test_final, x_train_final, y_train_final, labels, param_weights, para_weights, param_list, k_value): param_labels = [] for i in range(0, len(x_train_final)): #Analyze dataset x_train_final2 = np.array(x_train_final[i]) y_train_final2 = np.array(y_train_final[i]) x_test_final2 = np.array(x_test_final[i]) m = KnnDtw(n_neighbors=k_value, max_warping_window=100) m.fit(x_train_final2, y_train_final2) label, proba = m.predict(x_test_final2) #get the weight for this parameter if param_weights == None: param_labels.append(label) #if we don't have weights do this else: weight = [para_weights[param_list[i]]] param_labels.append(list(zip( label, weight * len(label)))) #a tuple list of (label, weight) param_labels = np.array(param_labels) if param_weights == None: para_mode, para_count = stats.mode(param_labels) para_mode = np.reshape(para_mode, (para_mode.shape[1], )) else: #for weights para_mode = [0] * param_labels.shape[1] for i in range(param_labels.shape[1]): mode_count = [0] * len( labels ) #an array representing how frequent each label was used to classify a time series col = param_labels[:, i] for p in col: mode_count[p[0] - 1] += p[1] para_mode[i] = mode_count.index( max(mode_count) ) + 1 #the the label that was used most frequently as the overall label #para_mode = np.reshape(para_mode,(para_mode.shape[1],)) #Using mode to see which classification was the most frequent for each data from all parameters used #k_val = list(range(1,11)) #k_fold_cross_val(k_val,x_train,y_train,6) return param_labels, para_mode
def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) all_baskets = ub_basket.basket.values all_baskets = nested_change(list(all_baskets), str) all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) all_baskets = remove_products_which_are_uncommon(all_baskets) all_baskets = remove_short_baskets(all_baskets) all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data( all_baskets) knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) return preds_all, distances
def param_ranking(param_list, k_val, warp_val, datapath, avg_type): start_time = time.time() p = [] r = [] f = [] for dataparam in param_list: trainingdatafile = datapath + 'train_' + dataparam + '.txt' traininglabelfile = datapath + 'train_labels.txt' testdatafile = datapath + 'test_' + dataparam + '.txt' testlabelfile = datapath + 'test_labels.txt' # Open training data file, x:data, y:label x_train_file = open(trainingdatafile, 'r') y_train_file = open(traininglabelfile, 'r') #Open test data file, x:data, y:label x_test_file = open(testdatafile, 'r') y_test_file = open(testlabelfile, 'r') # Create empty lists x_train = [] y_train = [] x_test = [] y_test = [] # Mapping table for classes labels = { 1: 'Hover', 2: 'Impact (Front Left)', 3: 'Impact (Front Right)', 4: 'Impact (Back Left)', 5: 'Impact (Back Right)', 6: 'Gust (from Left)', 7: 'Gust (from Right)', 8: 'Gust (from front)' } i = 0 # Loop through datasets for x in x_train_file: x_train.append([float(ts) for ts in x.split()]) for y in y_train_file: y_train.append(int(y.rstrip('\n'))) for x in x_test_file: x_test.append([float(ts) for ts in x.split()]) for y in y_test_file: y_test.append(int(y.rstrip('\n'))) #close data files x_train_file.close() y_train_file.close() x_test_file.close() y_test_file.close() # Convert to numpy for efficiency x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) m = KnnDtw(n_neighbors=k_val, max_warping_window=warp_val) m.fit(x_train, y_train) label, proba = m.predict(x_test) precision, recall, f_score, _ = score(y_test, label, average=avg_type) p.append(precision) r.append(recall) f.append(f_score) precision_rank = sorted(list(zip(param_list, p)), key=lambda x: x[1]) recall_rank = sorted(list(zip(param_list, r)), key=lambda x: x[1]) fscore_rank = sorted(list(zip(param_list, f)), key=lambda x: x[1]) #("Parameter rank by precision is:",precision_rank) print('Ranking for k = %s, max warping window = %s' % (k_val, warp_val)) for rank in precision_rank[::-1]: print(rank[0], ": ", rank[1]) #print("Parameter rank by recall is:",recall_rank) #print("Parameter rank by f-score is:",fscore_rank) print("--- %s seconds ---" % (time.time() - start_time)) #let's see how long this takes...
#Plot Test data if(plotdata): plt.figure(figsize=(11,7)) colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27'] for i, r in enumerate([0,1,2,3,4,5,6,7,8,9]): plt.subplot(5,2,i+1) plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2) plt.xlabel('Samples @50Hz') plt.legend(loc='upper left') plt.tight_layout() #Analyze dataset m = KnnDtw(n_neighbors, max_warping_window) m.fit(x_train,y_train) label, proba = m.predict(x_test) #Classification report from sklearn.metrics import classification_report, confusion_matrix print(classification_report(label, y_test, target_names=[l for l in labels.values()])) #Confusion Matrix conf_mat = confusion_matrix(label, y_test) fig = plt.figure(figsize=(8,8)) width = np.shape(conf_mat)[1] height = np.shape(conf_mat)[0]
from data import getData import matplotlib.pyplot as plt from knndtw import KnnDtw matplotlib.use('TkAgg') if __name__ == "__main__": k_range = 50 accuracy, k_list = [], [] range_list = [50, 100, 200, 300] duration_list = np.arange(0.5, 2.5, 0.5).tolist() sample_dict = getData(random_range=range_list, DURATION_TO_EXAMINE=duration_list) classifier = KnnDtw(k_range, sample_dict, random_range=range_list) # dicts = classifier.get_pos_dict(50) # dicts2 = classifier.get_label_dict(50) # dict3 = classifier.get_n_neighbors_dict(50) acc = classifier.merge_view() # Drawing fig = plt.figure(figsize=(12, 4)) for key in acc: plt.plot([k for k in range(1, len(acc[key]) + 1)], [s for s in acc[key]], lw=1, marker='o', label=str(key)) plt.legend()
## trainer.py ## ## "Builds model.p using data found in .\data\training" ## import os import pickle from numpy import array from knndtw import KnnDtw from imudata import IMUData FALL = 1 NOT_FALL = 2 model = KnnDtw( subsample_step=5) # change parameters here! using defaults currently train_files = list() for root, dirs, files in os.walk(r".\data\training\fall", topdown=False): for name in files: train_files.append(os.path.join(root, name)) labels = [FALL] * len(train_files) for root, dirs, files in os.walk(r".\data\training\not fall", topdown=False): for name in files: train_files.append(os.path.join(root, name)) labels.extend([NOT_FALL] * (len(train_files) - len(labels))) train_data = list() for file in train_files: print("Reading", file) data = IMUData()
freqs = np.load('processed/series_freqs.npy') masses = np.load('processed/series_masses.npy') labels = np.load('processed/series_labels.npy') print(" ") print("=============================") print(" INSTANTIATING KNN-DTW MODEL ") print("=============================") print(" ") print("Loading List of Neighbours...") print("freqs:", freqs.shape) print("masses:", masses.shape) print("labels:", labels.shape) print(" ") freq_model = KnnDtw(n_neighbors=10, max_warping_window=10) freq_model.fit(freqs, labels) mass_model = KnnDtw(n_neighbors=10, max_warping_window=10) mass_model.fit(masses, labels) # Setup client socket s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(('localhost', 8888)) # s2 = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # Second connection # s2.connect(('localhost', 8890)) s.send("series".encode()) parse = s.recv(1024).decode() if parse == "yes" or "no": print("Authorisation Success.") print("Series Classifier Ready.")
def multi_param_learn(param_list, param_weights, k_value, datapath): start_time = time.time() #this is the list that will store labels returned from each param before aggregating param_labels = [] if param_weights != None: if len(param_list) != len(param_weights): raise Exception( 'When using weights, there must one weight for each parameter!' ) para_weights = dict(zip(param_list, param_weights)) para_k = dict(zip(param_list, k_value)) for dataparam in param_list: trainingdatafile = datapath + 'train_' + dataparam + '.txt' traininglabelfile = datapath + 'train_labels.txt' testdatafile = datapath + 'test_' + dataparam + '.txt' testlabelfile = datapath + 'test_labels.txt' # Open training data file, x:data, y:label x_train_file = open(trainingdatafile, 'r') y_train_file = open(traininglabelfile, 'r') #Open test data file, x:data, y:label x_test_file = open(testdatafile, 'r') y_test_file = open(testlabelfile, 'r') # Create empty lists x_train = [] y_train = [] x_test = [] y_test = [] # Mapping table for classes labels = { 1: 'Hover', 2: 'Impact (Front Left)', 3: 'Impact (Front Right)', 4: 'Impact (Back Left)', 5: 'Impact (Back Right)', 6: 'Gust (from Left)', 7: 'Gust (from Right)', 8: 'Gust (from front)' } i = 0 # Loop through datasets for x in x_train_file: x_train.append([float(ts) for ts in x.split()]) for y in y_train_file: y_train.append(int(y.rstrip('\n'))) for x in x_test_file: x_test.append([float(ts) for ts in x.split()]) for y in y_test_file: y_test.append(int(y.rstrip('\n'))) #close data files x_train_file.close() y_train_file.close() x_test_file.close() y_test_file.close() # Convert to numpy for efficiency x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) ##plot train data #plt.figure(figsize=(11,7)) #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', # '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27', # '#D62728','#2C9F2C'] #for i, r in enumerate([0,1,2,3,5,6,7,8,9,10,11,12]): # plt.subplot(7,2,i+1) # plt.plot(x_train[r], label=labels[y_train[r]], color=colors[i], linewidth=2) # plt.xlabel('Samples @50Hz') # plt.legend(loc='upper left') # plt.tight_layout() # ##Plot Test data #plt.figure(figsize=(11,7)) #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', # '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27'] #for i, r in enumerate([0,1,2,3,4,5]): # plt.subplot(3,2,i+1) # plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2) # plt.xlabel('Samples @50Hz') # plt.legend(loc='upper left') # plt.tight_layout() #Analyze dataset print('Algorithm running for param %s with k value %i' % (dataparam, para_k[dataparam])) m = KnnDtw(n_neighbors=para_k[dataparam], max_warping_window=100) m.fit(x_train, y_train) label, proba = m.predict(x_test) #get the weight for this parameter if param_weights == None: param_labels.append(label) #if we don't have weights do this else: weight = [para_weights[dataparam]] param_labels.append(list(zip( label, weight * len(label)))) #a tuple list of (label, weight) param_labels = np.array(param_labels) if param_weights == None: para_mode, para_count = stats.mode(param_labels) para_mode = np.reshape(para_mode, (para_mode.shape[1], )) else: #for weights para_mode = [ 0 ] * param_labels.shape[1] #a zero array to represent final label value for i in range(param_labels.shape[1]): mode_count = [0] * len( labels ) #an array representing how frequent each label was used to classify a time series col = param_labels[:, i] for p in col: mode_count[int(p[0] - 1)] += p[1] para_mode[i] = mode_count.index( max(mode_count) ) + 1 #the the label that was used most frequently as the overall label #para_mode = np.reshape(para_mode,(para_mode.shape[1],)) #Using mode to see which classification was the most frequent for each data from all parameters used #k_val = list(range(1,11)) #k_fold_cross_val(k_val,x_train,y_train,6) #Classification report """ASSUMPTION: We're trying to see accuracy of labelling as a result of multi param voting, but we are only comparing to one y_test belonging to one (last) parameter with the current implementation we're assuming that y_test is the same across all param which builds on the assumption that train/test data for all param are from the same time period! """ from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import precision_recall_fscore_support as score print( classification_report(y_test, para_mode, target_names=[l for l in labels.values()])) #Confusion Matrix conf_mat = confusion_matrix(para_mode, y_test) fig = plt.figure(figsize=(8, 8)) width = np.shape(conf_mat)[1] height = np.shape(conf_mat)[0] res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest') for i, row in enumerate(conf_mat): for j, c in enumerate(row): if c > 0: plt.text(j - .2, i + .1, c, fontsize=16) #cb = fig.colorbar(res) plt.title('Confusion Matrix for ' + ', '.join([name for name in param_list])) plt.xlabel('Data') plt.ylabel('ML Identification') _ = plt.xticks(range(9), [l for l in labels.values()], rotation=90) _ = plt.yticks(range(9), [l for l in labels.values()]) #print how long this function ran print("Runtime was %s seconds" % (time.time() - start_time))
y_test.append(int(y.rstrip('\n'))) #close data files x_train_file.close() y_train_file.close() x_test_file.close() y_test_file.close() # Convert to numpy for efficiency x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) #Analyze dataset m = KnnDtw(n_neighbors=3, max_warping_window=500) m.fit(x_train[::trainsample], y_train) label, proba = m.predict(x_test) #Classification report from sklearn.metrics import classification_report, confusion_matrix print( classification_report(label, y_test, target_names=[l for l in labels.values()])) #Confusion Matrix conf_mat = confusion_matrix(label, y_test) fig = plt.figure(figsize=(7, 7)) width = np.shape(conf_mat)[1]
num += 1 acc_num += 1 recall += 1 precision += 1 if result == "falsepos": precision += 1 if result == "trueneg": acc_num += 1 if result == "falseneg": recall += 1 return (acc_num / len(results), num / precision, num / recall) FALL = 1 NOT_FALL = 2 model = KnnDtw() # change parameters here! using defaults currently train_files = list() scrapeNames(train_files, r".\data\training\fall") labels = [FALL] * len(train_files) scrapeNames(train_files, r".\data\training\not fall") labels.extend([NOT_FALL] * (len(train_files) - len(labels))) train_data = list() for file in train_files: print("Reading", file) data = IMUData() with open(file, 'r') as f: for i, line in enumerate(f): data.append(line) if i == 112: