def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) ok, ub_basket = train_test_split(ub_basket, test_size=0.20, random_state=0) #embedding_wrapper = EmbeddingWrapper('tafeng_products') print(ub_basket) all_baskets = ub_basket.basket.values print(all_baskets) #changes every item to string print("nested change") all_baskets = nested_change(list(all_baskets), str) print("embedding_wrapper.remove_products_wo_embeddings(all_baskets)") all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) print("uncommon products") all_baskets = remove_products_which_are_uncommon(all_baskets) print("short baskets") medium_baskets, all_baskets = remove_short_baskets(all_baskets) print(medium_baskets , all_baskets) print("nested change") all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) print("split_data") train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data(all_baskets) print('knndtw') knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) print(preds_all) print(distances) #print("Wasserstein distance", sum(distances)/len(distances)) return preds_all, distances
def machine_learning(x_test_final, x_train_final, y_train_final, labels, param_weights, para_weights, param_list, k_value): param_labels = [] for i in range(0, len(x_train_final)): #Analyze dataset x_train_final2 = np.array(x_train_final[i]) y_train_final2 = np.array(y_train_final[i]) x_test_final2 = np.array(x_test_final[i]) m = KnnDtw(n_neighbors=k_value, max_warping_window=100) m.fit(x_train_final2, y_train_final2) label, proba = m.predict(x_test_final2) #get the weight for this parameter if param_weights == None: param_labels.append(label) #if we don't have weights do this else: weight = [para_weights[param_list[i]]] param_labels.append(list(zip( label, weight * len(label)))) #a tuple list of (label, weight) param_labels = np.array(param_labels) if param_weights == None: para_mode, para_count = stats.mode(param_labels) para_mode = np.reshape(para_mode, (para_mode.shape[1], )) else: #for weights para_mode = [0] * param_labels.shape[1] for i in range(param_labels.shape[1]): mode_count = [0] * len( labels ) #an array representing how frequent each label was used to classify a time series col = param_labels[:, i] for p in col: mode_count[p[0] - 1] += p[1] para_mode[i] = mode_count.index( max(mode_count) ) + 1 #the the label that was used most frequently as the overall label #para_mode = np.reshape(para_mode,(para_mode.shape[1],)) #Using mode to see which classification was the most frequent for each data from all parameters used #k_val = list(range(1,11)) #k_fold_cross_val(k_val,x_train,y_train,6) return param_labels, para_mode
def run(): embedding_wrapper = EmbeddingWrapper('product') bc = BasketConstructor('./data/', './data/') ub_basket = bc.get_baskets('prior', reconstruct=False) all_baskets = ub_basket.basket.values all_baskets = nested_change(list(all_baskets), str) all_baskets = embedding_wrapper.remove_products_wo_embeddings(all_baskets) all_baskets = remove_products_which_are_uncommon(all_baskets) all_baskets = remove_short_baskets(all_baskets) all_baskets = nested_change(all_baskets, embedding_wrapper.lookup_ind_f) train_ub, val_ub_input, val_ub_target, test_ub_input, test_ub_target = split_data( all_baskets) knndtw = KnnDtw(n_neighbors=[5]) preds_all, distances = knndtw.predict(train_ub, val_ub_input, embedding_wrapper.basket_dist_EMD, embedding_wrapper.basket_dist_REMD) return preds_all, distances
def param_ranking(param_list, k_val, warp_val, datapath, avg_type): start_time = time.time() p = [] r = [] f = [] for dataparam in param_list: trainingdatafile = datapath + 'train_' + dataparam + '.txt' traininglabelfile = datapath + 'train_labels.txt' testdatafile = datapath + 'test_' + dataparam + '.txt' testlabelfile = datapath + 'test_labels.txt' # Open training data file, x:data, y:label x_train_file = open(trainingdatafile, 'r') y_train_file = open(traininglabelfile, 'r') #Open test data file, x:data, y:label x_test_file = open(testdatafile, 'r') y_test_file = open(testlabelfile, 'r') # Create empty lists x_train = [] y_train = [] x_test = [] y_test = [] # Mapping table for classes labels = { 1: 'Hover', 2: 'Impact (Front Left)', 3: 'Impact (Front Right)', 4: 'Impact (Back Left)', 5: 'Impact (Back Right)', 6: 'Gust (from Left)', 7: 'Gust (from Right)', 8: 'Gust (from front)' } i = 0 # Loop through datasets for x in x_train_file: x_train.append([float(ts) for ts in x.split()]) for y in y_train_file: y_train.append(int(y.rstrip('\n'))) for x in x_test_file: x_test.append([float(ts) for ts in x.split()]) for y in y_test_file: y_test.append(int(y.rstrip('\n'))) #close data files x_train_file.close() y_train_file.close() x_test_file.close() y_test_file.close() # Convert to numpy for efficiency x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) m = KnnDtw(n_neighbors=k_val, max_warping_window=warp_val) m.fit(x_train, y_train) label, proba = m.predict(x_test) precision, recall, f_score, _ = score(y_test, label, average=avg_type) p.append(precision) r.append(recall) f.append(f_score) precision_rank = sorted(list(zip(param_list, p)), key=lambda x: x[1]) recall_rank = sorted(list(zip(param_list, r)), key=lambda x: x[1]) fscore_rank = sorted(list(zip(param_list, f)), key=lambda x: x[1]) #("Parameter rank by precision is:",precision_rank) print('Ranking for k = %s, max warping window = %s' % (k_val, warp_val)) for rank in precision_rank[::-1]: print(rank[0], ": ", rank[1]) #print("Parameter rank by recall is:",recall_rank) #print("Parameter rank by f-score is:",fscore_rank) print("--- %s seconds ---" % (time.time() - start_time)) #let's see how long this takes...
if(plotdata): plt.figure(figsize=(11,7)) colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27'] for i, r in enumerate([0,1,2,3,4,5,6,7,8,9]): plt.subplot(5,2,i+1) plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2) plt.xlabel('Samples @50Hz') plt.legend(loc='upper left') plt.tight_layout() #Analyze dataset m = KnnDtw(n_neighbors, max_warping_window) m.fit(x_train,y_train) label, proba = m.predict(x_test) #Classification report from sklearn.metrics import classification_report, confusion_matrix print(classification_report(label, y_test, target_names=[l for l in labels.values()])) #Confusion Matrix conf_mat = confusion_matrix(label, y_test) fig = plt.figure(figsize=(8,8)) width = np.shape(conf_mat)[1] height = np.shape(conf_mat)[0] res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest')
print("Searching for KNN-DTW...") print(" ") with open(path, "r") as f: reader = csv.DictReader(f) freq_list = [] mass_list = [] for row in reader: freq = float(row['Freq']) freq_list.append(freq) mass = float(row['Mass']) mass_list.append(mass) freq_list = [freq_list] mass_list = [mass_list] freq_list = np.array(freq_list) mass_list = np.array(mass_list) _, _, freq_knn = freq_model.predict(freq_list) _, _, mass_knn = mass_model.predict(mass_list) freq_knn = freq_knn[0] mass_knn = mass_knn[0] print("Freq:", freq_knn) print("Mass:", mass_knn) print(" ") freq_probability = {} freq_probability['plastic'] = int((freq_knn == 0).sum()) / 10 freq_probability['metal'] = int((freq_knn == 1).sum()) / 10 freq_probability['glass'] = int((freq_knn == 2).sum()) / 10 #freq_probability['paper'] = int((freq_knn == 3).sum())/10 mass_probability = {} mass_probability['plastic'] = int((mass_knn == 0).sum()) / 10 mass_probability['metal'] = int((mass_knn == 1).sum()) / 10 mass_probability['glass'] = int((mass_knn == 2).sum()) / 10
def multi_param_learn(param_list, param_weights, k_value, datapath): start_time = time.time() #this is the list that will store labels returned from each param before aggregating param_labels = [] if param_weights != None: if len(param_list) != len(param_weights): raise Exception( 'When using weights, there must one weight for each parameter!' ) para_weights = dict(zip(param_list, param_weights)) para_k = dict(zip(param_list, k_value)) for dataparam in param_list: trainingdatafile = datapath + 'train_' + dataparam + '.txt' traininglabelfile = datapath + 'train_labels.txt' testdatafile = datapath + 'test_' + dataparam + '.txt' testlabelfile = datapath + 'test_labels.txt' # Open training data file, x:data, y:label x_train_file = open(trainingdatafile, 'r') y_train_file = open(traininglabelfile, 'r') #Open test data file, x:data, y:label x_test_file = open(testdatafile, 'r') y_test_file = open(testlabelfile, 'r') # Create empty lists x_train = [] y_train = [] x_test = [] y_test = [] # Mapping table for classes labels = { 1: 'Hover', 2: 'Impact (Front Left)', 3: 'Impact (Front Right)', 4: 'Impact (Back Left)', 5: 'Impact (Back Right)', 6: 'Gust (from Left)', 7: 'Gust (from Right)', 8: 'Gust (from front)' } i = 0 # Loop through datasets for x in x_train_file: x_train.append([float(ts) for ts in x.split()]) for y in y_train_file: y_train.append(int(y.rstrip('\n'))) for x in x_test_file: x_test.append([float(ts) for ts in x.split()]) for y in y_test_file: y_test.append(int(y.rstrip('\n'))) #close data files x_train_file.close() y_train_file.close() x_test_file.close() y_test_file.close() # Convert to numpy for efficiency x_train = np.array(x_train) y_train = np.array(y_train) x_test = np.array(x_test) y_test = np.array(y_test) ##plot train data #plt.figure(figsize=(11,7)) #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', # '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27', # '#D62728','#2C9F2C'] #for i, r in enumerate([0,1,2,3,5,6,7,8,9,10,11,12]): # plt.subplot(7,2,i+1) # plt.plot(x_train[r], label=labels[y_train[r]], color=colors[i], linewidth=2) # plt.xlabel('Samples @50Hz') # plt.legend(loc='upper left') # plt.tight_layout() # ##Plot Test data #plt.figure(figsize=(11,7)) #colors = ['#D62728','#2C9F2C','#FD7F23','#1F77B4','#9467BD', # '#8C564A','#7F7F7F','#1FBECF','#E377C2','#BCBD27'] #for i, r in enumerate([0,1,2,3,4,5]): # plt.subplot(3,2,i+1) # plt.plot(x_test[r], label=labels[y_test[r]], color=colors[i], linewidth=2) # plt.xlabel('Samples @50Hz') # plt.legend(loc='upper left') # plt.tight_layout() #Analyze dataset print('Algorithm running for param %s with k value %i' % (dataparam, para_k[dataparam])) m = KnnDtw(n_neighbors=para_k[dataparam], max_warping_window=100) m.fit(x_train, y_train) label, proba = m.predict(x_test) #get the weight for this parameter if param_weights == None: param_labels.append(label) #if we don't have weights do this else: weight = [para_weights[dataparam]] param_labels.append(list(zip( label, weight * len(label)))) #a tuple list of (label, weight) param_labels = np.array(param_labels) if param_weights == None: para_mode, para_count = stats.mode(param_labels) para_mode = np.reshape(para_mode, (para_mode.shape[1], )) else: #for weights para_mode = [ 0 ] * param_labels.shape[1] #a zero array to represent final label value for i in range(param_labels.shape[1]): mode_count = [0] * len( labels ) #an array representing how frequent each label was used to classify a time series col = param_labels[:, i] for p in col: mode_count[int(p[0] - 1)] += p[1] para_mode[i] = mode_count.index( max(mode_count) ) + 1 #the the label that was used most frequently as the overall label #para_mode = np.reshape(para_mode,(para_mode.shape[1],)) #Using mode to see which classification was the most frequent for each data from all parameters used #k_val = list(range(1,11)) #k_fold_cross_val(k_val,x_train,y_train,6) #Classification report """ASSUMPTION: We're trying to see accuracy of labelling as a result of multi param voting, but we are only comparing to one y_test belonging to one (last) parameter with the current implementation we're assuming that y_test is the same across all param which builds on the assumption that train/test data for all param are from the same time period! """ from sklearn.metrics import classification_report, confusion_matrix from sklearn.metrics import precision_recall_fscore_support as score print( classification_report(y_test, para_mode, target_names=[l for l in labels.values()])) #Confusion Matrix conf_mat = confusion_matrix(para_mode, y_test) fig = plt.figure(figsize=(8, 8)) width = np.shape(conf_mat)[1] height = np.shape(conf_mat)[0] res = plt.imshow(np.array(conf_mat), cmap=plt.cm.summer, interpolation='nearest') for i, row in enumerate(conf_mat): for j, c in enumerate(row): if c > 0: plt.text(j - .2, i + .1, c, fontsize=16) #cb = fig.colorbar(res) plt.title('Confusion Matrix for ' + ', '.join([name for name in param_list])) plt.xlabel('Data') plt.ylabel('ML Identification') _ = plt.xticks(range(9), [l for l in labels.values()], rotation=90) _ = plt.yticks(range(9), [l for l in labels.values()]) #print how long this function ran print("Runtime was %s seconds" % (time.time() - start_time))
labels.extend([NOT_FALL] * (len(test_files) - len(labels))) results = [list(), list(), list(), list(), list()] certainty = [list(), list(), list(), list(), list()] times = [list(), list(), list(), list(), list()] for i, file in enumerate(test_files): print("Predicting for", file) data = IMUData() with open(file, 'r') as f: for line in f: data.append(line) a = [array(data.kalmanX), array(data.kalmanY), array(data.x), array(data.y), array(data.z)] for j in range(1,6): # testing subsample_step 1 through 5 model.subsample_step = j t = time.time() result = model.predict(a) times[j-1].append(time.time() - t) results[j-1].append(labelResult(labels[i], result.mode[0])) certainty[j-1].append(result.count[0][0] / 5) csv_name = ".\\test results\\test" + str(round(time.time())) + ".csv" headers = ["train","test","step","accuracy","precision","recall","certainty","time"] with open(csv_name, "w", newline='') as f: writer = csv.writer(f) writer.writerow(headers) for i in range(0,5): acc, pre, rec = accuracy(results[i]) writer.writerow([ str(len(train_files)), str(len(test_files)), str(i + 1),