def run(): #in order to process the data, run the data processing file df_list = [ "abalone", "car", "segmentation", "machine", "forestfires", "wine" ] # df_list = ["machine", "forestfires", "wine"] df_class_num = [3, 4, 7, 1, 1, 1] # df_class_num = [1, 1, 1] results_file = open("./results/results_rbn.txt", "a+") for i in range(len(df_list)): data_array = Data_Processing_Lists("./processed", df_list[i] + "_processed") data_array.file_array = data_array.file_array[:] class_list = [] for j in range( df_class_num[i] ): #makes an array of integers the same lenght as the number of classes each data set has class_list.append(j) #data.append(data_array) #classes.append(class_list) data_array.slicer(5) test_data = data_array.file_array.pop(0) data_array.join_array() training_data = data_array.file_array toy = copy.deepcopy(data_array) toy.slicer(4) medoids = toy.file_array.pop(0) toy.join_array() training_data_toy = toy.file_array knn = [ edited_nn(13, training_data)[:1000], k_means(100, training_data), k_medoids(medoids, training_data_toy) ] # knn = [k_means(int(len(training_data)/4), training_data), k_medoids(medoids, training_data_toy)] algo_name = ['edited knn', 'kmeans', "kmedoids"] # algo_name = ['kmeans', "kmedoids"] algo_idx = 0 for centers in knn: for k in range(1): # def __init__(self, data, output, gaussian_function_type, centers): print("class list", class_list) print("size rbf\n", len(centers)) print(df_list[i]) rbn = RBN(training_data, class_list, 1, centers) rbn.train() guesses = rbn.classify(test_data) losses = Loss_Functions(guesses) if (len(class_list) == 1): #regression print_str = "MSE for " + str(df_list[i]) + " fold: " + str( k) + " \n" + algo_name[algo_idx] print(print_str) print(losses.mse()) results_file.write("\n" + print_str) results_file.write("\nMSE: " + str(losses.mse()) + "\n") else: #classification losses.confusion_matrix_generator() print_str = "Fscore for " + str( df_list[i]) + " fold: " + str( k) + "\n" + algo_name[algo_idx] print(print_str) print(losses.f_score()) results_file.write("\n" + print_str) results_file.write("\nF-score: " + str(losses.f_score()) + "\n") algo_idx += 1 results_file.close()
[1, 1], [2, 1], [3, 1], [1, 2], [4, 2], [5, 2], [3, 3], [2, 4], [3, 4], [5, 4] ] # print(dataset) # 使用10个数据,分3类,最多迭代20次 C, labels = k_means(dataset_10, 3, 20) print(C) print(labels) # 绘图代码 ############################################################ colValue = ['r', 'y', 'g', 'b', 'c', 'k', 'm'] for i in range(len(C)): coo_X = [] # x坐标列表 coo_Y = [] # y坐标列表 for j in range(len(C[i])): coo_X.append(C[i][j][0]) coo_Y.append(C[i][j][1]) plt.scatter(coo_X, coo_Y, marker='o', color=colValue[i % len(colValue)], label=i) ############################################################
def cross_validation(folds, k, dataframes, algorithm_name, evaluation_metric): #dataframes = [db_name, [section1,..,sectionN]] #confusion matrix guessed_classes = [] if algorithm_name == 'k-nn': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) guessed_classes+=k_nearest_neighbor(k,training_data, test_data) dataframes[1].append(test_data) if algorithm_name == 'k-nn-regression': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) guessed_classes+=k_nearest_neighbor_regression(k,training_data, test_data) dataframes[1].append(test_data) if algorithm_name == 'edited': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) training_data = edited_k_nearest(k, training_data) guessed_classes += k_nearest_neighbor(k,training_data, test_data) dataframes[1].append(test_data) if algorithm_name == 'condensed': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) training_data = condensed_k_nearest(k, training_data) guessed_classes += k_nearest_neighbor(k,training_data, test_data) dataframes[1].append(test_data) if algorithm_name == 'k-means': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) #training_data = slicer(4, training_data) # 1/4 of data for this algorithm #training_data = shuffle_pd_df(training_data) k_means_k = int(len(training_data)/4) training_data = k_means(k_means_k, training_data) guessed_classes+=(k_nearest_neighbor_regression(k,training_data, test_data)) dataframes[1].append(test_data) if algorithm_name == 'edited-k-means': print("New Data Set") for i in range(folds): test_data = dataframes[1].pop(i) training_data = concat_df(dataframes[1]) training_data = edited_k_nearest(k, training_data) guessed_classes += k_nearest_neighbor(k,training_data, test_data) dataframes[1].append(test_data) if algorithm_name == 'k-medoids': print("New Data Set") for i in range(folds): #pop off the data for testing test_data = dataframes[1].pop(i) #concatinate all the training data training_data = concat_df(dataframes[1]) # training_data = shuffle_pd_df(training_data) training_data = slicer(4, training_data) # 1/4 of data for this algorithm #set medoids to 1/4 data medoids = training_data.pop(0) #set training data to leftover 3/4 data training_data = concat_df(training_data) #run PAM-NN to generate medoid set returned_medoids = k_medoids(medoids, training_data) #run k-NN with medoids guessed_classes += k_nearest_neighbor_regression(k,returned_medoids, test_data) dataframes[1].append(test_data) if algorithm_name == 'edited-k-medoids': print("New Data Set") for i in range(folds): #pop off the data for testing test_data = dataframes[1].pop(i) #concatinate all the training data training_data = concat_df(dataframes[1]) #generate medoid data set by running edited-kNN medoids = edited_k_nearest(k, training_data) #remove medoid data points from training data for index, row in medoids.iterrows(): training_data = training_data.drop(index) print("size of training data", len(training_data)) print("size of medoids data", len(medoids)) #run PAM-NN to generate medoids with edited-kNN data set as initial guesses returned_medoids = k_medoids(medoids, training_data) #classify test data guessed_classes += k_nearest_neighbor(k,returned_medoids, test_data) #store test data with guessed classes dataframes[1].append(test_data) #----------------- #evaluation metrics for the algorithm's guessed_classes #----------------- if evaluation_metric == 'fscore': #only for classification confusion = {} #confusion matrix #for each class, initialize the confusion matrix with zeros for that class unique_classes = concat_df(dataframes[1])['0'].unique().tolist() for class_name in unique_classes: confusion.update({class_name:{'TP':0,'FP':0,'TN':0,'FN':0}})#class_name is the key for each classes confusion matrix #confusion{class:{TP:0,FP:0,TN:0,FN:0}} #for each class for class_name in unique_classes: #for each data point guessed in that class for result in guessed_classes: #result[0] is actual class and result[1] is our guess if class_name == result[1] and class_name == result[0]: #guess is accurate with what the class actually was value = 'TP' if class_name == result[1] and class_name != result[0]: #guessed that a record was part of a class and it wasn't value = 'FP' if class_name != result[1] and class_name == result[0]: #guessed that a record was not part of a class and it was value = 'FN' if class_name != result[1] and class_name != result[0]: #guess is accurate that the record did not belong to a class value = 'TN' confusion[class_name][value] += 1 #increment that classes TP/FP/TN/FN count accordingly #calculate our class independent accuracy correct = 0 total = 0 for result in guessed_classes: if(result[0]==result[1]): correct+=1 total+=1 accuracy = correct/total num_of_classes = len(confusion) average_cm = {'TP':0,'FP':0,'TN':0,'FN':0} #average confusion matrix over every class print(confusion) count = 0 precision = 0 recall=0 f1=0 for class1, matrix in confusion.items(): TP = matrix['TP'] TN = matrix['TN'] FP = matrix['FP'] FN = matrix['FN'] if((TP+FP) != 0): precision += TP/(TP+FP) ptemp = TP/(TP+FP) else: ptemp = 0 if((TP+FN) != 0): recall += TP/(TP+FN) rtemp = TP/(TP+FN) else: rtemp = 0 if((ptemp+rtemp)!=0): f1 += 2*ptemp*rtemp/(ptemp+rtemp) count+=1 precision = precision/count recall = recall/count f1 = f1/count #f1 = 2*precision*recall/(precision+recall) metrics = {'F1': f1, 'Precision':precision, 'Recall':recall, 'Accuracy': accuracy} return average_cm, metrics if evaluation_metric == 'regression': #For datasets: machine, forestfirest, wine print("regression") sum_of_error = 0.0 for result in guessed_classes: print(result) sum_of_error += (result[0]-result[1])**2 mean_square_error = sum_of_error/len(guessed_classes) return mean_square_error
print("Number of Centroids : ", int(num_centroids)) # get input data data = np.genfromtxt(input_path, delimiter='\t', skip_header=1, dtype=float) # shuffle data X = [] np.random.shuffle(data) for row in data: X.append(row[1:]) # Run k_means and return the final set of centroids and the Cluster Assingments centroids, cluster_assignment = k_means(np.array(X), int(num_centroids)) # Initialize array with 0's equal to the number of centroids. Used arrays for SSE Calculations cluster_assignment_array = [] sse_cluster_assignment_array = [] for i in range(int(num_centroids)): cluster_assignment_array.append([]) sse_cluster_assignment_array.append([]) # Get data ready for SSE for clusterID, row in zip(cluster_assignment, data): cluster_assignment_array[clusterID].append(row[0]) sse_cluster_assignment_array[clusterID].append(row[1:]) # Calculate the Sum of Squared Error sse = calc_sse(centroids, sse_cluster_assignment_array)