def test_knn_condensed(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=350) # minimal data frame data.split_data(data_frame=df) # sets test and train data cluster_obj = KNN(5, data) condensed_data = cluster_obj.condense_data(data.train_df) size_after = condensed_data.shape[0] size_prior = data.train_df.shape[0] self.assertGreater(size_prior, size_after)
def calcHiddenOutputs(self, input, center, std, data): knn = KNN(2, data) dist_between = knn.get_euclidean_distance(input, center) # print(type(input[1])) #print(type(center[1])) # print(dist_between) output = np.exp(-1 / (2 * std**2) * dist_between**2) # print(output) return output
def test_euclidean(self): """ Test if euclidean distance is working :return: """ data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=10) # minimal data frame data.split_data(data_frame=df) # sets test and train data knn = KNN(5, data) print(knn.get_euclidean_distance(df.iloc[1], df.iloc[2]))
def getMaxDistMeans(self, mean_list, data): maxDist = 0 knn = KNN(2, data) for clust in mean_list: for clus2 in mean_list: # compare against all other medoids curDist = knn.get_euclidean_distance() if curDist > maxDist: maxDist = curDist # print(maxDist) return maxDist
def test_KNN(self): """ Test if KNN is returning a class :return: """ data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=10) # minimal data frame data.split_data(data_frame=df) # sets test and train data k_val = 5 knn = KNN(k_val, data) nearest = knn.perform_KNN(k_val, df.iloc[1], data.train_df) print(nearest)
def getMaxDist(self, medoids_list, data): maxDist = 0 knn = KNN(2, data) for medoid in medoids_list: for medoid2 in medoids_list: # compare against all other medoids curDist = knn.get_euclidean_distance(medoid.row, medoid2.row) if curDist > maxDist: maxDist = curDist # print(maxDist) return maxDist
def test_k_means(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=200) # minimal data frame data.split_data(data_frame=df) # sets test and train data k_val = 5 knn = KNN(k_val, data) kmeans = Kmeans(k_val, data) clusters = kmeans.k_means(data.train_df, k_val) converter = DataConverter() dt = converter.convert_data_to_original(data.train_df.copy()) mismatch = False for cluster in clusters.values: if cluster not in dt.values: mismatch = True self.assertFalse(mismatch)
def predict_centroids( self, centroids, data_set): # Method to return closest cluster to test data for _, data in data_set[data_set].iterrows( ): # Loops through the rows of the data set distance = None # Initializes distance closest_centroid = None # Keeps track of the current closes centroid cluster closest_centroid_euclidian_distance = None # Keeps track of the closest euclidian distance. cluster_val = 1 for centroid in centroids: # Loops through the k centroid points euclid_distance = KNN.get_euclidean_distance( centroid, data ) # Gets the distance between the centroid and the data point if distance is None or euclid_distance < distance: # Updates the distance to keep track of the closest point distance = euclid_distance # closest_centroid = centroid closest_centroid = cluster_val closest_centroid_euclidian_distance = distance cluster_val += 1
def cluster_data(self, clusters, data_set): # Loop until clusters have converged previous_clusters = [] # Initializes to check if previous value mached while (True): current_clusters = [] for point in range(len(clusters)): # Appends an empty list current_clusters.append([]) for _, value in data_set.iterrows(): # Loop rows of the data set cluster_key = 0 # Appends a key for the closest value of the dictionary closest_point = [None, float('inf') ] # Index of dictionary, distance value value = list(value) # Won't work without this for row in clusters.values( ): # Loops through the values in the cluster to compare distance distance = KNN.get_euclidean_distance( row, value) # Gets the euclidean distance if distance < closest_point[ 1]: # Checks if it is closer than the previous closest point closest_point = [cluster_key, distance] # Sets the closest point cluster_key += 1 current_clusters[closest_point[0]].append( value ) # Appends the closest point to a the corresponding cluster clusters = self.mean_clusters( current_clusters, data_set) # Gets the updated k-mean clusters if previous_clusters == current_clusters: print( '-------------------------- K-Means has converged ------------------' ) cluster_list = [] for cluster in clusters.values( ): # Convert the k-means points to a list cluster_list.append(cluster) return cluster_list previous_clusters = current_clusters
def test_edit_vs_condese(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) df = data.df.sample(n=350) data.split_data(data_frame=df) knn = KNN(5, data) edit = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample(n=350) # minimal data frame data.split_data(data_frame=df) # sets test and train data cluster_obj = KNN(5, data) condensed_data = cluster_obj.condense_data(data.train_df) size_after = condensed_data.shape[0] print("----------") print(edit.shape[0]) print(size_after) if size_after < edit.shape[0]: print("Run condensed") else: print("Run edited")
def RBFREG_exp(data_config, data): # setup data var # data = Data('segmentation', pd.read_csv(r'data/segmentation.data', header=None), 0) # load data df = data.df # get the dataframe from df print("Checking DF set") print(df[df.columns[-1]]) # double check data is numerical cols = df.columns for col in cols: df[col] = df[col].astype(float) # split into test/train data.split_data(data_frame=df) if data_config == 'condensed': # Run RBF on condensed data set cluster_obj = KNN(5, data) data.train_df = cluster_obj.condense_data(data.train_df) print( "\n---------------- Running Condensed Nearest Neighbor RBF -----------------" ) print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) elif data_config == 'edited': # Run RBF on edited dataset knn = KNN(5, data) data.train_df = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) print( "\n---------------- Running Edited Nearest Neighbor RBF -----------------\n" ) print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) elif data_config == 'k-means': # Run RBF on K-means print("\n---------------- Running K-Means RBF -----------------\n") rbf = RBFRegK(clusters=4, maxruns=1000) rbf2 = RBFRegK(clusters=6, maxruns=1000) rbf3 = RBFRegK(clusters=8, maxruns=1000) rbf4 = RBFRegK(clusters=12, maxruns=1000) elif data_config == 'medoids': # Run RBF on Medoids print("\n---------------- Running Medoids RBF -----------------\n") rbf = RBFReg(clusters=4, maxruns=1000) rbf2 = RBFReg(clusters=6, maxruns=1000) rbf3 = RBFReg(clusters=8, maxruns=1000) rbf4 = RBFReg(clusters=12, maxruns=1000) # setup expected values for testings expected = data.train_df[data.train_df.columns[-1]] actual = data.test_df[data.test_df.columns[-1]] # sets test and train data # will have high error due to small dataset, but just a test to show how this works expc_list = actual.values.tolist() rbf.trainReg(data.train_df, expected, data) predicts = rbf.predictReg(data.test_df, data) print("predicts RBF 1") print(predicts) print("expected") print(expc_list) lf = LF() lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) # print("MSE RBF 1") # mse = rbf.mean_squared_error(predicts, expc_list) # print(mse) rbf2.trainReg(data.train_df, expected, data) predicts2 = rbf.predictReg(data.test_df, data) print("predicts RBF 2") print(predicts2) print("expected") print(expc_list) # print("MSE RBF 2") # mse2 = rbf2.mean_squared_error(predicts2, expc_list) # print(mse2) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) rbf3.trainReg(data.train_df, expected, data) predicts3 = rbf.predictReg(data.test_df, data) print("predicts RBF 3") print(predicts3) print("expected") print(expc_list) # print("MSE RBF 3") # mse3 = rbf.mean_squared_error(predicts3, expc_list) # print(mse3) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list) rbf4.trainReg(data.train_df, expected, data) predicts4 = rbf.predictReg(data.test_df, data) print("predicts RBF 4") print(predicts4) print("expected") print(expc_list) # print("MSE RBF 4") # mse4 = rbf.mean_squared_error(predicts4, expc_list) # print(mse4) lf.mean_squared_error(predicts, expc_list) lf.zero_one_loss(predicts, expc_list)
def RBFREG_vid(data_config, data): # data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8) # load data df = data.df.sample( 100) # get the dataframe from df, take small subsection data_name = data.name print("\nChecking DF set") print(df[df.columns[-1]]) # double check data is numerical cols = df.columns for col in cols: df[col] = df[col].astype(float) # split into test/train data.split_data(data_frame=df) # setup expected values for testings expected = data.train_df[data.train_df.columns[-1]] actual = data.test_df[data.test_df.columns[-1]] # sets test and train data # will have high error due to small dataset, but just a test to show how this works if data_config == 'condensed': # Run RBF on condensed data set cluster_obj = KNN(5, data) data.train_df = cluster_obj.condense_data(data.train_df) print( "\n---------------- Running Condensed Nearest Neighbor RBF Data: " + data_name + "-----------------") print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=8, maxruns=600) elif data_config == 'edited': # Run RBF on edited dataset knn = KNN(5, data) data.train_df = knn.edit_data(data.train_df, 5, data.test_df, data.label_col) print("\n---------------- Running Edited Nearest Neighbor RBF Data: " + data_name + "-----------------") print('Size of data: ', data.train_df.shape) rbf = RBFReg(clusters=8, maxruns=600) elif data_config == 'k-means': # Run RBF on K-means print("\n---------------- Running K-Means RBF Data: " + data_name + "-----------------") rbf = RBFRegK(clusters=8, maxruns=600) elif data_config == 'medoids': # Run RBF on Medoids print("\n---------------- Running Mediods RBF Data: " + data_name + "-----------------") rbf = RBFReg(clusters=8, maxruns=600) rbf.trainReg(data.train_df, expected, data) print('Calculate predictions for the RBF') predicts = rbf.predictReg(data.test_df, data) expc_list = actual.values.tolist() print("predicts RBF") print(predicts) print("expected") print(expc_list) lf = LF() mse = lf.mean_squared_error(predicts, expc_list) zeroone = lf.zero_one_loss(predicts, expc_list) plt.plot(predicts, label=data_name + ' ' + data_config + ' prediction') plt.plot(expc_list, label=data_name + ' ' + data_config + ' expected') plt.plot(mse, label='MSE: ' + str(mse)) plt.plot(zeroone, label='0-1 Loss: ' + str(zeroone)) plt.legend() plt.title('Data: ' + data_name) plt.ylabel('Expected value/ Predicted Value') plt.xlabel('# Predictions') plt.savefig( data_name + '_' + data_config ) # Code for saving a plot to image sourced from: https://pythonspot.com/matplotlib-save-figure-to-image-file/ plt.clf()
def test_edit(self): data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8, False) df = data.df.sample(n=50) data.split_data(data_frame=df) knn = KNN(5, data) knn.edit_data(data.train_df, 5, data.test_df, data.label_col)