def test_k_fold(self): data = Data() data_temp = pd.read_csv(r'data/abalone.data', header=None) data_split = data.split_k_fold(5, data_temp) # split into 10 dif parts self.assertIs(len(data_split), 5) # check split into 2 groups self.assertIs(len(data_split[0]), 2) # check that it split into test and train
def run_k_means(indata): # Run k-means on wine data set'knn = KNN() knn = KNN() data = Data() # loads the data and checks if complete data.split_data() in_data = {'abalone':indata} knn.data = data knn.current_data_set = 'abalone' # Set the data set to be used to wine centroids = knn.centroids(in_data, 5) # Get the k-means clusters knn.predict_centroids(centroids, data.test_dict) # Predict the closest cluster
def test_centroids(self): print("Testing Centroid") knn = KNN() data = Data() data.split_data() knn.data = data knn.current_data_set = 'wine' # used in KNN, needed here centroids = knn.centroids(data.train_dict, 4) knn.predict_centroids(centroids, data.test_dict) print("End Centroid Test")
def test_zero_one_loss(self): knn = KNN() lf = LF() data = Data() data_temp = pd.read_csv(r'data/abalone.data', header=None) data_set = data_temp.loc[:1000][:] # get first 100 rows of data_set k_val = 5 name = 'abalone' # used in KNN, needed here #cond_data = knn.condense_data(data_set, k_val, name, data) self.assertIsNotNone(lf.zero_one_loss(data_set, k_val, name, data))
def test_condense_data(self): # compare that the size of output pandas data frame is less than input (that CNN reduced the data) # importing part of abalone data to test this as we need the 2D structure knn = KNN() data = Data() data_temp = pd.read_csv(r'data/abalone.data', header=None) data_set = data_temp.loc[:400][:] # get first 100 rows of data_set k_val = 5 name = 'abalone' # used in KNN, needed here cond_data = knn.condense_data(data_set, k_val, name, data) self.assertGreater(len(data_set.index), len(cond_data.index))
def main(): """ Calls function in other files until program is finished. :return: None """ knn = KNN() data = Data() # loads the data and checks if complete while True: data.split_data() # split into both test and train predicted_class = { } # holds data_set_name and a list of predicted classes for key, train_data_set in data.train_dict: # iterate through data and get key(Data name) and data_set print("Current Data Set: ", key) predicted_class[key] = [ ] # create a list of for a data set of predicted values test_data_set = data.test_dict[ key] # TODO: Use same keys for all dictionaries; Access testing data by key. for _, query_point in train_data_set.iterrows(): # give query example and its corresponding train_data_set, along with # of desired neighbors to consider predicted_class[key].append( knn.perform_knn(query_point, train_data_set, 5))
[[ 0. 0.06810811 0.16 0.00432432 0. ] [ 0. 0.02702703 0.07027027 0.00216216 0. ] [ 0. 0.07351351 0.22054054 0.00756757 0. ] [ 0. 0.02810811 0.20864865 0.00756757 0. ] [ 0. 0.00756757 0.10918919 0.00540541 0. ]] """ SHOW_CONFUSIONS = True SHOWPLOTS = True all_train_x, all_train_y, all_val_x, all_val_y, all_test_x, all_test_y = [], [], [], [], [], [] for filename in FILENAMES: data_loader = Data(VAL_RATIO, TEST_RATIO, "ALL", filename, normalize=NORMALIZE) title_x, title_y = data_loader.get_title() train_x, train_y = data_loader.get_train_data() val_x, val_y = data_loader.get_val_data() test_x, test_y = data_loader.get_test_data() if title_x[-1] == "latitude": lat_train = train_x[:, -1] lon_train = train_x[:, -2] lat_val = val_x[:, -1] lon_val = val_x[:, -2] lat_test = test_x[:, -1] lon_test = test_x[:, -2]
from process_data import Data from sklearn.metrics import f1_score from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C TEST_RATIO = 0.0 #float from 0.0 to 1.0 VAL_RATIO = 0.3 #float from 0.0 to 1.0 NORMALIZE = True #normalize data in "total_passenger_count", "total_female_count", "empty_seats", "haversine" BATCH_SIZE = 5 INCLUDE = "ALL" #one of "trip_var", "instant_var", "perception_var", "contextual_var", "sociodemographic_var", "ALL" FILENAME = 'final_data_4.csv' data_loader = Data(VAL_RATIO, TEST_RATIO, INCLUDE, FILENAME, normalize=NORMALIZE) title_x, title_y = data_loader.get_title() train_x, train_y = data_loader.get_train_data() val_x, val_y = data_loader.get_val_data() test_x, test_y = data_loader.get_test_data() if title_x[-1] == "latitude": lat_train = train_x[:, -1] lon_train = train_x[:, -2] lat_val = val_x[:, -1] lon_val = val_x[:, -2] elif title_x[-1] == "longitude": lat_train = train_x[:, -2]
# # while True: # data.load_data() # data.split_data() # split into both test and train # predicted_class = {} # holds data_set_name and a list of predicted classes # # for name, train_data_set in data.train_dict.items(): # iterate through data and get key(Data name) and data_set # print("Current Data Set: ", name) # predicted_class[name] = [] # create a list of for a data set of predicted values # test_data_set = data.test_dict[name] # TODO: Use same keys for all dictionaries; Access testing data by key. # for _, query_point in train_data_set.iterrows(): # # give query example and its corresponding train_data_set, along with # of desired neighbors to consider # predicted_class[name].append(knn.perform_knn(query_point, train_data_set, 5, name, data)) knn = KNN() data = Data() # loads the data and checks if complete lf = LF() data.load_data() def run_zero_loss(): """ Calls function in other files until program is finished. :return: None """ data.split_data() # split into both test and train lf.zero_one_loss(data.test_dict['abalone'].sample(n=400), 5, 'abalone', data) def run_k_means(indata): # Run k-means on wine data set'knn = KNN() knn = KNN()