def edited_nn(k, training_data): #go through every row i = 0 while(len(training_data) != i): #take a row out of the training data example = training_data.pop(i) #find the rows closest points closest_points = k_Nearest_Points(k, training_data, example) #find what KNN classified it as guess = classification_guess(closest_points) actual = example[0] #if the row was classified correctly, add it back in the list if (guess == actual): training_data.insert(i, example) i += 1 return training_data
def condensed_k_nearest(k, training_data): random.shuffle(training_data) ## print("randomized training set") new_data_point = True # Keeps track of whether or not a new point was added to condensed condensed = [] # Will contain the condensed set of training_data while new_data_point == True: # Stops looping once nothing else gets added to condensed. new_data_point = False for i in range( len(training_data) ): # Takes each item in the training data, and finds the nearest neighbor in the training data nearest = k_Nearest_Points( k, training_data, training_data[i]) # Finds k nearest neighbors guess = classification_guess(nearest) if training_data[i][0] != guess and not df_row_in_list( training_data[i], condensed ): #Is the data point actual class different from the guessed class? and is the point not already contained in condensed? condensed.append(training_data[i]) # new_data_point = True # New datapoint added to condensed, continue while loop return condensed
def k_means(k, training_data): print('----------') #if our k value is greater than the training data, we already have as many centroids as we need if (k > len(training_data)): return training_data k_clusters = training_data[0:k] # place k centroids randomly (the first k points which are randomized) #centroids=[] #list of all centroids clusters = { } # = [index:[list of points assigned to this centroid's index]...] i = 0 for point in k_clusters: #centroids.append(point) clusters.update({i: []}) i += 1 # assign training data points to the nearest centroid to them for point in training_data: this_centroid = k_Nearest_Points(1, k_clusters, point) #print(this_centroid) cent_index = this_centroid[0][2] clusters[cent_index].append(point) #print(clusters) #we want to run this until it converges so we need to test k_clusters against another dataframe old_clusters = 0 iterations = 0 #recompute data point assignment until centroids no longer move while ( not old_clusters == k_clusters and iterations < 100 ): #if centroids are not re-adjusted, then old_clusters = k_clusters old_clusters = k_clusters.copy() cent_id = 0 #for each cluster for centroid_idx, list1 in clusters.items(): #sometimes the clusters dont have points classified to them, so here we avoid an error from that try: new_centroid = list1[0].copy() num_points = 1 for list_element in range( len(list1) - 1): #for each point classified to the centroid for item in range(len(list1[0]) - 1): #for each column (except for class) new_centroid[item + 1] += list1[list_element + 1][item + 1] num_points += 1 for item in range(len(new_centroid) - 1): new_centroid[item + 1] /= num_points #find the 5 nearest points to the center of the cluster points = (k_Nearest_Points(3, list1, new_centroid)) #set the class label of the centroid to the most popular of the 5 nearest neighbors label = [] for i in points: label.append(i[0]) label = max(set(label), key=label.count) new_centroid[0] = label k_clusters[centroid_idx] = new_centroid.copy() except: #print('no points assigned to this cluster') pass #used for indexing centroid points in k_clusters cent_id += 1 #print('#K clusters#') #print(k_clusters) #print('-----') #print(old_clusters) #index iterations so we dont run this algorithm forever iterations += 1 #clear the dictionary that keeps track of which points are closest to which centroids for index, row in clusters.items(): clusters[index] = [] for point in training_data: this_centroid = k_Nearest_Points(1, k_clusters, point) #print(this_centroid) cent_index = this_centroid[0][2] clusters[cent_index].append(point) #set the dataframe to the clusters we generated and return that set training_data = k_clusters print(iterations) return training_data
def k_medoids(medoids, training_data): #convert data to pandas data frames # training_data_np = pd.DataFrame(training_data).to_numpy() # medoids_np = pd.DataFrame(medoids).to_numpy() #initialize count so medoids loop doesnt run forever count = 0 #flag to say if we should run through medoid algorith runFull = True while runFull == True and count < 100: #for left over data points associate each to the closest medoid by using distance #make dictionary to assign associated points to medoids medoid_dictionary = {} print("count ", count) #iterate through training data for row in range(len(training_data)): #find the closest medoid closest_medoid = k_Nearest_Points(1, medoids, training_data[row]) # print("Clostest Medoid ", closest_medoid) #store index of the closest medoid index = closest_medoid[0][2] #store lists of data points assigned to that medoid in a dictionary #add traning data try: medoid_dictionary[index].append(row) except: medoid_dictionary.update({index: [row]}) #Swap to false so it wil not be rerun unnless a medoid is swapped below # print("Medoid Dictionary:") # print(medoid_dictionary) runFull = False count = count + 1 medoids_to_remove = [] training_data_to_medoid = [] for key in medoid_dictionary: #initialize minimum cost minimum_cost = 0 #include medoid in the cluster cluster_points = [medoids[key]] #add points mapped to medoid to cluster indices = [key] for training_index in medoid_dictionary[key]: indices.append(training_index) cluster_points.append(training_data[training_index]) #counter used because we want to initialize cost to medoid cost k = 0 minimum_index = 0 # print("NEW CLUSTER") for index in range(len(cluster_points)): #resets cost for each point in cluster cost = 0 all_point_distance_array = k_Nearest_Points( len(cluster_points), cluster_points, cluster_points[index]) #add up costs to get total for point in range(len(all_point_distance_array)): cost = cost + all_point_distance_array[point][1] # print("Cost ", cost) #set cost to medoid cost first if k == 0: minimum_cost = cost k = k + 1 # print("Medoid Cost ", minimum_cost) #if new cost is less than medoid cost or previous, update if cost < minimum_cost: minimum_cost = cost minimum_index = index # print("NEW LOWEST COST ---- index = ", indices[index]) # print("New lowest cost ", minimum_cost) #will need to rerun full medoid if a point is swapped #swap out medoied with data point that has lower cost if minimum_index != 0: runFull = True # print("ADDING KEY AND INDEX TO LIST") medoids_to_remove.append(key) # print("MEDOID KEY LIST", medoids_to_remove) training_data_to_medoid.append(indices[minimum_index]) # print("TRIANING DATA KEY LIST", training_data_to_medoid) # try: # medoids_to_remove.append(key) # print("MEDOID KEY LIST", medoids_to_remove) # training_data_to_medoid.append(indices[minimum_index]) # print("TRIANING DATA KEY LIST", training_data_to_medoid) # except: # medoids_to_remove = [key] # # print("Index ", index) # # print("# of cluster points", len(cluster_points)) # # print("Length of indices ", len(indices)) # training_data_to_medoid = [indices[index]] # print(training_data_to_medoid) # print("Medoids: ", len(medoids)) # for i in range(len(medoids)): # print(medoids[i]) for i in training_data_to_medoid: # print("appended training data: ", training_data[i]) new_medoid = training_data[int(i)] medoids.append(new_medoid) # print("Appended Medoids: ", len(medoids)) # for i in range(len(medoids)): # print(medoids[i]) for i in medoids_to_remove: training_data.append(medoids[i]) # medoids.append(training_data[training_data_to_medoid) # training_data.append(medoids[medoids_to_remove]) medoids_to_remove.sort() print("MEDOIDS TO REMOVE ", medoids_to_remove) training_data_to_medoid.sort() print("TRAINING DATA TO MEDOIDS ", training_data_to_medoid) for i in reversed(medoids_to_remove): # print("medoid to remove: ", medoids[i]) medoid_removed = medoids.pop(i) # print("medoid removed: ", medoid_removed) for i in reversed(training_data_to_medoid): training_data.pop(i) # print("Medoids") # for i in range(len(medoids)): # print(medoids[i]) # print("Length of medoids ", len(medoids)) return medoids