def classify(self, data_to_classify): points_to_classify = [] nearest = [] count = 0 for line in data_to_classify: points_to_classify.append(data_point( line[:], '')) # turns the input dataset into a list of datapoints for point in points_to_classify: count += 1 nearest = self.get_k_nearest(point) occurrence_counter = np.full( 2000, 0 ) # This is declared ahead of time to avoid any potential issues with out of bounds errors that might occur if dynamically declaring. It's huge because of the machine dataset for i in range(self.k): occurrence_counter[int( nearest[i][-1] )] += 1 # We cast this particular index to int, since no classes in these sets are of float value max_occurrence = np.argmax( occurrence_counter ) # finds the index with the most common occurrence, and takes that to be the class of the current point point.class_type = max_occurrence # print("point " + str(count) + " classified") return points_to_classify
def find_average( self, points ): #finds the average position of data points that belong to a cluster and assigns the new position to the centoid new_data = np.full(len(points[0].data), 0) new_class = 0 for i in range(len(points[0].data)): #average the position for row in range(len(points)): new_data[i] += points[row].data[i] new_data[i] = int(round(new_data[i] / len(points))) for i in range(len(points)): #average the class new_class += points[i].class_type new_class = int(round(new_class / len(points))) return data_point(new_data, new_class) #return the new centroid
def recompute(self): max_passes = int( 0.1 * len(self.d_set) ) # max passes (if the medoids don't stop changing) to kick out equal to 10% the dataset (tunable) distortion = 9223372036854775807 # this is the maximum integer value for the system, and is also used below distortion_prime = 9223372036854775807 old_distortion = 9223372036854775807 while (max_passes > 0): data_distort_med = [] # assign points to medoids for x in self.d_map.points: medoid_assigned_to = data_point('', '') shortest_distance = 9223372036854775807 for m in self.pam_map: # find which medoid the datapoint is closest to dist = self.euclidian(x, m) if (dist < shortest_distance): medoid_assigned_to = m shortest_distance = dist data_distort_med.append( [x, shortest_distance, medoid_assigned_to]) distortion += shortest_distance # calculate the distortion while finding closest (distances are unsquared) # swap-a-roo for m in range(len(self.pam_map)): for x in range(len(data_distort_med)): #swap points temp_point = self.pam_map[m] self.pam_map[m] = data_distort_med[x][0] data_distort_med[x][0] = temp_point #calc distortion for i in data_distort_med: if ( i[-1] != temp_point ): # if the medoid wasn't changed then the distance hasn't changed distortion_prime += i[1] else: distortion_prime += self.euclidian( self.pam_map[m], i[0]) #if distortion is not decreased then swap back if distortion <= distortion_prime: temp_point = self.pam_map[m] self.pam_map[m] = data_distort_med[x][0] data_distort_med[x][0] = temp_point else: distortion = distortion_prime if ((old_distortion - distortion) / distortion) < 0.01: break max_passes -= 1 old_distortion = distortion self.d_map.points = self.pam_map
def regression(self, data_to_regress): points_to_regress = [] nearest = [] for line in data_to_regress: points_to_regress.append(data_point(line[:], '')) for point in points_to_regress: nearest = self.get_k_nearest(point) nearest = sorted(nearest, key=lambda l: l[1], reverse=True) average = 0 for i in range(self.k): average += nearest[i][-1] average = average / self.k point.class_type = int(round(average)) return points_to_regress
def mini_gen(self, data_in): # Makes a new list of points point_list = [] for line in self.d_set: point_list.append(data_point(line[:-1], line[-1])) return point_list
def generate(self): point_list = [] for line in self.d_set: point_list.append(data_point(line[:-1], line[-1])) self.d_map = point_map(point_list)