예제 #1
0
def edited_nn(k, training_data):
    #go through every row
    i = 0
    while(len(training_data) != i):
        #take a row out of the training data
        example = training_data.pop(i)
        #find the rows closest points
        closest_points = k_Nearest_Points(k, training_data, example)
        #find what KNN classified it as
        guess = classification_guess(closest_points)
        actual = example[0]
        #if the row was classified correctly, add it back in the list
        if (guess == actual):
            training_data.insert(i, example)
            i += 1
    return training_data
예제 #2
0
def condensed_k_nearest(k, training_data):
    random.shuffle(training_data)
    ##    print("randomized training set")
    new_data_point = True  # Keeps track of whether or not a new point was added to condensed
    condensed = []  # Will contain the condensed set of training_data

    while new_data_point == True:  # Stops looping once nothing else gets added to condensed.
        new_data_point = False

        for i in range(
                len(training_data)
        ):  # Takes each item in the training data, and finds the nearest neighbor in the training data
            nearest = k_Nearest_Points(
                k, training_data,
                training_data[i])  # Finds k nearest neighbors
            guess = classification_guess(nearest)
            if training_data[i][0] != guess and not df_row_in_list(
                    training_data[i], condensed
            ):  #Is the data point actual class different from the guessed class? and is the point not already contained in condensed?
                condensed.append(training_data[i])  #
                new_data_point = True  # New datapoint added to condensed, continue while loop
    return condensed
def k_means(k, training_data):

    print('----------')

    #if our k value is greater than the training data, we already have as many centroids as we need
    if (k > len(training_data)):
        return training_data

    k_clusters = training_data[0:k]
    # place k centroids randomly (the first k points which are randomized)

    #centroids=[] #list of all centroids
    clusters = {
    }  # = [index:[list of points assigned to this centroid's index]...]
    i = 0
    for point in k_clusters:
        #centroids.append(point)
        clusters.update({i: []})
        i += 1

    # assign training data points to the nearest centroid to them
    for point in training_data:
        this_centroid = k_Nearest_Points(1, k_clusters, point)
        #print(this_centroid)
        cent_index = this_centroid[0][2]
        clusters[cent_index].append(point)
    #print(clusters)

    #we want to run this until it converges so we need to test k_clusters against another dataframe
    old_clusters = 0

    iterations = 0

    #recompute data point assignment until centroids no longer move
    while (
            not old_clusters == k_clusters and iterations < 100
    ):  #if centroids are not re-adjusted, then old_clusters = k_clusters
        old_clusters = k_clusters.copy()

        cent_id = 0
        #for each cluster
        for centroid_idx, list1 in clusters.items():

            #sometimes the clusters dont have points classified to them, so here we avoid an error from that
            try:

                new_centroid = list1[0].copy()
                num_points = 1
                for list_element in range(
                        len(list1) -
                        1):  #for each point classified to the centroid
                    for item in range(len(list1[0]) -
                                      1):  #for each column (except for class)
                        new_centroid[item + 1] += list1[list_element +
                                                        1][item + 1]
                    num_points += 1
                for item in range(len(new_centroid) - 1):
                    new_centroid[item + 1] /= num_points
                #find the 5 nearest points to the center of the cluster
                points = (k_Nearest_Points(3, list1, new_centroid))
                #set the class label of the centroid to the most popular of the 5 nearest neighbors
                label = []
                for i in points:
                    label.append(i[0])
                label = max(set(label), key=label.count)
                new_centroid[0] = label
                k_clusters[centroid_idx] = new_centroid.copy()

            except:
                #print('no points assigned to this cluster')
                pass

            #used for indexing centroid points in k_clusters
            cent_id += 1

        #print('#K clusters#')
        #print(k_clusters)
        #print('-----')
        #print(old_clusters)

        #index iterations so we dont run this algorithm forever
        iterations += 1

        #clear the dictionary that keeps track of which points are closest to which centroids
        for index, row in clusters.items():
            clusters[index] = []

        for point in training_data:
            this_centroid = k_Nearest_Points(1, k_clusters, point)
            #print(this_centroid)
            cent_index = this_centroid[0][2]
            clusters[cent_index].append(point)

    #set the dataframe to the clusters we generated and return that set

    training_data = k_clusters

    print(iterations)

    return training_data
def k_medoids(medoids, training_data):

    #convert data to pandas data frames
    # training_data_np = pd.DataFrame(training_data).to_numpy()
    # medoids_np = pd.DataFrame(medoids).to_numpy()

    #initialize count so medoids loop doesnt run forever
    count = 0
    #flag to say if we should run through medoid algorith
    runFull = True
    while runFull == True and count < 100:
        #for left over data points associate each to the closest medoid by using distance
        #make dictionary to assign associated points to medoids
        medoid_dictionary = {}
        print("count ", count)
        #iterate through training data
        for row in range(len(training_data)):
            #find the closest medoid
            closest_medoid = k_Nearest_Points(1, medoids, training_data[row])
            # print("Clostest Medoid ", closest_medoid)
            #store index of the closest medoid
            index = closest_medoid[0][2]
            #store lists of data points assigned to that medoid in a dictionary
            #add traning data
            try:
                medoid_dictionary[index].append(row)

            except:
                medoid_dictionary.update({index: [row]})

        #Swap to false so it wil not be rerun unnless a medoid is swapped below

        # print("Medoid Dictionary:")
        # print(medoid_dictionary)
        runFull = False
        count = count + 1

        medoids_to_remove = []
        training_data_to_medoid = []
        for key in medoid_dictionary:
            #initialize minimum cost
            minimum_cost = 0
            #include medoid in the cluster
            cluster_points = [medoids[key]]

            #add points mapped to medoid to cluster
            indices = [key]
            for training_index in medoid_dictionary[key]:
                indices.append(training_index)
                cluster_points.append(training_data[training_index])

            #counter used because we want to initialize cost to medoid cost
            k = 0
            minimum_index = 0
            # print("NEW CLUSTER")
            for index in range(len(cluster_points)):
                #resets cost for each point in cluster
                cost = 0
                all_point_distance_array = k_Nearest_Points(
                    len(cluster_points), cluster_points, cluster_points[index])

                #add up costs to get total
                for point in range(len(all_point_distance_array)):
                    cost = cost + all_point_distance_array[point][1]
                # print("Cost ", cost)

                #set cost to medoid cost first
                if k == 0:
                    minimum_cost = cost
                    k = k + 1
                    # print("Medoid Cost ", minimum_cost)

                #if new cost is less than medoid cost or previous, update

                if cost < minimum_cost:
                    minimum_cost = cost
                    minimum_index = index
                    # print("NEW LOWEST COST ---- index = ", indices[index])
                    # print("New lowest cost ", minimum_cost)
                    #will need to rerun full medoid if a point is swapped

                    #swap out medoied with data point that has lower cost
            if minimum_index != 0:
                runFull = True
                # print("ADDING KEY AND INDEX TO LIST")

                medoids_to_remove.append(key)
                # print("MEDOID KEY LIST", medoids_to_remove)
                training_data_to_medoid.append(indices[minimum_index])
                # print("TRIANING DATA KEY LIST", training_data_to_medoid)
                # try:
                #     medoids_to_remove.append(key)
                #     print("MEDOID KEY LIST", medoids_to_remove)
                #     training_data_to_medoid.append(indices[minimum_index])
                #     print("TRIANING DATA KEY LIST", training_data_to_medoid)

                # except:
                #     medoids_to_remove = [key]
                #     # print("Index ", index)
                #     # print("# of cluster points", len(cluster_points))
                #     # print("Length of indices ", len(indices))
                #     training_data_to_medoid = [indices[index]]
            # print(training_data_to_medoid)
        # print("Medoids: ", len(medoids))
        # for i in range(len(medoids)):
        #     print(medoids[i])
        for i in training_data_to_medoid:
            # print("appended training data: ", training_data[i])
            new_medoid = training_data[int(i)]
            medoids.append(new_medoid)
        # print("Appended Medoids: ", len(medoids))
        # for i in range(len(medoids)):
        #     print(medoids[i])
        for i in medoids_to_remove:
            training_data.append(medoids[i])
        # medoids.append(training_data[training_data_to_medoid)
        # training_data.append(medoids[medoids_to_remove])
        medoids_to_remove.sort()
        print("MEDOIDS TO REMOVE ", medoids_to_remove)
        training_data_to_medoid.sort()
        print("TRAINING DATA TO MEDOIDS ", training_data_to_medoid)
        for i in reversed(medoids_to_remove):
            # print("medoid to remove: ", medoids[i])
            medoid_removed = medoids.pop(i)
            # print("medoid removed: ", medoid_removed)
        for i in reversed(training_data_to_medoid):
            training_data.pop(i)
        # print("Medoids")
        # for i in range(len(medoids)):
        #     print(medoids[i])
        # print("Length of medoids ", len(medoids))

    return medoids