예제 #1
0
 def test_knn_condensed(self):
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
     df = data.df.sample(n=350)  # minimal data frame
     data.split_data(data_frame=df)  # sets test and train data
     cluster_obj = KNN(5, data)
     condensed_data = cluster_obj.condense_data(data.train_df)
     size_after = condensed_data.shape[0]
     size_prior = data.train_df.shape[0]
     self.assertGreater(size_prior, size_after)
예제 #2
0
 def calcHiddenOutputs(self, input, center, std, data):
     knn = KNN(2, data)
     dist_between = knn.get_euclidean_distance(input, center)
     #  print(type(input[1]))
     #print(type(center[1]))
     # print(dist_between)
     output = np.exp(-1 / (2 * std**2) * dist_between**2)
     # print(output)
     return output
예제 #3
0
 def test_euclidean(self):
     """
     Test if euclidean distance is working
     :return:
     """
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
     df = data.df.sample(n=10)  # minimal data frame
     data.split_data(data_frame=df)  # sets test and train data
     knn = KNN(5, data)
     print(knn.get_euclidean_distance(df.iloc[1], df.iloc[2]))
예제 #4
0
 def getMaxDistMeans(self, mean_list, data):
     maxDist = 0
     knn = KNN(2, data)
     for clust in mean_list:
         for clus2 in mean_list:
             # compare against all other medoids
             curDist = knn.get_euclidean_distance()
             if curDist > maxDist:
                 maxDist = curDist
     # print(maxDist)
     return maxDist
예제 #5
0
 def test_KNN(self):
     """
     Test if KNN is returning a class
     :return:
     """
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
     df = data.df.sample(n=10)  # minimal data frame
     data.split_data(data_frame=df)  # sets test and train data
     k_val = 5
     knn = KNN(k_val, data)
     nearest = knn.perform_KNN(k_val, df.iloc[1], data.train_df)
     print(nearest)
예제 #6
0
    def getMaxDist(self, medoids_list, data):
        maxDist = 0
        knn = KNN(2, data)
        for medoid in medoids_list:
            for medoid2 in medoids_list:
                # compare against all other medoids
                curDist = knn.get_euclidean_distance(medoid.row, medoid2.row)
                if curDist > maxDist:
                    maxDist = curDist

    # print(maxDist)
        return maxDist
예제 #7
0
 def test_k_means(self):
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
     df = data.df.sample(n=200)  # minimal data frame
     data.split_data(data_frame=df)  # sets test and train data
     k_val = 5
     knn = KNN(k_val, data)
     kmeans = Kmeans(k_val, data)
     clusters = kmeans.k_means(data.train_df, k_val)
     converter = DataConverter()
     dt = converter.convert_data_to_original(data.train_df.copy())
     mismatch = False
     for cluster in clusters.values:
         if cluster not in dt.values:
             mismatch = True
     self.assertFalse(mismatch)
예제 #8
0
    def predict_centroids(
            self, centroids,
            data_set):  # Method to return closest cluster to test data

        for _, data in data_set[data_set].iterrows(
        ):  # Loops through the rows of the data set
            distance = None  # Initializes distance
            closest_centroid = None  # Keeps track of the current closes centroid cluster
            closest_centroid_euclidian_distance = None  # Keeps track of the closest euclidian distance.
            cluster_val = 1
            for centroid in centroids:  # Loops through the k centroid points
                euclid_distance = KNN.get_euclidean_distance(
                    centroid, data
                )  # Gets the distance between the centroid and the data point

                if distance is None or euclid_distance < distance:  # Updates the distance to keep track of the closest point
                    distance = euclid_distance
                    # closest_centroid = centroid
                    closest_centroid = cluster_val
                    closest_centroid_euclidian_distance = distance
                cluster_val += 1
예제 #9
0
    def cluster_data(self, clusters,
                     data_set):  # Loop until clusters have converged
        previous_clusters = []  # Initializes to check if previous value mached
        while (True):
            current_clusters = []
            for point in range(len(clusters)):  # Appends an empty list
                current_clusters.append([])

            for _, value in data_set.iterrows():  # Loop rows of the data set
                cluster_key = 0  # Appends a key for the closest value of the dictionary
                closest_point = [None, float('inf')
                                 ]  # Index of dictionary, distance value
                value = list(value)  # Won't work without this
                for row in clusters.values(
                ):  # Loops through the values in the cluster to compare distance
                    distance = KNN.get_euclidean_distance(
                        row, value)  # Gets the euclidean distance
                    if distance < closest_point[
                            1]:  # Checks if it is closer than the previous closest point
                        closest_point = [cluster_key,
                                         distance]  # Sets the closest point
                    cluster_key += 1
                current_clusters[closest_point[0]].append(
                    value
                )  # Appends the closest point to a the corresponding cluster

            clusters = self.mean_clusters(
                current_clusters, data_set)  # Gets the updated k-mean clusters
            if previous_clusters == current_clusters:
                print(
                    '-------------------------- K-Means has converged ------------------'
                )
                cluster_list = []
                for cluster in clusters.values(
                ):  # Convert the k-means points to a list
                    cluster_list.append(cluster)
                return cluster_list
            previous_clusters = current_clusters
예제 #10
0
 def test_edit_vs_condese(self):
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)
     df = data.df.sample(n=350)
     data.split_data(data_frame=df)
     knn = KNN(5, data)
     edit = knn.edit_data(data.train_df, 5, data.test_df, data.label_col)
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
     df = data.df.sample(n=350)  # minimal data frame
     data.split_data(data_frame=df)  # sets test and train data
     cluster_obj = KNN(5, data)
     condensed_data = cluster_obj.condense_data(data.train_df)
     size_after = condensed_data.shape[0]
     print("----------")
     print(edit.shape[0])
     print(size_after)
     if size_after < edit.shape[0]:
         print("Run condensed")
     else:
         print("Run edited")
예제 #11
0
def RBFREG_exp(data_config, data):
    # setup data var
    # data = Data('segmentation', pd.read_csv(r'data/segmentation.data', header=None), 0)
    # load data
    df = data.df  # get the dataframe from df

    print("Checking DF set")
    print(df[df.columns[-1]])
    # double check data is numerical
    cols = df.columns
    for col in cols:
        df[col] = df[col].astype(float)
    # split into test/train
    data.split_data(data_frame=df)
    if data_config == 'condensed':  # Run RBF on condensed data set
        cluster_obj = KNN(5, data)
        data.train_df = cluster_obj.condense_data(data.train_df)

        print(
            "\n---------------- Running Condensed Nearest Neighbor RBF -----------------"
        )
        print('Size of data: ', data.train_df.shape)
        rbf = RBFReg(clusters=4, maxruns=1000)
        rbf2 = RBFReg(clusters=6, maxruns=1000)
        rbf3 = RBFReg(clusters=8, maxruns=1000)
        rbf4 = RBFReg(clusters=12, maxruns=1000)
    elif data_config == 'edited':  # Run RBF on edited dataset
        knn = KNN(5, data)
        data.train_df = knn.edit_data(data.train_df, 5, data.test_df,
                                      data.label_col)
        print(
            "\n---------------- Running Edited Nearest Neighbor RBF -----------------\n"
        )
        print('Size of data: ', data.train_df.shape)

        rbf = RBFReg(clusters=4, maxruns=1000)
        rbf2 = RBFReg(clusters=6, maxruns=1000)
        rbf3 = RBFReg(clusters=8, maxruns=1000)
        rbf4 = RBFReg(clusters=12, maxruns=1000)
    elif data_config == 'k-means':  # Run RBF on K-means
        print("\n---------------- Running K-Means RBF -----------------\n")
        rbf = RBFRegK(clusters=4, maxruns=1000)
        rbf2 = RBFRegK(clusters=6, maxruns=1000)
        rbf3 = RBFRegK(clusters=8, maxruns=1000)
        rbf4 = RBFRegK(clusters=12, maxruns=1000)
    elif data_config == 'medoids':  # Run RBF on Medoids
        print("\n---------------- Running Medoids RBF -----------------\n")
        rbf = RBFReg(clusters=4, maxruns=1000)
        rbf2 = RBFReg(clusters=6, maxruns=1000)
        rbf3 = RBFReg(clusters=8, maxruns=1000)
        rbf4 = RBFReg(clusters=12, maxruns=1000)
    # setup expected values for testings
    expected = data.train_df[data.train_df.columns[-1]]
    actual = data.test_df[data.test_df.columns[-1]]

    # sets test and train data
    # will have high error due to small dataset, but just a test to show how this works

    expc_list = actual.values.tolist()

    rbf.trainReg(data.train_df, expected, data)
    predicts = rbf.predictReg(data.test_df, data)

    print("predicts RBF 1")
    print(predicts)
    print("expected")
    print(expc_list)

    lf = LF()
    lf.mean_squared_error(predicts, expc_list)
    lf.zero_one_loss(predicts, expc_list)
    # print("MSE RBF 1")
    # mse = rbf.mean_squared_error(predicts, expc_list)
    # print(mse)

    rbf2.trainReg(data.train_df, expected, data)
    predicts2 = rbf.predictReg(data.test_df, data)

    print("predicts RBF 2")
    print(predicts2)
    print("expected")
    print(expc_list)

    # print("MSE RBF 2")
    # mse2 = rbf2.mean_squared_error(predicts2, expc_list)
    # print(mse2)
    lf.mean_squared_error(predicts, expc_list)
    lf.zero_one_loss(predicts, expc_list)

    rbf3.trainReg(data.train_df, expected, data)
    predicts3 = rbf.predictReg(data.test_df, data)

    print("predicts RBF 3")
    print(predicts3)
    print("expected")
    print(expc_list)

    # print("MSE RBF 3")
    # mse3 = rbf.mean_squared_error(predicts3, expc_list)
    # print(mse3)
    lf.mean_squared_error(predicts, expc_list)
    lf.zero_one_loss(predicts, expc_list)

    rbf4.trainReg(data.train_df, expected, data)
    predicts4 = rbf.predictReg(data.test_df, data)

    print("predicts RBF 4")
    print(predicts4)
    print("expected")
    print(expc_list)

    # print("MSE RBF 4")
    # mse4 = rbf.mean_squared_error(predicts4, expc_list)
    # print(mse4)
    lf.mean_squared_error(predicts, expc_list)
    lf.zero_one_loss(predicts, expc_list)
예제 #12
0
def RBFREG_vid(data_config, data):
    # data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8)  # load data
    df = data.df.sample(
        100)  # get the dataframe from df, take small subsection
    data_name = data.name
    print("\nChecking DF set")
    print(df[df.columns[-1]])
    # double check data is numerical
    cols = df.columns
    for col in cols:
        df[col] = df[col].astype(float)
    # split into test/train
    data.split_data(data_frame=df)

    # setup expected values for testings
    expected = data.train_df[data.train_df.columns[-1]]
    actual = data.test_df[data.test_df.columns[-1]]

    # sets test and train data
    # will have high error due to small dataset, but just a test to show how this works
    if data_config == 'condensed':  # Run RBF on condensed data set
        cluster_obj = KNN(5, data)
        data.train_df = cluster_obj.condense_data(data.train_df)

        print(
            "\n---------------- Running Condensed Nearest Neighbor RBF Data: "
            + data_name + "-----------------")
        print('Size of data: ', data.train_df.shape)
        rbf = RBFReg(clusters=8, maxruns=600)

    elif data_config == 'edited':  # Run RBF on edited dataset
        knn = KNN(5, data)
        data.train_df = knn.edit_data(data.train_df, 5, data.test_df,
                                      data.label_col)
        print("\n---------------- Running Edited Nearest Neighbor RBF Data: " +
              data_name + "-----------------")
        print('Size of data: ', data.train_df.shape)

        rbf = RBFReg(clusters=8, maxruns=600)

    elif data_config == 'k-means':  # Run RBF on K-means
        print("\n---------------- Running K-Means RBF Data: " + data_name +
              "-----------------")
        rbf = RBFRegK(clusters=8, maxruns=600)

    elif data_config == 'medoids':  # Run RBF on Medoids
        print("\n---------------- Running Mediods RBF Data: " + data_name +
              "-----------------")
        rbf = RBFReg(clusters=8, maxruns=600)

    rbf.trainReg(data.train_df, expected, data)

    print('Calculate predictions for the RBF')
    predicts = rbf.predictReg(data.test_df, data)

    expc_list = actual.values.tolist()
    print("predicts RBF")

    print(predicts)
    print("expected")
    print(expc_list)
    lf = LF()
    mse = lf.mean_squared_error(predicts, expc_list)
    zeroone = lf.zero_one_loss(predicts, expc_list)
    plt.plot(predicts, label=data_name + ' ' + data_config + ' prediction')
    plt.plot(expc_list, label=data_name + ' ' + data_config + ' expected')
    plt.plot(mse, label='MSE: ' + str(mse))
    plt.plot(zeroone, label='0-1 Loss: ' + str(zeroone))

    plt.legend()
    plt.title('Data: ' + data_name)
    plt.ylabel('Expected value/ Predicted Value')
    plt.xlabel('# Predictions')
    plt.savefig(
        data_name + '_' + data_config
    )  # Code for saving a plot to image sourced from: https://pythonspot.com/matplotlib-save-figure-to-image-file/
    plt.clf()
예제 #13
0
 def test_edit(self):
     data = Data('abalone', pd.read_csv(r'data/abalone.data', header=None), 8, False)
     df = data.df.sample(n=50)
     data.split_data(data_frame=df)
     knn = KNN(5, data)
     knn.edit_data(data.train_df, 5, data.test_df, data.label_col)