def test_k_fold(self):
     data = Data()
     data_temp = pd.read_csv(r'data/abalone.data', header=None)
     data_split = data.split_k_fold(5,
                                    data_temp)  #  split into 10 dif parts
     self.assertIs(len(data_split), 5)  # check split into 2 groups
     self.assertIs(len(data_split[0]),
                   2)  # check that it split into test and train
Exemple #2
0
def run_k_means(indata):  # Run k-means on wine data set'knn = KNN()
    knn = KNN()
    data = Data()  # loads the data and checks if complete
    data.split_data()
    in_data = {'abalone':indata}
    knn.data = data
    knn.current_data_set = 'abalone'  # Set the data set to be used to wine
    centroids = knn.centroids(in_data, 5)  # Get the k-means clusters
    knn.predict_centroids(centroids, data.test_dict)  # Predict the closest cluster
 def test_centroids(self):
     print("Testing Centroid")
     knn = KNN()
     data = Data()
     data.split_data()
     knn.data = data
     knn.current_data_set = 'wine'  # used in KNN, needed here
     centroids = knn.centroids(data.train_dict, 4)
     knn.predict_centroids(centroids, data.test_dict)
     print("End Centroid Test")
 def test_zero_one_loss(self):
     knn = KNN()
     lf = LF()
     data = Data()
     data_temp = pd.read_csv(r'data/abalone.data', header=None)
     data_set = data_temp.loc[:1000][:]  # get first 100 rows of data_set
     k_val = 5
     name = 'abalone'  # used in KNN, needed here
     #cond_data = knn.condense_data(data_set, k_val, name, data)
     self.assertIsNotNone(lf.zero_one_loss(data_set, k_val, name, data))
    def test_condense_data(self):
        # compare that the size of  output pandas data frame is less than input (that CNN reduced the data)
        # importing part of abalone data to test this as we need the 2D structure
        knn = KNN()
        data = Data()
        data_temp = pd.read_csv(r'data/abalone.data', header=None)
        data_set = data_temp.loc[:400][:]  # get first 100 rows of data_set
        k_val = 5
        name = 'abalone'  # used in KNN, needed here
        cond_data = knn.condense_data(data_set, k_val, name, data)

        self.assertGreater(len(data_set.index), len(cond_data.index))
Exemple #6
0
def main():
    """
    Calls function in other files until program is finished.
    :return: None
    """
    knn = KNN()
    data = Data()  # loads the data and checks if complete

    while True:
        data.split_data()  # split into both test and train
        predicted_class = {
        }  # holds data_set_name and a list of predicted classes

        for key, train_data_set in data.train_dict:  # iterate through data and get key(Data name) and data_set
            print("Current Data Set: ", key)
            predicted_class[key] = [
            ]  # create a list of for a data set of predicted values
            test_data_set = data.test_dict[
                key]  # TODO: Use same keys for all dictionaries; Access testing data by key.
            for _, query_point in train_data_set.iterrows():
                # give query example and its corresponding train_data_set, along with # of desired neighbors to consider
                predicted_class[key].append(
                    knn.perform_knn(query_point, train_data_set, 5))
Exemple #7
0
[[ 0.          0.06810811  0.16        0.00432432  0.        ]
 [ 0.          0.02702703  0.07027027  0.00216216  0.        ]
 [ 0.          0.07351351  0.22054054  0.00756757  0.        ]
 [ 0.          0.02810811  0.20864865  0.00756757  0.        ]
 [ 0.          0.00756757  0.10918919  0.00540541  0.        ]]
"""

SHOW_CONFUSIONS = True
SHOWPLOTS = True

all_train_x, all_train_y, all_val_x, all_val_y, all_test_x, all_test_y = [], [], [], [], [], []

for filename in FILENAMES:
    data_loader = Data(VAL_RATIO,
                       TEST_RATIO,
                       "ALL",
                       filename,
                       normalize=NORMALIZE)

    title_x, title_y = data_loader.get_title()
    train_x, train_y = data_loader.get_train_data()
    val_x, val_y = data_loader.get_val_data()
    test_x, test_y = data_loader.get_test_data()

    if title_x[-1] == "latitude":
        lat_train = train_x[:, -1]
        lon_train = train_x[:, -2]
        lat_val = val_x[:, -1]
        lon_val = val_x[:, -2]
        lat_test = test_x[:, -1]
        lon_test = test_x[:, -2]
Exemple #8
0
from process_data import Data
from sklearn.metrics import f1_score

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C

TEST_RATIO = 0.0  #float from 0.0 to 1.0
VAL_RATIO = 0.3  #float from 0.0 to 1.0
NORMALIZE = True  #normalize data in "total_passenger_count", "total_female_count", "empty_seats", "haversine"
BATCH_SIZE = 5
INCLUDE = "ALL"  #one of "trip_var", "instant_var", "perception_var", "contextual_var", "sociodemographic_var", "ALL"
FILENAME = 'final_data_4.csv'

data_loader = Data(VAL_RATIO,
                   TEST_RATIO,
                   INCLUDE,
                   FILENAME,
                   normalize=NORMALIZE)

title_x, title_y = data_loader.get_title()
train_x, train_y = data_loader.get_train_data()
val_x, val_y = data_loader.get_val_data()
test_x, test_y = data_loader.get_test_data()

if title_x[-1] == "latitude":
    lat_train = train_x[:, -1]
    lon_train = train_x[:, -2]
    lat_val = val_x[:, -1]
    lon_val = val_x[:, -2]
elif title_x[-1] == "longitude":
    lat_train = train_x[:, -2]
Exemple #9
0
#
#     while True:
#         data.load_data()
#         data.split_data()  # split into both test and train
#         predicted_class = {}  # holds data_set_name and a list of predicted classes
#
#         for name, train_data_set in data.train_dict.items():  # iterate through data and get key(Data name) and data_set
#             print("Current Data Set: ", name)
#             predicted_class[name] = []  # create a list of for a data set of predicted values
#             test_data_set = data.test_dict[name]  # TODO: Use same keys for all dictionaries; Access testing data by key.
#             for _, query_point in train_data_set.iterrows():
#                 # give query example and its corresponding train_data_set, along with # of desired neighbors to consider
#                 predicted_class[name].append(knn.perform_knn(query_point, train_data_set, 5, name, data))

knn = KNN()
data = Data()  # loads the data and checks if complete
lf = LF()
data.load_data()


def run_zero_loss():
    """
    Calls function in other files until program is finished.
    :return: None
    """
    data.split_data()  # split into both test and train
    lf.zero_one_loss(data.test_dict['abalone'].sample(n=400), 5, 'abalone', data)


def run_k_means(indata):  # Run k-means on wine data set'knn = KNN()
    knn = KNN()