Exemple #1
0
    def test_train_test_sizes(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        for test, train in KFolds(breastCancer, 10):
            self.assertEqual(len(test) + len(train), len(breastCancer))
Exemple #2
0
    def test_train_test_independence(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        for test, train in KFolds(breastCancer, 10):
            #https://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python
            self.assertFalse(bool(set(test.index) & set(train.index)))
Exemple #3
0
    def test_proper_number_of_folds(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        iterations = 0
        for test, train in KFolds(breastCancer, 10):
            iterations += 1

        self.assertEqual(iterations, 10)
Exemple #4
0
    def test_stratisfied(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        iterations = 0
        for test, train in KFolds(breastCancer, 10, stratisfied=True):
            print("TestLen 2", len(test[test['class'] == 2]), "TestLen 4",
                  len(test[test['class'] == 4]))
            print("TrainLen 2", len(train[train['class'] == 2]), "TrainLen 4",
                  len(train[train['class'] == 4]))
            iterations += 1
Exemple #5
0
    def test_test_set_coverage(self):
        #Initialization
        dataRetriever = DataRetriever("../Datasets/metadata.json")
        dataRetriever.retrieveData("breastCancer")
        breastCancer = dataRetriever.getDataSet()
        continousAttributes = [
            "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape",
            "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei",
            "blandChromatin", "normalNucleoli", "mitoses", "class"
        ]

        tested_vals = []

        #Add a dummy index to the dataset so we can see which rows are selected each test
        breastCancer["dummyIndex"] = np.arange(len(breastCancer)) + 1

        for test, train in KFolds(breastCancer, 10):
            tested_vals.extend(test["dummyIndex"])

        self.assertTrue(set(tested_vals) == set(breastCancer["dummyIndex"]))
Exemple #6
0
def run_driver(current_data_set,
               mutation_rate=0.5,
               maxIter=1000,
               batch_size=0.6,
               population_size=110,
               network_architecture=[15],
               pb_actor=None):
    cost_func = {
        "breastCancer": "bin_cross",
        "glass": "log_cosh",
        "soybeanSmall": "log_cosh",
        "abalone": "log_cosh",
        "forestFires": "log_cosh",
        "computerHardware": "log_cosh"
    }

    title_text = r""" 
       ______                    __   _          ___     __                     _  __   __                    
      / ____/___   ____   ___   / /_ (_)_____   /   /   / /____ _ ____   _____ (_)/ /_ / /_   ____ ___   _____
     / / __ / _ \ / __ \ / _ \ / __// // ___/  / /| /  / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/
    / /_/ //  __// / / //  __// /_ / // /__   / ___ / / // /_/ // /_/ // /   / // /_ / / / // / / / / /(__  ) 
    \____/ \___//_/ /_/ \___/ \__//_/ \___/  /_/  |_//_/ \__, / \____//_/   /_/ \__//_/ /_//_/ /_/ /_//____/  
                                                        /____/                                                
    """

    output_json = {}

    # ====================== Adjustable Variables ==============================
    # current_data_set = "abalone"
    # mutation_rate = .5
    # maxIter = 10
    # batch_size = .6
    # population_size = 110

    # network_architecture = []
    # ===========================================================================

    output_json["parameters"] = {
        "mutation_rate": mutation_rate,
        "population_size": population_size,
        "network_architecture": network_architecture,
        "cost_func": cost_func[current_data_set],
        "maxIter": maxIter,
        "batch_size": batch_size
    }

    # ================ Data pre-processing =================================================
    dataRetriever = DataRetriever("../../Datasets/metadata.json")
    dataRetriever.retrieveData(current_data_set)
    dataset = dataRetriever.getDataSet().dropna()

    discrete_attr = dataRetriever.getDescreteAttributes()
    cont_attributes = dataRetriever.getContinuousAttributes()
    # This line is used to normalize the data for Forest Fires
    if current_data_set == "forestFires":
        discrete_attr.remove('month')
        discrete_attr.remove('day')
        dataset['month'] = (pd.to_datetime(dataset.month,
                                           format='%b').dt.month) - 1
        dataset["day"] = dataset['day'].apply(
            lambda x: list(calendar.day_abbr).index(x.capitalize()))
        dataset["month_sin"] = np.sin(dataset['month'])
        dataset["month_cos"] = np.sin(dataset['month'])

        dataset["day_sin"] = np.sin(dataset['day'])
        dataset["day_cos"] = np.sin(dataset['day'])
        dataset = dataset.drop('day', axis=1)
        dataset = dataset.drop('month', axis=1)
        cont_attributes.append('month_sin')
        cont_attributes.append('month_cos')
        cont_attributes.append('day_sin')
        cont_attributes.append('day_cos')

        dataset[dataRetriever.getDataClass()] = np.log(
            dataset[dataRetriever.getDataClass()] + 0.000001)
    elif current_data_set == "computerHardware":
        discrete_attr.remove('venderName')
        discrete_attr.remove('modelName')
        dataset = dataset.drop('venderName', axis=1)
        dataset = dataset.drop('modelName', axis=1)

    dataset = dataset.reset_index(drop=True)

    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    # ======================= Train Neural Network ================
    print(title_text)
    fold = 0
    metrics = []

    for test_set, train_set in KFolds(dataset, 10):
        fold += 1
        fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv"
        output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv"

        metrics.append(
            multiprocess_func.remote(test_set,
                                     train_set,
                                     fold,
                                     fitness_file,
                                     output_file,
                                     dataRetriever,
                                     cost_func[current_data_set],
                                     current_data_set,
                                     mutation_rate,
                                     maxIter,
                                     batch_size,
                                     population_size,
                                     network_architecture,
                                     pb_actor=None))

    metrics = ray.get(metrics)
    print(metrics)
    print("Average Performance: ", np.asarray(metrics).mean())
    output_json["Metrics"] = metrics
    output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean()
    output_json["Std"] = np.asarray(metrics, dtype=np.float64).std()

    with open(
            f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json",
            'w') as f:
        json.dump(output_json, f, indent=4)
Exemple #7
0
data = data.dropna()
data = data.sample(frac=1.0, random_state=93)
data = data.reset_index(drop=True)
# data = data.drop('idNumber', axis=1)

class_col = dataRetriever.getDataClass()
# data[class_col] = np.log(data[class_col] + 0.001)

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col):

    #KFolds doesn't have the capability of returning a validate set
    #K is set to desired k/2 and the validate set is half of the test set

    sn = StandardNormalizer(train[contAttr])
    train[contAttr] = sn.train_fit()

    test1 = test.sample(frac=0.5, random_state=13)
    test2 = test.drop(test1.index)

    train = train.reset_index(drop=True)
    test1 = test1.reset_index(drop=True)
    test2 = test2.reset_index(drop=True)

    test1[contAttr] = sn.fit(test1[contAttr])
dataRetriever = DataRetriever("../Datasets/metadata.json")
dataRetriever.retrieveData("breastCancer")
dataset = dataRetriever.getDataSet().dropna()
dataset = dataset.reset_index(drop=True)

# This line is used to normalize the data for Forest Fires
# dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1)
maxIter = 1
learning_rate = 1e-3
batch_size = 0.01

metrics = []
fold = 0

# Ten-Fold Cross Validation
for test_set, train_set in KFolds(dataset, 10):
    fold += 1
    print("Fold Num: ", fold)
    # Encode Data
    test_set = test_set.reset_index(drop=True)
    train_set = train_set.reset_index(drop=True)
    ohe = OneHotEncoder()
    discrete_attr = dataRetriever.getDescreteAttributes()
    if dataRetriever.getDataClass() in discrete_attr:
        discrete_attr.remove(dataRetriever.getDataClass())

    train_set = ohe.train_fit(train_set, discrete_attr)
    test_set = ohe.fit(test_set)

    #  Normalize Data
    sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
Exemple #9
0
# This first for loop performs the NaiveBayes algorithm for un-shuffled data
jsonResults1 = {}
for dataSet in dataRetriever.getDataMenu():
    dataRetriever.retrieveData(dataSet)
    dataClass = dataRetriever.getDataClass()
    retrievedData = dataRetriever.getDataSet()

    numOfClassValues = len(
        retrievedData[dataRetriever.getDataClass()].unique())
    method = "macro"
    foldNum = 1

    jsonResults1[dataSet] = {}

    print(f"PRINTING RESULTS FOR THE CONTROL DATASET {dataSet}")
    for train, test in KFolds(retrievedData, 10):

        trainBin = BinDiscretizer(
            train[dataRetriever.getContinuousAttributes()], multi=True)

        trainBin.train_multi()
        train[dataRetriever.getContinuousAttributes()] = trainBin.fit_multi(
            train[dataRetriever.getContinuousAttributes()])
        test[dataRetriever.getContinuousAttributes()] = trainBin.fit_multi(
            test[dataRetriever.getContinuousAttributes()])

        naiveBayes = NaiveBayes(train, dataClass)

        answers = test[dataClass].to_numpy()[:]
        test = test.drop(columns=dataClass)
        predictions = naiveBayes.test(test)
# data = data.drop('idNumber', axis=1)

class_col = dataRetriever.getDataClass()
# data[class_col] = np.log(data[class_col] + 0.001)

centroidsTrain = pd.read_csv("CSVOutput/normalizedabaloneKMeansClustered.csv")
medoidsTrain = pd.read_csv("CSVOutput/normalizedabaloneMedoidsClustered.csv")

contAttr = dataRetriever.getContinuousAttributes()
discAttr = dataRetriever.getDescreteAttributes()
predictionType = dataRetriever.getPredictionType()

output_json = {}
iter_num = 0

for test, train in tqdm(KFolds(data, 10), total=1):
    k_vals = [1, 3, 5, 7, int(np.floor(np.sqrt(len(train))))]

    #Normalize data
    sn = StandardNormalizer(train[contAttr + [class_col]])
    train[contAttr + [class_col]] = sn.train_fit()
    test[contAttr + [class_col]] = sn.fit(test[contAttr + [class_col]])

    # print("KNN")
    KNN = KNearestNeighbor(test.drop(class_col, axis=1),
                           train,
                           k_vals,
                           contAttr,
                           discAttr,
                           unknown_col=class_col,
                           predictionType=predictionType)