def test_train_test_sizes(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] for test, train in KFolds(breastCancer, 10): self.assertEqual(len(test) + len(train), len(breastCancer))
def test_train_test_independence(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] for test, train in KFolds(breastCancer, 10): #https://stackoverflow.com/questions/3170055/test-if-lists-share-any-items-in-python self.assertFalse(bool(set(test.index) & set(train.index)))
def test_proper_number_of_folds(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] iterations = 0 for test, train in KFolds(breastCancer, 10): iterations += 1 self.assertEqual(iterations, 10)
def test_stratisfied(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] iterations = 0 for test, train in KFolds(breastCancer, 10, stratisfied=True): print("TestLen 2", len(test[test['class'] == 2]), "TestLen 4", len(test[test['class'] == 4])) print("TrainLen 2", len(train[train['class'] == 2]), "TrainLen 4", len(train[train['class'] == 4])) iterations += 1
def test_test_set_coverage(self): #Initialization dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") breastCancer = dataRetriever.getDataSet() continousAttributes = [ "clumpThickness", "uniformityOfCellSize", "uniformityOfCellShape", "marginalAdhesion", "singleEpithelialCellSize", "bareNuclei", "blandChromatin", "normalNucleoli", "mitoses", "class" ] tested_vals = [] #Add a dummy index to the dataset so we can see which rows are selected each test breastCancer["dummyIndex"] = np.arange(len(breastCancer)) + 1 for test, train in KFolds(breastCancer, 10): tested_vals.extend(test["dummyIndex"]) self.assertTrue(set(tested_vals) == set(breastCancer["dummyIndex"]))
def run_driver(current_data_set, mutation_rate=0.5, maxIter=1000, batch_size=0.6, population_size=110, network_architecture=[15], pb_actor=None): cost_func = { "breastCancer": "bin_cross", "glass": "log_cosh", "soybeanSmall": "log_cosh", "abalone": "log_cosh", "forestFires": "log_cosh", "computerHardware": "log_cosh" } title_text = r""" ______ __ _ ___ __ _ __ __ / ____/___ ____ ___ / /_ (_)_____ / / / /____ _ ____ _____ (_)/ /_ / /_ ____ ___ _____ / / __ / _ \ / __ \ / _ \ / __// // ___/ / /| / / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/ / /_/ // __// / / // __// /_ / // /__ / ___ / / // /_/ // /_/ // / / // /_ / / / // / / / / /(__ ) \____/ \___//_/ /_/ \___/ \__//_/ \___/ /_/ |_//_/ \__, / \____//_/ /_/ \__//_/ /_//_/ /_/ /_//____/ /____/ """ output_json = {} # ====================== Adjustable Variables ============================== # current_data_set = "abalone" # mutation_rate = .5 # maxIter = 10 # batch_size = .6 # population_size = 110 # network_architecture = [] # =========================================================================== output_json["parameters"] = { "mutation_rate": mutation_rate, "population_size": population_size, "network_architecture": network_architecture, "cost_func": cost_func[current_data_set], "maxIter": maxIter, "batch_size": batch_size } # ================ Data pre-processing ================================================= dataRetriever = DataRetriever("../../Datasets/metadata.json") dataRetriever.retrieveData(current_data_set) dataset = dataRetriever.getDataSet().dropna() discrete_attr = dataRetriever.getDescreteAttributes() cont_attributes = dataRetriever.getContinuousAttributes() # This line is used to normalize the data for Forest Fires if current_data_set == "forestFires": discrete_attr.remove('month') discrete_attr.remove('day') dataset['month'] = (pd.to_datetime(dataset.month, format='%b').dt.month) - 1 dataset["day"] = dataset['day'].apply( lambda x: list(calendar.day_abbr).index(x.capitalize())) dataset["month_sin"] = np.sin(dataset['month']) dataset["month_cos"] = np.sin(dataset['month']) dataset["day_sin"] = np.sin(dataset['day']) dataset["day_cos"] = np.sin(dataset['day']) dataset = dataset.drop('day', axis=1) dataset = dataset.drop('month', axis=1) cont_attributes.append('month_sin') cont_attributes.append('month_cos') cont_attributes.append('day_sin') cont_attributes.append('day_cos') dataset[dataRetriever.getDataClass()] = np.log( dataset[dataRetriever.getDataClass()] + 0.000001) elif current_data_set == "computerHardware": discrete_attr.remove('venderName') discrete_attr.remove('modelName') dataset = dataset.drop('venderName', axis=1) dataset = dataset.drop('modelName', axis=1) dataset = dataset.reset_index(drop=True) if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) # ======================= Train Neural Network ================ print(title_text) fold = 0 metrics = [] for test_set, train_set in KFolds(dataset, 10): fold += 1 fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv" output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv" metrics.append( multiprocess_func.remote(test_set, train_set, fold, fitness_file, output_file, dataRetriever, cost_func[current_data_set], current_data_set, mutation_rate, maxIter, batch_size, population_size, network_architecture, pb_actor=None)) metrics = ray.get(metrics) print(metrics) print("Average Performance: ", np.asarray(metrics).mean()) output_json["Metrics"] = metrics output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean() output_json["Std"] = np.asarray(metrics, dtype=np.float64).std() with open( f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json", 'w') as f: json.dump(output_json, f, indent=4)
data = data.dropna() data = data.sample(frac=1.0, random_state=93) data = data.reset_index(drop=True) # data = data.drop('idNumber', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) contAttr = dataRetriever.getContinuousAttributes() discAttr = dataRetriever.getDescreteAttributes() predictionType = dataRetriever.getPredictionType() output_json = {} iter_num = 0 for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col): #KFolds doesn't have the capability of returning a validate set #K is set to desired k/2 and the validate set is half of the test set sn = StandardNormalizer(train[contAttr]) train[contAttr] = sn.train_fit() test1 = test.sample(frac=0.5, random_state=13) test2 = test.drop(test1.index) train = train.reset_index(drop=True) test1 = test1.reset_index(drop=True) test2 = test2.reset_index(drop=True) test1[contAttr] = sn.fit(test1[contAttr])
dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") dataset = dataRetriever.getDataSet().dropna() dataset = dataset.reset_index(drop=True) # This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) maxIter = 1 learning_rate = 1e-3 batch_size = 0.01 metrics = [] fold = 0 # Ten-Fold Cross Validation for test_set, train_set in KFolds(dataset, 10): fold += 1 print("Fold Num: ", fold) # Encode Data test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) train_set = ohe.train_fit(train_set, discrete_attr) test_set = ohe.fit(test_set) # Normalize Data sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()])
# This first for loop performs the NaiveBayes algorithm for un-shuffled data jsonResults1 = {} for dataSet in dataRetriever.getDataMenu(): dataRetriever.retrieveData(dataSet) dataClass = dataRetriever.getDataClass() retrievedData = dataRetriever.getDataSet() numOfClassValues = len( retrievedData[dataRetriever.getDataClass()].unique()) method = "macro" foldNum = 1 jsonResults1[dataSet] = {} print(f"PRINTING RESULTS FOR THE CONTROL DATASET {dataSet}") for train, test in KFolds(retrievedData, 10): trainBin = BinDiscretizer( train[dataRetriever.getContinuousAttributes()], multi=True) trainBin.train_multi() train[dataRetriever.getContinuousAttributes()] = trainBin.fit_multi( train[dataRetriever.getContinuousAttributes()]) test[dataRetriever.getContinuousAttributes()] = trainBin.fit_multi( test[dataRetriever.getContinuousAttributes()]) naiveBayes = NaiveBayes(train, dataClass) answers = test[dataClass].to_numpy()[:] test = test.drop(columns=dataClass) predictions = naiveBayes.test(test)
# data = data.drop('idNumber', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) centroidsTrain = pd.read_csv("CSVOutput/normalizedabaloneKMeansClustered.csv") medoidsTrain = pd.read_csv("CSVOutput/normalizedabaloneMedoidsClustered.csv") contAttr = dataRetriever.getContinuousAttributes() discAttr = dataRetriever.getDescreteAttributes() predictionType = dataRetriever.getPredictionType() output_json = {} iter_num = 0 for test, train in tqdm(KFolds(data, 10), total=1): k_vals = [1, 3, 5, 7, int(np.floor(np.sqrt(len(train))))] #Normalize data sn = StandardNormalizer(train[contAttr + [class_col]]) train[contAttr + [class_col]] = sn.train_fit() test[contAttr + [class_col]] = sn.fit(test[contAttr + [class_col]]) # print("KNN") KNN = KNearestNeighbor(test.drop(class_col, axis=1), train, k_vals, contAttr, discAttr, unknown_col=class_col, predictionType=predictionType)