def test_q_calculation(self): dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("breastCancer") naiveBayes = NaiveBayes(dataRetriever.getDataSet(), dataRetriever.getDataClass()) #seperatedByClass = naiveBayes.calculateQ() #print(seperatedByClass) self.assertEqual( dataRetriever.getDataMenu(), ["breastCancer", "glass", "iris", "soybeanSmall", "vote"], "should return list of data sets")
def testKMeans(self): data = DataRetriever("../Datasets/metadata.json") data.retrieveData("computerHardware") kValue = 15 t = Timer() t.start() mediods = KMediods(data.getDataSet(), data.getDataClass(), data.getDescreteAttributes(), data.getContinuousAttributes(), data.getPredictionType(), kValue, 100) t.stop() print(f"Time: {t}") print(mediods) mediods.to_csv('kmedoids.csv', index=False)
maxItter = 100 kValue = 78 # These are only used for image segmentation and abalone # frac = .25 # random_state = 69 # kValue = m.floor(frac * kValue) dataSetUnNormalized = data.getDataSet() # dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001) // This is for Forest Fires sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()]) dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit() dataSetNormalized = dataSetUnNormalized # dataSetNormalized = dataSetNormalized.sample(frac=frac, random_state=random_state) # dataSetNormalized = dataSetNormalized.reset_index() # dataSetNormalized = dataSetNormalized.drop(["idNumber"], axis=1) #// For Glass medoids = KMediods(dataSetNormalized, data.getDataClass(), data.getDescreteAttributes(), data.getContinuousAttributes(), data.getPredictionType(), kValue, maxItter) medoids.to_csv('./CSVOutput/' + "normalized" + dataSetName + 'MedoidsClustered.csv', index=False) print(f"CSV for " + dataSetName + " has been created!")
# This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) maxIter = 1 learning_rate = 1e-3 batch_size = 0.01 metrics = [] fold = 0 test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) train_set = ohe.train_fit(train_set, discrete_attr) test_set = ohe.fit(test_set) # Normalize Data sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()]) train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit() test_set[dataRetriever.getContinuousAttributes()] = sn.fit(test_set[dataRetriever.getContinuousAttributes()]) # Train network and change architecture in respect to data set nn = NeuralNetwork(train_set, 2, [6,16], dataRetriever.getPredictionType(), dataRetriever.getDataClass()) fitness_matrix, average_fitness = nn._particle_swarm_optimize(70, max_iter=500)
maxItter = 100 kValue = 1330 # These are only used for image segmentation and abalone frac = .25 random_state = 69 kValue = m.floor(frac * kValue) dataSetUnNormalized = data.getDataSet() # dataSetUnNormalized[data.getDataClass()] = np.log(dataSetUnNormalized[data.getDataClass()] + 0.001) // This is for Forest Fires sn = StandardNormalizer(dataSetUnNormalized[data.getContinuousAttributes()]) dataSetUnNormalized[data.getContinuousAttributes()] = sn.train_fit() dataSetNormalized = dataSetUnNormalized dataSetNormalized = dataSetNormalized.sample(frac=frac, random_state=random_state) dataSetNormalized = dataSetNormalized.reset_index() # dataSetNormalized = dataSetNormalized.drop(["idNumber"], axis=1) // For Glass centroids = KMeans(dataSetNormalized, data.getDataClass(), data.getDescreteAttributes(), data.getContinuousAttributes(), data.getPredictionType(), kValue, maxItter) centroids.to_csv('./CSVOutput/' + "normalized" + dataSetName + 'KMeansClustered.csv', index=False) print(f"CSV for " + dataSetName + " has been created!")
def run_driver(current_data_set, mutation_rate=0.5, maxIter=1000, batch_size=0.6, population_size=110, network_architecture=[15], pb_actor=None): cost_func = { "breastCancer": "bin_cross", "glass": "log_cosh", "soybeanSmall": "log_cosh", "abalone": "log_cosh", "forestFires": "log_cosh", "computerHardware": "log_cosh" } title_text = r""" ______ __ _ ___ __ _ __ __ / ____/___ ____ ___ / /_ (_)_____ / / / /____ _ ____ _____ (_)/ /_ / /_ ____ ___ _____ / / __ / _ \ / __ \ / _ \ / __// // ___/ / /| / / // __ `// __ \ / ___// // __// __ \ / __ `__ \ / ___/ / /_/ // __// / / // __// /_ / // /__ / ___ / / // /_/ // /_/ // / / // /_ / / / // / / / / /(__ ) \____/ \___//_/ /_/ \___/ \__//_/ \___/ /_/ |_//_/ \__, / \____//_/ /_/ \__//_/ /_//_/ /_/ /_//____/ /____/ """ output_json = {} # ====================== Adjustable Variables ============================== # current_data_set = "abalone" # mutation_rate = .5 # maxIter = 10 # batch_size = .6 # population_size = 110 # network_architecture = [] # =========================================================================== output_json["parameters"] = { "mutation_rate": mutation_rate, "population_size": population_size, "network_architecture": network_architecture, "cost_func": cost_func[current_data_set], "maxIter": maxIter, "batch_size": batch_size } # ================ Data pre-processing ================================================= dataRetriever = DataRetriever("../../Datasets/metadata.json") dataRetriever.retrieveData(current_data_set) dataset = dataRetriever.getDataSet().dropna() discrete_attr = dataRetriever.getDescreteAttributes() cont_attributes = dataRetriever.getContinuousAttributes() # This line is used to normalize the data for Forest Fires if current_data_set == "forestFires": discrete_attr.remove('month') discrete_attr.remove('day') dataset['month'] = (pd.to_datetime(dataset.month, format='%b').dt.month) - 1 dataset["day"] = dataset['day'].apply( lambda x: list(calendar.day_abbr).index(x.capitalize())) dataset["month_sin"] = np.sin(dataset['month']) dataset["month_cos"] = np.sin(dataset['month']) dataset["day_sin"] = np.sin(dataset['day']) dataset["day_cos"] = np.sin(dataset['day']) dataset = dataset.drop('day', axis=1) dataset = dataset.drop('month', axis=1) cont_attributes.append('month_sin') cont_attributes.append('month_cos') cont_attributes.append('day_sin') cont_attributes.append('day_cos') dataset[dataRetriever.getDataClass()] = np.log( dataset[dataRetriever.getDataClass()] + 0.000001) elif current_data_set == "computerHardware": discrete_attr.remove('venderName') discrete_attr.remove('modelName') dataset = dataset.drop('venderName', axis=1) dataset = dataset.drop('modelName', axis=1) dataset = dataset.reset_index(drop=True) if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) # ======================= Train Neural Network ================ print(title_text) fold = 0 metrics = [] for test_set, train_set in KFolds(dataset, 10): fold += 1 fitness_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_fitness.csv" output_file = f"../DataDump/GA/{current_data_set}_layer{len(network_architecture)}_fold{fold}_output.csv" metrics.append( multiprocess_func.remote(test_set, train_set, fold, fitness_file, output_file, dataRetriever, cost_func[current_data_set], current_data_set, mutation_rate, maxIter, batch_size, population_size, network_architecture, pb_actor=None)) metrics = ray.get(metrics) print(metrics) print("Average Performance: ", np.asarray(metrics).mean()) output_json["Metrics"] = metrics output_json["Average"] = np.asarray(metrics, dtype=np.float64).mean() output_json["Std"] = np.asarray(metrics, dtype=np.float64).std() with open( f"../DataDump/GA_{current_data_set}_layer{len(network_architecture)}.json", 'w') as f: json.dump(output_json, f, indent=4)
from MLAlgorithms.Utils.StandardNormalizer import StandardNormalizer from MLAlgorithms.Utils.DataRetriever import DataRetriever from MLAlgorithms.Utils.ClassifierAnalyzer import ClassifierAnalyzer import numpy as np import json dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("vote") data = dataRetriever.getDataSet() data = data.dropna() data = data.sample(frac=1.0, random_state=93) data = data.reset_index(drop=True) # data = data.drop('idNumber', axis=1) class_col = dataRetriever.getDataClass() # data[class_col] = np.log(data[class_col] + 0.001) contAttr = dataRetriever.getContinuousAttributes() discAttr = dataRetriever.getDescreteAttributes() predictionType = dataRetriever.getPredictionType() output_json = {} iter_num = 0 for test, train in KFolds(data, 5, stratisfied=True, class_col=class_col): #KFolds doesn't have the capability of returning a validate set #K is set to desired k/2 and the validate set is half of the test set sn = StandardNormalizer(train[contAttr])
def network_tuner(*nodes_per_hidden_layer): """ This function is used to calcuate the optimal network architecture The user should input the dataset they would like to operate with and change the performance metric in accordance to the data set type IE regression or classification """ MSEs = [] bestNetwork = {} learning_rate = 0.0001 maxItter = 500 batch_size = .5 dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("glass") dataset = dataRetriever.getDataSet().dropna() dataset = dataset.reset_index(drop=True) # This line is used to normalize the data for Forest Fires # dataset[dataRetriever.getDataClass()] = np.log(dataset[dataRetriever.getDataClass()]+0.1) dataset[dataRetriever.getContinuousAttributes()] = (dataset[dataRetriever.getContinuousAttributes()]-dataset[dataRetriever.getContinuousAttributes()].mean())/dataset[dataRetriever.getContinuousAttributes()].std() test_set = dataset.sample(frac=0.1, random_state=69) train_set = dataset.drop(test_set.index) test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) datasetEncoded = ohe.train_fit(train_set, dataRetriever.getDescreteAttributes()) testEncoded = ohe.fit(test_set) output = None nn = NeuralNetwork(datasetEncoded, 0, [], dataRetriever.getPredictionType(), dataRetriever.getDataClass()) for i in range(maxItter): # We don't call an inital feedforward because backpropagate starts with a feedforward call # batch_size represents the number of data points per batch output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size) final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1)) output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True) actual = testEncoded[dataRetriever.getDataClass()] ## ===================== Classification ================= correct = 0 acc = 0 for i, row in enumerate(final): if row == actual.iloc[i]: correct += 1 # final = final.reshape(final.shape[0]) # MSE = ((actual-final)**2).mean() # MSEs.append(MSE) bestNetwork['network'] = nn bestNetwork['acc'] = acc bestNetwork['arc'] = [0] # # ============================================ # # ============ Compare Acc to Most Common Class values = test_set[dataRetriever.getDataClass()].value_counts() # USED FOR CLASSIFICATION # print(f'Accuracy: {acc}') # print(f'Max Class Prior: {values.max()/values.sum()}') # print(f"Class Distribution:\n{values}") # print("Final: ", final) # print("Actual: ", list(actual)) # print() numOfLayer = len(nodes_per_hidden_layer) print("Number of Hidden Layers: ", numOfLayer) for layer in range(numOfLayer): print(f"Layer Number: {layer + 1}") combinations = list(itertools.product(*nodes_per_hidden_layer[:layer+1])) for combo in combinations: output = None print("Node Combination: ",list(combo)) print(combo) nn = NeuralNetwork(datasetEncoded, layer, list(combo), dataRetriever.getPredictionType(), dataRetriever.getDataClass()) for i in range(maxItter): # We don't call an inital feedforward because backpropagate starts with a feedforward call # batch_size represents the number of data points per batch output = nn._back_propagate(learning_rate=learning_rate, batch_size=batch_size) final = nn.test(testEncoded.drop(dataRetriever.getDataClass(), axis=1)) output = nn._feed_forward(testEncoded.drop(dataRetriever.getDataClass(), axis=1), testing=True) actual = testEncoded[dataRetriever.getDataClass()] ## ===================== Classification ================= correct = 0 acc = 0 for i, row in enumerate(final): if row == actual.iloc[i]: correct += 1 acc = correct/len(test_set) # # # ============================================ # # # ============ Compare Acc to Most Common Class values = test_set[dataRetriever.getDataClass()].value_counts() # USED FOR CLASSIFICATION # print(f'Accuracy: {acc}') # print(f'Max Class Prior: {values.max()/values.sum()}') # # print(f"Class Distribution:\n{values}") # print("Final: ", final) # print("Actual: ", list(actual)) # print() if acc > bestNetwork['acc']: bestNetwork['network'] = nn bestNetwork['acc'] = acc bestNetwork['arc'] = combo # final = final.reshape(final.shape[0]) # MSE = ((actual-final)**2).mean() # MSEs.append(MSE) # if MSE < bestNetwork['acc']: # bestNetwork['network'] = nn # bestNetwork['acc'] = MSE # bestNetwork['arc'] = combo return bestNetwork#, MSEs
learning_rate = 1e-3 batch_size = 0.01 metrics = [] fold = 0 # Ten-Fold Cross Validation for test_set, train_set in KFolds(dataset, 10): fold += 1 print("Fold Num: ", fold) # Encode Data test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) train_set = ohe.train_fit(train_set, discrete_attr) test_set = ohe.fit(test_set) # Normalize Data sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()]) train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit() test_set[dataRetriever.getContinuousAttributes()] = sn.fit( test_set[dataRetriever.getContinuousAttributes()]) # Train network and change architecture in respect to data set nn = NeuralNetwork(train_set, 2, [2, 2], dataRetriever.getPredictionType(), dataRetriever.getDataClass()) nn.train(maxIter, learning_rate, batch_size)
print("The Percent of Correct Predictions is {t}%".format( t=round((t * 100 / len(answers)), 1))) print("The Percent of Incorrect Predictions is {f}%\n".format( f=round((f * 100 / len(answers)), 1))) dataRetriever = DataRetriever("../Datasets/metadata.json") ################################################ Un-Shuffled Data ################################################ # This first for loop performs the NaiveBayes algorithm for un-shuffled data jsonResults1 = {} for dataSet in dataRetriever.getDataMenu(): dataRetriever.retrieveData(dataSet) dataClass = dataRetriever.getDataClass() retrievedData = dataRetriever.getDataSet() numOfClassValues = len( retrievedData[dataRetriever.getDataClass()].unique()) method = "macro" foldNum = 1 jsonResults1[dataSet] = {} print(f"PRINTING RESULTS FOR THE CONTROL DATASET {dataSet}") for train, test in KFolds(retrievedData, 10): trainBin = BinDiscretizer( train[dataRetriever.getContinuousAttributes()], multi=True)
if __name__ == "__main__": dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData("computerHardware") data = dataRetriever.getDataSet() data = data.dropna() data = data.reset_index(drop=True)[[ "venderName", "modelName", "myct", "mmin", "mmax", "cach", "chmin", "chmax", "prp", "erp" ]] test = data.sample(frac=0.2) train = data.drop(test.index) test = test.reset_index(drop=True) train = train.reset_index(drop=True) VDM = ValueDifferenceMetric( data, unknown_col=dataRetriever.getDataClass(), prediction_type=dataRetriever.getPredictionType()) start = time.time() VDM.train() print(f"Training took: {time.time() - start} seconds") start = time.time() VDM.calc_distance_matrix(data["mmin"], data["mmin"]) print(f"Matrix took: {time.time() - start} seconds") # print(KNN.get_neighbors([5,10])) # print(KNN.test(augmented=True))
mutation_rate = .5 maxItter = 1000 batch_size = .6 population_size = 110 # =========================================================================== # ================ Data pre-processing ================================================= dataRetriever = DataRetriever("../Datasets/metadata.json") dataRetriever.retrieveData(current_data_set) dataset = dataRetriever.getDataSet().dropna() discrete_attr = dataRetriever.getDescreteAttributes() cont_attributes = dataRetriever.getContinuousAttributes() # This line is used to normalize the data for Forest Fires if current_data_set == "forestFires": zeros = dataset[dataset[dataRetriever.getDataClass()] < 1].index print(len(zeros) / len(dataset)) dataset = dataset.drop(zeros) discrete_attr.remove('month') discrete_attr.remove('day') dataset['month'] = (pd.to_datetime(dataset.month, format='%b').dt.month) - 1 dataset["day"] = dataset['day'].apply( lambda x: list(calendar.day_abbr).index(x.capitalize())) dataset["month_sin"] = np.sin(dataset['month']) dataset["month_cos"] = np.sin(dataset['month']) dataset["day_sin"] = np.sin(dataset['day']) dataset["day_cos"] = np.sin(dataset['day']) dataset = dataset.drop('day', axis=1)
learning_rate = 1e-3 batch_size = 0.01 metrics = [] fold = 0 # Ten-Fold Cross Validation for test_set, train_set in KFolds(dataset, 10): fold += 1 print("Fold Num: ", fold) # Encode Data test_set = test_set.reset_index(drop=True) train_set = train_set.reset_index(drop=True) ohe = OneHotEncoder() discrete_attr = dataRetriever.getDescreteAttributes() if dataRetriever.getDataClass() in discrete_attr: discrete_attr.remove(dataRetriever.getDataClass()) train_set = ohe.train_fit(train_set, discrete_attr) test_set = ohe.fit(test_set) # Normalize Data sn = StandardNormalizer(train_set[dataRetriever.getContinuousAttributes()]) train_set[dataRetriever.getContinuousAttributes()] = sn.train_fit() test_set[dataRetriever.getContinuousAttributes()] = sn.fit( test_set[dataRetriever.getContinuousAttributes()]) # Train network and change architecture in respect to data set nn = NeuralNetwork(train_set, 0, [], dataRetriever.getPredictionType(), dataRetriever.getDataClass()) nn.train(maxIter, learning_rate, batch_size)