def Ionosphere(X): binaryCols = {"signal": {"g": 1, "b": 0}} X = X.copy() X = Processor.removeMissing(X) X = X.drop(columns=['col0', 'col1', "col13"]) X = Processor.toBinaryCol(X, binaryCols) Y = X["signal"] X = X.iloc[:, :-1] return [X, Y]
def ttt(X): labels = {"o": 0, "b": 1, "x": 2} encoding = {"result": {"positive": 1, "negative": 0}} X = X.copy() X = Processor.toBinaryCol(X, encoding) X = Processor.OHE( X, cols=["tl", "tm", "tr", "ml", "mm", "mr", "bl", "bm", "br"]) Y = X["result"] X = X.drop(columns=["result"]) return [X, Y]
def adult(X): binaryCols = { "sex": { "Male": 0, "Female": 1 }, "salary": { ">50K": 0, "<=50K": 1 } } X = X.copy() X = Processor.removeMissing(X) X = Processor.toBinaryCol(X, binaryCols) X = Processor.normalize(X, ["fnlwgt", "hours-per-week"]) Y = X["salary"] X = X.iloc[:, :-1] X = Processor.OHE(X) countryCols = [ "native-country_Cambodia", "native-country_England", "native-country_Puerto-Rico", "native-country_Canada", "native-country_Outlying-US(Guam-USVI-etc)", "native-country_India", "native-country_Japan", "native-country_Greece", "native-country_South", "native-country_China", "native-country_Cuba", "native-country_Iran", "native-country_Honduras", "native-country_Italy", "native-country_Poland", "native-country_Jamaica", "native-country_Vietnam", "native-country_Portugal", "native-country_Ireland", "native-country_France", "native-country_Dominican-Republic", "native-country_Laos", "native-country_Ecuador", "native-country_Taiwan", "native-country_Haiti", "native-country_Columbia", "native-country_Hungary", "native-country_Guatemala", "native-country_Nicaragua", "native-country_Scotland", "native-country_Thailand", "native-country_Yugoslavia", "native-country_El-Salvador", "native-country_Trinadad&Tobago", "native-country_Peru", "native-country_Hong", "native-country_Holand-Netherlands" ] X = X.drop(columns=(["capital-gain", "capital-loss", "education-num"] + countryCols)) return [X, Y]
def mam(X): X = X.copy() X = Processor.fillMissing(X) Y = X["result"] X = X.drop(columns=["result"]) return [X, Y]
df = pd.DataFrame(data) df_to_table(df, 'time_table_all_final') # print(evaluate_acc(Processor.ToNumpyCol(Y_test), model.predict(X_test.to_numpy()))) #print(cross_validation(5, X_train.to_numpy(), Processor.ToNumpyCol(Y_train), model)) elif ds == "ionosphere": path = "../datasets/ionosphere/ionosphere.data" header = ["{}{}".format("col", x) for x in range(33 + 1)] header.append("signal") All = Processor.read(path, header) [X, Y] = Clean.Ionosphere(All) [X_train, X_test, Y_train, Y_test] = Processor.split(X, Y, train=0.8) setup = ''' from Project1.src.NaiveBayes import NaiveBayes from Project1.src.Processor import Processor from Project1.src.Clean import Clean from Project1.src.CrossValidation import cross_validation path = "../datasets/ionosphere/ionosphere.data" header = ["{}{}".format("col", x) for x in range(33 + 1)] header.append("signal")
import matplotlib.pyplot as plt import numpy as np from Project1.src.LogisticRegression import LogisticRegression from Project1.src.NaiveBayes import NaiveBayes from Project1.src.Processor import Processor from Project1.src.Clean import Clean from Project1.src.CrossValidation import cross_validation from Project1.src.CrossValidation import evaluate_acc print("Analyzing the ionosphere data set") path = "../datasets/ionosphere/ionosphere.data" header = ["{}{}".format("col", x) for x in range(33 + 1)] header.append("signal") All = Processor.read(path, header) [X, Y] = Clean.Ionosphere(All) X = X.to_numpy() Y = Processor.ToNumpyCol(Y) iters = np.arange(20, X.shape[0], 50) #print(X.shape) #print(Y.shape) accuracies = [] for iter_ in iters: #rowsX = X[0:X.shape[0], :] #rowsY = Y[0:Y.shape[0], :] rowsX = X[0:iter_, :]
Learning rates and threshold gradient were chosen using the results of the hyperparameter tuning script """ from Project1.src.LogisticRegression import LogisticRegression from Project1.src.NaiveBayes import NaiveBayes from Project1.src.CrossValidation import cross_validation from Project1.src.Processor import Processor from Project1.src.Clean import Clean from Project1.src.HPTuning import df_to_table import pandas as pd # Find accuracies for ionosphere data set print("Analyzing the ionosphere data set") path = "../datasets/ionosphere/ionosphere.data" header = ["{}{}".format("col", x) for x in range(33 + 1)] header.append("signal") All = Processor.read(path, header) [X, Y] = Clean.Ionosphere(All) ionosphere_results = ['ionosphere'] acc, _, _ = cross_validation(5, X.to_numpy(), Processor.ToNumpyCol(Y), LogisticRegression(), learning_rate=1.0, max_gradient=1e-2, max_iters=50000) ionosphere_results.append(round(acc, 2)) acc = cross_validation(5, X.to_numpy(), Processor.ToNumpyCol(Y), NaiveBayes()) ionosphere_results.append(round(acc, 2)) print(ionosphere_results)