def rhc(problem, iterations, random_seed, graph_file, graph_title): fitness = [] fit_time = [] fn_evals = [] global eval_count for i in iterations: eval_count = 0 start = datetime.datetime.now() best_state, best_fitness, _ = mlrose_hiive.random_hill_climb(problem, max_iters=i, random_state=random_seed) finish = datetime.datetime.now() fitness.append(best_fitness) fit_time.append((finish - start).total_seconds()) fn_evals.append(eval_count) plt.plot(iterations, fitness, label="Fitness score") plt.legend(loc="best") plt.grid() generate_graph(graph_file + "rhc", graph_title + "Random Hill Climbing", "Iterations", "Fitness") print('Best score achieved: ', max(fitness)) index = fitness.index(max(fitness)) print('Time taken to achieve that: ', fit_time[index]) print('Function evaluations taken to achieve that: ', fn_evals[index])
def ga(problem, iterations, random_seed, graph_file, graph_title): mutation_prob = [0.1, 0.2, 0.3, 0.4, 0.5] best_score = [] time_taken = [] fn_evals_taken = [] global eval_count for m in mutation_prob: fitness = [] fit_time = [] fn_evals = [] for i in iterations: eval_count = 0 start = datetime.datetime.now() best_state, best_fitness, _ = mlrose_hiive.genetic_alg(problem, mutation_prob=m, max_iters=i, random_state=random_seed) finish = datetime.datetime.now() fitness.append(best_fitness) fit_time.append((finish - start).total_seconds()) fn_evals.append(eval_count) # Find the best score achieved in that mutation prob best_score.append(max(fitness)) index = fitness.index(max(fitness)) # find the time that was taken to achieve that time_taken.append(fit_time[index]) fn_evals_taken.append(fn_evals[index]) plt.plot(iterations, fitness, label="Mutation = " + str(m)) plt.legend(loc="best", title='Mutation Probability') plt.grid() generate_graph(graph_file + "ga", graph_title + "Genetic Algorithm", "Iterations", "Fitness") # Decays best_score and time_taken plt.plot(mutation_prob, best_score) plt.grid() generate_graph(graph_file + "ga_mut", graph_title + "Genetic Algorithm", "Mutation Probability", "Best Score Achieved") """ plt.plot(mutation_prob, time_taken) plt.grid() generate_graph("cp_sa_decay_time", "Continuous Peaks - Genetic Algorithm", "Mutation Probability", "Time taken to achieve that") """ plt.scatter(time_taken, best_score) for i, txt in enumerate(mutation_prob): plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i])) plt.legend(loc='best', title='Mutation Probability') plt.grid() generate_graph(graph_file + "ga_scatter", graph_title + "Genetic Algorithm", "Time Taken", "Best Score achieved") print('Mutation prob: ', mutation_prob) print('Best scores reached: ', best_score) print('Time taken to do that: ', time_taken) print('Function evaluations taken: ', fn_evals_taken)
def mimic(problem, iterations, random_seed, graph_file, graph_title): keep_pct = [0.1, 0.25, 0.50] best_score = [] time_taken = [] fn_evals_taken = [] global eval_count for k in keep_pct: fitness = [] fit_time = [] fn_evals = [] for i in iterations: eval_count = 0 start = datetime.datetime.now() best_state, best_fitness, _ = mlrose_hiive.mimic(problem, keep_pct=k, max_iters=i, random_state=random_seed) finish = datetime.datetime.now() fitness.append(best_fitness) fit_time.append((finish - start).total_seconds()) fn_evals.append(eval_count) # Find the best score achieved in that mutation prob best_score.append(max(fitness)) index = fitness.index(max(fitness)) # find the time that was taken to achieve that time_taken.append(fit_time[index]) fn_evals_taken.append(fn_evals[index]) plt.plot(iterations, fitness, label="keep_pct = " + str(k)) plt.legend(loc="best", title='Proportion of samples kept') plt.grid() generate_graph(graph_file + "mimic", graph_title + "MIMIC: ", "Iterations", "Fitness") # Decays best_score and time_taken plt.plot(keep_pct, best_score) plt.grid() generate_graph(graph_file + "mimic_pct", graph_title + "MIMIC", "Proportion of samples kept", "Best Score Achieved") """ plt.plot(mutation_prob, time_taken) plt.grid() generate_graph("cp_sa_decay_time", "Continuous Peaks - Genetic Algorithm", "Mutation Probability", "Time taken to achieve that") """ plt.scatter(time_taken, best_score) for i, txt in enumerate(keep_pct): plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i])) plt.legend(loc='best', title='Proportion of samples kept') plt.grid() generate_graph(graph_file + "mimic_scatter", graph_title + "MIMIC", "Time Taken", "Best Score achieved") print('Proportion of samples kept: ', keep_pct) print('Best scores reached: ', best_score) print('Time taken to do that: ', time_taken) print('Function evaluations taken: ', fn_evals_taken)
def one_max(): algorithms = ['RHC', 'SA', 'GA', 'MIMIC'] best_score_om = [46, 44, 50, 50] time_taken_om = [0.00773, 0.006309, 0.554985, 19.869137] fn_evals_om = [88, 214, 6039, 3221] x = np.arange(4) colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue'] # Best Score achieved plt.bar(x, height=best_score_om, color=colors) plt.xticks(x, algorithms) generate_graph("one_max_score", "One Max - Best Scores", "Algorithms", "Best Score Achieved") # Time taken to achieve that plt.bar(x, height=time_taken_om, color=colors) plt.xticks(x, algorithms) generate_graph("one_max_time", "One Max - Running Time", "Algorithms", "Time taken to achieve that") # Time taken to achieve that plt.bar(x, height=fn_evals_om, color=colors) plt.xticks(x, algorithms) generate_graph("one_max_evals", "One Max - Function evaluations", "Algorithms", "Function evaluations taken")
def ks(): algorithms = ['RHC', 'SA', 'GA', 'MIMIC'] best_score_om = [41, 45, 50, 50] time_taken_om = [0.002853, 0.007676, 0.287459, 0.608017] fn_evals_om = [18, 28, 2615, 2413] x = np.arange(4) colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue'] # Best Score achieved plt.bar(x, height=best_score_om, color=colors) plt.xticks(x, algorithms) generate_graph("ks_score", "Knapsack - Best Scores", "Algorithms", "Best Score Achieved") # Time taken to achieve that plt.bar(x, height=time_taken_om, color=colors) plt.xticks(x, algorithms) generate_graph("ks_time", "Knapsack - Running Time", "Algorithms", "Time taken to achieve that") # Time taken to achieve that plt.bar(x, height=fn_evals_om, color=colors) plt.xticks(x, algorithms) generate_graph("ks_evals", "Knapsack - Function evaluations", "Algorithms", "Function evaluations taken")
def cp(): algorithms = ['RHC', 'SA', 'GA', 'MIMIC'] best_score_om = [56, 84, 94, 85] time_taken_om = [0.002085, 0.048171, 1.746986, 43.326225] fn_evals_om = [13, 819, 12880, 6846] x = np.arange(4) colors = ['coral', 'orange', 'mediumseagreen', 'cornflowerblue'] # Best Score achieved plt.bar(x, height=best_score_om, color=colors) plt.xticks(x, algorithms) generate_graph("cp_score", "Continuous Peaks - Best Scores", "Algorithms", "Best Score Achieved") # Time taken to achieve that plt.bar(x, height=time_taken_om, color=colors) plt.xticks(x, algorithms) generate_graph("cp_time", "Continuous Peaks - Running Time", "Algorithms", "Time taken to achieve that") # Time taken to achieve that plt.bar(x, height=fn_evals_om, color=colors) plt.xticks(x, algorithms) generate_graph("cp_evals", "Continuous Peaks - Function evaluations", "Algorithms", "Function evaluations taken")
return False, None, None, None def get_manhattan_heuristic(node, goal): i, j = divmod(int(node), 8) i_goal, j_goal = divmod(int(goal), 8) i_delta = abs(i - i_goal) j_delta = abs(j - j_goal) manhattan_dist = i_delta + j_delta return manhattan_dist if __name__ == '__main__': graph_neighbours = generate_graph() print("============ UCS Search ================") path_ucs, explored_ucs = uniform_cost_search(graph_neighbours, '0', '61') print("Path UCS:", path_ucs) # print("Explored Nodes UCS: ", explored_ucs) print(len(explored_ucs)) print() print("============ AStar Search ================") path_astar, explored_astar = astar_search(graph_neighbours, '0', '61') print("Path_astar:", path_astar) print("Explored Nodes A Star: ", explored_astar) print(len(explored_astar)) print()
def sa(problem, iterations, random_seed, graph_file, graph_title): decays = [0.001, 0.002, 0.003, 0.004, 0.005] best_score = [] time_taken = [] fn_evals_taken = [] # fig1, ax1 = plt.subplots() # fig2, ax2 = plt.subplots() global eval_count for decay in decays: schedule = mlrose_hiive.ArithDecay(init_temp=1.0, decay=decay) fitness = [] fit_time = [] fn_evals = [] for i in iterations: eval_count = 0 start = datetime.datetime.now() # Solve using simulated annealing - attempt 1 best_state, best_fitness, _ = mlrose_hiive.simulated_annealing(problem, schedule=schedule, max_iters=i, random_state=random_seed) finish = datetime.datetime.now() fn_evals.append(eval_count) fitness.append(best_fitness) fit_time.append((finish - start).total_seconds()) # print('iteration: ',i) # print('best_state:', best_state) # print('best_fitness: ', best_fitness) best_score.append(max(fitness)) index = fitness.index(max(fitness)) time_taken.append(fit_time[index]) fn_evals_taken.append(fn_evals[index]) # print('index: ', index) # print('time for that: ', fit_time[index]) plt.plot(iterations, fitness, label="Cooling = " + str(decay)) # ax2.plot(fn_evals, fitness, label="Cooling = " + str(decay)) plt.legend(loc="best") plt.grid() generate_graph(graph_file + "sa_iter", graph_title + "Simulated Annealing", "Iterations", "Fitness") """ ax2.legend(loc="best") ax2.grid() generate_graph("cp_sa_evals", "Continuous Peaks - Simulated Annealing", "Function evaluations", "Fitness") """ # Decays best_score and time_taken plt.plot(decays, best_score) plt.grid() generate_graph(graph_file + "sa_decays", graph_title + "Simulated Annealing", "Cooling Component", "Best Score Achieved") plt.plot(decays, time_taken) plt.grid() generate_graph(graph_file + "sa_decay_time", graph_title + "Simulated Annealing", "Cooling Component", "Time taken to achieve that") plt.scatter(time_taken, best_score) for i, txt in enumerate(decays): plt.annotate(s=str(txt), xy=(time_taken[i], best_score[i])) plt.legend(loc='best', title='Cooling Component') plt.grid() generate_graph(graph_file + "sa_scatter", graph_title + "Simulated Annealing", "Time Taken", "Best Score achieved") print('decays: ', decays) print('Best scores reached: ', best_score) print('Time taken to do that: ', time_taken) print('Function evaluations taken: ', fn_evals_taken)
def pulsar_dataset(): random_seed = 7 df = pd.read_csv('datasets/HTRU_2.csv') df = df.dropna() print('data size***********', df.shape) # Let us keep aside data for final testing, since we are going to employ cross-validation data_X = df.iloc[:, :-1] data_y = df.iloc[:, -1] X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed) # We will use X,y for tuning the model # Plot learning curves before tuning with default hidden layers mlp_model = MLPClassifier(hidden_layer_sizes=(1), random_state=random_seed) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] util.plot_lc_nn(mlp_model=mlp_model, X=X, y=y, train_sizes=train_sizes, graph_name='nn/nn_htru_') # Hyperparameter tuning, hidden layer size X_train, X_val_test, y_train, y_val_test = \ train_test_split(X, y, train_size=0.8, random_state=random_seed) hidden_layer_sizes = [1, 3, 5, 7, 10] train_score = [] test_score = [] for i in hidden_layer_sizes: mlp_model = MLPClassifier(hidden_layer_sizes=(i), random_state=random_seed) mlp_model.fit(X=X_train, y=y_train) y_train_predict = mlp_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = mlp_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_layers = pd.DataFrame({ 'Hidden layer sizes': hidden_layer_sizes, 'train score': train_score, 'validation score': test_score }) print('Hidden layers**************') print(df_layers) # Plot Max depth plt.plot(hidden_layer_sizes, train_score, 'o-', color="r", label="Training score") plt.plot(hidden_layer_sizes, test_score, 'o-', color="g", label="Validation score") plt.legend(loc="best") util.generate_graph("nn/nn_htru_layers", "Hidden layer sizes Vs Accuracy", "Hidden layer sizes", "Accuracy Score") # Choosing layer size = 3 # Decision Tree after pruning/tuning mlp_model = MLPClassifier(hidden_layer_sizes=(3), random_state=random_seed) util.plot_lc_nn(mlp_model=mlp_model, X=X, y=y, train_sizes=train_sizes, graph_name='nn/nn_htru_tuned_') # Final Model Accuracy against test set we kept aside, with max_depth = 11 mlp_model = MLPClassifier(hidden_layer_sizes=(3), random_state=random_seed) mlp_model.fit(X, y) y_predict = mlp_model.predict(X_test) final_accuracy = accuracy_score(y_test, y_predict) print( "MLPClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ", final_accuracy)
def wine_dataset(): random_seed = 7 df = pd.read_csv('datasets/winequality-white.csv', sep=';') df = df.dropna() print('data size***********', df.shape) # Let us keep aside data for final testing, since we are going to employ cross-validation data_X = df.iloc[:, :-1] data_y = df.iloc[:, -1] X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed) # We will use X,y for tuning the model KNN_model = KNeighborsClassifier(n_neighbors=3) train_sizes = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=KNN_model, title='Learning Curve - KNN', X=X, y=y, cv=3, train_sizes = train_sizes, graph_name= 'knn/knn_wine_') # Tuning the KNN model by the n_neighbours parameter X_train, X_val_test, y_train, y_val_test = \ train_test_split(X, y, train_size=0.8, random_state=random_seed) k_neighbours = range(1,31) train_score = [] test_score = [] for k in k_neighbours: KNN_model = KNeighborsClassifier(n_neighbors=k) KNN_model.fit(X=X_train, y=y_train) y_train_predict = KNN_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = KNN_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_neighbours = pd.DataFrame({ 'No. neighbours': k_neighbours, 'train score': train_score, 'test score': test_score }) print('K neighbours**************') print(df_neighbours) # Plot Max depth plt.plot(k_neighbours, train_score, 'o-', color="r", label="Training score") plt.plot(k_neighbours, test_score, 'o-', color="g", label="Test score") plt.legend(loc="best") util.generate_graph("knn/knn_wine_nei", "K neighbours Vs Accuracy", "K neighbours", "Accuracy Score") # At k = 14, we get a good meeting of train and validation scores. # KNN model after tuning KNN_model = KNeighborsClassifier(n_neighbors=14) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=KNN_model, title='Learning Curve - KNN', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='knn/knn_wine_tuned_') # Final Model Accuracy against test set we kept aside, with k = 14 KNN_model = KNeighborsClassifier(n_neighbors=14) KNN_model.fit(X, y) y_predict = KNN_model.predict(X_test) final_accuracy = accuracy_score(y_test, y_predict) print("KNeighborsClassifier - Wine Dataset - Final Accuracy score on the test set: ", final_accuracy)
iterations = range(1, 1001, 1) nn_rhc_fitness = rhc(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot) nn_sa_fitness = sa(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot) nn_ga_fitness = ga(X_train_scaled, X_test_scaled, y_train_hot, y_test_hot) print('nn_rhc_fitness.shape: ', nn_rhc_fitness.shape) print('nn_sa_fitness.shape: ', nn_sa_fitness.shape) print('nn_ga_fitness.shape: ', nn_ga_fitness.shape) # Plot the fitness vs iterations for each algorithm plt.plot(iterations, nn_rhc_fitness, label="RHC") plt.plot(iterations, nn_sa_fitness, label="SA") plt.plot(iterations, nn_ga_fitness, label="GA") plt.legend(loc="best") plt.grid() generate_graph("nn_fitness", "Neural Network - RHC, SA, GA", "Iterations", "Fitness") # Algorithm comparison algorithms = ['RHC', 'SA', 'GA'] train_accuracy = [0.22077590607452782, 0.21311893823379274, 0.5063808065339459] test_accuracy = [0.20204081632653062, 0.19795918367346937, 0.5387755102040817] fit_times = [11.030777, 12.34732, 1274.722984] x = np.arange(3) colors = ['coral', 'orange', 'mediumseagreen'] # Train accuracy score plt.bar(x, height= train_accuracy, color=colors) plt.xticks(x, algorithms) generate_graph("nn_train_score", "Neural Network - Train Accuracy Score", "Algorithms", "Accuracy score") # Test accuracy score
def pulsar_dataset(): random_seed = 7 df = pd.read_csv('datasets/HTRU_2.csv') df = df.dropna() print('data size***********', df.shape) # Let us keep aside data for final testing, since we are going to employ cross-validation data_X = df.iloc[:, :-1] data_y = df.iloc[:, -1] X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed) # We will use X,y for tuning the model svm_model = svm.SVC(kernel='linear', random_state=random_seed) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before tuning util.plot_learning_curve(estimator=svm_model, title='Learning Curve - Decision Trees', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='svm/svm_htru_linear_') # Swapping kernels in the SVM model X_train, X_val_test, y_train, y_val_test = \ train_test_split(X, y, train_size=0.8, random_state=random_seed) kernels = ['linear', 'rbf'] train_score = [] test_score = [] for kernel in kernels: svm_model = svm.SVC(kernel=kernel, random_state=random_seed) svm_model.fit(X=X_train, y=y_train) y_train_predict = svm_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = svm_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_kernels = pd.DataFrame({ 'SVM kernel': kernels, 'train score': train_score, 'test score': test_score }) print('SVM Kernels**************') print(df_kernels) # Plot Kernels plt.plot(kernels, train_score, 'o-', color="r", label="Training score") plt.plot(kernels, test_score, 'o-', color="g", label="Test score") plt.legend(loc="best") util.generate_graph("svm/svm_htru_kernels", "SVM Kernels Vs Accuracy", "SVM Kernels", "Accuracy Score") # Accuracy score is more or less same for both kernels # But the performance (fit time) for rbf(0.2s) is lesser compared to linear(8s) svm_model = svm.SVC(kernel='rbf', random_state=random_seed) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=svm_model, title='Learning Curve - SVM', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='svm/svm_htru_rbf_') # Final Model Accuracy against test set we kept aside, with kernel = rbf svm_model = svm.SVC(kernel='rbf', random_state=random_seed) svm_model.fit(X, y) y_predict = svm_model.predict(X_test) final_accuracy = accuracy_score(y_test, y_predict) print("SVC - HTRU_2 Dataset - Final Accuracy score on the test set: ", final_accuracy)
def pulsar_dataset(): random_seed = 7 df = pd.read_csv('datasets/HTRU_2.csv') df = df.dropna() print('data size***********', df.shape) # Let us keep aside data for final testing, since we are going to employ cross-validation data_X = df.iloc[:, :-1] data_y = df.iloc[:, -1] X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed) # We will use X,y for tuning the model boost_model = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(random_state=random_seed), n_estimators=10) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=boost_model, title='Learning Curve - Ada Boost Classifier', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='boost/boost_htru_') # Let's choose training set size 0.8, since dataset seems almost evenly distributed # Tuning no of estimators X_train, X_val_test, y_train, y_val_test = \ train_test_split(X, y, train_size=0.8, random_state=random_seed) no_estimators = [10, 100, 150, 200] train_score = [] test_score = [] for i in no_estimators: boost_model = AdaBoostClassifier( base_estimator=DecisionTreeClassifier(random_state=random_seed), n_estimators=i, random_state=random_seed) boost_model.fit(X=X_train, y=y_train) y_train_predict = boost_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = boost_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_depth = pd.DataFrame({ 'No Estimators': no_estimators, 'train score': train_score, 'validation score': test_score }) print('No Estimators**************') print(df_depth) # Plot Max depth plt.plot(no_estimators, train_score, 'o-', color="r", label="Training score") plt.plot(no_estimators, test_score, 'o-', color="g", label="Validation score") plt.legend(loc="best") util.generate_graph("boost/boost_htru_estimators", "No of Estimators Vs Accuracy", "No Estimators", "Accuracy Score") # Let us take no_estimators = 10 max_depths = range(1, 31) train_score = [] test_score = [] for max_depth in max_depths: boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( random_state=random_seed, max_depth=max_depth), n_estimators=10, random_state=random_seed) boost_model.fit(X=X_train, y=y_train) y_train_predict = boost_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = boost_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_depth = pd.DataFrame({ 'max_depths': max_depths, 'train score': train_score, 'validation score': test_score }) print('Max depth**************') print(df_depth) # Plot Max depth plt.plot(max_depths, train_score, 'o-', color="r", label="Training score") plt.plot(max_depths, test_score, 'o-', color="g", label="Validation score") plt.legend(loc="best") util.generate_graph("boost/boost_htru_depth", "Max Depth Vs Accuracy", "Max depth", "Accuracy Score") # At max_depth = 1, test score = 0.976955, train = 0.978086, not much difference increasing depth # so going with a very simple tree # Avoid too much overfitting # Decision Tree after pruning/tuning boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( max_depth=1, random_state=random_seed), n_estimators=10) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=boost_model, title='Learning Curve - Ada Boost Classifier', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='boost/boost_htru_pruned_') # Final Model Accuracy against test set we kept aside, with max_depth = 1 boost_model = AdaBoostClassifier(base_estimator=DecisionTreeClassifier( max_depth=1, random_state=random_seed), n_estimators=10) boost_model.fit(X, y) y_predict = boost_model.predict(X_test) final_accuracy = accuracy_score(y_test, y_predict) print( "AdaBoostClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ", final_accuracy)
def pulsar_dataset(): random_seed = 7 df = pd.read_csv('datasets/HTRU_2.csv') df = df.dropna() print('data size***********', df.shape) # Let us keep aside data for final testing, since we are going to employ cross-validation data_X = df.iloc[:, :-1] data_y = df.iloc[:, -1] X, X_test, y, y_test = train_test_split(data_X, data_y, train_size=0.8, random_state=random_seed) # We will use X,y for tuning the model DT_model = tree.DecisionTreeClassifier(random_state=random_seed) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=DT_model, title='Learning Curve - Decision Trees', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='dt/dt_htru_') # Let's choose training set size 0.8, since dataset seems almost evenly distributed # Pruning X_train, X_val_test, y_train, y_val_test = \ train_test_split(X, y, train_size=0.8, random_state=random_seed) max_depths = np.linspace(1, 32, 32, endpoint=True) train_score = [] test_score = [] for max_depth in max_depths: DT_model = tree.DecisionTreeClassifier(max_depth=max_depth, random_state=random_seed) DT_model.fit(X=X_train, y=y_train) y_train_predict = DT_model.predict(X_train) train_accuracy = accuracy_score(y_train, y_train_predict) train_score.append(train_accuracy) y_val_test_predict = DT_model.predict(X_val_test) test_accuracy = accuracy_score(y_val_test, y_val_test_predict) test_score.append(test_accuracy) df_depth = pd.DataFrame({ 'max depth': max_depths, 'train score': train_score, 'validation score': test_score }) print('max depth**************') print(df_depth) # Plot Max depth plt.plot(max_depths, train_score, 'o-', color="r", label="Training score") plt.plot(max_depths, test_score, 'o-', color="g", label="Validation score") plt.legend(loc="best") util.generate_graph("dt/dt_htru_max_depths", "Decision Tree Depths Vs Accuracy", "Max Tree Depth", "Accuracy Score") # choose max_depth = 1 # Decision Tree after pruning/tuning DT_model = tree.DecisionTreeClassifier(max_depth=1, random_state=random_seed) train_sizes = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] # Plot learning curves before pruning util.plot_learning_curve(estimator=DT_model, title='Learning Curve - Decision Trees', X=X, y=y, cv=3, train_sizes=train_sizes, graph_name='dt/dt_htru_pruned_') # Final Model Accuracy against test set we kept aside, with max_depth = 11 DT_model = tree.DecisionTreeClassifier(max_depth=1, random_state=random_seed) DT_model.fit(X, y) y_predict = DT_model.predict(X_test) final_accuracy = accuracy_score(y_test, y_predict) print( "DecisionTreeClassifier - HTRU_2 Dataset - Final Accuracy score on the test set: ", final_accuracy)