def n_neighbors_analysis(self, range_=range(1, 50)): print("\n######") print("Testing different neighbor values") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) for n in range_: clf = KNeighborsClassifier(n_neighbors=n, n_jobs=-1) clf.fit(X_train_std, y_train) metrics['train_acc_uniform'].append(clf.score( X_train_std, y_train)) metrics['test_acc_uniform'].append(clf.score(X_test_std, y_test)) clf = KNeighborsClassifier(n_neighbors=n, weights="distance", n_jobs=-1) clf.fit(X_train_std, y_train) metrics['train_acc_distance'].append( clf.score(X_train_std, y_train)) metrics['test_acc_distance'].append(clf.score(X_test_std, y_test)) for m in metrics.values(): plt.plot(range_, m, 'o-') plt.legend([i for i in metrics], ncol=1, loc='best') plt.xlabel('Number of Neighbors') plt.ylabel('Accuracy scores (weighted)') plt.title('Accuracy scores of Train and Test for {}'.format( self.data.index.name)) plt.show() optimal_weight = np.argmax(np.max(list(metrics.values()), axis=1)) self.optimal_weight = [i.split('_')[-1] for i in metrics][optimal_weight] self.optimal_n = np.argmax(list(metrics.values())[optimal_weight]) print("Better weight metric is:", self.optimal_weight) print("Updated Learning Curves:") clf = KNeighborsClassifier(n_neighbors=self.optimal_n, weights=self.optimal_weight, n_jobs=-1) plot_learning_curve(clf, '{} KNN Learning Curve (distance)'.format( self.data.index.name), self.data, self.target, cv=5) if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/KNN/{}_KNN_neighbor_analysis.csv".format( self.data.index.name)) return metrics
def depth_analysis(self, range_=range(2, 20)): print("\n######") print("Testing different max tree depths.") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) for d in range_: clf = XGBClassifier(max_depth=d, random_state=self.random, n_jobs=-1) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_acc'].append( accuracy_score(y_true=y_train, y_pred=preds_train)) metrics['test_acc'].append( accuracy_score(y_true=y_test, y_pred=preds_test)) self.plot_metric(metrics, range_, xlabel='Max Depth') if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/XGB/{}_XGB_depth_analysis.csv".format( self.data.index.name)) return metrics
def lr_analysis(self, range_=np.linspace(0.01, 1.0, 23)): print("\n######") print("Testing different learning rates.") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) for lr in range_: clf = XGBClassifier(learning_rate=lr, random_state=self.random, n_jobs=-1) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_acc'].append( accuracy_score(y_true=y_train, y_pred=preds_train)) metrics['test_acc'].append( accuracy_score(y_true=y_test, y_pred=preds_test)) self.plot_metric(metrics, range_, xlabel='Learning Rate') if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/XGB/{}_XGB_lr_analysis.csv".format( self.data.index.name)) return metrics
def n_estimator_analysis(self, range_=np.linspace(10, 1000, 10).astype(int)): print("\n######") print("Testing different amounts of trees.") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) for n in range_: clf = XGBClassifier(n_estimators=int(n), random_state=self.random, n_jobs=-1) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_acc'].append( accuracy_score(y_true=y_train, y_pred=preds_train)) metrics['test_acc'].append( accuracy_score(y_true=y_test, y_pred=preds_test)) self.plot_metric(metrics, range_, xlabel='Number of Trees') if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/XGB/{}_XGB_n_estimator.csv".format( self.data.index.name)) return metrics
def activation_analysis(self): print("\n######") print("Testing Different Activation Functions with 10 Fold X-Val") X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) activations = ['identity', 'logistic', 'tanh', 'relu'] # n_outputs = self.data[self.target].nunique() try: hidden = self.best_n_nodes except: hidden = self.num_hidden(self.data, 1, 4) accuracy = [] stdev = [] for activation in activations: with warnings.catch_warnings(): warnings.simplefilter("ignore") clf = MLPClassifier(activation=activation, hidden_layer_sizes=(hidden, 2), max_iter=1000, early_stopping=True, n_iter_no_change=20, random_state=self.random) scores = cross_val_score(clf, pd.concat([ pd.DataFrame(X_train_std), pd.DataFrame(X_test_std) ]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) accuracy.append(scores.mean()) stdev.append(scores.std() * 2) results = pd.DataFrame(index=activations, data=np.array([accuracy, stdev]).T, columns=['acc', 'std']) if self.save: results.to_csv( "./results/MLP/{}_MLPC_activation_analysis.csv".format( self.data.index.name)) return results
def general_analysis(self): print("\n######") print("eXtreme Gradient Boosted Decision Tree Classifier:") print("Default baseline values") clf = XGBClassifier(n_jobs=-1) plot_learning_curve(clf, '{} XGB Learning Curve'.format( self.data.index.name), self.data, self.target, cv=5) print("~~~~~~") print("Execution time metrics") X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) training_time, testing_time = measure_execution_time( clf, self.data.drop(columns=[self.target], axis=1), self.data[self.target], iterations=5) print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format( X_train.shape, np.mean(training_time), np.std(training_time))) print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format( X_test.shape, np.mean(testing_time), np.std(testing_time))) print("\n~~~~~~") scores = cross_val_score(clf, pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format( scores.mean(), scores.std() * 2)) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) print("Training Accuracy:", accuracy_score(y_true=y_train, y_pred=preds_train)) print("Training F1:", f1_score(y_true=y_train, y_pred=preds_train, average='weighted')) print("Testing Accuracy:", accuracy_score(y_true=y_test, y_pred=preds_test)) print("Testing F1:", f1_score(y_true=y_test, y_pred=preds_test, average='weighted')) print('~~~~~~\n')
def max_node_analysis(self, range_=range(2, 200, 10)): print("\n######") print("Testing different maximum leaf nodes") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random_state) for m in range_: clf = DecisionTreeClassifier( max_leaf_nodes=m, random_state=self.random_state) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_gini'].append( accuracy_score(y_true=y_train, y_pred=preds_train) ) metrics['test_gini'].append( accuracy_score(y_true=y_test, y_pred=preds_test) ) clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=m, random_state=self.random_state) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_entropy'].append( accuracy_score(y_true=y_train, y_pred=preds_train) ) metrics['test_entropy'].append( accuracy_score(y_true=y_test, y_pred=preds_test) ) self.plot_metric(metrics, range_, xlabel='Max Leaf Node', title='Accuracy vs Maximum Leaf Nodes for the {}'.format( self.data.index.name) ) if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/DT/{}_DT_max_depth_analysis.csv".format( self.data.index.name) ) return metrics
def max_iteration_analysis(self, range_=range(100, 1100, 100)): print("\n######") print("Testing different max iterations.") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) for r in range_: with warnings.catch_warnings(): warnings.simplefilter("ignore") clf = MLPClassifier(random_state=self.random, max_iter=r) clf.fit(X_train, y_train) preds_train = clf.predict(X_train_std) preds_test = clf.predict(X_test_std) metrics['train_acc'].append( accuracy_score(y_true=y_train, y_pred=preds_train)) metrics['test_acc'].append( accuracy_score(y_true=y_test, y_pred=preds_test)) results = pd.DataFrame(metrics, index=range_) results.index.name = "max_iter" plt.gcf().set_size_inches(8, 5) for col in metrics: plt.plot(range_, metrics[col], 'o-') plt.legend(['Training', 'Testing'], ncol=1, loc=4) plt.xlabel('Max Number of Iterations') plt.ylabel('Accuracy Score (weighted)') plt.xticks(range_, rotation=45) plt.title('MLPC Train and Test Accuracy for {}'.format( self.data.index.name)) plt.grid() plt.show() self.best_iter = results['test_acc'].idxmax() if self.save: results.to_csv("./results/MLP/{}_max_iter_analysis.csv".format( self.data.index.name)) return results
def min_samples_analysis(self, range_=range(1,31)): print("\n######") print("Testing different leaf sample size thresholds") metrics = defaultdict(list) # keep track of test/train accuracies X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random_state) for m in range_: clf = DecisionTreeClassifier(min_samples_leaf=m, random_state=self.random_state) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_gini'].append( accuracy_score(y_true=y_train, y_pred=preds_train) ) metrics['test_gini'].append( accuracy_score(y_true=y_test, y_pred=preds_test) ) clf = DecisionTreeClassifier(criterion='entropy', min_samples_leaf=m, random_state=self.random_state) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) metrics['train_entropy'].append( accuracy_score(y_true=y_train, y_pred=preds_train) ) metrics['test_entropy'].append( accuracy_score(y_true=y_test, y_pred=preds_test) ) self.plot_metric(metrics, range_, xlabel='Min Samples per Leaf', title='Accuracy vs Minimum Samples per Leaf for the {}'.format( self.data.index.name) ) if self.save: pd.DataFrame(metrics, index=range_).to_csv( "./results/DT/{}_DT_min_sample_analysis.csv".format( self.data.index.name) ) return metrics
def hidden_layer_analysis(self, range_=range(2, 11)): print("\n######") print("Testing Different Node Numbers via Alpha Parameter") metrics = defaultdict(list) X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) num_hidden_layers = [] for alpha in range_: with warnings.catch_warnings(): warnings.simplefilter("ignore") hidden = self.num_hidden(self.data, 1, alpha) num_hidden_layers.append(hidden) clf = MLPClassifier(hidden_layer_sizes=(hidden, 2), max_iter=1000, early_stopping=True, n_iter_no_change=20, random_state=self.random) clf.fit(X_train_std, y_train) preds_train = clf.predict(X_train_std) preds_test = clf.predict(X_test_std) metrics['train_acc'].append( accuracy_score(y_true=y_train, y_pred=preds_train)) metrics['test_acc'].append( accuracy_score(y_true=y_test, y_pred=preds_test)) results = pd.DataFrame(metrics, index=num_hidden_layers) results['alpha'] = range_ results.index.name = "n_hidden" self.plot_hl(num_hidden_layers, stats=results, plt_title='MLPC Train and Test Accuracy for {}'.format( self.data.index.name)) self.best_n_nodes = results['test_acc'].idxmax() if self.save: results.to_csv( "./results/MLPC/{}_hidden_layer_analysis.csv".format( self.data.index.name)) return results
def general_analysis(self): print("\n######") print("KNN Classifier:") print("Default Baseline values (5 neighbors)") clf = KNeighborsClassifier(n_jobs=-1) plot_learning_curve(clf, '{} KNN Learning Curve (uniform)'.format( self.data.index.name), self.data, self.target, cv=5, scale=True) clf = KNeighborsClassifier(weights='distance', n_jobs=-1) plot_learning_curve(clf, '{} KNN Learning Curve (distance)'.format( self.data.index.name), self.data, self.target, cv=5, scale=True) print("\n~~~~~~") print("Execution time metrics") X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) training_time, testing_time = measure_execution_time( clf, pd.concat([pd.DataFrame(X_train_std), pd.DataFrame(X_test_std)]), pd.concat([y_train, y_test])) print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format( X_train.shape, np.mean(training_time), np.std(training_time))) print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format( X_test.shape, np.mean(testing_time), np.std(testing_time))) for w in ['uniform', 'distance']: print("\n~~~~~~") print('{} weights:'.format(w.capitalize())) clf = KNeighborsClassifier(weights=w, n_jobs=-1) scores = cross_val_score(clf, pd.concat([ pd.DataFrame(X_train_std), pd.DataFrame(X_test_std) ]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})". format(scores.mean(), scores.std() * 2)) clf.fit(X_train_std, y_train) preds_train = clf.predict(X_train_std) preds_test = clf.predict(X_test_std) print("Training Accuracy:", accuracy_score(y_true=y_train, y_pred=preds_train)) print( "Training F1:", f1_score(y_true=y_train, y_pred=preds_train, average='weighted')) print("Testing Accuracy:", accuracy_score(y_true=y_test, y_pred=preds_test)) print( "Testing F1:", f1_score(y_true=y_test, y_pred=preds_test, average='weighted')) print("~~~~~~\n")
def general_analysis(self): print("\n######") print("Decision Tree Classifier:") print("Default Baseline values (no max depth or max leaf nodes)\n") clf = DecisionTreeClassifier(random_state=self.random_state) plot_learning_curve(clf, '{} Decision Tree Learning Curve'.format(self.data.index.name), self.data, self.target, cv=5) print("\n~~~~~~") print("Execution time metrics") X_train, X_test, y_train, y_test = prep_data_for_clf(self.data, self.target, random_state=self.random_state) training_time, testing_time = measure_execution_time(clf, self.data.drop(columns=[self.target], axis=1), self.data[self.target]) print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format( X_train.shape, np.mean(training_time), np.std(training_time)) ) print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format( X_test.shape, np.mean(testing_time), np.std(testing_time)) ) print("\n~~~~~~") print('Split on Gini Importance:') scores = cross_val_score(clf, pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format( scores.mean(), scores.std() * 2)) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) print("Training Accuracy:", accuracy_score(y_true=y_train, y_pred=preds_train)) print("Training F1:", f1_score(y_true=y_train, y_pred=preds_train, average='weighted')) print("Testing Accuracy:", accuracy_score(y_true=y_test, y_pred=preds_test)) print("Testing F1:", f1_score(y_true=y_test, y_pred=preds_test, average='weighted')) print('\n~~~~~~') print('Split on Entropy Gain:') clf = DecisionTreeClassifier(criterion='entropy', random_state=7308) scores = cross_val_score(clf, pd.concat([X_train, X_test]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format( scores.mean(), scores.std() * 2)) clf.fit(X_train, y_train) preds_train = clf.predict(X_train) preds_test = clf.predict(X_test) print("Training Accuracy:", accuracy_score(y_true=y_train, y_pred=preds_train)) print("Training F1:", f1_score(y_true=y_train, y_pred=preds_train, average='weighted')) print("Testing Accuracy:", accuracy_score(y_true=y_test, y_pred=preds_test)) print("Testing F1:", f1_score(y_true=y_test, y_pred=preds_test, average='weighted')) print("~~~~~~\n")
def general_analysis(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") print("\n######") print("Multilayer Perceptron Classifier:") print('Default Baseline values\n') clf = MLPClassifier(random_state=self.random, max_iter=1000) plot_learning_curve(clf, '{} MLP Learning Curve'.format( self.data.index.name), self.data, self.target, cv=5, scale=True) print("\n~~~~~~") print("Execution time metrics") X_train, X_test, y_train, y_test = prep_data_for_clf( self.data, self.target, random_state=self.random) sclr = StandardScaler() sclr.fit(X_train.astype('float')) X_train_std = sclr.transform(X_train.astype('float')) X_test_std = sclr.transform(X_test.astype('float')) training_time, testing_time = measure_execution_time( clf, pd.concat( [pd.DataFrame(X_train_std), pd.DataFrame(X_test_std)]), pd.concat([y_train, y_test])) print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format( X_train.shape, np.mean(training_time), np.std(training_time))) print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format( X_test.shape, np.mean(testing_time), np.std(testing_time))) print("\n~~~~~~") scores = cross_val_score(clf, pd.concat([ pd.DataFrame(X_train_std), pd.DataFrame(X_test_std) ]), pd.concat([y_train, y_test]), cv=10, n_jobs=-1) print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})". format(scores.mean(), scores.std() * 2)) clf.fit(X_train_std, y_train) preds_train = clf.predict(X_train_std) preds_test = clf.predict(X_test_std) print("Training Accuracy:", accuracy_score(y_true=y_train, y_pred=preds_train)) print( "Training F1:", f1_score(y_true=y_train, y_pred=preds_train, average='weighted')) print("Testing Accuracy:", accuracy_score(y_true=y_test, y_pred=preds_test)) print( "Testing F1:", f1_score(y_true=y_test, y_pred=preds_test, average='weighted')) print('~~~~~~\n')