コード例 #1
0
    def n_neighbors_analysis(self, range_=range(1, 50)):
        print("\n######")
        print("Testing different neighbor values")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        for n in range_:
            clf = KNeighborsClassifier(n_neighbors=n, n_jobs=-1)
            clf.fit(X_train_std, y_train)
            metrics['train_acc_uniform'].append(clf.score(
                X_train_std, y_train))
            metrics['test_acc_uniform'].append(clf.score(X_test_std, y_test))

            clf = KNeighborsClassifier(n_neighbors=n,
                                       weights="distance",
                                       n_jobs=-1)
            clf.fit(X_train_std, y_train)
            metrics['train_acc_distance'].append(
                clf.score(X_train_std, y_train))
            metrics['test_acc_distance'].append(clf.score(X_test_std, y_test))

        for m in metrics.values():
            plt.plot(range_, m, 'o-')

        plt.legend([i for i in metrics], ncol=1, loc='best')
        plt.xlabel('Number of Neighbors')
        plt.ylabel('Accuracy scores (weighted)')
        plt.title('Accuracy scores of Train and Test for {}'.format(
            self.data.index.name))
        plt.show()

        optimal_weight = np.argmax(np.max(list(metrics.values()), axis=1))
        self.optimal_weight = [i.split('_')[-1]
                               for i in metrics][optimal_weight]
        self.optimal_n = np.argmax(list(metrics.values())[optimal_weight])

        print("Better weight metric is:", self.optimal_weight)
        print("Updated Learning Curves:")
        clf = KNeighborsClassifier(n_neighbors=self.optimal_n,
                                   weights=self.optimal_weight,
                                   n_jobs=-1)
        plot_learning_curve(clf,
                            '{} KNN Learning Curve (distance)'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5)

        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
                "./results/KNN/{}_KNN_neighbor_analysis.csv".format(
                    self.data.index.name))

        return metrics
コード例 #2
0
ファイル: xgb_analysis.py プロジェクト: miketong08/CS7641
    def depth_analysis(self, range_=range(2, 20)):
        print("\n######")
        print("Testing different max tree depths.")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        for d in range_:
            clf = XGBClassifier(max_depth=d,
                                random_state=self.random,
                                n_jobs=-1)
            clf.fit(X_train, y_train)
            preds_train = clf.predict(X_train)
            preds_test = clf.predict(X_test)

            metrics['train_acc'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train))
            metrics['test_acc'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test))

        self.plot_metric(metrics, range_, xlabel='Max Depth')

        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
                "./results/XGB/{}_XGB_depth_analysis.csv".format(
                    self.data.index.name))

        return metrics
コード例 #3
0
ファイル: xgb_analysis.py プロジェクト: miketong08/CS7641
    def lr_analysis(self, range_=np.linspace(0.01, 1.0, 23)):
        print("\n######")
        print("Testing different learning rates.")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        for lr in range_:
            clf = XGBClassifier(learning_rate=lr,
                                random_state=self.random,
                                n_jobs=-1)

            clf.fit(X_train, y_train)
            preds_train = clf.predict(X_train)
            preds_test = clf.predict(X_test)

            metrics['train_acc'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train))
            metrics['test_acc'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test))

        self.plot_metric(metrics, range_, xlabel='Learning Rate')

        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
                "./results/XGB/{}_XGB_lr_analysis.csv".format(
                    self.data.index.name))

        return metrics
コード例 #4
0
ファイル: xgb_analysis.py プロジェクト: miketong08/CS7641
    def n_estimator_analysis(self,
                             range_=np.linspace(10, 1000, 10).astype(int)):
        print("\n######")
        print("Testing different amounts of trees.")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        for n in range_:
            clf = XGBClassifier(n_estimators=int(n),
                                random_state=self.random,
                                n_jobs=-1)

            clf.fit(X_train, y_train)
            preds_train = clf.predict(X_train)
            preds_test = clf.predict(X_test)

            metrics['train_acc'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train))
            metrics['test_acc'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test))

        self.plot_metric(metrics, range_, xlabel='Number of Trees')

        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
                "./results/XGB/{}_XGB_n_estimator.csv".format(
                    self.data.index.name))

        return metrics
コード例 #5
0
ファイル: mlpc_analysis.py プロジェクト: miketong08/CS7641
    def activation_analysis(self):
        print("\n######")
        print("Testing Different Activation Functions with 10 Fold X-Val")
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        activations = ['identity', 'logistic', 'tanh', 'relu']
        # n_outputs = self.data[self.target].nunique()
        try:
            hidden = self.best_n_nodes
        except:
            hidden = self.num_hidden(self.data, 1, 4)

        accuracy = []
        stdev = []

        for activation in activations:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf = MLPClassifier(activation=activation,
                                    hidden_layer_sizes=(hidden, 2),
                                    max_iter=1000,
                                    early_stopping=True,
                                    n_iter_no_change=20,
                                    random_state=self.random)

                scores = cross_val_score(clf,
                                         pd.concat([
                                             pd.DataFrame(X_train_std),
                                             pd.DataFrame(X_test_std)
                                         ]),
                                         pd.concat([y_train, y_test]),
                                         cv=10,
                                         n_jobs=-1)

                accuracy.append(scores.mean())
                stdev.append(scores.std() * 2)

        results = pd.DataFrame(index=activations,
                               data=np.array([accuracy, stdev]).T,
                               columns=['acc', 'std'])

        if self.save:
            results.to_csv(
                "./results/MLP/{}_MLPC_activation_analysis.csv".format(
                    self.data.index.name))

        return results
コード例 #6
0
ファイル: xgb_analysis.py プロジェクト: miketong08/CS7641
    def general_analysis(self):
        print("\n######")
        print("eXtreme Gradient Boosted Decision Tree Classifier:")
        print("Default baseline values")

        clf = XGBClassifier(n_jobs=-1)
        plot_learning_curve(clf,
                            '{} XGB Learning Curve'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5)

        print("~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        training_time, testing_time = measure_execution_time(
            clf,
            self.data.drop(columns=[self.target], axis=1),
            self.data[self.target],
            iterations=5)
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time)))
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time)))

        print("\n~~~~~~")
        scores = cross_val_score(clf,
                                 pd.concat([X_train, X_test]),
                                 pd.concat([y_train, y_test]),
                                 cv=10,
                                 n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(),
            scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
              accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
              f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
              accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
              f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
        print('~~~~~~\n')
コード例 #7
0
ファイル: dt_analysis.py プロジェクト: miketong08/CS7641
    def max_node_analysis(self, range_=range(2, 200, 10)):
        print("\n######")
        print("Testing different maximum leaf nodes")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data,
            self.target,
            random_state=self.random_state)

        for m in range_:
        	clf = DecisionTreeClassifier(
                max_leaf_nodes=m,
                random_state=self.random_state)
        	clf.fit(X_train, y_train)
        	preds_train = clf.predict(X_train)
        	preds_test = clf.predict(X_test)

        	metrics['train_gini'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train)
                )
        	metrics['test_gini'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test)
                )

        	clf = DecisionTreeClassifier(criterion='entropy',
                max_leaf_nodes=m,
                random_state=self.random_state)
        	clf.fit(X_train, y_train)
        	preds_train = clf.predict(X_train)
        	preds_test = clf.predict(X_test)

        	metrics['train_entropy'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train)
                )
        	metrics['test_entropy'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test)
                )

        self.plot_metric(metrics, range_,
            xlabel='Max Leaf Node',
            title='Accuracy vs Maximum Leaf Nodes for the {}'.format(
                self.data.index.name)
                )
        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
        	"./results/DT/{}_DT_max_depth_analysis.csv".format(
                self.data.index.name)
        	)

        return metrics
コード例 #8
0
ファイル: mlpc_analysis.py プロジェクト: miketong08/CS7641
    def max_iteration_analysis(self, range_=range(100, 1100, 100)):
        print("\n######")
        print("Testing different max iterations.")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        for r in range_:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                clf = MLPClassifier(random_state=self.random, max_iter=r)
                clf.fit(X_train, y_train)

                preds_train = clf.predict(X_train_std)
                preds_test = clf.predict(X_test_std)

                metrics['train_acc'].append(
                    accuracy_score(y_true=y_train, y_pred=preds_train))
                metrics['test_acc'].append(
                    accuracy_score(y_true=y_test, y_pred=preds_test))

        results = pd.DataFrame(metrics, index=range_)
        results.index.name = "max_iter"

        plt.gcf().set_size_inches(8, 5)
        for col in metrics:
            plt.plot(range_, metrics[col], 'o-')
        plt.legend(['Training', 'Testing'], ncol=1, loc=4)

        plt.xlabel('Max Number of Iterations')
        plt.ylabel('Accuracy Score (weighted)')
        plt.xticks(range_, rotation=45)
        plt.title('MLPC Train and Test Accuracy for {}'.format(
            self.data.index.name))

        plt.grid()
        plt.show()

        self.best_iter = results['test_acc'].idxmax()

        if self.save:
            results.to_csv("./results/MLP/{}_max_iter_analysis.csv".format(
                self.data.index.name))

        return results
コード例 #9
0
ファイル: dt_analysis.py プロジェクト: miketong08/CS7641
    def min_samples_analysis(self, range_=range(1,31)):
        print("\n######")
        print("Testing different leaf sample size thresholds")
        metrics = defaultdict(list)  # keep track of test/train accuracies
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data,
            self.target,
            random_state=self.random_state)

        for m in range_:
            clf = DecisionTreeClassifier(min_samples_leaf=m, random_state=self.random_state)
            clf.fit(X_train, y_train)
            preds_train = clf.predict(X_train)
            preds_test = clf.predict(X_test)

            metrics['train_gini'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train)
                )
            metrics['test_gini'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test)
                )

            clf = DecisionTreeClassifier(criterion='entropy',
                min_samples_leaf=m,
                random_state=self.random_state)
            clf.fit(X_train, y_train)
            preds_train = clf.predict(X_train)
            preds_test = clf.predict(X_test)

            metrics['train_entropy'].append(
                accuracy_score(y_true=y_train, y_pred=preds_train)
                )
            metrics['test_entropy'].append(
                accuracy_score(y_true=y_test, y_pred=preds_test)
                )

        self.plot_metric(metrics, range_,
            xlabel='Min Samples per Leaf',
            title='Accuracy vs Minimum Samples per Leaf for the {}'.format(
                self.data.index.name)
                )

        if self.save:
            pd.DataFrame(metrics, index=range_).to_csv(
        	"./results/DT/{}_DT_min_sample_analysis.csv".format(
                self.data.index.name)
        	)

        return metrics
コード例 #10
0
ファイル: mlpc_analysis.py プロジェクト: miketong08/CS7641
    def hidden_layer_analysis(self, range_=range(2, 11)):
        print("\n######")
        print("Testing Different Node Numbers via Alpha Parameter")
        metrics = defaultdict(list)
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        num_hidden_layers = []
        for alpha in range_:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                hidden = self.num_hidden(self.data, 1, alpha)
                num_hidden_layers.append(hidden)

                clf = MLPClassifier(hidden_layer_sizes=(hidden, 2),
                                    max_iter=1000,
                                    early_stopping=True,
                                    n_iter_no_change=20,
                                    random_state=self.random)

                clf.fit(X_train_std, y_train)
                preds_train = clf.predict(X_train_std)
                preds_test = clf.predict(X_test_std)

                metrics['train_acc'].append(
                    accuracy_score(y_true=y_train, y_pred=preds_train))
                metrics['test_acc'].append(
                    accuracy_score(y_true=y_test, y_pred=preds_test))

        results = pd.DataFrame(metrics, index=num_hidden_layers)
        results['alpha'] = range_
        results.index.name = "n_hidden"

        self.plot_hl(num_hidden_layers,
                     stats=results,
                     plt_title='MLPC Train and Test Accuracy for {}'.format(
                         self.data.index.name))
        self.best_n_nodes = results['test_acc'].idxmax()
        if self.save:
            results.to_csv(
                "./results/MLPC/{}_hidden_layer_analysis.csv".format(
                    self.data.index.name))

        return results
コード例 #11
0
    def general_analysis(self):
        print("\n######")
        print("KNN Classifier:")
        print("Default Baseline values (5 neighbors)")

        clf = KNeighborsClassifier(n_jobs=-1)
        plot_learning_curve(clf,
                            '{} KNN Learning Curve (uniform)'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5,
                            scale=True)

        clf = KNeighborsClassifier(weights='distance', n_jobs=-1)
        plot_learning_curve(clf,
                            '{} KNN Learning Curve (distance)'.format(
                                self.data.index.name),
                            self.data,
                            self.target,
                            cv=5,
                            scale=True)

        print("\n~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(
            self.data, self.target, random_state=self.random)

        sclr = StandardScaler()
        sclr.fit(X_train.astype('float'))
        X_train_std = sclr.transform(X_train.astype('float'))
        X_test_std = sclr.transform(X_test.astype('float'))

        training_time, testing_time = measure_execution_time(
            clf,
            pd.concat([pd.DataFrame(X_train_std),
                       pd.DataFrame(X_test_std)]), pd.concat([y_train,
                                                              y_test]))
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time)))
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time)))

        for w in ['uniform', 'distance']:
            print("\n~~~~~~")
            print('{} weights:'.format(w.capitalize()))
            clf = KNeighborsClassifier(weights=w, n_jobs=-1)
            scores = cross_val_score(clf,
                                     pd.concat([
                                         pd.DataFrame(X_train_std),
                                         pd.DataFrame(X_test_std)
                                     ]),
                                     pd.concat([y_train, y_test]),
                                     cv=10,
                                     n_jobs=-1)
            print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".
                  format(scores.mean(),
                         scores.std() * 2))

            clf.fit(X_train_std, y_train)
            preds_train = clf.predict(X_train_std)
            preds_test = clf.predict(X_test_std)
            print("Training Accuracy:",
                  accuracy_score(y_true=y_train, y_pred=preds_train))
            print(
                "Training F1:",
                f1_score(y_true=y_train,
                         y_pred=preds_train,
                         average='weighted'))
            print("Testing Accuracy:",
                  accuracy_score(y_true=y_test, y_pred=preds_test))
            print(
                "Testing F1:",
                f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))

        print("~~~~~~\n")
コード例 #12
0
ファイル: dt_analysis.py プロジェクト: miketong08/CS7641
    def general_analysis(self):
        print("\n######")
        print("Decision Tree Classifier:")
        print("Default Baseline values (no max depth or max leaf nodes)\n")

        clf = DecisionTreeClassifier(random_state=self.random_state)
        plot_learning_curve(clf, '{} Decision Tree Learning Curve'.format(self.data.index.name), self.data, self.target, cv=5)

        print("\n~~~~~~")
        print("Execution time metrics")
        X_train, X_test, y_train, y_test = prep_data_for_clf(self.data, self.target, random_state=self.random_state)

        training_time, testing_time = measure_execution_time(clf,
            self.data.drop(columns=[self.target], axis=1), self.data[self.target])
        print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
            X_train.shape, np.mean(training_time), np.std(training_time))
            )
        print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
            X_test.shape, np.mean(testing_time), np.std(testing_time))
            )

        print("\n~~~~~~")
        print('Split on Gini Importance:')
        scores = cross_val_score(clf,
            pd.concat([X_train, X_test]),
            pd.concat([y_train, y_test]),
            cv=10, n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(), scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
            accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
            f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
            accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
            f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))

        print('\n~~~~~~')
        print('Split on Entropy Gain:')
        clf = DecisionTreeClassifier(criterion='entropy', random_state=7308)
        scores = cross_val_score(clf,
            pd.concat([X_train, X_test]),
            pd.concat([y_train, y_test]),
            cv=10, n_jobs=-1)
        print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".format(
            scores.mean(), scores.std() * 2))

        clf.fit(X_train, y_train)
        preds_train = clf.predict(X_train)
        preds_test = clf.predict(X_test)

        print("Training Accuracy:",
            accuracy_score(y_true=y_train, y_pred=preds_train))
        print("Training F1:",
            f1_score(y_true=y_train, y_pred=preds_train, average='weighted'))
        print("Testing Accuracy:",
            accuracy_score(y_true=y_test, y_pred=preds_test))
        print("Testing F1:",
            f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
        print("~~~~~~\n")
コード例 #13
0
ファイル: mlpc_analysis.py プロジェクト: miketong08/CS7641
    def general_analysis(self):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("\n######")
            print("Multilayer Perceptron Classifier:")
            print('Default Baseline values\n')

            clf = MLPClassifier(random_state=self.random, max_iter=1000)
            plot_learning_curve(clf,
                                '{} MLP Learning Curve'.format(
                                    self.data.index.name),
                                self.data,
                                self.target,
                                cv=5,
                                scale=True)

            print("\n~~~~~~")
            print("Execution time metrics")
            X_train, X_test, y_train, y_test = prep_data_for_clf(
                self.data, self.target, random_state=self.random)

            sclr = StandardScaler()
            sclr.fit(X_train.astype('float'))
            X_train_std = sclr.transform(X_train.astype('float'))
            X_test_std = sclr.transform(X_test.astype('float'))
            training_time, testing_time = measure_execution_time(
                clf,
                pd.concat(
                    [pd.DataFrame(X_train_std),
                     pd.DataFrame(X_test_std)]), pd.concat([y_train, y_test]))
            print("Training time input dim of {} : {:.4f} (+/- {:.4f})".format(
                X_train.shape, np.mean(training_time), np.std(training_time)))
            print("Testing time input dim of {}: {:.4f} (+/- {:.4f})".format(
                X_test.shape, np.mean(testing_time), np.std(testing_time)))

            print("\n~~~~~~")
            scores = cross_val_score(clf,
                                     pd.concat([
                                         pd.DataFrame(X_train_std),
                                         pd.DataFrame(X_test_std)
                                     ]),
                                     pd.concat([y_train, y_test]),
                                     cv=10,
                                     n_jobs=-1)

            print("10 Fold Cross Validation Accuracy: {:.4f} (+/- {:.4f})".
                  format(scores.mean(),
                         scores.std() * 2))

            clf.fit(X_train_std, y_train)
            preds_train = clf.predict(X_train_std)
            preds_test = clf.predict(X_test_std)

            print("Training Accuracy:",
                  accuracy_score(y_true=y_train, y_pred=preds_train))
            print(
                "Training F1:",
                f1_score(y_true=y_train,
                         y_pred=preds_train,
                         average='weighted'))
            print("Testing Accuracy:",
                  accuracy_score(y_true=y_test, y_pred=preds_test))
            print(
                "Testing F1:",
                f1_score(y_true=y_test, y_pred=preds_test, average='weighted'))
            print('~~~~~~\n')