Beispiel #1
0
def print_test_accuracy(show_example_errors=False,
                        show_confusion_matrix=False):

    # For all the images in the test-set,
    # calculate the predicted classes and whether they are correct.
    correct, cls_pred = predict_cls_test()

    # Classification accuracy and the number of correct classifications.
    acc, num_correct = classification_accuracy(correct)

    # Number of images being classified.
    num_images = len(correct)

    # Print the accuracy.
    msg = "Accuracy on Test-Set: {0:.1%} ({1} / {2})"
    print(msg.format(acc, num_correct, num_images))

    # Plot some examples of mis-classifications, if desired.
    if show_example_errors:
        print("Example errors:")
        plot_functions.plot_example_errors(cls_pred=cls_pred,
                                           correct=correct,
                                           db=data["test"])

    # Plot the confusion matrix, if desired.
    if show_confusion_matrix:
        print("Confusion Matrix:")
        plot_functions.plot_confusion_matrix(cls_pred=cls_pred,
                                             cls_true=data["test"]["cls"],
                                             num_classes=num_classes,
                                             class_names=class_names)
def classify(X_train, X_test, y_train, y_test, vectorizer):
    clf = LogisticRegression(C=30.0,
                             class_weight='balanced',
                             solver='newton-cg',
                             multi_class='multinomial',
                             n_jobs=-1,
                             random_state=40)
    clf.fit(X_train, y_train)
    y_predicted_counts = clf.predict(X_test)

    accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)
    cm = confusion_matrix(y_test, y_predicted_counts)
    top_scores, top_words, bottom_scores, bottom_words = get_relevant_features(
        vectorizer, clf, n=10)

    plot_confusion_matrix(cm,
                          classes=['Irrelevant', 'Disaster', 'Unsure'],
                          normalize=False,
                          title='Confusion matrix')
    plot_important_words(top_scores, top_words, bottom_scores, bottom_words,
                         "Most important words for relevance")

    print(cm)
    print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" %
          (accuracy, precision, recall, f1))
Beispiel #3
0
    def decision_tree(self, data, labels, num_features, num_folds=5):
        fig_acc, axs_acc = plt.subplots(1, 2, figsize=(13, 4), squeeze=False)
        fig_sens, axs_sens = plt.subplots(1, 2, figsize=(13, 4), squeeze=False)

        if num_folds:
            self.param_grid['num_folds'] = num_folds
        skf = StratifiedKFold(n_splits=self.param_grid.get('fold'))

        for criteria_index in range(len(self.param_grid.get('criteria'))):
            criteria = self.param_grid.get('criteria')[criteria_index]
            values_acc = {}
            values_sens = {}
            for depth in self.param_grid.get('max_depths'):
                accuracies_values = []
                sensitivities_values = []
                for num_samples in self.param_grid.get('min_samples_leaf'):
                    tree = DecisionTreeClassifier(min_samples_leaf=num_samples,
                                                  max_depth=depth,
                                                  criterion=criteria,
                                                  min_impurity_decrease=0.005)
                    fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics(
                        self)

                    # there are four probes for every patient, it's reasonable to take only one
                    single_data, single_labels = prep.select_single_probes(
                        data, labels)

                    for train_index, test_index in skf.split(
                            single_data, single_labels):
                        trn_x, tst_x, trn_y, tst_y = prep.separate_and_prepare_data(
                            data, labels, train_index, test_index,
                            num_features)

                        # CLASSIFICATION
                        tree.fit(trn_x, trn_y)
                        prd_y = tree.predict(tst_x)

                        # EVALUATION
                        evaluation.append_solution_for_fold(
                            fold_accuracies, fold_sensitivities,
                            fold_predictions, fold, tst_y, prd_y)
                        fold = fold + 1

                    if statistics.mean(fold_accuracies
                                       ) > self.best_solution.get('accuracy'):
                        self.best_solution['criteria'] = self.param_grid.get(
                            'criteria')[criteria_index]
                        self.best_solution['min_samples_leaf'] = num_samples
                        self.best_solution['max_depths'] = depth
                        self.best_solution['accuracy'] = statistics.mean(
                            fold_accuracies)
                        self.best_solution['sensitivity'] = statistics.mean(
                            fold_sensitivities)

                        TN, FP, FN, TP = evaluation.compute_confm_values(
                            fold_predictions)
                        self.best_solution['confusion_matrix'] = np.array(
                            ([TN, FP], [FN, TP]))
                    accuracies_values.append(statistics.mean(fold_accuracies))
                    sensitivities_values.append(
                        statistics.mean(fold_sensitivities))

                values_acc[depth] = accuracies_values
                values_sens[depth] = sensitivities_values
            plot.multiple_line_chart(axs_acc[0, criteria_index],
                                     self.param_grid.get('min_samples_leaf'),
                                     values_acc,
                                     'Decision Trees with %s criteria' %
                                     criteria,
                                     'max_depths',
                                     'min_samples_leaf',
                                     'accuracy',
                                     percentage=True)

            plot.multiple_line_chart(axs_sens[0, criteria_index],
                                     self.param_grid.get('min_samples_leaf'),
                                     values_sens,
                                     'Decision Trees with %s criteria' %
                                     criteria,
                                     'max_depths',
                                     'min_samples_leaf',
                                     'sensitivity',
                                     percentage=True)
        plt.show()
        fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False)
        plot.plot_confusion_matrix(axs[0, 0],
                                   self.best_solution.get('confusion_matrix'),
                                   'Confusion matrix', [0, 1], True)
        plt.show()
        return self.best_solution
Beispiel #4
0
            # Calculate the kernel matrix for raw data
            start = time.time()
            K_train_rw = gk.fit_transform(G_train_rw)
            K_test_rw = gk.transform(G_test_rw)
            end = time.time()
            print("", end=".")

            # Initialise an SVM and fit.
            clf = svm.SVC(kernel='precomputed')
            clf.fit(K_train_rw, y_train_rw)
            print("", end=". ")

            # Predict and test.
            y_pred_rw = clf.predict(K_test_rw)
            print("Confusion Matrix: \n", confusion_matrix(y_test_rw, y_pred_rw))
            plot_confusion_matrix(y_test_rw, y_pred_rw, labels, title="Confusion Matrix Before Smoothing")

            # Calculate accuracy of classification.
            data_kernel_rw.append(
                sec_to_time(round(end - start, 2)) +
                " ~ " + str(round(accuracy_score(y_test_rw, y_pred_rw)*100, 2)) + "%")
            rw_ac.append((accuracy_score(y_test_rw, y_pred_rw)))
            print("Raw: ", data_kernel_rw[-1])

            # Calculate the kernel matrix for Smooth data
            start = time.time()
            K_train_sm = gk.fit_transform(G_train_sm)
            K_test_sm = gk.transform(G_test_sm)
            end = time.time()
            print("", end=".")
Beispiel #5
0
    def random_forest(self, data, labels, num_features, num_folds):

        fig_acc, axs_acc = plt.subplots(1, 2, figsize=(10, 4), squeeze=False)
        fig_sens, axs_sens = plt.subplots(1, 2, figsize=(10, 4), squeeze=False)
        if num_folds:
            self.param_grid['num_folds'] = num_folds

        skf = StratifiedKFold(n_splits=self.param_grid.get('num_folds'))

        for max_features_index in range(len(self.param_grid.get('max_features'))):
            max_features = self.param_grid.get('max_features')[max_features_index]
            values_acc = {}
            values_sens = {}
            for depth in self.param_grid.get('max_depths'):
                accuracies_values = []
                sensitivities_values = []
                for num_estimators in self.param_grid.get('num_estimators'):
                    fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics(self)

                    # there are four probes for every patient, it's reasonable to take only one
                    single_data, single_labels = prep.select_single_probes(data, labels)

                    for train_index, test_index in skf.split(single_data, single_labels):
                        trn_x, tst_x, trn_y, tst_y = self.separate_and_prepare_data(data,
                                                                                labels,
                                                                                train_index,
                                                                                test_index,
                                                                                num_features)

                        # CLASSIFICATION
                        rf = RandomForestClassifier(n_estimators=num_estimators, max_depth=depth, max_features=max_features)
                        rf.fit(trn_x, trn_y)
                        prd_y = rf.predict(tst_x)

                        # EVALUATION
                        evaluation.append_solution_for_fold(fold_accuracies,
                                                            fold_sensitivities,
                                                            fold_predictions,
                                                            fold,
                                                            tst_y,
                                                            prd_y)
                        fold = fold + 1

                    if statistics.mean(fold_accuracies) > self.best_solution.get('accuracy'):
                        self.best_solution['num_estimators'] = max_features_index
                        self.best_solution['max_depths'] = depth
                        self.best_solution['max_features'] = num_estimators
                        self.best_solution['accuracy'] = statistics.mean(fold_accuracies)
                        self.best_solution['sensitivity'] = statistics.mean(fold_sensitivities)

                        TN, FP, FN, TP = evaluation.compute_confm_values(fold_predictions)
                        self.best_solution['confusion_matrix'] = np.array(([TN, FP], [FN, TP]))

                    accuracies_values.append(statistics.mean(fold_accuracies))
                    sensitivities_values.append(statistics.mean(fold_sensitivities))

                values_acc[depth] = accuracies_values
                values_sens[depth] = sensitivities_values
            plot.multiple_line_chart(axs_acc[0, max_features_index], self.param_grid.get('num_estimators'), values_acc,
                                     'Random Forests with %s features' % max_features,
                                     'max_depths',
                                     'nr estimators',
                                     'accuracy',
                                     percentage=True)
            plt.figure()
            plot.multiple_line_chart(axs_sens[0, max_features_index], self.param_grid.get('num_estimators'),
                                     values_sens,
                                     'Random Forests with %s features' % max_features,
                                     'max_depths',
                                     'nr estimators',
                                     'sensitivity',
                                     percentage=True)
        plt.show()
        fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False)
        plot.plot_confusion_matrix(axs[0, 0], self.best_solution.get('confusion_matrix'), 'Confusion matrix', [0, 1],
                                   True)
        plt.show()
        return self.best_solution
Beispiel #6
0
    top_scores = sorted_contributions['Relevant']['supporters'][:10].tolist()
    bottom_words = sorted_contributions['Relevant']['detractors'][:10].index.tolist()
    bottom_scores = sorted_contributions['Relevant']['detractors'][:10].tolist()

    plot_important_words(top_scores, top_words, bottom_scores, bottom_words, "Most important words for relevance")


if __name__ == '__main__':
    questions = pd.read_pickle('ready_data.pkl')
    list_corpus = questions['text'].tolist()
    list_labels = questions['class_label'].tolist()

    tokenized_corpus = [[tokens for tokens in gensim.utils.tokenize(text)] for text in list_corpus]
    embeddings = [get_average_word2vec(tokens) for tokens in tokenized_corpus]
    X_train, X_test, y_train, y_test = train_test_split(embeddings, list_labels, test_size=0.2, random_state=40)
    plot_LSA(X_train, y_train)

    clf = LogisticRegression(C=30.0, class_weight='balanced', solver='newton-cg', multi_class='multinomial', n_jobs=-1,
                             random_state=40)
    clf.fit(X_train, y_train)
    y_predicted_counts = clf.predict(X_test)

    accuracy, precision, recall, f1 = get_metrics(y_test, y_predicted_counts)
    cm = confusion_matrix(y_test, y_predicted_counts)

    plot_confusion_matrix(cm, classes=['Irrelevant', 'Disaster', 'Unsure'], normalize=False, title='Confusion matrix')
    plot_important_words_with_lime()

    print(cm)
    print("accuracy = %.3f, precision = %.3f, recall = %.3f, f1 = %.3f" % (accuracy, precision, recall, f1))
Beispiel #7
0
    def knn(self, data, labels, num_features, num_folds=5):

        fig_acc, axs_acc = plt.subplots(1, 1, figsize=(6, 4), squeeze=False)
        fig_sens, axs_sens = plt.subplots(1, 1, figsize=(6, 4), squeeze=False)
        if num_folds:
            self.param_grid['num_folds'] = num_folds
        skf = StratifiedKFold(n_splits=self.param_grid.get('num_folds'))

        values_acc = {}
        values_sens = {}
        for dist_index, dist in enumerate(self.param_grid.get('dist')):
            accuracies_values = []
            sensitivities_values = []
            for n_neigh in self.param_grid.get('n_neighbors'):
                fold_accuracies, fold_sensitivities, fold_predictions, fold = evaluation.initialize_metrics(self)

                # there are four probes for every patient, it's reasonable to take only one
                single_data, single_labels = prep.select_single_probes(data, labels)

                for train_index, test_index in skf.split(single_data, single_labels):
                    trn_x, tst_x, trn_y, tst_y = self.separate_and_prepare_data(data,
                                                                                labels,
                                                                                train_index,
                                                                                test_index,
                                                                                num_features)
                    # CLASSIFICATION
                    knn = KNeighborsClassifier(n_neighbors=n_neigh, metric=dist)
                    knn.fit(trn_x, trn_y)
                    prd_y = knn.predict(tst_x)

                    # EVALUATION
                    evaluation.append_solution_for_fold(fold_accuracies,
                                                        fold_sensitivities,
                                                        fold_predictions,
                                                        fold,
                                                        tst_y,
                                                        prd_y)
                    fold = fold + 1

                if statistics.mean(fold_accuracies) > self.best_solution.get('accuracy'):
                    self.best_solution['dist'] = self.param_grid.get('dist')[dist_index]
                    self.best_solution['n_neighbors'] = n_neigh
                    self.best_solution['accuracy'] = statistics.mean(fold_accuracies)
                    self.best_solution['sensitivity'] = statistics.mean(fold_sensitivities)

                    TN, FP, FN, TP = evaluation.compute_confm_values(fold_predictions)
                    self.best_solution['confusion_matrix'] = np.array(([TN, FP], [FN, TP]))
                # result for different number of neighbours
                accuracies_values.append(statistics.mean(fold_accuracies))
                sensitivities_values.append(statistics.mean(fold_sensitivities))
            # results for every distance with different num of neighbours
            values_acc[dist] = accuracies_values
            values_sens[dist] = sensitivities_values
        plot.multiple_line_chart(axs_acc[0, 0], self.param_grid.get('n_neighbors'), values_acc,
                                 'KNN for different number of neighbours',
                                 'Distance metrics',
                                 'nr neighbours',
                                 'accuracy',
                                 percentage=False)
        plot.multiple_line_chart(axs_sens[0, 0], self.param_grid.get('n_neighbors'), values_sens,
                                 'KNN for different number of neighbours',
                                 'Distance metrics',
                                 'nr neighbours',
                                 'sensitivity',
                                 percentage=False)
        plt.show()
        fig, axs = plt.subplots(1, 1, figsize=(4, 4), squeeze=False)
        plot.plot_confusion_matrix(axs[0, 0], self.best_solution.get('confusion_matrix'), 'Confusion matrix', [0, 1],
                                   True)
        plt.show()
        return self.best_solution