Example #1
0
    def train(self, importance_cutoff=0.15):
        classes, self.class_names = self.get_labels()
        self.class_names = sort_class_names(self.class_names)

        # Get items and labels, filtering out those for which we have no labels.
        X_iter, y_iter = split_tuple_iterator(self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform([item for item in X_iter])

        # Calculate labels.
        y = np.array(y_iter)

        print(f"X: {X.shape}, y: {y.shape}")

        is_multilabel = isinstance(y[0], np.ndarray)

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=0)
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(self.class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scorings,
                                    cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        feature_names = self.get_human_readable_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            shap.summary_plot(
                shap_values,
                X_train.toarray(),
                feature_names=feature_names,
                class_names=self.class_names,
                plot_type="layered_violin"
                if not isinstance(shap_values, list) else None,
                show=False,
            )

            matplotlib.pyplot.savefig("feature_importance.png",
                                      bbox_inches="tight")

            important_features = self.get_important_features(
                importance_cutoff, shap_values)

            self.print_feature_importances(important_features, feature_names)

            # Save the important features in the metric report too
            feature_report = self.save_feature_importances(
                important_features, feature_names)

            tracking_metrics["feature_report"] = feature_report

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        if is_multilabel:
            assert isinstance(
                y_pred[0], np.ndarray), "The predictions should be multilabel"

        print(f"No confidence threshold - {len(y_test)} classified")
        if is_multilabel:
            confusion_matrix = metrics.multilabel_confusion_matrix(
                y_test, y_pred)
        else:
            confusion_matrix = metrics.confusion_matrix(
                y_test, y_pred, labels=self.class_names)

            print(
                classification_report_imbalanced(y_test,
                                                 y_pred,
                                                 labels=self.class_names))
            report = classification_report_imbalanced_values(
                y_test, y_pred, labels=self.class_names)

            tracking_metrics["report"] = report

        print_labeled_confusion_matrix(confusion_matrix,
                                       self.class_names,
                                       is_multilabel=is_multilabel)

        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in [0.6, 0.7, 0.8, 0.9]:
            y_pred_probas = self.clf.predict_proba(X_test)
            confidence_class_names = self.class_names + ["__NOT_CLASSIFIED__"]

            y_pred_filter = []
            classified_indices = []
            for i in range(0, len(y_test)):
                argmax = np.argmax(y_pred_probas[i])
                if y_pred_probas[i][argmax] < confidence_threshold:
                    if not is_multilabel:
                        y_pred_filter.append("__NOT_CLASSIFIED__")
                    continue

                classified_indices.append(i)
                if is_multilabel:
                    y_pred_filter.append(y_pred[i])
                else:
                    y_pred_filter.append(argmax)

            if not is_multilabel:
                y_pred_filter = np.array(y_pred_filter)
                y_pred_filter[classified_indices] = self.le.inverse_transform(
                    np.array(y_pred_filter[classified_indices], dtype=int))

            print(
                f"\nConfidence threshold > {confidence_threshold} - {len(y_test)} classified"
            )
            if is_multilabel:
                confusion_matrix = metrics.multilabel_confusion_matrix(
                    y_test[classified_indices], np.asarray(y_pred_filter))
            else:
                confusion_matrix = metrics.confusion_matrix(
                    y_test.astype(str),
                    y_pred_filter.astype(str),
                    labels=confidence_class_names,
                )
                print(
                    classification_report_imbalanced(
                        y_test.astype(str),
                        y_pred_filter.astype(str),
                        labels=confidence_class_names,
                    ))
            print_labeled_confusion_matrix(confusion_matrix,
                                           confidence_class_names,
                                           is_multilabel=is_multilabel)

        joblib.dump(self, self.__class__.__name__.lower())

        return tracking_metrics
Example #2
0
    def train(self, importance_cutoff=0.15):
        classes, class_names = self.get_labels()
        class_names = sorted(list(class_names), reverse=True)

        # Get items and labels, filtering out those for which we have no labels.
        X_iter, y_iter = split_tuple_iterator(self.items_gen(classes))

        # Extract features from the items.
        X = self.extraction_pipeline.fit_transform(X_iter)

        # Calculate labels.
        y = np.array(y_iter)

        print(f"X: {X.shape}, y: {y.shape}")

        # Split dataset in training and test.
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            test_size=0.1,
                                                            random_state=0)
        if self.sampler is not None:
            pipeline = make_pipeline(self.sampler, self.clf)
        else:
            pipeline = self.clf

        tracking_metrics = {}

        # Use k-fold cross validation to evaluate results.
        if self.cross_validation_enabled:
            scorings = ["accuracy"]
            if len(class_names) == 2:
                scorings += ["precision", "recall"]

            scores = cross_validate(pipeline,
                                    X_train,
                                    y_train,
                                    scoring=scorings,
                                    cv=5)

            print("Cross Validation scores:")
            for scoring in scorings:
                score = scores[f"test_{scoring}"]
                tracking_metrics[f"test_{scoring}"] = {
                    "mean": score.mean(),
                    "std": score.std() * 2,
                }
                print(
                    f"{scoring.capitalize()}: f{score.mean()} (+/- {score.std() * 2})"
                )

        # Training on the resampled dataset if sampler is provided.
        if self.sampler is not None:
            X_train, y_train = self.sampler.fit_resample(X_train, y_train)

        print(f"X_train: {X_train.shape}, y_train: {y_train.shape}")
        print(f"X_test: {X_test.shape}, y_test: {y_test.shape}")

        self.clf.fit(X_train, y_train)

        feature_names = self.get_feature_names()
        if self.calculate_importance and len(feature_names):
            explainer = shap.TreeExplainer(self.clf)
            shap_values = explainer.shap_values(X_train)

            # TODO: Actually implement feature importance visualization for multiclass problems.
            if isinstance(shap_values, list):
                shap_values = np.sum(np.abs(shap_values), axis=0)

            important_features = self.get_important_features(
                importance_cutoff, shap_values)

            print(f"\nTop {len(important_features)} Features:")
            for i, [importance, index,
                    is_positive] in enumerate(important_features):
                print(
                    f'{i + 1}. \'{feature_names[int(index)]}\' ({"+" if (is_positive) else "-"}{importance})'
                )

        print("Test Set scores:")
        # Evaluate results on the test set.
        y_pred = self.clf.predict(X_test)

        print(f"No confidence threshold - {len(y_test)} classified")
        confusion_matrix = metrics.confusion_matrix(y_test,
                                                    y_pred,
                                                    labels=class_names)
        print(confusion_matrix)
        tracking_metrics["confusion_matrix"] = confusion_matrix.tolist()

        print(
            classification_report_imbalanced(y_test,
                                             y_pred,
                                             labels=class_names))
        report = classification_report_imbalanced_values(y_test,
                                                         y_pred,
                                                         labels=class_names)

        tracking_metrics["report"] = report

        # Evaluate results on the test set for some confidence thresholds.
        for confidence_threshold in [0.6, 0.7, 0.8, 0.9]:
            y_pred_probas = self.clf.predict_proba(X_test)

            y_test_filter = []
            y_pred_filter = []
            for i in range(0, len(y_test)):
                argmax = np.argmax(y_pred_probas[i])
                if y_pred_probas[i][argmax] < confidence_threshold:
                    continue

                y_test_filter.append(y_test[i])
                y_pred_filter.append(argmax)

            y_pred_filter = self.le.inverse_transform(y_pred_filter)

            print(
                f"\nConfidence threshold > {confidence_threshold} - {len(y_test_filter)} classified"
            )
            print(
                metrics.confusion_matrix(y_test_filter,
                                         y_pred_filter,
                                         labels=class_names))
            print(
                classification_report_imbalanced(y_test_filter,
                                                 y_pred_filter,
                                                 labels=class_names))

        joblib.dump(self, self.__class__.__name__.lower())

        return tracking_metrics