Beispiel #1
0
    def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None):
        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingRegressor):
            X = X.toarray()

        predictions = estimator.predict(X)

        if took_log_of_y:
            for idx, val in enumerate(predictions):
                predictions[idx] = math.exp(val)

        try:
            score = self.scoring_func(y, predictions)
        except ValueError:

            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices]
            y = [val for idx, val in enumerate(y) if idx not in bad_val_indices]

            print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset')
            score = self.scoring_func(y, predictions)

        if advanced_scoring == True:
            if hasattr(estimator, 'name'):
                print(estimator.name)
            advanced_scoring_regressors(predictions, y, verbose=verbose, name=name)
        return - 1 * score
Beispiel #2
0
    def score(self, estimator, X, y, advanced_scoring=False):
        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingClassifier):
            X = X.toarray()

        predictions = np.array(estimator.predict_proba(X))
        kwargs = {}

        if np.unique(y).size > 2 or predictions.ndim > 1:
            if self.scoring_method in ['log_loss', 'roc_auc']:
                y = np.array(y)
                y_one_hot = np.zeros((y.size, y.max() + 1))
                y_one_hot[np.arange(y.size), y] = 1
                y = y_one_hot
            if self.scoring_method in ['f1_score']:
                predictions = predictions.argmax(axis=1)
                kwargs['average'] = 'weighted'

        if self.scoring_method == 'brier_score_loss':
            # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred.
            probas = [max(min(row[1], 1), 0) for row in predictions]
            predictions = probas

        try:
            score = self.scoring_func(y, predictions, **kwargs)
        except ValueError as e:
            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [
                val for idx, val in enumerate(predictions)
                if idx not in bad_val_indices
            ]
            y = [
                val for idx, val in enumerate(y) if idx not in bad_val_indices
            ]

            print(
                'Found ' + str(len(bad_val_indices)) +
                ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset'
            )
            try:
                score = self.scoring_func(y, predictions)
            except ValueError:
                # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params
                predictions = self.clean_probas(predictions)
                score = self.scoring_func(y, predictions)

        if self.scoring_method in [
                'accuracy', 'accuracy_score', 'roc_auc', 'f1_score'
        ]:
            score *= -1  # value needs to go up to optimize these

        if advanced_scoring:
            return (-1 * score, predictions)
        else:
            return -1 * score
Beispiel #3
0
    def score(self, estimator, X, y, advanced_scoring=False):

        X, y = utils.drop_missing_y_vals(X, y, output_column=None)

        if isinstance(estimator, GradientBoostingClassifier):
            X = X.toarray()

        predictions = estimator.predict_proba(X)

        if self.scoring_method == 'brier_score_loss':
            # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred.
            probas = [max(min(row[1], 1), 0) for row in predictions]
            predictions = probas

        try:
            score = self.scoring_func(y, predictions)
        except ValueError as e:
            bad_val_indices = []
            for idx, val in enumerate(y):
                if str(val) in bad_vals_as_strings:
                    bad_val_indices.append(idx)

            predictions = [
                val for idx, val in enumerate(predictions)
                if idx not in bad_val_indices
            ]
            y = [
                val for idx, val in enumerate(y) if idx not in bad_val_indices
            ]

            print(
                'Found ' + str(len(bad_val_indices)) +
                ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset'
            )
            try:
                score = self.scoring_func(y, predictions)
            except ValueError:
                # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params
                predictions = self.clean_probas(predictions)
                score = self.scoring_func(y, predictions)

        if advanced_scoring:
            return (-1 * score, predictions)
        else:
            return -1 * score
Beispiel #4
0
    def score(self, X_test, y_test, advanced_scoring=True, verbose=2):

        if isinstance(X_test, list):
            X_test = pd.DataFrame(X_test)
        y_test = list(y_test)

        X_test, y_test = utils.drop_missing_y_vals(X_test, y_test,
                                                   self.output_column)

        if self._scorer is not None:
            if self.type_of_estimator == 'regressor':
                return self._scorer.score(self.trained_pipeline,
                                          X_test,
                                          y_test,
                                          self.took_log_of_y,
                                          advanced_scoring=advanced_scoring,
                                          verbose=verbose,
                                          name=self.name)

            elif self.type_of_estimator == 'classifier':
                # TODO: can probably refactor accuracy score now that we've turned scoring into it's own class
                if self._scorer == accuracy_score:
                    predictions = self.trained_pipeline.predict(X_test)
                    return self._scorer.score(y_test, predictions)
                elif advanced_scoring:
                    score, probas = self._scorer.score(
                        self.trained_pipeline,
                        X_test,
                        y_test,
                        advanced_scoring=advanced_scoring)
                    utils_scoring.advanced_scoring_classifiers(probas,
                                                               y_test,
                                                               name=self.name)
                    return score
                else:
                    return self._scorer.score(
                        self.trained_pipeline,
                        X_test,
                        y_test,
                        advanced_scoring=advanced_scoring)
        else:
            return self.trained_pipeline.score(X_test, y_test)
Beispiel #5
0
    def _prepare_for_training(self, X):

        # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here.
        if isinstance(X, list):
            X_df = pd.DataFrame(X)
            del X
        else:
            X_df = X

        # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored
        if len(self.cols_to_ignore) > 0:
            X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore)

        # Having duplicate columns can really screw things up later. Remove them here, with user logging to tell them what we're doing
        X_df = utils.drop_duplicate_columns(X_df)

        # If we're writing training results to file, create the new empty file name here
        if self.write_gs_param_results_to_file:
            self.gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv'
            try:
                os.remove(self.gs_param_file_name)
            except:
                pass

        # bad_rows = X_df[pd.isnull(X_df[self.output_column])]
        # if bad_rows.shape[0] > 0:
        #     print('We encountered a number of missing values for this output column')
        #     print('Specifically, here is the output column:')
        #     print(self.output_column)
        #     print('And here is the number of missing (nan, None, etc.) values for this column:')
        #     print(bad_rows.shape[0])
        #     print('We will remove these values, and continue with training on the cleaned dataset')
        # X_df = X_df.dropna(subset=[self.output_column])

        # Remove the output column from the dataset, and store it into the y varaible
        y = list(X_df.pop(self.output_column))

        # Drop all rows that have an empty value for our output column
        # User logging so they can adjust if they pass in a bunch of bad values:
        X_df, y = utils.drop_missing_y_vals(X_df, y, self.output_column)

        # If this is a classifier, try to turn all the y values into proper ints
        # Some classifiers play more nicely if you give them category labels as ints rather than strings, so we'll make our jobs easier here if we can.
        if self.type_of_estimator == 'classifier':
            # The entire column must be turned into floats. If any value fails, don't convert anything in the column to floats
            try:
                y_ints = []
                for val in y:
                    y_ints.append(int(val))
                y = y_ints
            except:
                pass
        else:
            # If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats
            indices_to_delete = []
            y_floats = []
            bad_vals = []
            for idx, val in enumerate(y):
                try:
                    float_val = utils_data_cleaning.clean_val(val)
                    y_floats.append(float_val)
                except ValueError as err:
                    indices_to_delete.append(idx)
                    bad_vals.append(val)

            y = y_floats

            # Even more verbose logging here since these values are not just missing, they're strings for a regression problem
            if len(indices_to_delete) > 0:
                print(
                    'The y values given included some bad values that the machine learning algorithms will not be able to train on.'
                )
                print(
                    'The rows at these indices have been deleted because their y value could not be turned into a float:'
                )
                print(indices_to_delete)
                print('These were the bad values')
                print(bad_vals)
                X_df = X_df.drop(X_df.index(indices_to_delete))

        return X_df, y