def score(self, estimator, X, y, took_log_of_y=False, advanced_scoring=False, verbose=2, name=None): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingRegressor): X = X.toarray() predictions = estimator.predict(X) if took_log_of_y: for idx, val in enumerate(predictions): predictions[idx] = math.exp(val) try: score = self.scoring_func(y, predictions) except ValueError: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [val for idx, val in enumerate(predictions) if idx not in bad_val_indices] y = [val for idx, val in enumerate(y) if idx not in bad_val_indices] print('Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset') score = self.scoring_func(y, predictions) if advanced_scoring == True: if hasattr(estimator, 'name'): print(estimator.name) advanced_scoring_regressors(predictions, y, verbose=verbose, name=name) return - 1 * score
def score(self, estimator, X, y, advanced_scoring=False): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingClassifier): X = X.toarray() predictions = np.array(estimator.predict_proba(X)) kwargs = {} if np.unique(y).size > 2 or predictions.ndim > 1: if self.scoring_method in ['log_loss', 'roc_auc']: y = np.array(y) y_one_hot = np.zeros((y.size, y.max() + 1)) y_one_hot[np.arange(y.size), y] = 1 y = y_one_hot if self.scoring_method in ['f1_score']: predictions = predictions.argmax(axis=1) kwargs['average'] = 'weighted' if self.scoring_method == 'brier_score_loss': # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred. probas = [max(min(row[1], 1), 0) for row in predictions] predictions = probas try: score = self.scoring_func(y, predictions, **kwargs) except ValueError as e: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [ val for idx, val in enumerate(predictions) if idx not in bad_val_indices ] y = [ val for idx, val in enumerate(y) if idx not in bad_val_indices ] print( 'Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset' ) try: score = self.scoring_func(y, predictions) except ValueError: # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params predictions = self.clean_probas(predictions) score = self.scoring_func(y, predictions) if self.scoring_method in [ 'accuracy', 'accuracy_score', 'roc_auc', 'f1_score' ]: score *= -1 # value needs to go up to optimize these if advanced_scoring: return (-1 * score, predictions) else: return -1 * score
def score(self, estimator, X, y, advanced_scoring=False): X, y = utils.drop_missing_y_vals(X, y, output_column=None) if isinstance(estimator, GradientBoostingClassifier): X = X.toarray() predictions = estimator.predict_proba(X) if self.scoring_method == 'brier_score_loss': # At the moment, Microsoft's LightGBM returns probabilities > 1 and < 0, which can break some scoring functions. So we have to take the max of 1 and the pred, and the min of 0 and the pred. probas = [max(min(row[1], 1), 0) for row in predictions] predictions = probas try: score = self.scoring_func(y, predictions) except ValueError as e: bad_val_indices = [] for idx, val in enumerate(y): if str(val) in bad_vals_as_strings: bad_val_indices.append(idx) predictions = [ val for idx, val in enumerate(predictions) if idx not in bad_val_indices ] y = [ val for idx, val in enumerate(y) if idx not in bad_val_indices ] print( 'Found ' + str(len(bad_val_indices)) + ' null or infinity values in the y values. We will ignore these, and report the score on the rest of the dataset' ) try: score = self.scoring_func(y, predictions) except ValueError: # Sometimes, particularly for a badly fit model using either too little data, or a really bad set of hyperparameters during a grid search, we can predict probas that are > 1 or < 0. We'll cap those here, while warning the user about them, because they're unlikely to occur in a model that's properly trained with enough data and reasonable params predictions = self.clean_probas(predictions) score = self.scoring_func(y, predictions) if advanced_scoring: return (-1 * score, predictions) else: return -1 * score
def score(self, X_test, y_test, advanced_scoring=True, verbose=2): if isinstance(X_test, list): X_test = pd.DataFrame(X_test) y_test = list(y_test) X_test, y_test = utils.drop_missing_y_vals(X_test, y_test, self.output_column) if self._scorer is not None: if self.type_of_estimator == 'regressor': return self._scorer.score(self.trained_pipeline, X_test, y_test, self.took_log_of_y, advanced_scoring=advanced_scoring, verbose=verbose, name=self.name) elif self.type_of_estimator == 'classifier': # TODO: can probably refactor accuracy score now that we've turned scoring into it's own class if self._scorer == accuracy_score: predictions = self.trained_pipeline.predict(X_test) return self._scorer.score(y_test, predictions) elif advanced_scoring: score, probas = self._scorer.score( self.trained_pipeline, X_test, y_test, advanced_scoring=advanced_scoring) utils_scoring.advanced_scoring_classifiers(probas, y_test, name=self.name) return score else: return self._scorer.score( self.trained_pipeline, X_test, y_test, advanced_scoring=advanced_scoring) else: return self.trained_pipeline.score(X_test, y_test)
def _prepare_for_training(self, X): # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here. if isinstance(X, list): X_df = pd.DataFrame(X) del X else: X_df = X # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored if len(self.cols_to_ignore) > 0: X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore) # Having duplicate columns can really screw things up later. Remove them here, with user logging to tell them what we're doing X_df = utils.drop_duplicate_columns(X_df) # If we're writing training results to file, create the new empty file name here if self.write_gs_param_results_to_file: self.gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv' try: os.remove(self.gs_param_file_name) except: pass # bad_rows = X_df[pd.isnull(X_df[self.output_column])] # if bad_rows.shape[0] > 0: # print('We encountered a number of missing values for this output column') # print('Specifically, here is the output column:') # print(self.output_column) # print('And here is the number of missing (nan, None, etc.) values for this column:') # print(bad_rows.shape[0]) # print('We will remove these values, and continue with training on the cleaned dataset') # X_df = X_df.dropna(subset=[self.output_column]) # Remove the output column from the dataset, and store it into the y varaible y = list(X_df.pop(self.output_column)) # Drop all rows that have an empty value for our output column # User logging so they can adjust if they pass in a bunch of bad values: X_df, y = utils.drop_missing_y_vals(X_df, y, self.output_column) # If this is a classifier, try to turn all the y values into proper ints # Some classifiers play more nicely if you give them category labels as ints rather than strings, so we'll make our jobs easier here if we can. if self.type_of_estimator == 'classifier': # The entire column must be turned into floats. If any value fails, don't convert anything in the column to floats try: y_ints = [] for val in y: y_ints.append(int(val)) y = y_ints except: pass else: # If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats indices_to_delete = [] y_floats = [] bad_vals = [] for idx, val in enumerate(y): try: float_val = utils_data_cleaning.clean_val(val) y_floats.append(float_val) except ValueError as err: indices_to_delete.append(idx) bad_vals.append(val) y = y_floats # Even more verbose logging here since these values are not just missing, they're strings for a regression problem if len(indices_to_delete) > 0: print( 'The y values given included some bad values that the machine learning algorithms will not be able to train on.' ) print( 'The rows at these indices have been deleted because their y value could not be turned into a float:' ) print(indices_to_delete) print('These were the bad values') print(bad_vals) X_df = X_df.drop(X_df.index(indices_to_delete)) return X_df, y