Beispiel #1
0
    def transform(self, X, y=None):

        if isinstance(X, dict):
            for col, col_dict in self.column_ranges.items():
                if col in X:
                    X[col] = scale_val(val=X[col], min_val=col_dict['min_val'], total_range=col_dict['inner_range'], truncate_large_values=self.truncate_large_values)
        else:

            if len(self.cols_to_ignore) > 0:
                X = utils.safely_drop_columns(X, self.cols_to_ignore)
                # X = X.drop(self.cols_to_ignore, axis=1)

            for col, col_dict in self.column_ranges.items():
                if col in X.columns:
                    min_val = col_dict['min_val']
                    inner_range = col_dict['inner_range']
                    X[col] = X[col].apply(lambda x: scale_val(x, min_val, inner_range, self.truncate_large_values))

        return X
Beispiel #2
0
    def train(self,
              raw_training_data,
              user_input_func=None,
              optimize_entire_pipeline=False,
              optimize_final_model=None,
              write_gs_param_results_to_file=True,
              perform_feature_selection=None,
              verbose=True,
              X_test=None,
              y_test=None,
              print_training_summary_to_viewer=True,
              ml_for_analytics=True,
              only_analytics=False,
              compute_power=3,
              take_log_of_y=None,
              model_names=None,
              perform_feature_scaling=True,
              ensembler=None):

        self.user_input_func = user_input_func
        self.optimize_final_model = optimize_final_model
        self.optimize_entire_pipeline = optimize_entire_pipeline
        self.write_gs_param_results_to_file = write_gs_param_results_to_file
        self.compute_power = compute_power
        self.ml_for_analytics = ml_for_analytics
        self.only_analytics = only_analytics
        self.X_test = X_test
        self.y_test = y_test
        self.print_training_summary_to_viewer = print_training_summary_to_viewer
        if self.type_of_estimator == 'regressor':
            self.take_log_of_y = take_log_of_y
        self.model_names = model_names
        self.perform_feature_scaling = perform_feature_scaling
        self.ensembler = ensembler

        if verbose:
            print(
                'Welcome to auto_ml! We\'re about to go through and make sense of your data using machine learning'
            )

        # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here.
        if isinstance(raw_training_data, list):
            X_df = pd.DataFrame(raw_training_data)
            del raw_training_data
        else:
            X_df = raw_training_data

        if len(X_df.columns) < 50 and perform_feature_selection != True:
            perform_feature_selection = False
        else:
            perform_feature_selection = True

        self.perform_feature_selection = perform_feature_selection

        # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored
        if len(self.cols_to_ignore) > 0:
            X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore)

        X_df, y = self._prepare_for_training(X_df)
        self.X_df = X_df
        self.y = y

        if self.take_log_of_y:
            y = [math.log(val) for val in y]
            self.took_log_of_y = True

        if verbose:
            print(
                'Successfully performed basic preparations and y-value cleaning'
            )

        if model_names != None:
            estimator_names = model_names
        else:
            estimator_names = self._get_estimator_names()

        if self.type_of_estimator == 'classifier':
            if len(set(y)) > 2:
                scoring = accuracy_score
            else:
                scoring = utils_scoring.brier_score_loss_wrapper
            self._scorer = scoring
        else:
            scoring = utils_scoring.rmse_scoring
            self._scorer = scoring

        if verbose:
            print('Created estimator_names and scoring')

        self.perform_grid_search_by_model_names(estimator_names, scoring, X_df,
                                                y)

        # If we ran GridSearchCV, we will have to pick the best model
        # If we did not, the best trained pipeline will already be saved in self.trained_pipeline
        if self.fit_grid_search and len(self.grid_search_pipelines) > 1:
            # Once we have trained all the pipelines, select the best one based on it's performance on (top priority first):
            # 1. Holdout data
            # 2. CV data

            # First, sort all of the tuples that hold our scores in their first position(s), and our actual trained pipeline in their final position
            # Since a more positive score is better, we want to make sure that the first item in our sorted list is the highest score, thus, reverse=True
            sorted_gs_pipeline_results = sorted(self.grid_search_pipelines,
                                                key=lambda x: x[0],
                                                reverse=True)

            # Next, grab the thing at position 0 in our sorted list, which is itself a list of the scores(s), and the pipeline itself
            best_result_list = sorted_gs_pipeline_results[0]
            # Our best grid search result is the thing at the end of that list.
            best_trained_gs = best_result_list[-1]
            # And the pipeline is the best estimator within that grid search object.
            self.trained_pipeline = best_trained_gs.best_estimator_

        # Delete values that we no longer need that are just taking up space.
        del self.X_test
        del self.y_test
        del self.grid_search_pipelines
        del X_df
Beispiel #3
0
    def _prepare_for_training(self, X):

        # We accept input as either a DataFrame, or as a list of dictionaries. Internally, we use DataFrames. So if the user gave us a list, convert it to a DataFrame here.
        if isinstance(X, list):
            X_df = pd.DataFrame(X)
            del X
        else:
            X_df = X

        # To keep this as light in memory as possible, immediately remove any columns that the user has already told us should be ignored
        if len(self.cols_to_ignore) > 0:
            X_df = utils.safely_drop_columns(X_df, self.cols_to_ignore)

        # Having duplicate columns can really screw things up later. Remove them here, with user logging to tell them what we're doing
        X_df = utils.drop_duplicate_columns(X_df)

        # If we're writing training results to file, create the new empty file name here
        if self.write_gs_param_results_to_file:
            self.gs_param_file_name = 'most_recent_pipeline_grid_search_result.csv'
            try:
                os.remove(self.gs_param_file_name)
            except:
                pass

        # bad_rows = X_df[pd.isnull(X_df[self.output_column])]
        # if bad_rows.shape[0] > 0:
        #     print('We encountered a number of missing values for this output column')
        #     print('Specifically, here is the output column:')
        #     print(self.output_column)
        #     print('And here is the number of missing (nan, None, etc.) values for this column:')
        #     print(bad_rows.shape[0])
        #     print('We will remove these values, and continue with training on the cleaned dataset')
        # X_df = X_df.dropna(subset=[self.output_column])

        # Remove the output column from the dataset, and store it into the y varaible
        y = list(X_df.pop(self.output_column))

        # Drop all rows that have an empty value for our output column
        # User logging so they can adjust if they pass in a bunch of bad values:
        X_df, y = utils.drop_missing_y_vals(X_df, y, self.output_column)

        # If this is a classifier, try to turn all the y values into proper ints
        # Some classifiers play more nicely if you give them category labels as ints rather than strings, so we'll make our jobs easier here if we can.
        if self.type_of_estimator == 'classifier':
            # The entire column must be turned into floats. If any value fails, don't convert anything in the column to floats
            try:
                y_ints = []
                for val in y:
                    y_ints.append(int(val))
                y = y_ints
            except:
                pass
        else:
            # If this is a regressor, turn all the values into floats if possible, and remove this row if they cannot be turned into floats
            indices_to_delete = []
            y_floats = []
            bad_vals = []
            for idx, val in enumerate(y):
                try:
                    float_val = utils_data_cleaning.clean_val(val)
                    y_floats.append(float_val)
                except ValueError as err:
                    indices_to_delete.append(idx)
                    bad_vals.append(val)

            y = y_floats

            # Even more verbose logging here since these values are not just missing, they're strings for a regression problem
            if len(indices_to_delete) > 0:
                print(
                    'The y values given included some bad values that the machine learning algorithms will not be able to train on.'
                )
                print(
                    'The rows at these indices have been deleted because their y value could not be turned into a float:'
                )
                print(indices_to_delete)
                print('These were the bad values')
                print(bad_vals)
                X_df = X_df.drop(X_df.index(indices_to_delete))

        return X_df, y