def get_data_summary(self) -> pd.DataFrame: """Get summary statistics and identified intent for training dataset. Returns: a DataFrame, in which each column represent the summary of a column """ if not self.has_fitted: logging.info("The foreshadow object is not trained yet.") return None X_summary = self.X_preparer.cache_manager[AcceptedKey.SUMMARY] y_summary = self.y_preparer.cache_manager[AcceptedKey.SUMMARY] X_summary[y_summary.columns[0]] = y_summary return X_summary
def execute_model(fs, X_train, y_train, X_test, y_test): """Execute the model produced by `generate_model()`. Also, exports the data to json and returns the exported json object containing the results and the serialized Foreshadow object. Also, prints simple model accuracy metrics. Args: fs (foreshadow.Foreshadow): An unfit foreshadow object. X_train (:obj:`DataFrame <pandas.DataFrame>`): The X train data. X_test (:obj:`DataFrame <pandas.DataFrame>`): The X test data. y_train (:obj:`DataFrame <pandas.DataFrame>`): The y train data. y_test (:obj:`DataFrame <pandas.DataFrame>`): The y test data. Returns: dict: A dictionary with the following keys `X_Model`, `X_Summary`, \ `y_model`, and `y_summary` which each represent the serialized \ and summarized forms of each of those steps. """ logging.info("Fitting final model...") fs.fit(X_train, y_train) logging.info("Scoring final model...") score = fs.score(X_test, y_test) logging.info("Final Results: ") logging.info(score) # TODO Temporarily turn off the serialization as this requires # additional change and plus we may not need it at all. # json_file_location = "foreshadow.json" # fs.to_json(json_file_location) pickled_fitted_pipeline_location = "foreshadow_fitted_pipeline.p" fs.pickle_fitted_pipeline(pickled_fitted_pipeline_location) # logging.info( # "Serialized foreshadow pipeline has been saved to {} " # "and {}. Refer to docs to read and process.".format( # json_file_location, pickled_fitted_pipeline_location # ) # ) logging.info( "Serialized foreshadow pipeline has been saved to {}. Refer to docs " "to read and process.".format(pickled_fitted_pipeline_location))
def resolve(self, X, *args, **kwargs): """Pick the appropriate transformer if necessary. Note: Column info sharer is set based on the chosen transformer. Args: X: input observations *args: args to pass to resolve **kwargs: params to resolve """ # Override the SmartTransformer resolve method to allow the setting of # column info sharer data when resolving. super().resolve(X, *args, **kwargs) column_name = self.column self.column_intent = self.transformer.__class__.__name__ logging.info("Column {} has intent type: {}".format( column_name, self.column_intent))
def _has_column_in_cache_manager(self, column: str) -> Union[bool, None]: """Check if the column exists in the cache manager. If the foreshadow object has not been trained, it will return None. Args: column: the column name Returns: Whether a column exists in the cache manager """ if not self.has_fitted: logging.info( "The foreshadow object is not trained yet. Please make sure " "the column {} exist to ensure the override takes " "effect.".format(column)) return False cache_manager = self.X_preparer.cache_manager return True if column in cache_manager[AcceptedKey.INTENT] else False
def get_intent(self, column_name: str) -> Union[str, None]: """Retrieve the intent of a column. Args: column_name: the column name Returns: str: the intent of the column """ # Note: this retrieves intent from cache_manager. Only columns have # been processed will be visible. cache_manager = self.X_preparer.cache_manager if self._has_column_in_cache_manager(column_name): return cache_manager[AcceptedKey.INTENT][column_name] else: logging.info("No intent exists for column {}. Either the column " "doesn't exist or foreshadow object has not " "been fitted yet.".format(column_name)) return None
def _check_empty_columns(self, original_columns: List) -> List: empty_columns = [] for cleaner_tuple in self.feature_processor.transformers_: _, cleaner, column_name = cleaner_tuple if isinstance(cleaner.transformer, DropCleaner): empty_columns.append(column_name) if len(empty_columns) == len(original_columns): error_message = ( "All columns are dropped since they all have " "over 90% of missing values. Aborting foreshadow.") logging.error(error_message) raise ValueError(error_message) elif len(empty_columns) > 0: logging.info( "Identified columns with over 90% missing values: {} and " "they will be dropped." "".format(",".join(empty_columns))) return empty_columns
def transform(self, X): """Execute fancyimpute transformer on X data. Args: X (:obj:`pandas.DataFrame`): Input data Returns: :obj:`pandas.DataFrame`: Output data """ if X.isnull().values.any(): """This is a temporary fix since the newer version of fancyimpute package has already fixed the issue of throwing exception when there's no missing value. However, due to the constraint on the requirements, we have to stay with an older version and use this workaround until we figure out how to upgrade all the associated dependencies. """ return self.imputer.complete(X) else: logging.info("No missing value found in column {}".format( X.columns[0])) return X
def generate_model(args): # noqa: C901 """Process command line args and generate a Foreshadow model to fit. Args: args (list): A list of string arguments to process Returns: tuple: A tuple of `fs, X_train, y_train, X_test, y_test` which \ represents the foreshadow model along with the split data. Raises: ValueError: if invalid file or invalid y. """ cargs = process_argument(args) if cargs.level == 3 and cargs.method is not None: warnings.warn( "WARNING: Level 3 model search enabled. Method will be ignored.") if cargs.level != 3 and cargs.time != 10: warnings.warn("WARNING: Time parameter not applicable " "to feature engineering. Must be in level 3.") try: df = pd.read_csv(cargs.data) except Exception: raise ValueError( "Failed to load file. Please verify it exists and is a valid CSV.") try: X_df = df.drop(columns=cargs.target) y_df = df[[cargs.target]] except Exception: raise ValueError("Invalid target variable") X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2) if cargs.level == 1: # Default everything with basic estimator fs = Foreshadow( problem_type=cargs.problem_type, estimator=get_method(cargs.method, y_train, cargs.family, cargs.problem_type), ) # elif cargs.level == 2: # # Parameter search on all matched intents # # if cargs.x_config is not None: # try: # with open(cargs.x_config, "r") as f: # X_search = Preprocessor(from_json=json.load(f)) # except Exception: # raise ValueError( # "Could not read X config file {}".format(cargs.x_config) # ) # print("Reading config for X Preprocessor") # else: # X_search = search_intents(X_train) # print("Searching over valid intent space for X data") # # if cargs.y_config is not None: # try: # with open(cargs.y_config, "r") as f: # y_search = Preprocessor(from_json=json.load(f)) # except Exception: # raise ValueError( # "Could not read y config file {}".format(cargs.y_config) # ) # print("Reading config for y Preprocessor") # else: # y_search = search_intents(y_train, y_var=True) # print("Searching over valid intent space for y data") # # # If level 3 also do model parameter search with AutoEstimator # # Input time limit into Foreshadow to be passed into AutoEstimator # # fs = Foreshadow( # X_preparer=X_search, # y_preparer=y_search, # estimator=get_method(cargs.method, y_train), # optimizer=GridSearchCV, # ) # elif cargs.level == 3: # Default intent and advanced model search using 3rd party AutoML estimator = AutoEstimator(problem_type=cargs.problem_type, auto="tpot") estimator.construct_estimator(y_train) # TODO move this into the configure_estimator method "max_time_mins" # is an argument for the TPOT library. We cannot assign it # based on the problem type here. For testing purpose, I'm going # to hardcode it for TPOT. # kwargs = ( # "max_time_mins" # if estimator.problem_type == ProblemType.REGRESSION # else "time_left_for_this_task" # ) kwargs = "max_time_mins" estimator.estimator_kwargs = { kwargs: cargs.time, **estimator.estimator_kwargs, } fs = Foreshadow(problem_type=cargs.problem_type, estimator=estimator) else: raise ValueError("Invalid Level. Only levels 1 and 3 supported.") if cargs.multiprocess: fs.configure_multiprocessing(-1) logging.info("multiprocessing enabled.") return fs, X_train, y_train, X_test, y_test
def transform(self, X, y=None): """Clean string columns. Here, we assume that any list output means that these are desired to be new columns in our dataset. Contractually, this could change to be that a boolean flag is passed to indicate when this is desired, as of right now, there should be no need to return a list for any case other than this case of desiring new column. The same is assumed for dicts, where the key is the new column name, the value is the value for that row in that column. NaNs are automatically put into the columns that don't exist for given rows. Args: X (:obj:`pandas.Series`): X data y: input labels Returns: :obj:`pandas.DataFrame`: Transformed data Raises: InvalidDataFrame: If unexpected output returned that was not handled correctly. This happens if the output specified by the child does not match what is actually returned. The child should ensure it's implementation is consistent. """ X = check_df(X, single_column=True) logging.info("Starting cleaning rows...") out = X[X.columns[0]].apply(self.transform_row, return_tuple=False) logging.info("Ending cleaning rows...") # access single column as series and apply the list of # transformations to each row in the series. if any( [ isinstance(out.iloc[i], (list, tuple)) for i in range(out.shape[0]) ] ): # out are lists == new columns if not all( [ len(out.iloc[0]) == len(out.iloc[i]) for i in range(len(out.iloc[0])) ] ): raise InvalidDataFrame( "length of lists: {}, returned not of same value.".format( [out.iloc[i] for i in range(len(out[0]))] ) ) columns = self.output_columns if columns is None: # by default, pandas would have given a unique integer to # each column, instead, we keep the previous column name and # add that integer. columns = [ X.columns[0] + str(c) for c in range(len(out.iloc[0])) ] # We need to set the index. Otherwise, the new data frame might # misalign with other columns. X = pd.DataFrame([*out.values], index=out.index, columns=columns) elif any( [isinstance(out.iloc[i], (dict)) for i in range(out.shape[0])] ): # out are dicts == named new columns all_keys = dict() for row in out: all_keys.update({key: True for key in row}) # get all columns columns = list(all_keys.keys()) out = pd.DataFrame([*out.values], columns=columns) out.columns = [X.columns[0] + "_" + c for c in columns] X = out # by default, this will create a DataFrame where if a row # contains the value, it will be added, if not NaN is added. else: # no lists, still 1 column output X[X.columns[0]] = out return X
def _export_data(self, X, is_train=True): data_path = self._determine_export_path(is_train) X.to_csv(data_path, index=False) logging.info("Exported processed data to {}".format(data_path))