def test_fmt_sklearn_preds_classification(self): """test fmt_sklearn_preds on classification case""" modelobj_class = RandomForestClassifier() model_df = self.df.loc[:, self.df.columns != 'target'] modelobj_class.fit(model_df, self.df_class.loc[:, 'target']) fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_class, 'predict_proba'), modelobj_class, model_df, self.df_class, 'target', 'classification') self.assertIn('predictedYSmooth', fmtd_outputs, """fmt_sklearn_preds on classificaiton case does not return predictions""")
def test_fmt_sklearn_preds_regression(self): """test fmt_sklearn_preds on regression case""" modelobj_regr = RandomForestRegressor() model_df =self.df.loc[:, self.df.columns != 'target'] modelobj_regr.fit(model_df, self.df.loc[:, 'target']) fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_regr, 'predict'), modelobj_regr, model_df, self.df, 'target', 'regression') self.assertIn('predictedYSmooth', fmtd_outputs.columns.values, """fmt_sklearn_preds on regression case does not return predictions""")
def run(self, output_type='html', output_path=''): """ main run engine. Iterate over columns specified in keepfeaturelist, and perform anlaysis :param output_type: str output type: html - save html to output_path raw_data - return raw analysis dataframe agg_data - return aggregate analysis dataframe :param output_path: - fpath to save output :return: pd.DataFrame or saved html output :rtype: pd.DataFrame or .html """ # ensure supported output types if output_type not in md_utils.Settings.supported_out_types: error_out = """Output type {} not supported. \nCurrently support {} output""".format( output_type, md_utils.Settings.supported_out_types) logger.error(error_out) raise ValueError(error_out) # run the prediction function first to assign the errors to the dataframe self._cat_df = fmt_sklearn_preds(self.predict_engine, self.modelobj, self._model_df, self._cat_df, self.ydepend, self.model_type) # create placeholder for outputs placeholder = [] # create placeholder for all insights insights_list = [] logging.info("""Running main program. Iterating over columns and applying functions depednent on datatype""") not_in_cols = ['errors', 'predictedYSmooth', self.ydepend] # filter columns to iterate through to_iter_cols = self._cat_df.columns[~self._cat_df.columns. isin(not_in_cols)] for idx, col in enumerate(to_iter_cols): # column placeholder colhold = [] for groupby_var in self.groupbyvars: # if current column is the groupby variable, # create error metrics if col != groupby_var: json_out = self._var_check(col=col, groupby_var=groupby_var) # append to placeholder colhold.append(json_out) else: logging.info("""Creating accuracy metric for groupby variable: {}""".format(groupby_var)) # create error metrics for slices of groupby data acc = md_utils.create_accuracy(self.model_type, self._cat_df, self.error_type, groupby=groupby_var) # append to insights dataframe placeholder insights_list.append(acc) logger.info( """Run processed - Col: {} - groupby_var: {}""".format( col, groupby_var)) # map all of the same columns errors to the first element and # append to placeholder # dont append if placeholder is empty due to col being the same as groupby if len(colhold) > 0: placeholder.append(formatting.FmtJson.flatten_json(colhold)) # TODO redirect stdout so progress bar can output to single line md_utils.sysprint('Percent Complete: {per:2.0f}%'.format( per=(float(idx) / float(len(to_iter_cols))) * 100)) md_utils.sysprint('Percent Complete: 100%') logging.info('Converting accuracy outputs to json format') # finally convert insights_df into json object # convert insights list to dataframe insights_df = pd.concat(insights_list) insights_json = formatting.FmtJson.to_json(insights_df.round( self.round_num), html_type='accuracy', vartype='Accuracy', err_type=self.error_type, ydepend=self.ydepend, mod_type=self.model_type) # append to outputs placeholder.append(insights_json) # append percentiles placeholder.append(self.Percentiles.percentiles) # append groupby percentiles placeholder.append(self.Percentiles.group_percentiles_out) # assign placeholder final outputs to class instance self.outputs = placeholder # save outputs if specified if output_type == 'html': self._save(fpath=output_path) elif output_type == 'raw_data': return self.get_raw_df() elif output_type == 'agg_data': return self.get_agg_df()
def run(self, output_type='html', progbar=False, **kwargs): """ main run engine. Iterate over columns specified in keepfeaturelist, and perform anlaysis :param output_type: str output type: html - save html to output_path raw_data - return raw analysis dataframe agg_data - return aggregate analysis dataframe :param output_path: - fpath to save output :param progbar: boolean output progress bar :return: pd.DataFrame or saved html output :rtype: pd.DataFrame or .html """ self._validate_params() # run the prediction function first to assign the errors to the dataframe self._cat_df = fmt_sklearn_preds(self.predict_engine, self._modelobj, self._model_df, self._cat_df, self.ydepend, self.model_type) # placeholders placeholder = {'res': [], 'insights': []} logger.info("""Running main program. Iterating over columns and applying functions depednent on datatype""") not_in_cols = ['errors', 'predictedYSmooth', self.ydepend] # filter columns to iterate through to_iter_cols = self._cat_df.columns[~self._cat_df.columns.isin(not_in_cols)] # create combinations of groupby and columns all_iter = list(itertools.product(to_iter_cols, self.groupbyvars)) if progbar: pbar = md_utils.progress_bar() progress_bar = pbar(total=len(all_iter)) for (col, groupby_var) in itertools.product(to_iter_cols, self.groupbyvars): col_indices = [col, 'errors', 'predictedYSmooth', groupby_var] key, value = self._base_runner(self._cat_df.loc[:, col_indices], col, groupby_var) placeholder[key].append(value) logger.info("""Run processed - Col: {} - groupby_var: {}""".format(col, groupby_var)) if progbar: progress_bar.update(1) # convert placeholders to final output self._plc_hldr_out(placeholder['insights'], placeholder['res'], html_type='error') # save outputs if specified outputs = self._save(output_type=output_type, fpath=kwargs.get('output_path', None)) # underlying data select, return it if isinstance(outputs, pd.DataFrame): return outputs
def run(self, output_type='html', progbar=False, **kwargs): """ main run engine. Iterate over columns specified in keepfeaturelist, and perform anlaysis :param output_type: str output type: html - save html to output_path raw_data - return raw analysis dataframe agg_data - return aggregate analysis dataframe :param output_path: - fpath to save output :param progbar: bool - output progress bar messages :return: pd.DataFrame or saved html output :rtype: pd.DataFrame or .html """ # if output_type is a data format, force output_df to True and throw warning if output_type in ['raw_data', 'agg_data'] and kwargs.get('output_df') == False: kwargs['output_df'] = True raise Warning("""output_df must be set to True when returning dataframe. Forcing to true""") self._validate_params() # run the prediction function first to assign the errors to the dataframe self._cat_df = fmt_sklearn_preds(self.predict_engine, self._modelobj, self._model_df, self._cat_df, self.ydepend, self.model_type) # placeholders placeholder = {'res': [], 'insights': []} logger.info("""Running main program. Iterating over columns and applying functions depednent on datatype""") # filter cols to iterate over to_iter = [val for val in self._keepfeaturelist if val != self.ydepend] # create col, groupby combos all_iter = list(itertools.product(to_iter, self.groupbyvars)) # create container with synthetic prediction difference, row mask, # and incremental val preds_container = self._preds_container(to_iter) # import pbar if progbar: pbar = md_utils.progress_bar() progress_bar = pbar(total=len(all_iter)) for idx, (col, groupby_var) in enumerate(all_iter, 1): col_indices = [col, 'errors', 'predictedYSmooth', groupby_var, 'diff'] # pull incremental val, diff, and mask from container incremental_val, diff, mask = preds_container[col] # update differences self._cat_df['diff'] = diff key, value = self._base_runner(self._cat_df.loc[mask, col_indices], col, groupby_var, **kwargs) # assign incremental value for output formatting if key == 'res': value['incremental_val'] = incremental_val placeholder[key].append(value) #logger.info("""Run processed - Col: {} - groupby_var: {}""".format(col, groupby_var)) if progbar: progress_bar.update(1) # convert placeholders to final output self._plc_hldr_out(placeholder['insights'], placeholder['res'], html_type='sensitivity') # save outputs if specified self._save(output_type=output_type, fpath=kwargs.get('output_path', None))