Beispiel #1
0
    def test_fmt_sklearn_preds_classification(self):
        """test fmt_sklearn_preds on classification case"""

        modelobj_class = RandomForestClassifier()

        model_df = self.df.loc[:, self.df.columns != 'target']

        modelobj_class.fit(model_df,
                           self.df_class.loc[:, 'target'])

        fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_class, 'predict_proba'),
                                            modelobj_class,
                                            model_df,
                                            self.df_class,
                                            'target',
                                            'classification')

        self.assertIn('predictedYSmooth',
                      fmtd_outputs,
                      """fmt_sklearn_preds on classificaiton case does not return predictions""")
Beispiel #2
0
    def test_fmt_sklearn_preds_regression(self):
        """test fmt_sklearn_preds on regression case"""

        modelobj_regr = RandomForestRegressor()

        model_df =self.df.loc[:, self.df.columns != 'target']

        modelobj_regr.fit(model_df,
                          self.df.loc[:, 'target'])

        fmtd_outputs = fmt_model_outputs.fmt_sklearn_preds(getattr(modelobj_regr, 'predict'),
                                            modelobj_regr,
                                            model_df,
                                            self.df,
                                            'target',
                                            'regression')

        self.assertIn('predictedYSmooth',
                      fmtd_outputs.columns.values,
                      """fmt_sklearn_preds on regression case does not return predictions""")
Beispiel #3
0
    def run(self, output_type='html', output_path=''):
        """
        main run engine. Iterate over columns specified in keepfeaturelist,
        and perform anlaysis
        :param output_type: str output type:
                html - save html to output_path
                raw_data - return raw analysis dataframe
                agg_data - return aggregate analysis dataframe
        :param output_path: - fpath to save output
        :return: pd.DataFrame or saved html output
        :rtype: pd.DataFrame or .html
        """
        # ensure supported output types
        if output_type not in md_utils.Settings.supported_out_types:
            error_out = """Output type {} not supported.
                                \nCurrently support {} output""".format(
                output_type, md_utils.Settings.supported_out_types)

            logger.error(error_out)

            raise ValueError(error_out)

        # run the prediction function first to assign the errors to the dataframe
        self._cat_df = fmt_sklearn_preds(self.predict_engine, self.modelobj,
                                         self._model_df, self._cat_df,
                                         self.ydepend, self.model_type)
        # create placeholder for outputs
        placeholder = []
        # create placeholder for all insights
        insights_list = []
        logging.info("""Running main program. Iterating over 
                    columns and applying functions depednent on datatype""")

        not_in_cols = ['errors', 'predictedYSmooth', self.ydepend]

        # filter columns to iterate through
        to_iter_cols = self._cat_df.columns[~self._cat_df.columns.
                                            isin(not_in_cols)]

        for idx, col in enumerate(to_iter_cols):

            # column placeholder
            colhold = []

            for groupby_var in self.groupbyvars:
                # if current column is the groupby variable,
                # create error metrics
                if col != groupby_var:
                    json_out = self._var_check(col=col,
                                               groupby_var=groupby_var)
                    # append to placeholder
                    colhold.append(json_out)

                else:
                    logging.info("""Creating accuracy metric for 
                                groupby variable: {}""".format(groupby_var))
                    # create error metrics for slices of groupby data
                    acc = md_utils.create_accuracy(self.model_type,
                                                   self._cat_df,
                                                   self.error_type,
                                                   groupby=groupby_var)
                    # append to insights dataframe placeholder
                    insights_list.append(acc)

                logger.info(
                    """Run processed - Col: {} - groupby_var: {}""".format(
                        col, groupby_var))

            # map all of the same columns errors to the first element and
            # append to placeholder
            # dont append if placeholder is empty due to col being the same as groupby
            if len(colhold) > 0:
                placeholder.append(formatting.FmtJson.flatten_json(colhold))
            # TODO redirect stdout so progress bar can output to single line
            md_utils.sysprint('Percent Complete: {per:2.0f}%'.format(
                per=(float(idx) / float(len(to_iter_cols))) * 100))

        md_utils.sysprint('Percent Complete: 100%')
        logging.info('Converting accuracy outputs to json format')
        # finally convert insights_df into json object
        # convert insights list to dataframe
        insights_df = pd.concat(insights_list)
        insights_json = formatting.FmtJson.to_json(insights_df.round(
            self.round_num),
                                                   html_type='accuracy',
                                                   vartype='Accuracy',
                                                   err_type=self.error_type,
                                                   ydepend=self.ydepend,
                                                   mod_type=self.model_type)
        # append to outputs
        placeholder.append(insights_json)
        # append percentiles
        placeholder.append(self.Percentiles.percentiles)
        # append groupby percentiles
        placeholder.append(self.Percentiles.group_percentiles_out)
        # assign placeholder final outputs to class instance
        self.outputs = placeholder
        # save outputs if specified
        if output_type == 'html':
            self._save(fpath=output_path)
        elif output_type == 'raw_data':
            return self.get_raw_df()
        elif output_type == 'agg_data':
            return self.get_agg_df()
Beispiel #4
0
    def run(self,
            output_type='html',
            progbar=False,
            **kwargs):
        """
        main run engine. Iterate over columns specified in keepfeaturelist,
        and perform anlaysis
        :param output_type: str output type:
                html - save html to output_path
                raw_data - return raw analysis dataframe
                agg_data - return aggregate analysis dataframe
        :param output_path: - fpath to save output
        :param progbar: boolean output progress bar
        :return: pd.DataFrame or saved html output
        :rtype: pd.DataFrame or .html
        """
        self._validate_params()
        # run the prediction function first to assign the errors to the dataframe
        self._cat_df = fmt_sklearn_preds(self.predict_engine,
                                         self._modelobj,
                                         self._model_df,
                                         self._cat_df,
                                         self.ydepend,
                                         self.model_type)
        # placeholders
        placeholder = {'res': [],
                       'insights': []}

        logger.info("""Running main program. Iterating over 
                            columns and applying functions depednent on datatype""")

        not_in_cols = ['errors', 'predictedYSmooth', self.ydepend]

        # filter columns to iterate through
        to_iter_cols = self._cat_df.columns[~self._cat_df.columns.isin(not_in_cols)]

        # create combinations of groupby and columns
        all_iter = list(itertools.product(to_iter_cols, self.groupbyvars))

        if progbar:
            pbar = md_utils.progress_bar()
            progress_bar = pbar(total=len(all_iter))

        for (col, groupby_var) in itertools.product(to_iter_cols, self.groupbyvars):

            col_indices = [col, 'errors', 'predictedYSmooth', groupby_var]

            key, value = self._base_runner(self._cat_df.loc[:, col_indices],
                                           col,
                                           groupby_var)

            placeholder[key].append(value)

            logger.info("""Run processed - Col: {} - groupby_var: {}""".format(col, groupby_var))

            if progbar:
                progress_bar.update(1)

        # convert placeholders to final output
        self._plc_hldr_out(placeholder['insights'],
                           placeholder['res'],
                           html_type='error')

        # save outputs if specified
        outputs = self._save(output_type=output_type,
                             fpath=kwargs.get('output_path', None))
        # underlying data select, return it
        if isinstance(outputs, pd.DataFrame):
            return outputs
Beispiel #5
0
    def run(self,
            output_type='html',
            progbar=False,
            **kwargs):
        """
        main run engine. Iterate over columns specified in keepfeaturelist,
        and perform anlaysis
        :param output_type: str output type:
                html - save html to output_path
                raw_data - return raw analysis dataframe
                agg_data - return aggregate analysis dataframe
        :param output_path: - fpath to save output
        :param progbar: bool - output progress bar messages
        :return: pd.DataFrame or saved html output
        :rtype: pd.DataFrame or .html
        """
        # if output_type is a data format, force output_df to True and throw warning
        if output_type in ['raw_data', 'agg_data'] and kwargs.get('output_df') == False:
            kwargs['output_df'] = True
            raise Warning("""output_df must be set to True when returning dataframe. Forcing 
                            to true""")


        self._validate_params()
        # run the prediction function first to assign the errors to the dataframe
        self._cat_df = fmt_sklearn_preds(self.predict_engine,
                                         self._modelobj,
                                         self._model_df,
                                         self._cat_df,
                                         self.ydepend,
                                         self.model_type)
        # placeholders
        placeholder = {'res': [],
                       'insights': []}

        logger.info("""Running main program. Iterating over 
                            columns and applying functions depednent on datatype""")

        # filter cols to iterate over
        to_iter = [val for val in self._keepfeaturelist if val != self.ydepend]
        # create col, groupby combos
        all_iter = list(itertools.product(to_iter, self.groupbyvars))
        # create container with synthetic prediction difference, row mask,
        # and incremental val
        preds_container = self._preds_container(to_iter)
        # import pbar
        if progbar:
            pbar = md_utils.progress_bar()
            progress_bar = pbar(total=len(all_iter))

        for idx, (col, groupby_var) in enumerate(all_iter, 1):
            col_indices = [col, 'errors', 'predictedYSmooth', groupby_var, 'diff']

            # pull incremental val, diff, and mask from container
            incremental_val, diff, mask = preds_container[col]

            # update differences
            self._cat_df['diff'] = diff

            key, value = self._base_runner(self._cat_df.loc[mask, col_indices],
                                           col,
                                           groupby_var,
                                           **kwargs)
            # assign incremental value for output formatting
            if key == 'res':
                value['incremental_val'] = incremental_val

            placeholder[key].append(value)

            #logger.info("""Run processed - Col: {} - groupby_var: {}""".format(col, groupby_var))
            if progbar:
                progress_bar.update(1)

        # convert placeholders to final output
        self._plc_hldr_out(placeholder['insights'],
                           placeholder['res'],
                           html_type='sensitivity')

        # save outputs if specified
        self._save(output_type=output_type,
                   fpath=kwargs.get('output_path', None))