def _dump_df_excel(obj, file, **kwargs):
    '''dump df to excel
    
    obj: 
        2d array like data
    file:
        str or file obj:        
    '''
    writer = pd.ExcelWriter(file)
    obj = get_flat_list(obj)

    sheet_name = kwargs.get('sheet_name')

    if sheet_name is None:
        sheet_name = ['sheet' + str(i + 1) for i in range(len(obj))]
    else:
        sheet_name = get_flat_list(sheet_name)
        check_consistent_length(obj, sheet_name)

    for data, name in zip(obj, sheet_name):
        try:
            data = pd.DataFrame(data)
            kw = get_kwargs(data.to_excel, **kwargs)
            kw.update({
                'sheet_name': name,
                'index': kwargs.get('index', False)
            })
            data.to_excel(writer, **kw)
        except Exception as e:
            print(repr(e))
            continue
    writer.save()
Exemple #2
0
 def _get_scorer(self, scoring):
     ''' return sklearn scorer, including custom scorer
     '''
     scorer = {}
     sk_scoring = []
     custom_scorer = get_custom_scorer()
     for i in get_flat_list(scoring):
         if i in custom_scorer:
             scorer.update({i: custom_scorer[i]})
         else:
             sk_scoring.append(i)
     if len(sk_scoring) > 0:
         s, _ = _validation._check_multimetric_scoring(self.estimator,
                                                       scoring=sk_scoring)
         scorer.update(s)
     return scorer
def _get_files(dirpath, suffix=None, subfolder=False):
    ''' return file dict {filename : file}

    dirpath - str
        - dir_x to traverse
    suffix -->extension name, or list of extension names, egg ['.xlsx', 'csv']
        - to include file extensions, default None, to include all extensions
    subfolder --> bool
        - true to traverse subfolders, False only the given dirpath
    '''
    if subfolder:
        get_dirs = traverse_all_dirs
    else:
        get_dirs = traverse_dir

    rst = {
        k: v
        for k, v in get_dirs(dirpath).items()
        if os.path.splitext(v)[1] in get_flat_list(suffix) or not suffix
    }
    return rst
def plotter_score_path(df_score, title=None, cm=None, style='-.o'):
    '''
    df_score:
        data frame of scores of metrics
    '''
    # plot
    data = df_score.select_dtypes(include='number')
    n = len(data.columns)
    i, j = plt.rcParams['figure.figsize']
    fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2)))
    ax = get_flat_list(ax) if n == 1 else ax
    if cm is None:
        cm = plt.get_cmap('tab10')
    cmlist = [cm(i) for i in np.linspace(0, 1, n)]

    i = 0
    for ax0, col in zip(ax, data.columns):
        s = data[col]
        if api.is_numeric_dtype(s):

            s.plot(ax=ax0, color=cmlist[i], style=style)
            ax0.fill_between(s.index,
                             s - s.std(),
                             s + s.std(),
                             color='grey',
                             alpha=.3,
                             label=r'{} = {}$\pm$ {}'.format(
                                 col, round(s.mean(), 4), round(s.std(), 4)))
            plt.setp(ax0, ylabel=col)
            h, l = ax0.get_legend_handles_labels()
            ax0.legend([h[-1]], [l[-1]])
            i += 1
    ax[0].set_title(title)
    ax[-1].set_xlabel('index')
    plt.tight_layout(rect=(0, 0, 0.98, 0.96))
    return fig
def plotter_cv_results_(results,
                        train_style='mo-',
                        test_style='go-.',
                        title=None):
    '''plot univariate parameter cross validated results after 
    grid search of model
    
    return
    -----
    ax, or tuple of ax
    '''
    scoring = results.filter(like='mean_train_').columns
    scoring = [i.replace('mean_train_', '') for i in scoring]
    df_param = results.filter(like='param_')
    param_array = df_param.columns
    if len(param_array) > 1:
        print('multi-parameter is encountered ... ')
        print(df_param.apply(lambda x: pd.Series(pd.unique(x))))
    # plot
    n = len(scoring)
    i, j = plt.rcParams['figure.figsize']
    fig, ax = plt.subplots(n, 1, figsize=(i, j + 2.5 * (n // 2)))
    ax = get_flat_list(ax) if n == 1 else ax
    for s, ax0 in zip(scoring, ax):
        df = results[['mean_train_' + s, 'mean_test_' + s, 'std_test_' + s]]
        if len(param_array) == 1:
            df.index = results[param_array[0]]
            xlabel = param_array[0]
            num_param = api.is_numeric_dtype(df.index)
            if not num_param:
                df.index = np.arange(len(df.index))
        else:
            xlabel = ' + '.join([i.split('__')[-1] for i in param_array])

        df.sort_index(inplace=True)
        # plot
        mean = df['mean_test_' + s].values
        std = df.pop('std_test_' + s)
        x = df.index.get_values()
        df.plot.line(style=[train_style, test_style], ax=ax0)
        ax0.fill_between(x,
                         mean - std,
                         mean + std,
                         color='grey',
                         alpha=.2,
                         label=r'$\pm$ 1 std. dev.')
        # annotate
        x_max = df.index[np.argmax(mean)]
        best_score = np.max(mean)
        std = np.mean(std)
        h, l = ax0.get_legend_handles_labels()
        ax0.legend(
            [h[-1]],
            ['score_max= %0.4f $\pm$ %0.2f' % (np.max(mean), np.mean(std))])
        ax0.axvline(x_max, linestyle='--', marker='x', color='y')
        ax0.annotate("%0.4f" % best_score, (x_max, best_score))
        ax0.set_xlim(x.min() - 0.5, x.max() + 0.5)
        plt.setp(ax0, ylabel=s)

    # set title
    ax[0].set_title(title, fontsize=13)
    # use fig legend
    fig.legend(h, ('train', 'test', r'$\pm$ 1 std. dev.'),
               loc='upper right',
               ncol=3,
               bbox_to_anchor=(0.98, 1))
    ax[-1].set_xlabel(xlabel)
    plt.tight_layout(rect=(0, 0, 1, 0.95))
    return ax
def plotter_auc(fpr,
                tpr,
                ax=None,
                alpha=0.95,
                lw=1.2,
                curve_label=None,
                title=None,
                cm=None):
    '''plot roc_auc curve given fpr, tpr, or list of fpr, tpr
    
    cm:
        color map default 'tab20'
    
    return
    ----
    ax
    '''
    fpr, tpr = get_flat_list(fpr), get_flat_list(tpr)
    if len(fpr) != len(tpr):
        raise ValueError("length of fpr and tpr doesn't match")
    n = len(fpr)
    names = range(n) if curve_label is None else get_flat_list(curve_label)
    if len(names) != n:
        print('n_curve label not match with n_fpr or n_tpr')
        names = range(n)

    # -- plot each line
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    aucs = []
    kss = []
    if cm is None:
        cm = plt.get_cmap('tab20')
    cmlist = [cm(i) for i in np.linspace(0, 1, n)]
    for i in range(n):
        if len(fpr[i]) != len(tpr[i]):
            print("length of {}th fpr and tpr doesn't match".format(i))
            continue
        else:
            auc_score = auc(fpr[i], tpr[i])
            ks_score = max(np.array(tpr[i]) - np.array(fpr[i]))
            aucs.append(auc_score)
            kss.append(ks_score)
            ax.plot(fpr[i],
                    tpr[i],
                    color=cmlist[i],
                    alpha=alpha,
                    lw=lw,
                    label='ROC %r (AUC=%0.2f;KS=%0.2f)' %
                    (names[i], auc_score, ks_score))
    # plot mean tpr line
    if n > 1:
        mean_fpr = np.linspace(0, 1, 100)
        tprs = [interp(mean_fpr, x, y) for x, y in zip(fpr, tpr)]
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[0] = 0.0
        mean_tpr[-1] = 1.0
        mean_auc = np.mean(aucs)
        std_auc = np.std(aucs)
        ax.plot(mean_fpr,
                mean_tpr,
                'b-.',
                alpha=1,
                lw=1.5,
                label='Mean ROC(AUC=%0.2f $\pm$ %0.2f)' % (mean_auc, std_auc))
        #plot variance
        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        ax.fill_between(mean_fpr,
                        tprs_lower,
                        tprs_upper,
                        color='grey',
                        alpha=.3,
                        label=r'$\pm$ 1 std. dev.')
    # plot chance line
    ax.plot([0, 1], [0, 1], 'k--', lw=1.5, label='Chance (AUC=0.5)')
    # set property
    if title is None:
        title = 'Receiver operating characteristic'
    plt.setp(ax,
             xlabel='False Positive Rate',
             ylabel='True Positive Rate',
             xlim=[-0.05, 1.05],
             ylim=[-0.05, 1.05],
             title=title)
    plt.legend(loc="lower right", fontsize='medium', bbox_to_anchor=(1, 0))
    plt.tight_layout()
    return ax
Exemple #7
0
    def run_sensitivity(self,
                        train_set=None,
                        title=None,
                        param_grid=-1,
                        refit='roc_auc',
                        scoring=['roc_auc', 'KS'],
                        fit_params={},
                        n_jobs=2,
                        save_fig=True,
                        **kwargs):
        '''
        - run sensitivity of param_grid (if param_grid=-1, use pre-difined); 
        - update self estimator as best estimator, & update self gridcv_results;
        - dump plots/spreadsheets
        
        parmameters
        ----
        train_set: 
            2 element tuple, (X, y) of train data
        param_grid:
            parameter grid space, if -1, use pipe_grid() to return predifined 
            param_grid
        **kwargs:
            GridSearchCV keywords
        '''

        L = locals().copy()
        L.pop('self')
        L.pop('param_grid')
        folder = self.folder
        #--
        if train_set is None:
            train_set = self._get_dataset('.traindata')[0]
        else:
            folder.write(train_set, 'data/0.traindata')

        if param_grid is -1:
            param_grid = []
            for k, v in self.estimator.named_steps.items():
                grid = pipe_grid(k)
                if grid is not None:
                    param_grid.extend(grid)

        if len(param_grid) == 0:
            print('no param_grid found, skip grid search')
            return

        # memory cache
        if hasattr(self.estimator, 'memory'):
            self.estimator.memory = os.path.relpath(
                os.path.join(self.folder.path_, 'tempfolder'))

        X, y = train_set
        cv_results = []
        for i, grid in enumerate(get_flat_list(param_grid)):
            self.grid_searchcv(X,
                               y=y,
                               param_grid=grid,
                               **get_kwargs(self.grid_searchcv, **L),
                               **kwargs)
            self.plot_gridcv(save_fig=save_fig, title=str(i))
            cv_results.append(self.gridcv_results)

        print('sensitivity results are being saved... ')
        title = 0 if title is None else str(title)
        folder.write(cv_results,
                     'spreadsheet/GridcvResults{}.xlsx'.format(title))
        self.save()
        self._shut_temp_folder()
Exemple #8
0
    def run_test(self,
                 test_set=None,
                 title=None,
                 use_self_bins=True,
                 cv=3,
                 scoring=['roc_auc', 'KS', 'average_precision'],
                 save_fig=True,
                 **kwargs):
        '''
        - run test performance of an estimator; 
        - dump lift curve and ROC curve for test data under self.folder.path_; 
        - optionally dump spreadsheets of calculated data
        
        test_set:
            2 element tuple (X_test, y_test) or list of them
        title:
            title for test_set indicator
        
        return
        ----
            series: averaged scoring for each of scoring metrics
        '''
        L = locals().copy()
        L.pop('self')
        L.pop('title')
        folder = self.folder
        # --

        r = 0
        if test_set is None:
            test_set, title = self._get_dataset('.testdata')[0]
            r -= 1

        test_set_list = get_flat_list(test_set)
        if title is not None:
            title_list = get_flat_list(title)
        else:
            title_list = [str(i) for i in range(len(test_set_list))]
        check_consistent_length(test_set_list, title_list)
        if r == 0:
            folder.write([test_set_list, title_list],
                         'data/{}.testdata'.format(len(title_list)))

        testscore = []
        for i, j in zip(test_set_list, title_list):
            # test performance
            X_test = i[0]
            y_test = i[1]
            # plot test auc
            testcv = self.plot_auc_test(X_test,
                                        y_test,
                                        title=j,
                                        **get_kwargs(self.plot_auc_test, **L,
                                                     **kwargs))
            # plot lift curve
            test_lift = self.plot_lift(X_test,
                                       y_test,
                                       title=j,
                                       **get_kwargs(self.plot_lift, **L),
                                       **kwargs)
            # test scores
            scores = self.test_score(X_test, y_test, cv=cv, scoring=scoring)
            scores['group'] = str(j)
            testscore.append(scores)
            if self.verbose > 0:
                print(
                    'test cv_score & cv_splits test data are being saved... ')
                folder.write(testcv[-1],
                             file='spreadsheet/TestSplits{}.xlsx'.format(j))
                if test_lift is None:
                    lift = pd.DataFrame()
                else:
                    lift = test_lift[-1]
                folder.write(
                    [lift, scores],
                    sheet_name=['lift_curve', 'test_score'],
                    file='spreadsheet/TestPerfomance{}.xlsx'.format(j))

        testscore_all = pd.concat(testscore, axis=0, ignore_index=True)
        fig = plotter_score_path(testscore_all, title='score_path')
        if save_fig is True:
            folder.write(fig, 'plots/TestScore_path.pdf')
            plt.close()
        if self.verbose > 0 and len(testscore) > 1:
            folder.write(testscore_all, 'spreadsheet/TestPerformanceAll.xlsx')

        return testscore_all[scoring].mean()
Exemple #9
0
    def plot_auc_test(self,
                      X,
                      y,
                      cv=1,
                      groups=None,
                      title=None,
                      ax=None,
                      save_fig=False):
        '''plot roc_auc curve for given fitted estimator, must have continuous
        predictons (decision_function or predict_proba) to evaluate model by
        roc_auc metrics(iterables of X, y can be passed or X, y 
        can be splited using cv > 1), to assess model fit performance

        X
            -2D array or list of 2D ndarrays
        y
            -binary or list of class labels
        cv 
            -int, cross-validation generator or an iterable
            - if cv>1, generate splits by StratifyKfold method
        title
            - title added to plot header as to indicate (X, y)
        return
        --------
        ax, mean-auc, std-auc,
       
        data_splits:
           list of test data set in the form of DataFrame (combined X & y)
        '''
        L = locals().copy()
        L.pop('self')
        estimator = self.estimator
        # split test set by cv
        if cv > 1:
            xs = []
            ys = []
            data_splits = tuple(
                _split_cv(X, y=y, cv=cv, groups=groups,
                          random_state=self.seed))
            for x_set, y_set in data_splits:
                xs.append(x_set[1])
                ys.append(y_set[1])
            L.update({'X': xs, 'y': ys, 'cv': 1})
            return self.plot_auc_test(**L)

        self._check_fitted(estimator)
        X = get_flat_list(X)
        y = get_flat_list(y)
        validation.check_consistent_length(X, y)
        fprs = []
        tprs = []
        aucs = []
        n_sample = 0
        for i in range(len(X)):
            x0 = X[i]
            y0 = y[i]
            y_pre = self._pre_continueous(estimator, x0)
            fpr, tpr, threshhold = roc_curve(y0, y_pre, drop_intermediate=True)
            fprs.append(fpr)
            tprs.append(tpr)
            aucs.append(auc(fpr, tpr))
            n_sample += len(x0)
        # -- plot
        if ax is None:
            fig, ax = plt.subplots(1, 1)
        ax = plotter_auc(fprs, tprs, ax=ax)

        header = '-'.join([
            _get_estimator_name(estimator), 'testCV',
            '{} samples'.format(n_sample)
        ])
        if isinstance(title, str):
            header = '-'.join([title, header])
        ax.set_title(header)

        data_splits = [
            pd.concat((pd.DataFrame(i) for i in item), axis=1)
            for item in zip(X, y)
        ]

        if save_fig is True:
            if isinstance(title, str):
                plot_name = 'plots/roc_test_' + title + '.pdf'
            else:
                plot_name = 'plots/roc_test.pdf'
            self.folder.write(plt.gcf(), plot_name)
            plt.close()
        return ax, np.mean(aucs), np.std(aucs), data_splits