Exemple #1
0
def plot_scatter_matrix(df, y_continuous=True, y_var_name=None, colors=None):
    """ Plots a series of scatter matrix of the continuous variables.
        INPUT:
            df:
                dataframe
            y_var_name:
                string, the column name of the dependent y variable in
                the dataframe
            jitter:
                a float that widens the data, make this wider according to
                number of datapoints
            **options:
                the **options input found in matplotlib scatter
        OUTPUT:
            A scatterplot on ax.
    """
    if not y_var_name:
        y_var_name = df.columns[0]
    (continuous_features,
     category_features) = sort_features(df.drop(y_var_name, axis=1))
    color_wheel = make_color_wheel(df, y_var_name)
    sample_df = take_sample(df)
    # Oh my, possible error: you can make a subsample that one unique value
    while 5 < len(continuous_features):
        plot_sample_df = sample_df[[y_var_name] + continuous_features[:5]]
        plot_one_scatter_matrix(plot_sample_df, sample_df, y_var_name,
                                color_wheel, colors, y_continuous)
        plt.show()
        continuous_features = continuous_features[5:]
    plot_sample_df = sample_df[[y_var_name] + continuous_features]
    plot_one_scatter_matrix(plot_sample_df, sample_df, y_var_name, color_wheel,
                            colors, y_continuous)
    return None
Exemple #2
0
def make_models(df,
                df_X,
                y,
                y_var_name,
                univariates,
                alphas=np.logspace(start=-2, stop=5, num=5)):
    """CHOOSE MODELS FOR CONTINUOUS OR CATEGORICAL Y, make the Models"""
    print(len(y.unique()))
    (continuous_features, category_features) = sort_features(df_X)
    is_continuous = (y_var_name in continuous_features)
    if is_continuous:
        print('Y VARIABLE: "' + y_var_name + '" IS CONTINUOUS')
        print()
        if univariates:
            plot_many_univariates(df, y_var_name)
            plt.show()
        names_models = make_cont_models(alphas)
        scoring = 'neg_mean_squared_error'
    else:
        print('Y VARIABLE: "' + y_var_name + '" IS CATEGORICAL')
        print()
        names_models = make_cat_models(alphas)
        scoring = 'accuracy'
    models = [x[1] for x in names_models]
    return (names_models, continuous_features, category_features, models,
            scoring, is_continuous, alphas)
Exemple #3
0
def auto_spline_pipeliner(df_X, knots=10):
    (continuous_features, category_features) = sort_features(df_X)
    # print(continuous_features)
    # print(category_features)
    continuous_pipelet = []
    category_pipelet = []
    for name in continuous_features:
        knotspace = list(np.linspace(df_X[name].min(), df_X[name].max(),
                                     knots))
        continuous_pipelet.append(
            (name + '_fit', simple_spline_specification(name, knotspace)))
    for name in category_features:
        category_pipe = simple_category_specification(
            name, list(df_X[name].unique()))
        category_pipelet.append((name + '_spec', category_pipe))
        # print(df_X[name].unique()[:-1])
    category_features_pipe = FeatureUnion(category_pipelet)
    if (continuous_features == []) & (category_features == []):
        return "(continuous_features == []) & (category_features == [])"
    if continuous_features == []:
        return category_features_pipe
    continuous_features_scaled = Pipeline([('continuous_features',
                                            FeatureUnion(continuous_pipelet)),
                                           ('standardizer', StandardScaler())])
    if category_features == []:
        return continuous_features_scaled
    pipe_continuous_category = FeatureUnion([
        ('continuous_features', continuous_features_scaled),
        ('category_features', category_features_pipe)
    ])
    return pipe_continuous_category
Exemple #4
0
def plot_many_univariates(df, y_var_name):
    """ A linear spline regression of all continuous columns in the dataframe.
        INPUT:
            ax:
                matplotlib axis
                (use 'fig, ax = matplotlib.pyplot.subplots(1,1)')
            dataframe:
                dataframe of floats or ints
            x_var_name:
                the column name of the x variable in the dataframe
            y_var_name:
                string, the column name of the dependent y variable in
                the dataframe
        OUTPUT:
            A linear regression, with light blue bootstrapped lines showing the
            instability of the regression
    """
    (continuous_features, category_features) = sort_features(df)
    continuous_features_greater_two = list(
        filter(lambda x: len(df[x].unique()) > 2, continuous_features))
    if len(continuous_features_greater_two) > 1:
        num_plot_rows = int(np.ceil(
            len(continuous_features_greater_two) / 2.0))
        fig, axs = plt.subplots(num_plot_rows,
                                2,
                                figsize=(14, 3 * num_plot_rows))
        for i, continuous_feature in tqdm.tqdm(
                enumerate(continuous_features_greater_two)):
            # if len(df[continuous_feature].unique()) > 2:
            plot_one_univariate(axs.flatten()[i], df, continuous_feature,
                                y_var_name)
            axs.flatten()[i].set_title(
                f"{continuous_feature}: Univariate Plot")
    elif len(continuous_features_greater_two) == 1:
        fig, axs = plt.subplots(len(continuous_features_greater_two),
                                1,
                                figsize=(14, 4.5 *
                                         len(continuous_features_greater_two)))
        for i, continuous_feature in enumerate(
                continuous_features_greater_two):
            plot_one_univariate(axs, df, continuous_feature, y_var_name)
            axs.set_title("{}: Univariate Plot".format(continuous_feature))
            fig.set_tight_layout(tight=True)  # this doesn't work!!!
            # 'tight_layout' must be used in calling script as well
            fig.tight_layout(pad=2)
    else:
        raise ValueError('No Continous Features to Plot')
    return None
Exemple #5
0
def plot_continuous_error_graphs(df,
                                 y,
                                 y_var_name,
                                 model,
                                 is_continuous,
                                 sample_limit=300,
                                 predicteds_vs_actuals=True,
                                 residuals=True):
    df_X_sample = df.sample(sample_limit).drop(y_var_name, axis=1)
    y_hat_sample = model.predict(df_X_sample)
    if is_continuous:
        if len(y) > 0:
            if len(y) == len(y_hat_sample):
                if predicteds_vs_actuals:
                    (continuous_features,
                     category_features) = sort_features(df_X_sample)
                    timeit(plot_many_predicteds_vs_actuals,
                           df_X_sample,
                           continuous_features,
                           y,
                           y_hat_sample.reshape(-1),
                           n_bins=50)
                    plt.show()
                    # add feature to jitter plot to categorical features
                    # add cdf???
                if residuals:
                    fig, ax = plt.subplots()
                    timeit(plot_residual_error,
                           ax,
                           df_X_sample.values[:, 0].reshape(-1),
                           y.reshape(-1),
                           y_hat_sample.reshape(-1),
                           s=30)
                    plt.show()
            else:
                print('len(y) != len(y_hat), so no regressions included')
        else:
            print('No y, so no regressions included')
    return None
Exemple #6
0
def compare_predictions(df,
                        y_var_name,
                        percent_data=None,
                        category_limit=11,
                        knots=3,
                        alphas=np.logspace(start=-2, stop=10, num=50),
                        corr_matrix=True,
                        scatter_matrix=True,
                        bootstrap_coefs=True,
                        feature_importances=True,
                        partial_dep=True,
                        actual_vs_predicted=True,
                        residuals=True,
                        univariates=True,
                        compare_models=True,
                        ROC=True,
                        bootstraps=10):
    """Takes dataframe
        INPUT:
            name:
                string, a feature name to spline
            knots:
                int, number knots (divisions) which are
                divisions between splines.
        OUTPUT:
            pipeline
    """
    starttotal = time()
    df, sample_limit = clean_dataframe(df, y_var_name, percent_data)

    # REMEMBER OLD DATAFRAME

    df_unpiped, df_X_unpiped = df.copy(), df.copy().drop(y_var_name, axis=1)
    (unpiped_continuous_features,
     unpiped_category_features) = sort_features(df_X_unpiped)
    columns_unpiped = df_X_unpiped.columns

    # REMOVE CATEGORICAL VARIABLES THAT HAVE TOO MANY CATEGORIES TO BE USEFUL
    df = drop_category_exeeding_limit(df, y_var_name, category_limit)

    # SHOW CORRELATION MATRIX
    if corr_matrix:
        if len(unpiped_continuous_features) > 0:
            timeit(plt.matshow, df.sample(sample_limit).corr())

    # MAKE SCATTER MATRIX
    if scatter_matrix:
        if len(unpiped_continuous_features) > 0:
            timeit(plot_scatter_matrix, df, y_var_name, colors=True)
            plt.show()

    # TRANSFORM DATAFRAME
    print('DF COLUMNS: \n' + str(list(df.columns)) + '\n')
    df, df_X, X, y, pipeline = use_spline(df, y_var_name)
    print('DF COLUMNS AFTER TRANSFORM: \n' + str(list(df.columns)) + '\n')

    # MAKE MODELS
    (names_models, continuous_features, category_features, models, scoring,
     is_continuous, alphas) = make_models(df, df_X, y, y_var_name, univariates,
                                          alphas)

    # evaluate each model in turn
    fit_models, results, names, y_hats, errors, seed = [], [], [], [], [], 7

    for name, model in tqdm.tqdm(names_models):
        # if not linear: change df_X to df_X unpiped
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        if name == 'RR' or name == 'LASSO':
            alpha, cv_results = timeit(plot_choose_alpha, df, model,
                                       y_var_name, alphas, kfold, scoring)
            model = model(alpha)
        else:
            cv_results = timeit(cross_val_score,
                                model,
                                X,
                                y,
                                cv=kfold,
                                scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: mean=%f std=%f" % (name, cv_results.mean(),
                                      cv_results.std())
        print(msg)

        # OTHER CROSS VALIDATE METHOD:

        # FIT MODEL WITH ALL DATA
        model.fit(X, y)
        fit_models.append(model)

        # PLOT PREDICTED VS ACTUALS
        if is_continuous:
            timeit(plot_predicted_vs_actuals, df, model, y_var_name,
                   sample_limit)
            plt.show()

        # MAKE BOOTSTRAPS
        if bootstrap_coefs or partial_dep:
            bootstrap_models = bootstrap_train_premade(model,
                                                       X,
                                                       y,
                                                       bootstraps=bootstraps,
                                                       fit_intercept=False)

        # PLOT COEFFICIANTS
        if hasattr(model, "coef_"):
            coefs = model.coef_
            columns = list(df.drop(y_var_name, axis=1).columns)
            while (type(coefs[0]) is list) or (type(coefs[0]) is np.ndarray):
                coefs = list(coefs[0])
            timeit(plot_coefs, coefs=coefs, columns=columns, graph_name=name)
            plt.show()

            # PLOT BOOTSTRAP COEFFICIANTS
            if is_continuous:
                if bootstrap_coefs:
                    # PLOT BOOTSTRAP COEFS
                    fig, axs = timeit(plot_bootstrap_coefs,
                                      bootstrap_models,
                                      df_X.columns,
                                      n_col=4)
                    fig.tight_layout()
                    plt.show()

        # PLOT FEATURE IMPORTANCES
        if feature_importances:
            if 'feature_importances_' in dir(model):
                timeit(plot_feature_importances, model, df_X)
                plt.show()

        # PLOT PARTIAL DEPENDENCIES
        if partial_dep:
            timeit(plot_partial_dependences,
                   model,
                   X=df_X_unpiped,
                   var_names=unpiped_continuous_features,
                   y=y,
                   bootstrap_models=bootstrap_models,
                   pipeline=pipeline,
                   n_points=250)
            plt.tight_layout()
            plt.show()

        # PLOT PREDICTED VS ACTUALS
        plot_continuous_error_graphs(df,
                                     y,
                                     y_var_name,
                                     model,
                                     is_continuous,
                                     sample_limit,
                                     predicteds_vs_actuals=True,
                                     residuals=True)
        df_X = df.drop(y_var_name, axis=1)

        # GET ERROR
        y_hat, error = get_error(name, model, df_X, y, is_continuous)
        y_hats.append(y_hat)
        errors.append(error)

    # --COMPARE MODELS--
    if compare_models:
        choose_box_and_violin_plots(names, scoring, compare_models, results,
                                    is_continuous)
    # ROC CURVE
    if ROC:
        if not is_continuous:
            timeit(plot_rocs, models, df_X, y)
            plt.show()
    print(f'MAKE SUBSAMPLE TIME: {time() - starttotal}')
    return names, results, fit_models, pipeline, df_X, y_hats, errors