Ejemplo n.º 1
0
def clean_dataframe(df, y_var_name, percent_data):
    df = cleandata.rename_columns(df)
    y_var_name = stringcase.snakecase(y_var_name).replace('__', '_').replace(
        '__', '_')
    df = timeit(take_subsample, df, percent_data)
    df = timeit(cleandata.clean_df, df, y_var_name)
    sample_limit = make_sample_limit(df)
    return df, sample_limit
Ejemplo n.º 2
0
def clean_dataframe(df, y_var_name, percent_data):
    df = rename_columns(df)
    y_var_name = stringcase.snakecase(y_var_name).replace('__', '_').replace(
        '__', '_')
    # subsample is breaking the ROC somehow
    # df = timeit(take_subsample, df, percent_data)
    df = timeit(clean_df, df, y_var_name)
    sample_limit = np.min([300, df.shape[0]])
    return df, sample_limit
Ejemplo n.º 3
0
 def test_rename_columns(self):
     df = pd.DataFrame([[1, 2, 3, 4], [1, 2, 3, 4]],
                       columns=[
                           'First', 'Second_and THIRD', '__Fourth',
                           'FIF.TH aND sixth'
                       ])
     df_snake = pd.DataFrame([[1, 2, 3, 4], [1, 2, 3, 4]],
                             columns=[
                                 'first', 'second_and_third', '_fourth',
                                 'fif_th_and_sixth'
                             ])
     df_rename = rename_columns(df)
     self.assertTrue(
         all(x == y for x, y in zip(df_snake.columns, df_rename.columns)))
     assert_dict_equal(df_snake.to_dict(), df_rename.to_dict())
Ejemplo n.º 4
0
def compare_predictions(df,
                        y_var_name,
                        percent_data=None,
                        category_limit=11,
                        knots=3,
                        corr_matrix=True,
                        scatter_matrix=True,
                        bootstrap_coefs=True,
                        feature_importances=True,
                        partial_dep=True,
                        actual_vs_predicted=True,
                        residuals=True,
                        univariates=True,
                        compare_models=True,
                        ROC=True,
                        bootstraps=10):
    """Takes dataframe
        INPUT:
            name:
                string, a feature name to spline
            knots:
                int, number knots (divisions) which are divisions between splines.
        OUTPUT:
            pipeline
    """
    df = cleandata.rename_columns(df)
    y_var_name = stringcase.snakecase(y_var_name).replace('__', '_')
    start = time()
    if percent_data is None:
        while len(df) > 1000:
            print(f"""'percent_data' NOT SPECIFIED AND len(df)=({len(df)})
                  IS > 1000: TAKING A RANDOM %10 OF THE SAMPLE""")
            df = df.sample(frac=.1)
    else:
        df = df.sample(frac=percent_data)
    print(f'MAKE SUBSAMPLE TIME: {time() - start}')
    start = time()
    df = cleandata.clean_df(df, y_var_name)
    print(f'CLEAN_DF TIME: {time()-start}')

    # REMEMBER OLD DATAFRAME

    df_unpiped = df.copy()
    df_X_unpiped = df_unpiped.drop(y_var_name, axis=1)
    (unpiped_continuous_features,
     unpiped_category_features) = sort_features(df_X_unpiped)
    columns_unpiped = df.columns
    columns_unpiped = list(columns_unpiped)
    columns_unpiped.remove(y_var_name)

    # REMOVE CATEGORICAL VARIABLES THAT HAVE TOO MANY CATEGORIES TO BE USEFUL
    df = cleandata.drop_category_exeeding_limit(df, y_var_name, category_limit)

    # SHOW CORRELATION MATRIX
    if corr_matrix:
        if len(df) < 300:
            sample_limit = len(df)
        else:
            sample_limit = 300
        start = time()
        plt.matshow(df.sample(sample_limit).corr())
        plt.show()
        print(f'PLOT CORRELATION TIME: {time() - start}')

    # MAKE SCATTER MATRIX
    if scatter_matrix:
        start = time()
        galgraphs.plot_scatter_matrix(df, y_var_name)
        plt.show()
        print(f'MAKE SCATTER TIME: {time() - start}')
        print()

    print('DF COLUMNS: ')
    print(str(list(df.columns)))
    print()
    # TRANSFORM DATAFRAME
    df_X = df.drop(y_var_name, axis=1)
    pipeline = auto_spline_pipeliner(df_X, knots=5)
    pipeline.fit(df_X)
    df_X = pipeline.transform(df_X)
    X = df_X.values
    y = df[y_var_name]
    df = df_X
    df[y_var_name] = y
    print('DF COLUMNS AFTER TRANSFORM: ')
    print(str(list(df.columns)))
    print()

    # CHOOSE MODELS FOR CONTINUOUS OR CATEGORICAL Y
    names_models = []
    print(len(y.unique()))

    (continuous_features, category_features) = sort_features(df_X)
    is_continuous = (y_var_name in continuous_features)
    if is_continuous:
        print('Y VARIABLE: "' + y_var_name + '" IS CONTINUOUS')
        print()
        if univariates == True:
            galgraphs.plot_many_univariates(df, y_var_name)
            plt.show()
        # names_models.append(('LR', LinearRegression()))
        alphas = np.logspace(start=-2, stop=5, num=5)
        names_models.append(('RR', RidgeCV(alphas=alphas)))
        names_models.append(('LASSO', LassoCV(alphas=alphas)))
        names_models.append(('DT', DecisionTreeRegressor()))
        names_models.append(('RF', RandomForestRegressor()))
        names_models.append(('GB', GradientBoostingRegressor()))
        names_models.append(('GB', AdaBoostRegressor()))
        # names_models.append(('SVM', SVC()))
        # evaluate each model in turn
        scoring = 'neg_mean_squared_error'
    else:
        alphas = np.logspace(start=-5, stop=5, num=5)
        print('Y VARIABLE: "' + y_var_name + '" IS CATEGORICAL')
        print()
        names_models.append(('LR', LogisticRegression()))
        names_models.append(('LASSOish', LogisticRegression(penalty='l1')))
        names_models.append(('LDA', LinearDiscriminantAnalysis()))
        names_models.append(('RR', RidgeClassifierCV(alphas=alphas)))
        names_models.append(('KNN', KNeighborsClassifier()))
        names_models.append(('DT', DecisionTreeClassifier()))
        # names_models.append(('NB', GaussianNB()))
        names_models.append(('RF', RandomForestClassifier()))
        names_models.append(('GB', GradientBoostingClassifier()))
        names_models.append(('DT', AdaBoostClassifier()))
        # names_models.append(('SVM', SVC()))
        scoring = 'accuracy'
    models = [x[1] for x in names_models]
    fit_models = []

    # evaluate each model in turn
    results = []
    names = []
    seed = 7
    for name, model in tqdm.tqdm(names_models):

        #if not linear: change df_X to df_X unpiped

        # CROSS VALIDATE MODELS
        start = time()
        kfold = model_selection.KFold(n_splits=10, random_state=seed)
        cv_results = model_selection.cross_val_score(model,
                                                     X,
                                                     y,
                                                     cv=kfold,
                                                     scoring=scoring)

        results.append(cv_results)
        names.append(name)
        msg = "%s: mean=%f std=%f" % (name, cv_results.mean(),
                                      cv_results.std())
        print(msg)
        plt.show()

        print(f'CV CALC TIME: {time()-start}')

        # #OTHER CROSS VALIDATE METHOD:
        # ridge_regularization_strengths = np.logspace(np.log10(0.000001), np.log10(100000000), num=100)
        # ridge_regressions = []
        # y=df['age']
        # df_X = df.drop('age', axis=1)
        # for alpha in ridge_regularization_strengths:
        #     ridge = Ridge(alpha=alpha)
        #     ridge.fit(df_X, y)
        #     ridge_regressions.append(ridge)
        # fig, ax = plt.subplots(figsize=(16, 6))
        # galgraphs.plot_solution_paths(ax, ridge_regressions)

        # ADD GRIDSEARCH HERE

        # FIT MODEL WITH ALL DATA
        model.fit(X, y)
        fit_models.append(model)

        # PLOT PREDICTED VS ACTUALS
        start = time()
        if is_continuous:
            galgraphs.plot_predicted_vs_actuals(df, model, y_var_name,
                                                sample_limit)
            plt.show()

        print(f'PLOT PREDICTED VS ACTUALS TIME: {time() - start}')
        # MAKE BOOTSTRAPS
        if bootstrap_coefs or partial_dep:
            bootstrap_models = bootstrap_train_premade(model,
                                                       X,
                                                       y,
                                                       bootstraps=bootstraps,
                                                       fit_intercept=False)

        # PLOT COEFFICIANTS

        if hasattr(model, "coef_"):
            start = time()
            coefs = model.coef_
            columns = list(df.columns)
            columns.remove(y_var_name)
            while (type(coefs[0]) is list) or (type(coefs[0]) is np.ndarray):
                coefs = list(coefs[0])
            galgraphs.plot_coefs(coefs=coefs, columns=columns, graph_name=name)
            plt.show()

            # PLOT BOOTSTRAP COEFFICIANTS
            if is_continuous:
                if bootstrap_coefs:
                    print(f'PLOT COEFFICIANTS TIME: {time() - start}')
                    # PLOT BOOTSTRAP COEFS
                    start = time()
                    fig, axs = plot_bootstrap_coefs(bootstrap_models,
                                                    df_X.columns,
                                                    n_col=4)
                    fig.tight_layout()
                    plt.show()
                    print(
                        f'PLOT BOOTSTRAP COEFFICIANTS TIME: {time() - start}')

        # PLOT FEATURE IMPORTANCES
        if feature_importances:
            if 'feature_importances_' in dir(model):
                start = time()
                galgraphs.plot_feature_importances(model, df_X)
                plt.show()
                print(f'PLOT FEATURE IMPORTANCES TIME: {time() - start}')

        # PLOT PARTIAL DEPENDENCIES
        if partial_dep:
            start = time()
            plot_partial_dependences(model,
                                     X=df_X_unpiped,
                                     var_names=unpiped_continuous_features,
                                     y=y,
                                     bootstrap_models=bootstrap_models,
                                     pipeline=pipeline,
                                     n_points=250)
            # plot_partial_dependences(model, X=df_unpiped.drop(y_var_name, axis=1), var_names=columns_unpiped, y=y, bootstrap_models=bootstrap_models, pipeline=pipeline, n_points=250)
            # galgraphs.plot_partial_dependences(model, X=df_unpiped.drop(y_var_name, axis=1), var_names=columns_unpiped, y=y, bootstrap_models=bootstrap_models, pipeline=pipeline, n_points=250)
            plt.show()
            print(
                f'PLOT CONTINUOUS PARTIAL DEPENDENCIES TIME: {time() - start}')
            start = time()
            hot_categorical_vars = [
                column for column in df.columns
                if (len(df[column].unique()) == 2)
            ]
            # galgraphs.shaped_plot_partial_dependences(model, df[[y_var_name]+hot_categorical_vars], y_var_name)
            plt.show()
            print(
                f'PLOT CATEGORICAL PARTIAL DEPENDENCIES TIME: {time() - start}'
            )

        # PLOT PREDICTED VS ACTUALS

        df_X_sample = df.sample(sample_limit).drop(y_var_name, axis=1)
        y_hat_sample = model.predict(df_X_sample)
        if is_continuous:
            if len(y) > 0:
                if len(y) == len(y_hat_sample):
                    if predicteds_vs_actuals:
                        (continuous_features,
                         category_features) = sort_features(df_X_sample)
                        start = time()
                        galgraphs.plot_many_predicteds_vs_actuals(
                            df_X_sample,
                            continuous_features,
                            y,
                            y_hat_sample.reshape(-1),
                            n_bins=50)
                        plt.show()

                        print(
                            f'PLOT PREDICTEDS_VS_ACTUALS TIME: {time() - start}'
                        )
                        # galgraphs.plot_many_predicteds_vs_actuals(df_X_sample, category_features, y, y_hat_sample.reshape(-1), n_bins=50)
                        # add feature to jitter plot to categorical features
                        # add cdf???
                    if residuals:
                        start = time()
                        fig, ax = plt.subplots()
                        galgraphs.plot_residual_error(
                            ax,
                            df_X_sample.values[:, 0].reshape(-1),
                            y.reshape(-1),
                            y_hat_sample.reshape(-1),
                            s=30)
                        plt.show()

                    print(f'PLOT RESIDUAL ERROR TIME: {time() - start}')
                else:
                    print('len(y) != len(y_hat), so no regressions included')
            else:
                print('No y, so no regressions included')

        df_X = df.drop(y_var_name, axis=1)

        # GET ERROR
        if is_continuous:
            y_hat = model.predict(df_X)
            print(f'{name}: MSE = {np.mean((y_hat-y)**2)}')
        else:
            if 'predict_proba' in dir(model):
                y_hat = model.predict_proba(df_X)[:, 0]
                logloss = np.mean(y * np.log(y_hat) +
                                  (1 - y) * np.log(1 - y_hat))
                print(f'{name}: logloss = {logloss}')
            if 'decision_function' in dir(model):
                d = model.decision_function(df_X)[0]
                y_hat = np.exp(d) / np.sum(np.exp(d))
                print(f'{name}: logloss = {np.mean((y_hat-y)**2)}')

    # --COMPARE MODELS--
    if compare_models:
        start = time()
        if is_continuous:
            negresults = []
            for i, result in enumerate(results):
                negresults.append(-1 * result)
            galgraphs.plot_box_and_violins(names, scoring, negresults)
        else:
            galgraphs.plot_box_and_violins(names, scoring, results)
        plt.show()
        print(f'PLOT BAR AND VIOLIN TIME: {time() - start}')

    # ROC CURVE
    if ROC:
        if not is_continuous:
            start = time()
            galgraphs.plot_rocs(models, df_X, y)
            plt.show()
            print(f'PLOT ROC TIME: {time() - start}')

    return names, results, models, pipeline, df_X
Ejemplo n.º 5
0
import sys
import os
import autoregression
from autoregression import cleandata, galgraphs
import importlib
import warnings
import pandas
import matplotlib.pyplot as plt

url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = [
    'preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class'
]
df = pandas.read_csv(url, names=names)
array = df.values
df = cleandata.rename_columns(df)
df.head(3)
plt.rcParams.update({'figure.max_open_warning': 0})