Beispiel #1
0
def run_titanic():
    df_titanic = pd.read_csv(
        'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv'
    )
    df_titanic = df_titanic[[
        'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'
    ]]
    # df_titanic['Pclass'] =  df_titanic['Pclass'].map(lambda x: str(x)+' class')
    df_titanic['Pclass'] = np.array(
        ['zero', 'class one', 'class two',
         'class three'])[df_titanic['Pclass']]
    df_titanic['Survived'] = np.array([True, False])[df_titanic['Survived']]
    autoregression.compare_predictions(
        df_titanic,
        'survived',
        # percent_data=1,
        corr_matrix=True,
        # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X.
        bootstrap_coefs=True,
        partial_dep=True,
        plot_alphas=True,
        plot_predicted_vs_actuals_flag=True,
        plot_coefs_flag=True,
        feature_importances=True,
        actual_vs_predicted=True,
        plot_predicteds_vs_actuals=True,
        residuals=True,
        univariates=True,
        compare_models=True,
        ROC=True,
        show_plots=True,
    )
def run_iris():
    iris_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv')
    # autoregression.compare_predictions(iris_df,'sepal_length')
    iris_df['foods'] = np.random.choice(['hot dogs', 'bacon', 'sweets', np.NaN, np.inf], iris_df.shape[0], )
    autoregression.compare_predictions(iris_df,'sepal_length', percent_data=1,
                            # corr_matrix=True,
                            scatter_matrix=True, 
                            bootstrap_coefs=True,
                            partial_dep=True, 
                            plot_predicted_vs_actuals_flag=True,
                            plot_coefs_flag=True,
                            feature_importances=True,
                            actual_vs_predicted=True,
                            plot_predicteds_vs_actuals=True,
                            residuals=True, 
                            univariates=True, 
                            compare_models=True,
                            ROC=True
                            )
Beispiel #3
0
def main():
    posts = load_posts()
    classrooms = load_classrooms()
    posts['year_month'] = pd.to_datetime(posts['date']).map(lambda dt: dt.replace(day=1))
    classrooms_merged = pd.read_csv('../data_january/classrooms_merged_non_leak.csv')
    now = declare_now()
    six_months = timedelta(days=365//2)
    # one_year = timedelta(days=365)
    last_month_posts_total = get_posts_total_between_with_zeros(posts, now, six_months)
    full_y, y = make_y_was_any_posts_between(posts, classrooms, now, six_months)
    # print(find_y_hearts(posts, now))
    # print(find_y_hearts(posts[posts['classroom_id']==1 | posts['classroom_id']==3], now))
    # y_hearts = get_all_y_hearts(posts, now)
    classrooms_merged_before_now = make_classrooms_merged_before(classrooms_merged, now)
    classrooms_merged_before_three_months_ago = make_classrooms_merged_before(classrooms_merged, now-timedelta(days=90))
    classrooms_merged_before_now_dropped = drop_collumns(classrooms_merged_before_now)
    classrooms_merged_before_three_months_ago = drop_collumns(classrooms_merged_before_three_months_ago)
    classrooms_sum_before_now = make_classrooms_merged_before(classrooms_merged, now)
    classrooms_sum_before_now_dropped = make_classrooms_merged_before(classrooms_merged_before_now_dropped, now)
    classrooms_sum_before_three_months_ago = make_classrooms_merged_before(classrooms_merged_before_three_months_ago, now)
    check_merge_lengths(classrooms, classrooms_merged, posts, classrooms_merged_before_now_dropped, classrooms_sum_before_now, y, full_y)

    classrooms_sum_before_three_months_ago = make_classrooms_sum_(classrooms_merged_before_three_months_ago, full_y)
    classrooms_sum_before_now = make_classrooms_sum_(classrooms_merged, full_y)
    # classrooms_sum_hearts_before_three_months_ago = merged_to_sum_before(classrooms_merged, now-timedelta(days=90), y_hearts)
    # classrooms_sum_hearts_before_three_months_ago = make_classrooms_sum_hearts_before_now(classrooms_merged_before_three_months_ago)
    # classrooms_sum_hearts_before_now = make_classrooms_sum_hearts_before_now(classrooms_merged)
    # names3, results3, models3, pipeline3, df_X3 = autoregression.compare_predictions(classrooms_sum_hearts_before_three_months_ago, 'will_post_next_semester')
    names2, results2, models2, pipeline2, df_X2 = autoregression.compare_predictions(classrooms_sum_before_three_months_ago, 'will_post_next_semester', corr_matrix=True,
    scatter_matrix=False, bootstrap_coefs=False,
    feature_importances=False,
    partial_dep=False, actual_vs_predicted=False,
    residuals=False, univariates=False, compare_models=False,
    ROC=False)

    output = open('random_forrest.pkl', 'wb')
    pickle.dump(models2[3], output)
    output.close()
    df_X2.to_csv('df_X2.csv')
    'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare'
]]
# df_titanic['Pclass'] =  df_titanic['Pclass'].map(lambda x: str(x)+' class')
df_titanic['Pclass'] = np.array(
    ['zero', 'class one', 'class two', 'class three'])[df_titanic['Pclass']]
df_titanic['Survived'] = np.array([True, False])[df_titanic['Survived']]
names, results, fit_models, pipeline, df_X, y_hats, errors = autoregression.compare_predictions(
    df_titanic,
    'survived',
    percent_data=1,
    corr_matrix=True,
    # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X.
    bootstrap_coefs=True,
    partial_dep=True,
    plot_alphas=True,
    plot_predicted_vs_actuals_flag=True,
    plot_coefs_flag=True,
    feature_importances=True,
    actual_vs_predicted=True,
    plot_predicteds_vs_actuals=True,
    residuals=True,
    univariates=True,
    compare_models=True,
    ROC=True,
)
# autoregression.compare_predictions(iris_df,'sepal_length',
#                         feature_importances=False
# )

from autoregression import clean_dataframe
Beispiel #5
0
import autoregression
import pandas as pd
import matplotlib
import numpy as np
np.random.seed(seed=99)
# matplotlib.interactive(True)
import matplotlib.pyplot as plt
from sklearn.datasets import load_wine

wine = load_wine()
df = pd.DataFrame(wine.data, columns=wine.feature_names)
df['foods'] = np.random.choice(['hot dogs', 'bacon', 'sweets', np.NaN, np.inf],
                               df.shape[0])
# df['wine_class'] = wine['target_names'][wine.target] # This has 3 classifications!!!
df['wine_class'] = (wine['target_names'][wine.target] == 'class_1')

autoregression.compare_predictions(df,
                                   'wine_class',
                                   corr_matrix=True,
                                   scatter_matrix=True,
                                   bootstrap_coefs=True,
                                   partial_dep=True,
                                   plot_predicted_vs_actuals=True,
                                   plot_coefs_flag=True,
                                   feature_importances=True,
                                   actual_vs_predicted=True,
                                   plot_predicteds_vs_actuals=True,
                                   residuals=True,
                                   univariates=True,
                                   compare_models=True,
                                   ROC=True)