def run_titanic(): df_titanic = pd.read_csv( 'https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv' ) df_titanic = df_titanic[[ 'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare' ]] # df_titanic['Pclass'] = df_titanic['Pclass'].map(lambda x: str(x)+' class') df_titanic['Pclass'] = np.array( ['zero', 'class one', 'class two', 'class three'])[df_titanic['Pclass']] df_titanic['Survived'] = np.array([True, False])[df_titanic['Survived']] autoregression.compare_predictions( df_titanic, 'survived', # percent_data=1, corr_matrix=True, # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X. bootstrap_coefs=True, partial_dep=True, plot_alphas=True, plot_predicted_vs_actuals_flag=True, plot_coefs_flag=True, feature_importances=True, actual_vs_predicted=True, plot_predicteds_vs_actuals=True, residuals=True, univariates=True, compare_models=True, ROC=True, show_plots=True, )
def run_iris(): iris_df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/iris.csv') # autoregression.compare_predictions(iris_df,'sepal_length') iris_df['foods'] = np.random.choice(['hot dogs', 'bacon', 'sweets', np.NaN, np.inf], iris_df.shape[0], ) autoregression.compare_predictions(iris_df,'sepal_length', percent_data=1, # corr_matrix=True, scatter_matrix=True, bootstrap_coefs=True, partial_dep=True, plot_predicted_vs_actuals_flag=True, plot_coefs_flag=True, feature_importances=True, actual_vs_predicted=True, plot_predicteds_vs_actuals=True, residuals=True, univariates=True, compare_models=True, ROC=True )
def main(): posts = load_posts() classrooms = load_classrooms() posts['year_month'] = pd.to_datetime(posts['date']).map(lambda dt: dt.replace(day=1)) classrooms_merged = pd.read_csv('../data_january/classrooms_merged_non_leak.csv') now = declare_now() six_months = timedelta(days=365//2) # one_year = timedelta(days=365) last_month_posts_total = get_posts_total_between_with_zeros(posts, now, six_months) full_y, y = make_y_was_any_posts_between(posts, classrooms, now, six_months) # print(find_y_hearts(posts, now)) # print(find_y_hearts(posts[posts['classroom_id']==1 | posts['classroom_id']==3], now)) # y_hearts = get_all_y_hearts(posts, now) classrooms_merged_before_now = make_classrooms_merged_before(classrooms_merged, now) classrooms_merged_before_three_months_ago = make_classrooms_merged_before(classrooms_merged, now-timedelta(days=90)) classrooms_merged_before_now_dropped = drop_collumns(classrooms_merged_before_now) classrooms_merged_before_three_months_ago = drop_collumns(classrooms_merged_before_three_months_ago) classrooms_sum_before_now = make_classrooms_merged_before(classrooms_merged, now) classrooms_sum_before_now_dropped = make_classrooms_merged_before(classrooms_merged_before_now_dropped, now) classrooms_sum_before_three_months_ago = make_classrooms_merged_before(classrooms_merged_before_three_months_ago, now) check_merge_lengths(classrooms, classrooms_merged, posts, classrooms_merged_before_now_dropped, classrooms_sum_before_now, y, full_y) classrooms_sum_before_three_months_ago = make_classrooms_sum_(classrooms_merged_before_three_months_ago, full_y) classrooms_sum_before_now = make_classrooms_sum_(classrooms_merged, full_y) # classrooms_sum_hearts_before_three_months_ago = merged_to_sum_before(classrooms_merged, now-timedelta(days=90), y_hearts) # classrooms_sum_hearts_before_three_months_ago = make_classrooms_sum_hearts_before_now(classrooms_merged_before_three_months_ago) # classrooms_sum_hearts_before_now = make_classrooms_sum_hearts_before_now(classrooms_merged) # names3, results3, models3, pipeline3, df_X3 = autoregression.compare_predictions(classrooms_sum_hearts_before_three_months_ago, 'will_post_next_semester') names2, results2, models2, pipeline2, df_X2 = autoregression.compare_predictions(classrooms_sum_before_three_months_ago, 'will_post_next_semester', corr_matrix=True, scatter_matrix=False, bootstrap_coefs=False, feature_importances=False, partial_dep=False, actual_vs_predicted=False, residuals=False, univariates=False, compare_models=False, ROC=False) output = open('random_forrest.pkl', 'wb') pickle.dump(models2[3], output) output.close() df_X2.to_csv('df_X2.csv')
'Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare' ]] # df_titanic['Pclass'] = df_titanic['Pclass'].map(lambda x: str(x)+' class') df_titanic['Pclass'] = np.array( ['zero', 'class one', 'class two', 'class three'])[df_titanic['Pclass']] df_titanic['Survived'] = np.array([True, False])[df_titanic['Survived']] names, results, fit_models, pipeline, df_X, y_hats, errors = autoregression.compare_predictions( df_titanic, 'survived', percent_data=1, corr_matrix=True, # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X. bootstrap_coefs=True, partial_dep=True, plot_alphas=True, plot_predicted_vs_actuals_flag=True, plot_coefs_flag=True, feature_importances=True, actual_vs_predicted=True, plot_predicteds_vs_actuals=True, residuals=True, univariates=True, compare_models=True, ROC=True, ) # autoregression.compare_predictions(iris_df,'sepal_length', # feature_importances=False # ) from autoregression import clean_dataframe
import autoregression import pandas as pd import matplotlib import numpy as np np.random.seed(seed=99) # matplotlib.interactive(True) import matplotlib.pyplot as plt from sklearn.datasets import load_wine wine = load_wine() df = pd.DataFrame(wine.data, columns=wine.feature_names) df['foods'] = np.random.choice(['hot dogs', 'bacon', 'sweets', np.NaN, np.inf], df.shape[0]) # df['wine_class'] = wine['target_names'][wine.target] # This has 3 classifications!!! df['wine_class'] = (wine['target_names'][wine.target] == 'class_1') autoregression.compare_predictions(df, 'wine_class', corr_matrix=True, scatter_matrix=True, bootstrap_coefs=True, partial_dep=True, plot_predicted_vs_actuals=True, plot_coefs_flag=True, feature_importances=True, actual_vs_predicted=True, plot_predicteds_vs_actuals=True, residuals=True, univariates=True, compare_models=True, ROC=True)