Beispiel #1
0
 def test_clean_df_y_nan(self):
     df = pd.DataFrame([[1, 2, 3, np.nan], [5, 6, 7, 8], [9, 10, 11, 12]],
                       columns=['first', 'second', 'third', 'fourth'])
     df_clean_sample = pd.DataFrame(
         [[5.0, 6.0, 7.0, 8.0], [9.0, 10.0, 11.0, 12.0]],
         columns=['first', 'second', 'third', 'fourth'])
     df_cleaned, sample_limit = clean_dataframe(df,
                                                'fourth',
                                                percent_data=1)
     assert df_cleaned.reset_index(
         drop=True).to_dict() == df_clean_sample.reset_index(
             drop=True).to_dict()
Beispiel #2
0
 def test_clean_df_y_x_inf(self):
     df = pd.DataFrame([[1, np.nan, 3, np.inf], [5, 6, 7, 8],
                        [9, 10, np.nan, 12], [13, 14, 15, 16]],
                       columns=['first', 'second', 'third', 'fourth'])
     df_clean_sample = pd.DataFrame(
         [[5.0, 6.0, 7.0, False, 8.0], [9.0, 10.0, 10.5, True, 12.0],
          [13.0, 14.0, 15.0, False, 16.0]],
         columns=['first', 'second', 'third', 'third_was_null', 'fourth'])
     print(df)
     df_clean_sample = df_clean_sample.reset_index(drop=True)
     print(df_clean_sample)
     df_cleaned, sample_limit = clean_dataframe(df,
                                                'fourth',
                                                percent_data=1)
     df_cleaned = df_cleaned.sort_index().reset_index(drop=True)
     print(df_cleaned)
     assert df_cleaned.to_dict() == df_cleaned.to_dict()
Beispiel #3
0
 def test_clean_dataframe(self):
     df1 = pd.DataFrame([[1, np.inf, 3], [2, 3, 4], [3, 4, 5], [2, 3, 4],
                         [3, 4, 5], [2, 3, np.NAN], [3, 4, 5], [2, 3, 4],
                         [3, 4, 5], [2, 3, 4], [3, 4, 5]],
                        columns=['first', 'second', 'third'])
     y_var_name = 'first'
     percent_data = 1
     df2, sample_limit = clean_dataframe(df1, y_var_name, percent_data=None)
     self.assertTrue(
         df2.reset_index(drop=True).equals(
             pd.DataFrame(
                 [[3.5, 3, True, False, 1], [3, 4, False, False, 2],
                  [4, 5, False, False, 3], [3, 4, False, False, 2],
                  [4, 5, False, False, 3], [3, 4.4, False, True, 2],
                  [4, 5, False, False, 3], [3, 4, False, False, 2],
                  [4, 5, False, False, 3], [3, 4, False, False, 2],
                  [4, 5, False, False, 3]],
                 columns=[
                     'second', 'third', 'second_was_inf', 'third_was_null',
                     'first'
                 ]).reset_index(drop=True)))
    corr_matrix=True,
    # scatter_matrix=True, #doesn't work IF categorical values of 4 groups in X.
    bootstrap_coefs=True,
    partial_dep=True,
    plot_alphas=True,
    plot_predicted_vs_actuals_flag=True,
    plot_coefs_flag=True,
    feature_importances=True,
    actual_vs_predicted=True,
    plot_predicteds_vs_actuals=True,
    residuals=True,
    univariates=True,
    compare_models=True,
    ROC=True,
)
# autoregression.compare_predictions(iris_df,'sepal_length',
#                         feature_importances=False
# )

from autoregression import clean_dataframe

df_titanic_cleaned, sample_limit = clean_dataframe(df_titanic,
                                                   'Survived',
                                                   percent_data=1)

df_titanic_transformed = pipeline.transform(df_titanic_cleaned)

print(df_titanic_transformed.columns)
print(fit_models)

print(fit_models[0].predict_proba(df_titanic_transformed))