def test_imputation(self): s = np.matrix([[1.0, np.nan], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0], [9.0, 1.0]]) k = self.k r = [np.matrix([[np.nan, 2.2], [3.3, 4.4]])] m = np.hstack([s, r[0][k[0]]]) m[np.isnan(m)] = np.nanmean(m) n_matrix = nm.NormalizedMatrix(s, r, k) assert_equal(utils.imputation(n_matrix).sum(axis=0), m.sum(axis=0)) s = np.matrix([[1.0, np.nan], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0], [9.0, 1.0]]) k = [np.array([0, 1, 1, 0, 1]), np.array([0, 1, 1, 0, 1])] r = [ np.matrix([[np.nan, 2.2], [3.3, 4.4]]), np.matrix([[np.nan, 2.2], [3.3, 4.4]]) ] m = np.hstack([s, r[0][k[0]], r[1][k[1]]]) mean = np.nanmean(m, axis=0) inds = np.where(np.isnan(m)) m[inds] = np.take(mean, inds[1]) n_matrix = nm.NormalizedMatrix(s, r, k) assert_almost_equal( utils.imputation(n_matrix, axis=0).sum(axis=0), m.sum(axis=0))
##### min_price train_min = pd.read_csv("train.csv") train_min.drop(columns=['max_price'], inplace=True) test_min = pd.read_csv("test.csv") df = utils.merge_train_test(train_min, test_min, 'min_price') cat_vars = ['name', 'brand', 'base_name', 'cpu', 'cpu_details', 'gpu', 'os', 'os_details', 'screen_surface'] dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu'] target_vars = ['min_price', 'max_price'] target = 'min_price' num_vars = [col for col in df.columns if col not in cat_vars + dummy_vars + target_vars] variable_lists = [cat_vars, dummy_vars, target_vars, num_vars] df = utils.imputation(df) utils.drop_columns(df, ['name', 'base_name', 'pixels_y'], variable_lists) # utils.decrease_cat_size_handling(df, cat_vars, target) # df = utils.one_hot_encoding(df, cat_vars) utils.smooth_handling(df, cat_vars, target) estimator = xgb.XGBRegressor(n_estimators=200, max_depth=4, gamma=0.3, colsample_bytree=0.6, subsample=1, min_child_weight=15) df_min = utils.fit_predict(df, estimator, target, 'id', 'MIN') df_complete_predictions = utils.get_predictions(df, estimator, target, 'id', 'min_price_pred') ##### max_price train_min = pd.read_csv("train.csv") train_min.drop(columns=['min_price'], inplace=True) test_min = pd.read_csv("test.csv")
test_min = pd.read_csv("test.csv") test_min = utils.preprocessing(test_min) df = utils.merge_train_test(train_min, test_min, 'min_price') cat_vars = ['name', 'brand', 'base_name', 'cpu', 'cpu_details', 'gpu', 'os', 'os_details', 'screen_surface'] dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu'] target_vars = ['min_price', 'max_price'] target = 'min_price' num_vars = [col for col in df.columns if col not in cat_vars + dummy_vars + target_vars] variable_lists = [cat_vars, dummy_vars, target_vars, num_vars] df = utils.imputation(df) utils.drop_columns(df, cols_to_be_dropped, variable_lists) utils.decrease_cat_size_handling(df, decrease_cat_vars, target) df = utils.one_hot_encoding(df, one_hot_cat_vars) utils.smooth_handling(df, smooth_cat_vars, target) estimator = clone(estimators[0]) X_train, y_train, X_test = utils.split_train_test_res(df, target, 'id') estimator.fit(X_train, y_train) X_train = df[df['train'] == 1].drop(columns=[target, 'id', 'train']) explainer = shap.TreeExplainer(estimator) shap_values = explainer.shap_values(X_train.values) shap.force_plot(explainer.expected_value, shap_values[0, :], X_train.iloc[0, :])
df_min_in = df.copy() cat_vars = [ 'name', 'brand', 'base_name', 'cpu', 'cpu_details', 'gpu', 'os', 'os_details', 'screen_surface' ] dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu'] target_vars = ['min_price', 'max_price'] target = 'min_price' num_vars = [ col for col in df.columns if col not in cat_vars + dummy_vars + target_vars ] variable_lists = [cat_vars, dummy_vars, target_vars, num_vars] df_min_in = utils.imputation(df_min_in) utils.drop_columns(df_min_in, cols_to_be_dropped + ['max_price'], variable_lists) utils.decrease_cat_size_handling(df_min_in, decrease_cat_vars, target) df_min_in = utils.one_hot_encoding(df_min_in, one_hot_cat_vars) utils.smooth_handling(df_min_in, smooth_cat_vars, target) estimator = clone(estimators[0]) df_min, mae_min = utils.fit_mae(df_min_in, estimator, target, 'id', 'MIN') df_comp_min = utils.get_predictions(df_min_in, estimator, target, 'id', 'min_price_pred') # max_price df_max_in = df.copy()