Ejemplo n.º 1
0
    def test_imputation(self):
        s = np.matrix([[1.0, np.nan], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0],
                       [9.0, 1.0]])
        k = self.k
        r = [np.matrix([[np.nan, 2.2], [3.3, 4.4]])]
        m = np.hstack([s, r[0][k[0]]])
        m[np.isnan(m)] = np.nanmean(m)
        n_matrix = nm.NormalizedMatrix(s, r, k)

        assert_equal(utils.imputation(n_matrix).sum(axis=0), m.sum(axis=0))

        s = np.matrix([[1.0, np.nan], [4.0, 3.0], [5.0, 6.0], [8.0, 7.0],
                       [9.0, 1.0]])
        k = [np.array([0, 1, 1, 0, 1]), np.array([0, 1, 1, 0, 1])]
        r = [
            np.matrix([[np.nan, 2.2], [3.3, 4.4]]),
            np.matrix([[np.nan, 2.2], [3.3, 4.4]])
        ]
        m = np.hstack([s, r[0][k[0]], r[1][k[1]]])

        mean = np.nanmean(m, axis=0)
        inds = np.where(np.isnan(m))
        m[inds] = np.take(mean, inds[1])
        n_matrix = nm.NormalizedMatrix(s, r, k)
        assert_almost_equal(
            utils.imputation(n_matrix, axis=0).sum(axis=0), m.sum(axis=0))

##### min_price
train_min = pd.read_csv("train.csv")
train_min.drop(columns=['max_price'], inplace=True)
test_min = pd.read_csv("test.csv")
df = utils.merge_train_test(train_min, test_min, 'min_price')

cat_vars = ['name', 'brand', 'base_name', 'cpu', 'cpu_details', 'gpu', 'os', 'os_details', 'screen_surface']
dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu']
target_vars = ['min_price', 'max_price']
target = 'min_price'
num_vars = [col for col in df.columns if col not in cat_vars + dummy_vars + target_vars]
variable_lists = [cat_vars, dummy_vars, target_vars, num_vars]

df = utils.imputation(df)
utils.drop_columns(df, ['name', 'base_name', 'pixels_y'], variable_lists)
# utils.decrease_cat_size_handling(df, cat_vars, target)
# df = utils.one_hot_encoding(df, cat_vars)
utils.smooth_handling(df, cat_vars, target)

estimator = xgb.XGBRegressor(n_estimators=200, max_depth=4, gamma=0.3, colsample_bytree=0.6, subsample=1, min_child_weight=15)

df_min = utils.fit_predict(df, estimator, target, 'id', 'MIN')
df_complete_predictions = utils.get_predictions(df, estimator, target, 'id', 'min_price_pred')


##### max_price
train_min = pd.read_csv("train.csv")
train_min.drop(columns=['min_price'], inplace=True)
test_min = pd.read_csv("test.csv")
test_min = pd.read_csv("test.csv")
test_min = utils.preprocessing(test_min)
df = utils.merge_train_test(train_min, test_min, 'min_price')


cat_vars = ['name', 'brand', 'base_name', 'cpu',
            'cpu_details', 'gpu', 'os', 'os_details', 'screen_surface']
dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu']
target_vars = ['min_price', 'max_price']
target = 'min_price'
num_vars = [col for col in df.columns if col not in cat_vars +
            dummy_vars + target_vars]
variable_lists = [cat_vars, dummy_vars, target_vars, num_vars]


df = utils.imputation(df)
utils.drop_columns(df, cols_to_be_dropped, variable_lists)
utils.decrease_cat_size_handling(df, decrease_cat_vars, target)
df = utils.one_hot_encoding(df, one_hot_cat_vars)
utils.smooth_handling(df, smooth_cat_vars, target)

estimator = clone(estimators[0])
X_train, y_train, X_test = utils.split_train_test_res(df, target, 'id')
estimator.fit(X_train, y_train)
X_train = df[df['train'] == 1].drop(columns=[target, 'id', 'train'])

explainer = shap.TreeExplainer(estimator)
shap_values = explainer.shap_values(X_train.values)

shap.force_plot(explainer.expected_value,
                shap_values[0, :], X_train.iloc[0, :])
Ejemplo n.º 4
0
df_min_in = df.copy()

cat_vars = [
    'name', 'brand', 'base_name', 'cpu', 'cpu_details', 'gpu', 'os',
    'os_details', 'screen_surface'
]
dummy_vars = ['touchscreen', 'detachable_keyboard', 'discrete_gpu']
target_vars = ['min_price', 'max_price']
target = 'min_price'
num_vars = [
    col for col in df.columns if col not in cat_vars + dummy_vars + target_vars
]
variable_lists = [cat_vars, dummy_vars, target_vars, num_vars]

df_min_in = utils.imputation(df_min_in)
utils.drop_columns(df_min_in, cols_to_be_dropped + ['max_price'],
                   variable_lists)
utils.decrease_cat_size_handling(df_min_in, decrease_cat_vars, target)
df_min_in = utils.one_hot_encoding(df_min_in, one_hot_cat_vars)
utils.smooth_handling(df_min_in, smooth_cat_vars, target)

estimator = clone(estimators[0])

df_min, mae_min = utils.fit_mae(df_min_in, estimator, target, 'id', 'MIN')
df_comp_min = utils.get_predictions(df_min_in, estimator, target, 'id',
                                    'min_price_pred')

# max_price

df_max_in = df.copy()