def load_train_test(nrows = None, silent = True, treat_cat_missing = False, treat_num_missing = False, remove_duplicated_cols = False): train, test = pp.read_train_test(nrows = nrows) test["first_active_month"].fillna(test["first_active_month"].mode().iloc[0], inplace=True) train['first_active_month'] = pd.to_datetime(train['first_active_month'], format='%Y-%m-%d') test['first_active_month'] = pd.to_datetime(test['first_active_month'], format='%Y-%m-%d') train['elapsed_time'] = (datetime.datetime.today() - train['first_active_month']).dt.days test['elapsed_time'] = (datetime.datetime.today() - test['first_active_month']).dt.days train["year"] = train["first_active_month"].dt.year test["year"] = test["first_active_month"].dt.year train["month"] = train["first_active_month"].dt.month test["month"] = test["first_active_month"].dt.month train['dayofweek'] = train['first_active_month'].dt.dayofweek test['dayofweek'] = test['first_active_month'].dt.dayofweek train['weekofyear'] = train['first_active_month'].dt.weekofyear test['weekofyear'] = test['first_active_month'].dt.weekofyear train['outliers'] = 0 train.loc[train['target'] < -30, 'outliers'] = 1 for f in ['feature_1','feature_2','feature_3']: order_label = train.groupby([f])['outliers'].mean() train[f + "_"] = train[f].map(order_label) test[f + "_"] = test[f].map(order_label) train.drop(['outliers'], axis = 1, inplace=True) train = pp.hot_encode(train, ["feature_1", "feature_2", "feature_3"]) test = pp.hot_encode(test, ["feature_1", "feature_2", "feature_3"]) return train, test
from xgboost import XGBRegressor import lightgbm as lgb from sklearn.svm import SVR, LinearSVR, LinearSVC from sklearn.linear_model import ElasticNet, Lasso, BayesianRidge, Ridge, SGDRegressor, LassoLars from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor import warnings def ignore_warn(*args, **kwargs): pass warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn) train, test = pp.read_train_test() train = pp.drop_outliers(train) all_data = pp.concat_train_test(train.drop(['SalePrice'], axis=1), test) #ds = ds.drop(['Utilities'], axis=1) #ds = ds.drop(high_occurance_missing(ds, 0.8), axis=1) all_data = pp.convert_numeric2category(all_data) was_missing_columns = pp.handle_missing(all_data) all_data = pp.encode(all_data) shrunk_columns = pp.shrink_scales(all_data) engineered_columns = pp.add_engineered_features(all_data) simplified_columns = pp.simplify_features(all_data)
'colsample_bytree': 0.054, 'colsample_bylevel': 0.50, 'n_jobs': -1, 'random_state': 456 } fit_params = { 'early_stopping_rounds': 15, 'eval_metric': 'rmse', 'verbose': False } import matplotlib.pyplot as plt plt.spy(train) train, test = pp.read_train_test(train_file = 'train.csv', test_file = 'test.csv') #train_sparse = train.replace(0, np.nan).to_sparse() #test_sparse = test.replace(0, np.nan).to_sparse() #train_X_sparse = train_sparse.drop(['ID','target'], axis=1) #train_y_sparse = (np.log1p(train_sparse.target)).values ids = list(test.ID) train_X = train.drop(['ID','target'], axis=1) train_y = (np.log1p(train.target)).values test_X = test.drop(['ID'], axis=1) pipe = Pipeline(
def load_train_test(nrows=None, silent=True, treat_cat_missing=False, treat_num_missing=False, remove_duplicated_cols=False): train, test = pp.read_train_test(train_file='application_train.csv', test_file='application_test.csv', nrows=nrows) # Remove some rows with values not present in test set train = train[train['CODE_GENDER'] != 'XNA'] train = train[train['NAME_INCOME_TYPE'] != 'Maternity leave'] train = train[train['NAME_FAMILY_STATUS'] != 'Unknown'] if not silent: print("Train samples: {}, Test samples: {}".format( len(train), len(test))) # Decrease number of categories in ORGANIZATION_TYPE _, cat_cols_train = pp.get_feature_groups(train) _, cat_cols_test = pp.get_feature_groups(test) if not silent: print("Decreading the number of categories...") for col in cat_cols_train: cat_values_table_train = pp.check_categorical_cols_values(train, col=col) s_low_values_train = set(cat_values_table_train[ cat_values_table_train.loc[:, "% of Total"] < 1].index) cat_values_table_test = pp.check_categorical_cols_values(test, col=col) s_low_values_test = set(cat_values_table_test[ cat_values_table_test.loc[:, "% of Total"] < 1].index) l_union = list(s_low_values_train.union(s_low_values_test)) if len(l_union) >= 2: if not silent: print( "Decreasing the number of categories in {}...".format(col)) print("The following categories will be grouped: {}".format( l_union)) train.loc[train[col].isin(l_union), col] = "Other 2" test.loc[test[col].isin(l_union), col] = "Other 2" train.loc[:, 'HOUR_APPR_PROCESS_START'] = train.loc[:, 'HOUR_APPR_PROCESS_START'].astype( 'object') test.loc[:, 'HOUR_APPR_PROCESS_START'] = test.loc[:, 'HOUR_APPR_PROCESS_START'].astype( 'object') train = treat_anomalies(train, columns=['DAYS_EMPLOYED']) test = treat_anomalies(test, columns=['DAYS_EMPLOYED']) train.loc[train['OWN_CAR_AGE'] > 80, 'OWN_CAR_AGE'] = np.nan train.loc[train['REGION_RATING_CLIENT_W_CITY'] < 0, 'REGION_RATING_CLIENT_W_CITY'] = np.nan train.loc[train['AMT_INCOME_TOTAL'] > 1e8, 'AMT_INCOME_TOTAL'] = np.nan train.loc[train['AMT_REQ_CREDIT_BUREAU_QRT'] > 10, 'AMT_REQ_CREDIT_BUREAU_QRT'] = np.nan train.loc[train['OBS_30_CNT_SOCIAL_CIRCLE'] > 40, 'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan test.loc[test['OWN_CAR_AGE'] > 80, 'OWN_CAR_AGE'] = np.nan test.loc[test['REGION_RATING_CLIENT_W_CITY'] < 0, 'REGION_RATING_CLIENT_W_CITY'] = np.nan test.loc[test['AMT_INCOME_TOTAL'] > 1e8, 'AMT_INCOME_TOTAL'] = np.nan test.loc[test['AMT_REQ_CREDIT_BUREAU_QRT'] > 10, 'AMT_REQ_CREDIT_BUREAU_QRT'] = np.nan test.loc[test['OBS_30_CNT_SOCIAL_CIRCLE'] > 40, 'OBS_30_CNT_SOCIAL_CIRCLE'] = np.nan train['COUNT_MISSING'] = train.isnull().sum(axis=1).values test['COUNT_MISSING'] = test.isnull().sum(axis=1).values cat_cols = pp.get_dtype_columns(train, [np.dtype(object)]) cat_cols2encode = [ c for c in cat_cols if len(train[c].value_counts(dropna=False)) <= 2 ] if not silent: print("Label encoding {}".format(cat_cols2encode)) le = LabelEncoder() for col in cat_cols2encode: le.fit(train[col]) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) # CATEGORICAL MISSING #print(pp.check_missing(train[pp.get_categorical_missing_cols(train)])) #print(pp.check_missing(test[pp.get_categorical_missing_cols(test)])) if (treat_cat_missing): if not silent: print("Treating categoricals missing...") train.NAME_TYPE_SUITE.fillna("Unaccompanied", inplace=True) test.NAME_TYPE_SUITE.fillna("Unaccompanied", inplace=True) # High density missing categorical columns - deserves a column when performing get_dummies # FONDKAPREMONT_MODE, WALLSMATERIAL_MODE, HOUSETYPE_MODE, EMERGENCYSTATE_MODE, OCCUPATION_TYPE if not silent: print("Creating dummies variables...") train = pd.get_dummies(train, dummy_na=treat_cat_missing, dtype='bool') test = pd.get_dummies(test, dummy_na=treat_cat_missing, dtype='bool') train_labels = train['TARGET'] train, test = train.align(test, join='inner', axis=1) train['TARGET'] = train_labels # NUMERICAL MISSING #print(pp.check_missing(train[pp.get_numerical_missing_cols(train)])) #print(pp.check_missing(test[pp.get_numerical_missing_cols(test)])) if (treat_num_missing): if not silent: print("Treating numericals missing...") num_missing_trans = pp.HandleMissingMedianTransformer() train = num_missing_trans.fit_transform(train) test = num_missing_trans.fit_transform(test) # FEATURE ENGINEERING if not silent: print("Feature engineering...") train = pp.get_domain_knowledge_features(train) test = pp.get_domain_knowledge_features(test) if remove_duplicated_cols: duplicated_train = pp.duplicate_columns(train, verbose=not silent, progress=False) if not silent: print("Removing duplicated columns {}".format(duplicated_train)) train.drop(list(duplicated_train.keys()), axis=1, inplace=True) test.drop(list(duplicated_train.keys()), axis=1, inplace=True) return train, test
import preprocessing as pp from sklearn.preprocessing import Binarizer, LabelEncoder def make_submission(model, X_train, y_train, X_test, filename='submission.csv'): model.fit(X_train, y_train) predicted = model.predict_proba(test_X)[:, 1] my_submission = pd.DataFrame({'SK_ID_CURR': ids, 'TARGET': predicted}) my_submission.to_csv(filename, index=False) train, test = pp.read_train_test(train_file='application_train.csv', test_file='application_test.csv') cat_cols = pp.get_dtype_columns(train, [np.dtype(object)]) cat_cols2encode = [ c for c in cat_cols if len(train[c].value_counts(dropna=False)) <= 3 ] le = LabelEncoder() for col in cat_cols2encode: le.fit(train[col]) train[col] = le.transform(train[col]) test[col] = le.transform(test[col]) # CATEGORICAL MISSING print(pp.check_missing(train[pp.get_categorical_missing_cols(train)])) print(pp.check_missing(test[pp.get_categorical_missing_cols(test)]))