def fte_sales_channels(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_sales_channels_train' cache_key_test = 'fte_sales_channels_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_sales_channels - Cache found, will use cached data') train = pd.concat([train, train_cached], axis = 1, copy = False) test = pd.concat([test, test_cached], axis = 1, copy = False) return train, test, y, db_conn, folds, cache_file logger.info('fte_sales_channels - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table): df['p_avg_seller_size'] = encode_average(df, db_conn, table, 'previous_application', 'SELLERPLACE_AREA') df['p_avg_channel_size'] = encode_average(df, db_conn, table, 'previous_application', 'CHANNEL_TYPE') _trans(train, "application_train") _trans(test, "application_test") ######################################################## columns = ['p_avg_seller_size', 'p_avg_channel_size'] logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_pos_cash_aggregate(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_pos_cash_aggregate_train' cache_key_test = 'fte_pos_cash_aggregate_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info( 'fte_pos_cash_aggregate - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_pos_cash_aggregate - Cache not found, will recompute from scratch' ) ######################################################## # SQLite doesn't have stddev function, revert to Pandas # Note that the DB has 10M rows pos_cash = pd.read_sql_query( 'select * FROM POS_CASH_balance ORDER BY SK_ID_CURR ASC;', db_conn) # Create the aggregate agg_POS_CASH = pos_cash.groupby('SK_ID_CURR').agg( dict( MONTHS_BALANCE=["sum", "mean", "max", "min", "std"], CNT_INSTALMENT=["sum", "mean", "max", "min", "std"], CNT_INSTALMENT_FUTURE=["sum", "mean", "max", "min", "std"], SK_DPD=["sum", "mean", "max", "std"], # dropping mean SK_DPD_DEF=["sum", "mean", "max", "min", "std"], SK_ID_CURR='count')) agg_POS_CASH.columns = pd.Index( [e[0] + "_" + e[1] for e in agg_POS_CASH.columns.tolist()]) train = train.merge(agg_POS_CASH, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) test = test.merge(agg_POS_CASH, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[agg_POS_CASH.columns] test_cache = test[agg_POS_CASH.columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_missed_installments(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_missed_installments_train' cache_key_test = 'fte_missed_installments_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info( 'fte_missed_installments - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_missed_installments - Cache not found, will recompute from scratch' ) ######################################################## query = """ select SK_ID_CURR, AMT_INSTALMENT - AMT_PAYMENT AS DIFF_EXPECTED_PMT, DAYS_ENTRY_PAYMENT - DAYS_INSTALMENT AS DAYS_LATE from installments_payments """ installments_diff = pd.read_sql_query(query, db_conn) agg_installments_diff = installments_diff.groupby('SK_ID_CURR').agg( ["sum", "mean", "max", "min", "std", "count"]) agg_installments_diff.columns = pd.Index( [e[0] + "_" + e[1] for e in agg_installments_diff.columns.tolist()]) train = train.merge(agg_installments_diff, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) test = test.merge(agg_installments_diff, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[agg_installments_diff.columns] test_cache = test[agg_installments_diff.columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_app_categoricals(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_app_categoricals_train' cache_key_test = 'fte_app_categoricals_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_app_categoricals - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_app_categoricals - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table): df['NAME_CONTRACT_TYPE'] = encode_categoricals(df, db_conn, table, 'NAME_CONTRACT_TYPE') df['NAME_TYPE_SUITE'] = encode_categoricals(df, db_conn, table, 'NAME_TYPE_SUITE') df['OCCUPATION_TYPE'] = encode_categoricals(df, db_conn, table, 'OCCUPATION_TYPE') df['ORGANIZATION_TYPE'] = encode_categoricals(df, db_conn, table, 'ORGANIZATION_TYPE') df['NAME_INCOME_TYPE'] = encode_categoricals(df, db_conn, table, 'NAME_INCOME_TYPE') df['NAME_EDUCATION_TYPE'] = encode_categoricals( df, db_conn, table, 'NAME_EDUCATION_TYPE') df['NAME_FAMILY_STATUS'] = encode_categoricals(df, db_conn, table, 'NAME_FAMILY_STATUS') df['NAME_HOUSING_TYPE'] = encode_categoricals(df, db_conn, table, 'NAME_HOUSING_TYPE') _trans(train, "application_train") _trans(test, "application_test") ######################################################## columns = [ 'NAME_CONTRACT_TYPE', 'NAME_TYPE_SUITE', 'OCCUPATION_TYPE', 'ORGANIZATION_TYPE', 'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE' ] logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def tr_sentiment(train, test, y, folds, cache_file): print("############# Sentiment Analysis step ################") cache_key_train = 'nlp_sentiment_train' cache_key_test = 'nlp_sentiment_test' #Check if cache file exist and if data for this step is cached dict_train, dict_test = load_from_cache(cache_file, cache_key_train, cache_key_test) if dict_train is not None and dict_test is not None: train_out = train.assign(**dict_train) test_out = test.assign(**dict_test) return train_out, test_out, y, folds, cache_file print('# No cache detected, computing from scratch #') def _trans(df): return { 'sentiment_polarity': df['description'].apply(lambda x: TextBlob(x).sentiment.polarity), 'sentiment_subjectivity': df['description'].apply( lambda x: TextBlob(x).sentiment.subjectivity) } nlp_sentiment_train = _trans(train) nlp_sentiment_test = _trans(test) print('Caching features in ' + cache_file) save_to_cache(cache_file, cache_key_train, cache_key_test, nlp_sentiment_train, nlp_sentiment_test) print('Adding features to dataframe') train_out = train.assign(**nlp_sentiment_train) test_out = test.assign(**nlp_sentiment_test) return train_out, test_out, y, folds, cache_file
def fte_prev_app_process(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_prev_app_process_train' cache_key_test = 'fte_prev_app_process_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_prev_app_process - Cache found, will use cached data') train = pd.concat([train, train_cached], axis = 1, copy = False) test = pd.concat([test, test_cached], axis = 1, copy = False) return train, test, y, db_conn, folds, cache_file logger.info('fte_prev_app_process - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table, columns): query = f""" select avg(-p.DAYS_DECISION) / 365.25 p_avg_years_decision, avg(case(p.NAME_CONTRACT_TYPE) when 'Revolving loans' Then NULL else -p.DAYS_FIRST_DUE end ) / 365.25 p_avg_years_first_due, avg(case(p.NAME_CONTRACT_TYPE) when 'Revolving loans' Then NULL else -p.DAYS_LAST_DUE_1ST_VERSION end ) / 365.25 p_avg_years_last_due_1st_version, avg(p.HOUR_APPR_PROCESS_START) p_avg_hour_start, sum(case(p.NAME_CONTRACT_TYPE) when 'Revolving loans' Then 1 else 0 end ) p_count_revolving_loans from {table} app left join previous_application p on app.SK_ID_CURR = p.SK_ID_CURR GROUP BY app.SK_ID_CURR ORDER BY app.SK_ID_CURR ASC; """ df[columns] = pd.read_sql_query(query, db_conn) # TODO add currency, otherwise credit is not comparable columns = ['p_avg_years_decision', 'p_avg_years_first_due', 'p_avg_years_last_due_1st_version', 'p_avg_hour_start', 'p_count_revolving_loans' ] _trans(train, "application_train", columns) _trans(test, "application_test", columns) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_prev_credit_situation(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_prev_credit_situation_train' cache_key_test = 'fte_prev_credit_situation_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_prev_credit_situation - Cache found, will use cached data') train = pd.concat([train, train_cached], axis = 1, copy = False) test = pd.concat([test, test_cached], axis = 1, copy = False) return train, test, y, db_conn, folds, cache_file logger.info('fte_prev_credit_situation - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table, columns): query = f""" select IFNULL(sum(case(p.NAME_CONTRACT_TYPE) when 'Consumer loans' then 1 else 0 end), 0) AS p_total_consumer_loans, IFNULL(sum(case(p.NAME_CONTRACT_TYPE) when 'Cash loans' then 1 else 0 end), 0) AS p_total_cash_loans, IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Refused' then 1 else 0 end), 0) AS p_total_application_refused, IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Canceled' then 1 else 0 end), 0) AS p_total_application_canceled, IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Approved' then 1 else 0 end), 0) AS p_total_application_approved, -- IFNULL(sum(case(p.NAME_CONTRACT_STATUS) when 'Unused offer' then 1 else 0 end), 0) AS p_total_application_unused, IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Refused' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_refused, -- IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Canceled' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_canceled, IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Approved' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_approved, IFNULL(avg(case(p.NAME_CONTRACT_STATUS) when 'Unused offer' then p.AMT_APPLICATION else 0 end), 0) AS p_avg_application_unused, avg(case(p.NAME_CONTRACT_STATUS) when 'Approved' then 1.0 when 'Unused offer' then 1.0 else 0.0 end) AS p_ratio_app_approved_total, IFNULL(avg(p.AMT_APPLICATION - p.AMT_CREDIT), 0) AS p_avg_diff_asked_offered, IFNULL(avg(p.CNT_PAYMENT), 0) AS p_avg_payment_schedule, IFNULL(avg(p.AMT_ANNUITY), 0) AS p_avg_annuity, IFNULL(avg(p.CNT_PAYMENT), 0) / app.AMT_ANNUITY AS p_ratio_annuity_p_app, avg(p.AMT_DOWN_PAYMENT) AS p_avg_down_payment from {table} app left join previous_application p on app.SK_ID_CURR = p.SK_ID_CURR GROUP BY app.SK_ID_CURR ORDER BY app.SK_ID_CURR ASC; """ df[columns] = pd.read_sql_query(query, db_conn) # TODO add currency, otherwise credit is not comparable columns = ['p_total_consumer_loans', 'p_total_cash_loans', 'p_total_application_refused', 'p_total_application_canceled', 'p_total_application_approved', # 'p_total_application_unused', 'p_avg_application_refused', # 'p_avg_application_canceled', 'p_avg_application_approved', 'p_avg_application_unused', 'p_ratio_app_approved_total', 'p_avg_diff_asked_offered', 'p_avg_payment_schedule', 'p_avg_annuity', 'p_ratio_annuity_p_app', 'p_avg_down_payment'] _trans(train, "application_train", columns) _trans(test, "application_test", columns) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def tr_tfidf_lsa_lgb(train, test, y, folds, cache_file): print("############# TF-IDF + LSA step ################") cache_key_train = 'tfidf_lsa_lgb_train' cache_key_test = 'tfidf_lsa_lgb_test' #Check if cache file exist and if data for this step is cached dict_train, dict_test = load_from_cache(cache_file, cache_key_train, cache_key_test) if dict_train is not None and dict_test is not None: train_out = train.assign(**dict_train) test_out = test.assign(**dict_test) return train_out, test_out, y, folds, cache_file print('# No cache detected, computing from scratch #') vectorizer = TfidfVectorizer(max_features=2**16, min_df=2, stop_words='english', use_idf=True) train_raw, test_raw = _clean_desc(train, test) train_vect = vectorizer.fit_transform(train_raw['CleanDesc']) test_vect = vectorizer.transform(test_raw['CleanDesc']) # print(vectorizer.get_feature_names()) svd = TruncatedSVD(100) #Recommended 100 dimensions for LSA lsa = make_pipeline( svd, # Normalizer(copy=False) # Not needed for trees ensemble and Leaky on CV ) # Run SVD on the training data, then project the training data. X_train_lsa = lsa.fit_transform(train_vect) X_test_lsa = lsa.transform(test_vect) #explained_variance = svd.explained_variance_ratio_.sum() #print(" Explained variance of the SVD step: {}%".format(int(explained_variance * 100))) le = LabelEncoder() y_encode = le.fit_transform(y) # Separate train in train + validation data X_train, X_val, y_train, y_val = train_test_split(X_train_lsa, y_encode, test_size=0.2, random_state=42) # train gbm = LGBMClassifier(n_estimators=2048, seed=42, objective='multiclass', colsample_bytree='0.8', subsample='0.8') # Predict out-of-folds train data print('Start training - Number of folds: ', len(folds)) train_predictions = out_of_fold_predict(gbm, X_train_lsa, y_encode, folds) tfidf_train_names = { 'tfidf_' + le.classes_[0]: [row[0] for row in train_predictions], 'tfidf_' + le.classes_[1]: [row[1] for row in train_predictions], 'tfidf_' + le.classes_[2]: [row[2] for row in train_predictions] } gbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='multi_logloss', early_stopping_rounds=50, verbose=False) # Now validate the predict value using the previously split validation set print('Start validating TF-IDF + LSA...') # predict y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration) # eval print('We stopped at boosting round: ', gbm.best_iteration) print('The mlogloss of prediction is:', mlogloss(y_val, y_pred)) # Now compute the value for the actual test data using out-of-folds predictions print('Start predicting TF-IDF + LSA...') test_predictions = gbm.predict_proba(X_test_lsa, num_iteration=gbm.best_iteration) tfidf_test_names = { 'tfidf_' + le.classes_[0]: [row[0] for row in test_predictions], 'tfidf_' + le.classes_[1]: [row[1] for row in test_predictions], 'tfidf_' + le.classes_[2]: [row[2] for row in test_predictions] } print('Caching features in ' + cache_file) save_to_cache(cache_file, cache_key_train, cache_key_test, tfidf_train_names, tfidf_test_names) print('Adding features to dataframe') train_out = train.assign(**tfidf_train_names) test_out = test.assign(**tfidf_test_names) return train_out, test_out, y, folds, cache_file
def fte_bureau_credit_situation(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_bureau_credit_situation_train' cache_key_test = 'fte_bureau_credit_situation_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info( 'fte_bureau_credit_situation - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_bureau_credit_situation - Cache not found, will recompute from scratch' ) ######################################################## def _trans(df, table, columns): query = f""" select IFNULL(count(b.SK_ID_BUREAU), 0) AS b_total_prev_applications, IFNULL(sum(case CREDIT_ACTIVE when 'Active' then 1 else 0 end), 0) AS b_current_active_credit, IFNULL(sum(case CREDIT_ACTIVE when 'Sold' then 1 else 0 end), 0) AS b_sold_credit, IFNULL(sum(case CREDIT_ACTIVE when 'Bad debt' then 1 else 0 end), 0) AS b_bad_debt_credit, IFNULL(avg(case CREDIT_ACTIVE when 'Active' then 1 else 0 end), 0) AS b_active_applications_ratio, IFNULL(count(distinct CREDIT_TYPE), 0) AS b_nb_distinct_credit_type, IFNULL(sum(AMT_CREDIT_SUM), 0) AS b_total_prev_credit, IFNULL(sum(case CREDIT_ACTIVE when 'Active' then AMT_CREDIT_SUM else 0 end), 0) AS b_active_credit_amount, IFNULL(sum(AMT_CREDIT_SUM_DEBT), 0) AS b_current_debt, IFNULL(sum(AMT_CREDIT_MAX_OVERDUE), 0) AS b_current_overdue, IFNULL(max(-DAYS_CREDIT), 99 * 365.25) / 365.25 AS b_first_credit_years_ago, IFNULL(min(-DAYS_CREDIT), 99 * 365.25) / 365.25 AS b_last_credit_years_ago, IFNULL(max(DAYS_CREDIT_ENDDATE), -99 * 365.25) / 365.25 AS b_existing_credit_close_date, IFNULL(max(-DAYS_ENDDATE_FACT), 99 * 365.25) / 365.25 AS b_years_since_no_card_credit, -- IFNULL(min(-DAYS_CREDIT_UPDATE), 99 * 365.25) AS b_last_DAYS_CREDIT_UPDATE, case CREDIT_CURRENCY when 'currency_1' then 0 when 'currency_2' then 1 when 'currency_3' then 2 else 3 end b_currency, IFNULL(sum(CNT_CREDIT_PROLONG), 0) AS b_credit_days_extension from {table} app left join bureau b on app.SK_ID_CURR = b.SK_ID_CURR GROUP BY app.SK_ID_CURR ORDER BY app.SK_ID_CURR ASC; """ df[columns] = pd.read_sql_query(query, db_conn) # TODO add currency, otherwise credit is not comparable columns = [ 'b_total_prev_applications', 'b_current_active_credit', 'b_sold_credit', 'b_bad_debt_credit', 'b_active_applications_ratio', 'b_nb_distinct_credit_type', 'b_total_prev_credit', 'b_active_credit_amount', 'b_current_debt', 'b_current_overdue', 'b_first_credit_years_ago', 'b_last_credit_years_ago', 'b_existing_credit_close_date', 'b_years_since_no_card_credit', # 'b_last_DAYS_CREDIT_UPDATE', 'b_currency', 'b_credit_days_extension' ] _trans(train, "application_train", columns) _trans(test, "application_test", columns) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def tr_managerskill(train, test, y, folds, cache_file): print("\n\n############# Manager skill step ################") cache_key_train = 'managerskill_train' cache_key_test = 'managerskill_test' #Check if cache file exist and if data for this step is cached dict_train, dict_test = load_from_cache(cache_file, cache_key_train, cache_key_test) if dict_train is not None and dict_test is not None: train_out = train.assign(**dict_train) test_out = test.assign(**dict_test) return train_out, test_out, y, folds, cache_file print('# No cache detected, computing from scratch #') lb = LabelBinarizer(sparse_output=True) lb.fit(list(train['manager_id'].values) + list(test['manager_id'].values)) X_train_mngr = lb.transform(train['manager_id']).astype(np.float32) X_test_mngr = lb.transform(test['manager_id']).astype(np.float32) le = LabelEncoder() y_encode = le.fit_transform(y) # Separate train in train + validation data X_train, X_val, y_train, y_val = train_test_split(X_train_mngr, y_encode, test_size=0.2, random_state=42) # train gbm = LGBMClassifier(n_estimators=2048, seed=42, objective='multiclass', colsample_bytree='0.8', subsample='0.8') # Predict out-of-folds train data print('Start training - Number of folds: ', len(folds)) train_predictions = out_of_fold_predict(gbm, X_train_mngr, y_encode, folds) mngr_train_names = { 'mngr_' + le.classes_[0]: [row[0] for row in train_predictions], 'mngr_' + le.classes_[1]: [row[1] for row in train_predictions], 'mngr_' + le.classes_[2]: [row[2] for row in train_predictions], } mngr_train_names['mngr_skill'] = [ 2 * h + m for (h, m) in zip(mngr_train_names['mngr_high'], mngr_train_names['mngr_medium']) ] gbm.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='multi_logloss', early_stopping_rounds=50, verbose=False) # Now validate the predict value using the previously split validation set print('Start validating Manager skill...') # predict y_pred = gbm.predict_proba(X_val, num_iteration=gbm.best_iteration) # eval print('We stopped at boosting round: ', gbm.best_iteration) print('The mlogloss of prediction is:', mlogloss(y_val, y_pred)) # Now compute the value for the actual test data using out-of-folds predictions print('Start predicting Manager skill...') test_predictions = gbm.predict_proba(X_test_mngr, num_iteration=gbm.best_iteration) mngr_test_names = { 'mngr_' + le.classes_[0]: [row[0] for row in test_predictions], 'mngr_' + le.classes_[1]: [row[1] for row in test_predictions], 'mngr_' + le.classes_[2]: [row[2] for row in test_predictions] } mngr_test_names['mngr_skill'] = [ 2 * h + m for (h, m) in zip(mngr_test_names['mngr_high'], mngr_test_names['mngr_medium']) ] print('Caching features in ' + cache_file) save_to_cache(cache_file, cache_key_train, cache_key_test, mngr_train_names, mngr_test_names) print('Adding features to dataframe') train_out = train.assign(**mngr_train_names) test_out = test.assign(**mngr_test_names) return train_out, test_out, y, folds, cache_file
def fte_application(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_application_train' cache_key_test = 'fte_application_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_application - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_application - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table, columns): query = f""" select OBS_30_CNT_SOCIAL_CIRCLE, DEF_30_CNT_SOCIAL_CIRCLE, OBS_60_CNT_SOCIAL_CIRCLE, DEF_60_CNT_SOCIAL_CIRCLE, REGION_POPULATION_RELATIVE, --REGION_RATING_CLIENT, REGION_RATING_CLIENT_W_CITY, REG_CITY_NOT_LIVE_CITY, --REG_CITY_NOT_WORK_CITY, --REG_REGION_NOT_LIVE_REGION --REG_REGION_NOT_WORK_REGION, --LIVE_REGION_NOT_WORK_REGION, CNT_CHILDREN, CNT_FAM_MEMBERS, case(CODE_GENDER) when "F" THEN 1 else 0 end isWoman, -- case(FLAG_OWN_CAR) when "Y" THEN 1 else 0 end ownCar, case(FLAG_OWN_REALTY) when "Y" THEN 1 else 0 end ownRealEstate, FLAG_DOCUMENT_3, FLOORSMAX_AVG, FLOORSMIN_AVG, YEARS_BUILD_AVG, LIVINGAREA_MODE, YEARS_BEGINEXPLUATATION_MODE, LANDAREA_MODE, BASEMENTAREA_MODE, TOTALAREA_MODE, APARTMENTS_AVG, COMMONAREA_AVG, NONLIVINGAREA_MEDI, --LIVINGAPARTMENTS_MODE, --YEARS_BUILD_MODE, --ENTRANCES_AVG, --AMT_REQ_CREDIT_BUREAU_DAY, --AMT_REQ_CREDIT_BUREAU_HOUR, AMT_REQ_CREDIT_BUREAU_MON, AMT_REQ_CREDIT_BUREAU_QRT, --AMT_REQ_CREDIT_BUREAU_WEEK, AMT_REQ_CREDIT_BUREAU_YEAR, FLAG_MOBIL, --FLAG_EMP_PHONE, FLAG_WORK_PHONE, --FLAG_CONT_MOBILE, --FLAG_PHONE, --FLAG_EMAIL, EXT_SOURCE_1, EXT_SOURCE_2, EXT_SOURCE_3 from {table} ORDER BY SK_ID_CURR ASC """ df[columns] = pd.read_sql_query(query, db_conn) columns = [ 'OBS_30_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE', 'OBS_60_CNT_SOCIAL_CIRCLE', 'DEF_60_CNT_SOCIAL_CIRCLE', 'REGION_POPULATION_RELATIVE', # 'REGION_RATING_CLIENT', 'REGION_RATING_CLIENT_W_CITY', 'REG_CITY_NOT_LIVE_CITY', # 'REG_CITY_NOT_WORK_CITY', # 'REG_REGION_NOT_LIVE_REGION', # 'REG_REGION_NOT_WORK_REGION', # 'LIVE_REGION_NOT_WORK_REGION', 'CNT_CHILDREN', 'CNT_FAM_MEMBERS', 'isWoman', # 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'FLAG_DOCUMENT_3', 'FLOORSMAX_AVG', 'FLOORSMIN_AVG', 'YEARS_BUILD_AVG', 'LIVINGAREA_MODE', 'YEARS_BEGINEXPLUATATION_MODE', 'LANDAREA_MODE', 'BASEMENTAREA_MODE', 'TOTALAREA_MODE', 'APARTMENTS_AVG', 'COMMONAREA_AVG', 'NONLIVINGAREA_MEDI', # 'LIVINGAPARTMENTS_MODE', # 'YEARS_BUILD_MODE', # 'ENTRANCES_AVG', # 'AMT_REQ_CREDIT_BUREAU_DAY', # 'AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', # 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_YEAR', 'FLAG_MOBIL', #'FLAG_EMP_PHONE', 'FLAG_WORK_PHONE', #'FLAG_CONT_MOBILE', #'FLAG_PHONE', #'FLAG_EMAIL', 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3' ] _trans(train, "application_train", columns) _trans(test, "application_test", columns) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_withdrawals(train, test, y, db_conn, folds, cache_file): cache_key_train = 'fte_withdrawals_train' cache_key_test = 'fte_withdrawals_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info('fte_withdrawals - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_withdrawals - Cache not found, will recompute from scratch') ######################################################## def _trans(df, table, columns): query = f""" with instalments_per_app AS ( select SK_ID_CURR, max(CNT_INSTALMENT_MATURE_CUM) AS nb_installments from credit_card_balance GROUP BY SK_ID_CURR, SK_ID_PREV ORDER BY SK_ID_CURR ) select count(distinct SK_ID_PREV) as cb_nb_of_credit_cards, avg(CNT_DRAWINGS_ATM_CURRENT) as cb_avg_atm_withdrawal_count, avg(AMT_DRAWINGS_ATM_CURRENT) as cb_avg_atm_withdrawal_amount, sum(AMT_DRAWINGS_ATM_CURRENT) as cb_sum_atm_withdrawal_amount, avg(CNT_DRAWINGS_CURRENT) as cb_avg_withdrawal_count, avg(AMT_DRAWINGS_CURRENT) as cb_avg_withdrawal_amount, avg(CNT_DRAWINGS_POS_CURRENT) as cb_avg_pos_withdrawal_count, avg(AMT_DRAWINGS_POS_CURRENT) as cb_avg_pos_withdrawal_amount, avg(SK_DPD) as cb_avg_day_past_due, avg(SK_DPD_DEF) as cb_avg_day_past_due_tolerated --sum(nb_installments) as instalments_per_app FROM {table} app left join credit_card_balance ccb on app.SK_ID_CURR = ccb.SK_ID_CURR left join instalments_per_app ipa on ipa.SK_ID_CURR = ccb.SK_ID_CURR --where -- MONTHS_BALANCE BETWEEN -3 and -1 group by ccb.sk_id_curr ORDER by app.SK_ID_CURR ASC """ df[columns] = pd.read_sql_query(query, db_conn) columns = [ 'cb_nb_of_credit_cards', 'cb_avg_atm_withdrawal_count', 'cb_avg_atm_withdrawal_amount', 'cb_sum_atm_withdrawal_amount', 'cb_avg_withdrawal_count', 'cb_avg_withdrawal_amount', 'cb_avg_pos_withdrawal_count', 'cb_avg_pos_withdrawal_amount', 'cb_avg_day_past_due', 'cb_avg_day_past_due_tolerated' # 'instalments_per_app' ] _trans(train, "application_train", columns) _trans(test, "application_test", columns) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[columns] test_cache = test[columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file
def fte_pos_cash_current_status(train, test, y, db_conn, folds, cache_file): ## Count the still active/completed/amortized debt credit cache_key_train = 'fte_pos_cash_current_status_train' cache_key_test = 'fte_pos_cash_current_status_test' # Check if cache file exist and if data for this step is cached train_cached, test_cached = load_from_cache(cache_file, cache_key_train, cache_key_test) if train_cached is not None and test_cached is not None: logger.info( 'fte_pos_cash_current_status - Cache found, will use cached data') train = pd.concat([train, train_cached], axis=1, copy=False) test = pd.concat([test, test_cached], axis=1, copy=False) return train, test, y, db_conn, folds, cache_file logger.info( 'fte_pos_cash_current_status - Cache not found, will recompute from scratch' ) ######################################################## # In SQLite we avoid joining on temporary tables/subqueries/with-statement as they are not indexed # and super-slow. ORDER BY on the following result is very slow to ... # (~>1min vs 20ms for just dumping the result on i5-5257U) query = """ SELECT SK_ID_CURR, -- SK_ID_PREV, NAME_CONTRACT_STATUS AS pos_cash_NAME_CONTRACT_STATUS FROM POS_CASH_balance GROUP BY SK_ID_PREV HAVING MONTHS_BALANCE = max(MONTHS_BALANCE) """ pos_cash_current = pd.read_sql_query(query, db_conn) # Pivot pos_cash_current = pd.get_dummies(pos_cash_current, columns=[ 'pos_cash_NAME_CONTRACT_STATUS' ]).groupby('SK_ID_CURR').sum() # TODO: add a proper feature selection phase pos_cash_current = pos_cash_current[[ 'pos_cash_NAME_CONTRACT_STATUS_Active', 'pos_cash_NAME_CONTRACT_STATUS_Completed' ]] train = train.merge(pos_cash_current, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) test = test.merge(pos_cash_current, left_on='SK_ID_CURR', right_index=True, how='left', copy=False) ######################################################## logger.info(f'Caching features in {cache_file}') train_cache = train[pos_cash_current.columns] test_cache = test[pos_cash_current.columns] save_to_cache(cache_file, cache_key_train, cache_key_test, train_cache, test_cache) return train, test, y, db_conn, folds, cache_file