def aggregate(): df = utils.get_dummies(cre) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.cre_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg['CRE_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(): df = utils.get_dummies(pos) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.pos_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] except: pass df_agg['POS_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(args): print(args) k, v, prefix = args df = utils.get_dummies(bure[bure[k] == v]) li = [] for c1 in df.columns: for c2 in col_cat: if c1.startswith(c2 + '_'): li.append(c1) break cat_aggregations = {} for cat in li: cat_aggregations[cat] = ['mean', 'sum'] df_agg = df.groupby(KEY).agg({ **utils_agg.bure_num_aggregations, **cat_aggregations }) df_agg.columns = pd.Index( [prefix + e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) # std / mean col_std = [c for c in df_agg.columns if c.endswith('_std')] for c in col_std: df_agg[f'{c}-d-mean'] = df_agg[c] / df_agg[c.replace('_std', '_mean')] # max / min col_max = [c for c in df_agg.columns if c.endswith('_max')] for c in col_max: try: df_agg[f'{c}-d-min'] = df_agg[c] / df_agg[c.replace( '_max', '_min')] df_agg[f'{c}-m-min'] = df_agg[c] - df_agg[c.replace( '_max', '_min')] except: pass df_agg[f'{prefix}BURE_COUNT'] = df.groupby(KEY).size() df_agg.reset_index(inplace=True) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
def aggregate(): df = utils.get_dummies(ins) df_agg = df.groupby(KEY).agg({**num_aggregations}) df_agg.columns = pd.Index( [e[0] + "_" + e[1] for e in df_agg.columns.tolist()]) df_agg.reset_index(inplace=True) # utils.remove_feature(df_agg, var_limit=0, corr_limit=0.98, sample_size=19999) tmp = pd.merge(train, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/train') tmp = pd.merge(test, df_agg, on=KEY, how='left').drop(KEY, axis=1) utils.to_feature(tmp.add_prefix(PREF), '../feature/test') return
'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY' ] col_cat = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'] col_group = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'] # ============================================================================= # feature # ============================================================================= bureau = utils.read_pickles('../data/bureau') bureau = bureau[bureau['DAYS_CREDIT_ENDDATE'].between(-365, 0)] bureau = utils.get_dummies(bureau) bureau.drop('SK_ID_BUREAU', axis=1, inplace=True) gr = bureau.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x)) # ============================================================================= # gr1
from multiprocessing import Pool import utils utils.start(__file__) #============================================================================== KEY = 'SK_ID_CURR' PREF = 'prev_102' # ============================================================================= # feature # ============================================================================= prev = utils.read_pickles('../data/previous_application') prev = prev[prev['NAME_CONTRACT_STATUS'] == 'Refused'] prev = utils.get_dummies(prev) prev.columns = [c.replace('/', '') for c in prev.columns] prev.drop('SK_ID_PREV', axis=1, inplace=True) base = prev[[KEY]].drop_duplicates().set_index(KEY) gr = prev.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x))
'MONTHS_BALANCE', 'CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF' ] col_cat = ['NAME_CONTRACT_STATUS'] col_group = ['SK_ID_PREV', 'NAME_CONTRACT_STATUS'] # ============================================================================= # feature # ============================================================================= pos = utils.read_pickles('../data/POS_CASH_balance') pos = pos[pos['MONTHS_BALANCE'] > -12] pos = utils.get_dummies(pos) pos.drop('SK_ID_PREV', axis=1, inplace=True) base = pos[[KEY]].drop_duplicates().set_index(KEY) gr = pos.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x))
PREF = f'ins_{No}' col_num = [ 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT' ] col_group = ['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER'] # ============================================================================= # feature # ============================================================================= ins = utils.read_pickles('../data/installments_payments') ins = ins[ins['DAYS_INSTALMENT'].between(-365, 0)] ins = utils.get_dummies(ins) ins.drop('SK_ID_PREV', axis=1, inplace=True) base = ins[[KEY]].drop_duplicates().set_index(KEY) gr = ins.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x))
KEY = 'SK_ID_CURR' No = '301' PREF = f'ins_{No}' NTHREAD = 3 col_num = [ 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER', 'DAYS_INSTALMENT', 'DAYS_ENTRY_PAYMENT', 'AMT_INSTALMENT', 'AMT_PAYMENT' ] col_group = ['SK_ID_PREV', 'NUM_INSTALMENT_VERSION', 'NUM_INSTALMENT_NUMBER'] # ============================================================================= # feature # ============================================================================= ins = utils.get_dummies(utils.read_pickles('../data/installments_payments')) ins.drop('SK_ID_PREV', axis=1, inplace=True) base = ins[[KEY]].drop_duplicates().set_index(KEY) gr = ins.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x))
# 'AMT_DRAWINGS_CURRENT', 'AMT_DRAWINGS_OTHER_CURRENT', # 'AMT_DRAWINGS_POS_CURRENT', 'AMT_INST_MIN_REGULARITY', # 'AMT_PAYMENT_CURRENT', 'AMT_PAYMENT_TOTAL_CURRENT', # 'AMT_RECEIVABLE_PRINCIPAL', 'AMT_RECIVABLE', 'AMT_TOTAL_RECEIVABLE', # 'CNT_DRAWINGS_ATM_CURRENT', 'CNT_DRAWINGS_CURRENT', # 'CNT_DRAWINGS_OTHER_CURRENT', 'CNT_DRAWINGS_POS_CURRENT', # 'CNT_INSTALMENT_MATURE_CUM', 'SK_DPD', 'SK_DPD_DEF'] # #col_cat = ['CNT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS'] # #col_group = ['SK_ID_PREV', 'CNT_DRAWINGS_OTHER_CURRENT', 'NAME_CONTRACT_STATUS'] # ============================================================================= # feature # ============================================================================= cre = utils.get_dummies(utils.read_pickles('../data/credit_card_balance')) cre.drop('SK_ID_PREV', axis=1, inplace=True) base = cre[[KEY]].drop_duplicates().set_index(KEY) gr = cre.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x)) # =============================================================================
KEY = 'SK_ID_CURR' PREF = 'pos_201' NTHREAD = 2 col_num = ['MONTHS_BALANCE', 'CNT_INSTALMENT', 'CNT_INSTALMENT_FUTURE', 'SK_DPD', 'SK_DPD_DEF'] col_cat = ['NAME_CONTRACT_STATUS'] col_group = ['SK_ID_PREV', 'NAME_CONTRACT_STATUS'] # ============================================================================= # feature # ============================================================================= pos = utils.get_dummies(utils.read_pickles('../data/POS_CASH_balance')) pos.drop('SK_ID_PREV', axis=1, inplace=True) base = pos[[KEY]].drop_duplicates().set_index(KEY) gr = pos.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x))
col_num = [ 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE', 'DAYS_CREDIT_ENDDATE', 'DAYS_ENDDATE_FACT', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'AMT_ANNUITY' ] col_cat = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'] col_group = ['CREDIT_ACTIVE', 'CREDIT_CURRENCY', 'CREDIT_TYPE'] # ============================================================================= # feature # ============================================================================= bureau = utils.get_dummies(utils.read_pickles('../data/bureau')) bureau.drop('SK_ID_BUREAU', axis=1, inplace=True) gr = bureau.groupby(KEY) train = utils.load_train([KEY]) test = utils.load_test([KEY]) def nunique(x): return len(set(x)) # ============================================================================= # gr1
def run_training(): if torch.cuda.is_available(): DEVICE = 'cuda' else: DEVICE = 'cpu' df_train = pd.read_csv(PATH + 'train_features.csv') targets = pd.read_csv(PATH + 'train_targets_scored.csv') utils.get_dummies(df_train, ['cp_type', 'cp_dose', 'cp_time']) sig_ids = df_train['sig_id'] df_train.drop('sig_id', axis=1, inplace=True) targets.drop('sig_id', axis=1, inplace=True) # TODO use unscored data for training as well X_train, X_val, y_train, y_val = train_test_split(df_train.values, targets.values, test_size=0.3, random_state=42) train_dataset = utils.ModelDataset(X_train, y_train) train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=4) val_dataset = utils.ModelDataset(X_val, y_val) val_loader = DataLoader(val_dataset, batch_size=1) model = utils.Model(X_train.shape[1], y_train.shape[1], num_layers, hidden_size) model.to(DEVICE) optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1) engine = utils.Engine(model, optimizer, device=DEVICE) best_loss = np.inf early_stopping = 10 early_stopping_counter = 0 # TODO use optuns for trails for epoch in range(EPOCHS): train_loss = engine.train(train_loader) val_loss = engine.validate(val_loader) scheduler.step(val_loss) print(f'Epoch {epoch}, train_loss {train_loss}, val_loss {val_loss}') if val_loss < best_loss: best_loss = val_loss torch.save(model.state_dict(), '/models') else: early_stopping_counter += 1 if early_stopping_counter > early_stopping: break print(f'best loss {best_loss}') return best_loss