def __init__(self, tc=TrainConfig(), sample_df=None): self.tc = tc self.gam = self.tc.ga_model ckpt_root = get_file('ga_ckpt', v=self.tc.version) os.makedirs(ckpt_root, exist_ok=True) self.tc.ckpt_dir = self.ckpt_dir = os.path.join( ckpt_root, folder_date()) os.makedirs(self.ckpt_dir) print('Logging to {}'.format(self.ckpt_dir)) self.ckpt_temp = os.path.join(self.ckpt_dir, '{gen}_{score}.pk') self.best_score = 0 self.best_gen = 0 self.workers = {} # self.lazy_genes = LazyGenes(self.gam['lazy_size']) self.odf = sample_df if sample_df is not None else pd.read_csv( get_file('all_data', self.tc.version)) self.train_ix = self.odf[self.odf['TARGET'].notnull()].index self.test_ix = self.odf[self.odf['TARGET'].isnull()].index print('Generating genes') original_genes = self.odf.drop(['SK_ID_CURR', 'TARGET'], axis=1).columns.tolist() ogs = [Gene(i, df=self.odf) for i in original_genes] self.genes = ogs new_genes = [] for ix, g in enumerate(self.genes[:-1]): for og in self.genes[ix + 1:]: new_genes.extend(g.apply_ops(og)) self.genes.extend(new_genes) print('Generated {} genes!'.format(len(self.genes))) # Generate initial generation self.population = Population(genes=self.genes, tc=self.tc) org_chromo = ogs[:self.gam['chromosome_size']] diff = self.gam['chromosome_size'] - len(org_chromo) random.shuffle(new_genes) org_chromo += new_genes[:diff] assert len(org_chromo) == self.gam[ 'chromosome_size'], 'Error in original chromo size' self.population.members.append( Chromosome(org_chromo, self.gam['mate_method'], self.genes, self.gam['mutate_scale'])) assert len( self.population.members ) == self.population.size, 'Discrepancies in population size' print('Generated a population of size {}'.format(self.population.size)) if self.tc.use_gpu: alloc = self.gam['workers'] // len(self.tc.gpu_devices) rem = self.gam['workers'] % len(self.tc.gpu_devices) self.gpu_alloc = self.tc.gpu_devices * (alloc + rem)
def evaluate(self, x=None, y=None, update_rank=True): if x is None and self.data is None: self.load_train_data() x = x or self.data.drop('TARGET', axis=1).values y = y or self.data['TARGET'].values bm = self.tc.board_model self.evaluation = self.k_model.evaluate( x=x, y=y, batch_size=bm['batch_size'], ) rank = dict(zip(self.k_model.metrics_names, self.evaluation)) if update_rank: path = get_file('board_models_ranks', self.tc.version) try: with open(path) as fp: current_rank = json.load(fp) except (json.JSONDecodeError, FileNotFoundError): current_rank = {} with open(path, 'w') as fp: current_rank[self.tc.board_model['model_folder']] = { 'rank': rank, 'model_ckpt': self.ckpt_model } json.dump(current_rank, fp) return rank
def installments_payments(self, num_rows=None): ins = pd.read_csv(get_file('ins_pay'), nrows=num_rows) ins, cat_cols = self.one_hot_encoder(ins, nan_as_category=True) # Percentage and difference paid in each installment (amount paid and installment value) ins['PAYMENT_PERC'] = ins['AMT_PAYMENT'] / ins['AMT_INSTALMENT'] ins['PAYMENT_DIFF'] = ins['AMT_INSTALMENT'] - ins['AMT_PAYMENT'] # Days past due and days before due (no negative values) ins['DPD'] = ins['DAYS_ENTRY_PAYMENT'] - ins['DAYS_INSTALMENT'] ins['DBD'] = ins['DAYS_INSTALMENT'] - ins['DAYS_ENTRY_PAYMENT'] ins['DPD'] = ins['DPD'].apply(lambda x: x if x > 0 else 0) ins['DBD'] = ins['DBD'].apply(lambda x: x if x > 0 else 0) # Features: Perform aggregations aggregations = { 'NUM_INSTALMENT_VERSION': ['nunique'], 'DPD': ['max', 'mean', 'sum'], 'DBD': ['max', 'mean', 'sum'], 'PAYMENT_PERC': ['max', 'mean', 'sum', 'var'], 'PAYMENT_DIFF': ['max', 'mean', 'sum', 'var'], 'AMT_INSTALMENT': ['max', 'mean', 'sum'], 'AMT_PAYMENT': ['min', 'max', 'mean', 'sum'], 'DAYS_ENTRY_PAYMENT': ['max', 'mean', 'sum'] } for cat in cat_cols: aggregations[cat] = ['mean'] ins_agg = ins.groupby('SK_ID_CURR').agg(aggregations) ins_agg.columns = pd.Index([ 'INSTAL_' + e[0] + "_" + e[1].upper() for e in ins_agg.columns.tolist() ]) # Count installments accounts ins_agg['INSTAL_COUNT'] = ins.groupby('SK_ID_CURR').size() del ins gc.collect() return ins_agg
def gen_data(self, debug=False, version='v0'): os.makedirs(get_file('curated', version), exist_ok=True) num_rows = 10000 if debug else None df = self.application_train_test(num_rows) with timer("Process bureau and bureau_balance"): bureau = self.bureau_and_balance(num_rows) print("Bureau df shape:", bureau.shape) df = df.join(bureau, how='left', on='SK_ID_CURR') del bureau gc.collect() with timer("Process previous_applications"): prev = self.previous_applications(num_rows) print("Previous applications df shape:", prev.shape) df = df.join(prev, how='left', on='SK_ID_CURR') del prev gc.collect() with timer("Process POS-CASH balance"): pos = self.pos_cash(num_rows) print("Pos-cash balance df shape:", pos.shape) df = df.join(pos, how='left', on='SK_ID_CURR') del pos gc.collect() with timer("Process installments payments"): ins = self.installments_payments(num_rows) print("Installments payments df shape:", ins.shape) df = df.join(ins, how='left', on='SK_ID_CURR') del ins gc.collect() with timer("Process credit card balance"): cc = self.credit_card_balance(num_rows) print("Credit card balance df shape:", cc.shape) df = df.join(cc, how='left', on='SK_ID_CURR') del cc gc.collect() with timer("Saving data"): print(df.shape) print('Dropping unimportant features') df.drop(features_with_no_imp_at_least_twice, axis=1, inplace=True) gc.collect() print(df.shape) df.to_csv(get_file('all_data', version), index=False) df[df['TARGET'].notnull()].to_csv(get_file('org_train', version), index=False) df[df['TARGET'].isnull()].to_csv(get_file('org_test', version), index=False) return df
def load_test_data(self): print('Loading test data') self.test_data = pd.read_csv(get_file('board_test_prob', self.tc.version), index_col='SK_ID_CURR') print(self.test_data.head()) print('Test Data with shape {}'.format(self.test_data.shape)) self.test_x = self.test_data.drop('TARGET', axis=1).values self.test_y = self.test_data['TARGET'].values.reshape((-1, 1))
def __init__(self, odf, max_size=5, version='v1', is_sample=False): self.bins = {} self.odf = odf self.file = 'bin_sample_{}.csv' if is_sample else 'bin_{}.csv' self.bin_path = os.path.join(get_file('genes_bin_dir', version), self.file) self.max_size = max_size self.version = version
def __init__(self, version='v0'): self.train_data = pd.read_csv(get_file('org_train', v=version), index_col='index') self.test_data = pd.read_csv(get_file('org_test', v=version), index_col='index') self.train_data.reset_index(inplace=True) self.test_data.reset_index(inplace=True) with open(get_file('board')) as fp: self.board = json.load(fp) self.models = { 'Model_{}'.format(ix): lgb.Booster(model_file=m) for ix, m in enumerate(self.board) } self.train_prob = pd.DataFrame() self.test_prob = pd.DataFrame() for m in self.models: print('Model {} is predicting'.format(m)) self.train_prob[m] = self.models[m].predict(self.train_data) self.test_prob[m] = self.models[m].predict(self.test_data) agg = ['mean', 'median', 'max', 'min', 'std', 'var', 'mad', 'sum'] model_cols = list(self.models.keys()) for a in agg: name = 'MODELS_' + a.upper() self.train_prob[name] = self.train_prob[model_cols].agg(a, axis=1) self.test_prob[name] = self.test_prob[model_cols].agg(a, axis=1) self.train_prob['TARGET'] = self.train_data['TARGET'] self.train_prob['SK_ID_CURR'] = self.train_data['SK_ID_CURR'] self.test_prob['TARGET'] = self.test_data['TARGET'] self.test_prob['SK_ID_CURR'] = self.test_data['SK_ID_CURR'] if self.train_prob.isin([ np.inf, -np.inf ]).any().any() or self.train_prob.isna().any().any(): raise ValueError('Problem in data integrity!') self.train_prob.to_csv(get_file('board_train_prob', version), index=False) self.test_prob.to_csv(get_file('board_test_prob', version), index=False)
def save_predicted(self, x=None, data=None, path=None, notes='', include_rank=True): os.makedirs(get_file('submission'), exist_ok=True) submission = pd.DataFrame() self.predict(x) rank = None if not include_rank else self.evaluate() data = data or self.test_data submission['SK_ID_CURR'] = data.index submission['TARGET'] = self.predicted file = 'submission_{:.4f}_{}.csv'.format(rank['auc_roc'], datetime.now().strftime('%m-%d_%H-%M-%S')) path = path or os.path.join(get_file('submission'), file) current_meta = safe_load_json(get_file('sub_meta')) sub_meta = { 'path': path, 'notes': notes, 'model_path': self.tc.board_model['model_folder'], 'model_ckpt': self.ckpt_model, 'rank': rank } current_meta[file] = sub_meta dump_json(current_meta, get_file('sub_meta')) submission.to_csv(path, index=False)
def gen_genetic_data(self, df=None, version='v0'): bin_dir = get_file('genes_bin_dir', version) bin_file = 'bin_sample_{ix}.csv' if df is not None else 'bin_{ix}.csv' gene_file = 'genes' if df is None else 'genes_sample' os.makedirs(bin_dir, exist_ok=True) operators = ['+', '-', '/', '*'] df = df if df is not None else pd.read_csv( get_file('all_data', version)) original_genes = df.drop(['SK_ID_CURR', 'TARGET'], axis=1).columns.tolist() genes_map = {} og_len = len(original_genes) print('Starting') for ix, og in enumerate(original_genes[:-1]): if ix // 10: print('{} of {}'.format(ix, og_len)) cols = original_genes[ix:] cols.pop(cols.index(og)) gen_cols = [] for c in cols: gen_cols.extend([og + op + c for op in operators]) new_genes = self.apply_operator(df[og], df[cols]) new_genes = [g for i in new_genes for g in i] gene_bin = pd.DataFrame(dict(zip(gen_cols, new_genes))) genes_map.update( dict(zip(gen_cols, [ix for _ in range(len(gen_cols))]))) gene_bin['SK_ID_CURR'] = df['SK_ID_CURR'] gene_bin.to_csv(os.path.join(bin_dir, bin_file.format(ix=ix)), index=False) del gene_bin dump_json(genes_map, get_file(gene_file, version))
def credit_card_balance(self, num_rows=None): cc = pd.read_csv(get_file('cc_bal'), nrows=num_rows) cc, cat_cols = self.one_hot_encoder(cc, nan_as_category=True) # General aggregations cc.drop(['SK_ID_PREV'], axis=1, inplace=True) cc_agg = cc.groupby('SK_ID_CURR').agg( ['min', 'max', 'mean', 'sum', 'var']) cc_agg.columns = pd.Index([ 'CC_' + e[0] + "_" + e[1].upper() for e in cc_agg.columns.tolist() ]) # Count credit card lines cc_agg['CC_COUNT'] = cc.groupby('SK_ID_CURR').size() del cc gc.collect() return cc_agg
def evolve(self): e = 1 try: while e: self.assign_workers() self.start_workers() self.await_workers() self.population.sort() cs = self.population.members[0].score gen = self.population.gen if cs >= self.best_score: self.best_score = cs self.best_gen = gen rank_file = 'sample_ga_rank' if SAMPLE else 'ga_rank' rank_path = get_file(rank_file, self.tc.version) current_rank = safe_load_json(rank_path) current_rank[self.population.members[0].model_path] = { 'gen': self.population.gen, 'score': cs } dump_json(current_rank, rank_path) with open(self.ckpt_temp.format(gen=gen, score=cs), 'wb') as fp: pickle.dump(self.population.members, fp) print( '\n-----------------------------------------------------------------------------' ) print( 'Gen: {} | Best score: {} by {} | Current Best Score: {}'. format(gen, self.best_score, self.best_gen, cs)) print( '-----------------------------------------------------------------------------\n' ) self.population.mate_members() self.population.mutate_members() self.population.gen += 1 gc.collect() if SAMPLE: e -= 1 except KeyboardInterrupt: print('Exited Gracefully with score:{}'.format(self.best_score)) print('Finished!')
def get_callbacks(self): callbacks = [] os.makedirs(get_file('board_models', self.tc.version), exist_ok=True) self.tc.board_model['model_folder'] = model_folder = self.get_model_folder() print('Model dir: {}'.format(model_folder)) os.makedirs(model_folder, exist_ok=True) log_dir = os.path.join(model_folder, 'logs') print('Tensorboard log dir: {}'.format(log_dir)) best_models_dir = os.path.join(model_folder, 'best') print('Best Model dir: {}'.format(best_models_dir)) os.makedirs(log_dir, exist_ok=True) os.makedirs(best_models_dir, exist_ok=True) save_format = 'weights.{epoch:02d}-{val_loss:.2f}-{auc_roc:.2f}.hdf5' _j = os.path.join callbacks.append(ModelCheckpoint(_j(model_folder, save_format))) callbacks.append(ModelCheckpoint(_j(best_models_dir, save_format), save_best_only=True)) callbacks.append(TensorBoard(log_dir=log_dir)) return callbacks
def pos_cash(self, num_rows=None): pos = pd.read_csv(get_file('pc_bal'), nrows=num_rows) pos, cat_cols = self.one_hot_encoder(pos, nan_as_category=True) # Features aggregations = { 'MONTHS_BALANCE': ['max', 'mean', 'size'], 'SK_DPD': ['max', 'mean'], 'SK_DPD_DEF': ['max', 'mean'] } for cat in cat_cols: aggregations[cat] = ['mean'] pos_agg = pos.groupby('SK_ID_CURR').agg(aggregations) pos_agg.columns = pd.Index([ 'POS_' + e[0] + "_" + e[1].upper() for e in pos_agg.columns.tolist() ]) # Count pos cash accounts pos_agg['POS_COUNT'] = pos.groupby('SK_ID_CURR').size() del pos gc.collect() return pos_agg
def kfold_lightgbm(tc=TrainConfig(), manual=False, test_df=None, train_df=None): board = {} tc = load_train_config(tc) if not manual: train_df = pd.read_csv(get_file('org_train', v=tc.version), index_col='index') test_df = pd.read_csv(get_file('org_test', v=tc.version), index_col='index') print("Starting LightGBM. Train shape: {}, test shape: {}".format( train_df.shape, test_df.shape)) os.makedirs(get_file('models', v=tc.version), exist_ok=True) model_dir = datetime.now().strftime('%m-%d_%H-%M-%S') model_path = os.path.join(get_file('models', v=tc.version), model_dir) os.makedirs(model_path, exist_ok=True) sub_file = os.path.join(model_path, 'submission.csv') feat_imp_file = os.path.join(model_path, 'feature_importance.csv') model_txt = os.path.join(model_path, 'model-{score:.2f}.txt') tc_path = os.path.join(model_path, 'train_config.json') with open(tc_path, 'w') as tcp: json.dump(json.loads(tc.to_json()), tcp, indent=4) print('Model path: {}'.format(model_path)) # Cross validation model if tc.stratified: folds = StratifiedKFold(n_splits=tc.num_folds, shuffle=True, random_state=1001) else: folds = KFold(n_splits=tc.num_folds, shuffle=True, random_state=1001) # Create arrays and dataframes to store results oof_preds = np.zeros(train_df.shape[0]) sub_preds = np.zeros(test_df.shape[0]) feature_importance_df = pd.DataFrame() feats = [ f for f in train_df.columns if f not in ['TARGET', 'SK_ID_CURR', 'SK_ID_BUREAU', 'SK_ID_PREV', 'index'] ] clf = None for n_fold, (train_idx, valid_idx) in enumerate( folds.split(train_df[feats], train_df['TARGET'])): d_train = lgb.Dataset(data=train_df[feats].iloc[train_idx], label=train_df['TARGET'].iloc[train_idx], free_raw_data=False, silent=True) d_valid = lgb.Dataset(data=train_df[feats].iloc[valid_idx], label=train_df['TARGET'].iloc[valid_idx], free_raw_data=False, silent=True) clf = lgb.train(params=tc.params, train_set=d_train, num_boost_round=10000, valid_sets=[d_train, d_valid], early_stopping_rounds=200, verbose_eval=False) oof_preds[valid_idx] = clf.predict(d_valid.data) sub_preds += clf.predict(test_df[feats]) / folds.n_splits fold_importance_df = pd.DataFrame() fold_importance_df["feature"] = feats fold_importance_df["importance"] = clf.feature_importance( importance_type='gain') fold_importance_df["fold"] = n_fold + 1 feature_importance_df = pd.concat( [feature_importance_df, fold_importance_df], axis=0) print('Fold %2d AUC : %.6f' % (n_fold + 1, roc_auc_score(d_valid.label, oof_preds[valid_idx]))) # print('Saving model') score = roc_auc_score(train_df['TARGET'], oof_preds) mt = model_txt.format(score=score) board[mt] = score clf.save_model(mt) print('Full AUC score %.6f' % score) try: with open(get_file('board')) as fp: current_board = json.load(fp) except (FileNotFoundError, json.JSONDecodeError): current_board = {} with open(get_file('board'), 'w') as fp: current_board.update(board) json.dump(current_board, fp) # Write submission file and plot feature importance sub_df = test_df[['SK_ID_CURR']].copy() sub_df['TARGET'] = sub_preds sub_df[['SK_ID_CURR', 'TARGET']].to_csv(sub_file, index=False) feature_importance_df.to_csv(feat_imp_file, index=False) return feature_importance_df, score, model_path
def get_model_folder(self, model_type=None, time=None): time_format = time or datetime.now().strftime('%m-%d_%H-%M-%S') model_type = model_type or self.model_type model_folder = '{type}_{time_format}'.format(type=model_type, time_format=time_format) return os.path.join(get_file('board_models', self.tc.version), model_folder)
def previous_applications(self, num_rows=None): prev = pd.read_csv(get_file('pre_app'), nrows=num_rows) prev, cat_cols = self.one_hot_encoder(prev, nan_as_category=True) # Days 365.243 values -> nan prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True) prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True) prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True) prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True) # Add feature: value ask / value received percentage prev['APP_CREDIT_PERC'] = prev['AMT_APPLICATION'] / prev['AMT_CREDIT'] # Previous applications numeric features num_aggregations = { 'AMT_ANNUITY': ['min', 'max', 'mean'], 'AMT_APPLICATION': ['min', 'max', 'mean'], 'AMT_CREDIT': ['min', 'max', 'mean'], 'APP_CREDIT_PERC': ['min', 'max', 'mean', 'var'], 'AMT_DOWN_PAYMENT': ['min', 'max', 'mean'], 'AMT_GOODS_PRICE': ['min', 'max', 'mean'], 'HOUR_APPR_PROCESS_START': ['min', 'max', 'mean'], 'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'], 'DAYS_DECISION': ['min', 'max', 'mean'], 'CNT_PAYMENT': ['mean', 'sum'], } # Previous applications categorical features cat_aggregations = {} for cat in cat_cols: cat_aggregations[cat] = ['mean'] prev_agg = prev.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) prev_agg.columns = pd.Index([ 'PREV_' + e[0] + "_" + e[1].upper() for e in prev_agg.columns.tolist() ]) # Previous Applications: Approved Applications - only numerical features approved = prev[prev['NAME_CONTRACT_STATUS_Approved'] == 1] approved_agg = approved.groupby('SK_ID_CURR').agg(num_aggregations) cols = approved_agg.columns.tolist() approved_agg.columns = pd.Index([ 'APPROVED_' + e[0] + "_" + e[1].upper() for e in approved_agg.columns.tolist() ]) prev_agg = prev_agg.join(approved_agg, how='left', on='SK_ID_CURR') # Previous Applications: Refused Applications - only numerical features refused = prev[prev['NAME_CONTRACT_STATUS_Refused'] == 1] refused_agg = refused.groupby('SK_ID_CURR').agg(num_aggregations) refused_agg.columns = pd.Index([ 'REFUSED_' + e[0] + "_" + e[1].upper() for e in refused_agg.columns.tolist() ]) prev_agg = prev_agg.join(refused_agg, how='left', on='SK_ID_CURR') del refused, refused_agg, approved, approved_agg, prev for e in cols: prev_agg['NEW_RATIO_PREV_' + e[0] + "_" + e[1].upper()] = prev_agg['APPROVED_' + e[0] + "_" + e[1].upper()] / \ prev_agg['REFUSED_' + e[0] + "_" + e[1].upper()] gc.collect() return prev_agg
def bureau_and_balance(self, num_rows=None, nan_as_category=True): bureau = pd.read_csv(get_file('bureau'), nrows=num_rows) bb = pd.read_csv(get_file('bureau_bal'), nrows=num_rows) bb, bb_cat = self.one_hot_encoder(bb, nan_as_category) bureau, bureau_cat = self.one_hot_encoder(bureau, nan_as_category) # Bureau balance: Perform aggregations and merge with bureau.csv bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']} for col in bb_cat: bb_aggregations[col] = ['mean'] bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) bb_agg.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]) bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU') bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True) del bb, bb_agg gc.collect() # Bureau and bureau_balance numeric features num_aggregations = { 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'], 'DAYS_CREDIT_UPDATE': ['mean'], 'CREDIT_DAY_OVERDUE': ['max', 'mean'], 'AMT_CREDIT_MAX_OVERDUE': ['mean'], 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_OVERDUE': ['mean'], 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 'AMT_ANNUITY': ['max', 'mean'], 'CNT_CREDIT_PROLONG': ['sum'], 'MONTHS_BALANCE_MIN': ['min'], 'MONTHS_BALANCE_MAX': ['max'], 'MONTHS_BALANCE_SIZE': ['mean', 'sum'] } # Bureau and bureau_balance categorical features cat_aggregations = {} for cat in bureau_cat: cat_aggregations[cat] = ['mean'] for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean'] bureau_agg = bureau.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) bureau_agg.columns = pd.Index([ 'BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist() ]) # Bureau: Active credits - using only numerical aggregations active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations) cols = active_agg.columns.tolist() active_agg.columns = pd.Index([ 'ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(active_agg, how='left', on='SK_ID_CURR') del active, active_agg gc.collect() # Bureau: Closed credits - using only numerical aggregations closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations) closed_agg.columns = pd.Index([ 'CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(closed_agg, how='left', on='SK_ID_CURR') for e in cols: bureau_agg['NEW_RATIO_BURO_' + e[0] + "_" + e[1].upper()] = ( bureau_agg['ACTIVE_' + e[0] + "_" + e[1].upper()] / bureau_agg['CLOSED_' + e[0] + "_" + e[1].upper()]) del closed, closed_agg, bureau gc.collect() return bureau_agg
def application_train_test(self, num_rows=None, nan_as_category=False): # Read data and merge df = pd.read_csv(get_file('app_train'), nrows=num_rows) test_df = pd.read_csv(get_file('app_test'), nrows=num_rows) print("Train samples: {}, test samples: {}".format( len(df), len(test_df))) df = df.append(test_df).reset_index() # Optional: Remove 4 applications with XNA CODE_GENDER (train set) df = df[df['CODE_GENDER'] != 'XNA'] docs = [_f for _f in df.columns if 'FLAG_DOC' in _f] live = [ _f for _f in df.columns if ('FLAG_' in _f) & ('FLAG_DOC' not in _f) & ('_FLAG_' not in _f) ] # NaN values for DAYS_EMPLOYED: 365.243 -> nan df['DAYS_EMPLOYED'].replace(365243, np.nan, inplace=True) inc_by_org = df[[ 'AMT_INCOME_TOTAL', 'ORGANIZATION_TYPE' ]].groupby('ORGANIZATION_TYPE').median()['AMT_INCOME_TOTAL'] df['NEW_CREDIT_TO_ANNUITY_RATIO'] = df['AMT_CREDIT'] / df['AMT_ANNUITY'] df['NEW_CREDIT_TO_GOODS_RATIO'] = df['AMT_CREDIT'] / df[ 'AMT_GOODS_PRICE'] df['NEW_DOC_IND_AVG'] = df[docs].mean(axis=1) df['NEW_DOC_IND_STD'] = df[docs].std(axis=1) df['NEW_DOC_IND_KURT'] = df[docs].kurtosis(axis=1) df['NEW_LIVE_IND_SUM'] = df[live].sum(axis=1) df['NEW_LIVE_IND_STD'] = df[live].std(axis=1) df['NEW_LIVE_IND_KURT'] = df[live].kurtosis(axis=1) df['NEW_INC_PER_CHLD'] = df['AMT_INCOME_TOTAL'] / (1 + df['CNT_CHILDREN']) df['NEW_INC_BY_ORG'] = df['ORGANIZATION_TYPE'].map(inc_by_org) df['NEW_EMPLOY_TO_BIRTH_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH'] df['NEW_ANNUITY_TO_INCOME_RATIO'] = df['AMT_ANNUITY'] / ( 1 + df['AMT_INCOME_TOTAL']) df['NEW_SOURCES_PROD'] = df['EXT_SOURCE_1'] * df['EXT_SOURCE_2'] * df[ 'EXT_SOURCE_3'] df['NEW_EXT_SOURCES_MEAN'] = df[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3' ]].mean(axis=1) df['NEW_SCORES_STD'] = df[[ 'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3' ]].std(axis=1) df['NEW_SCORES_STD'] = df['NEW_SCORES_STD'].fillna( df['NEW_SCORES_STD'].mean()) df['NEW_CAR_TO_BIRTH_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_BIRTH'] df['NEW_CAR_TO_EMPLOY_RATIO'] = df['OWN_CAR_AGE'] / df['DAYS_EMPLOYED'] df['NEW_PHONE_TO_BIRTH_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df[ 'DAYS_BIRTH'] df['NEW_PHONE_TO_EMPLOY_RATIO'] = df['DAYS_LAST_PHONE_CHANGE'] / df[ 'DAYS_EMPLOYED'] df['NEW_CREDIT_TO_INCOME_RATIO'] = df['AMT_CREDIT'] / df[ 'AMT_INCOME_TOTAL'] # Categorical features with Binary encode (0 or 1; two categories) for bin_feature in ['CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY']: df[bin_feature], uniques = pd.factorize(df[bin_feature]) # Categorical features with One-Hot encode df, cat_cols = self.one_hot_encoder(df, nan_as_category) del test_df gc.collect() return df
def load_train_data(self): print('Loading train data') self.data = pd.read_csv(get_file('board_train_prob', self.tc.version), index_col='SK_ID_CURR')