def prepareSmallPickle(): joined = du.loadPickleDF("application") small = joined[joined["dataclass"] == "Train"].iloc[:] small = small.drop(["dataclass"], axis=1) du.dfToPickle("small", small) skid = small[["SK_ID_CURR"]] du.dfToPickle("skidsmall", skid) bbsmall = du.loadPickleDF("bureau") bbsmall = skid.merge(bbsmall, how='left', on='SK_ID_CURR') bbsmall = bbsmall.drop(["dataclass"], axis=1) du.dfToPickle("bureausmall", bbsmall) print("done")
def quickHack(): """Tried to see if higher thresholds for feature inclusion in pruned data set, would result in last features added having improved contributions. Didnt seem so :?""" resultPath = "./output/eval_prune_3006" totalauc = pd.Series() totalbl = pd.Series() v = [] last10 = [] for i in range(0, 5): file = "result_" + "_" + str(i) result = du.loadPickleDF("result_" + str(i), path=resultPath) feats = result[result["INCL"] == True]["COL"].values print(file + ":" + str(len(feats)) + ":" + str(result.iloc[-1]["SCORE"])) v.append(result.iloc[-1]["SCORE"]) rl10 = result.iloc[40:55]["RATIO"] last10.append(rl10.mean()) print(str(np.mean(v)) + ":" + str(np.std(v))) print(str(np.mean(last10)) + ":" + str(np.std(last10))) print( str(totalauc.mean()) + " " + str(totalauc.std()) + " " + str(totalauc.mean() / totalauc.std())) resultPath = "./output/eval_prune_3106" totalauc = pd.Series() totalbl = pd.Series() for th in (0.2, 0.5, 1.0): v = [] last10 = [] for i in range(0, 5): file = "result_" + str(th) + "_" + str(i) result = du.loadPickleDF("result_" + str(th) + "_" + str(i), path=resultPath) feats = result[result["INCL"] == True]["COL"].values print(file + ":" + str(len(feats)) + ":" + str(result.iloc[-1]["SCORE"])) v.append(result.iloc[-1]["SCORE"]) rl10 = result.iloc[40:55]["RATIO"] last10.append(rl10.mean()) print(str(np.mean(v)) + ":" + str(np.std(v))) print(str(np.mean(last10)) + ":" + str(np.std(last10))) print( str(totalauc.mean()) + " " + str(totalauc.std()) + " " + str(totalauc.mean() / totalauc.std()))
def bureauBalanceLoanPredictPickle(num_rows=None, nan_as_category=True): # bureau = pd.read_csv('../data/csv/bureau.csv', nrows = num_rows) # bureau.to_pickle('../data/csv/bureau.pkl') bureau = pd.read_pickle('../data/pickle/bureau.pck') # bb = pd.read_csv('../data/csv/bureau_balance.csv', nrows = num_rows) # bb.to_pickle('../data/csv/bureau_balance.pkl') bb = pd.read_pickle('../data/pickle/bureau_balance.pck') joined = du.loadPickleDF("application") joined = joined[["SK_ID_CURR", "TARGET", "dataclass"]] # bureau = joined.merge(bureau, how='left',on="SK_ID_CURR") bureau = bureau.merge(joined, how='left', on="SK_ID_CURR") bureau = bureau[bureau["dataclass"] == "Train"] bureau = bureau.drop(["dataclass"], axis=1) ut.process_categories(bureau, ["TARGET"], checkCats=False) du.dfToPickle("bureausmall-loanpredict", bureau) print("done")
def comparePrunedFeatureSet(): resultPath = "./output/" import pickle evalidx = int(len(train) * 0.3) evaltrain = train[:evalidx] evaltarget = target[:evalidx] train = train[evalidx:] target = target[evalidx:] baseline = pd.Series() # for th in (0.2,0.5,1.0): # for i in range(0,5): for th in (1.0, ): for i in range(2, 5): seed = randint(1, 60000) result = pd.DataFrame() result = fs.createRandomFeatureSet(train, target, params, scoreModel, seed, th) du.dfToPickle("result_" + str(th) + "_" + str(i), result, resultPath) gc.collect() pruned = [] baseline = [] for th in (0.2, 0.5, 1.0): for i in range(0, 5): seed = randint(1, 60000) result = du.loadPickleDF("result_" + str(th) + "_" + str(i), path=resultPath) feats = result[result["INCL"] == True]["COL"].values baseline.append( evalLGBModelAUC(train, target, evaltrain, evaltarget, params, seed)) pruned.append( evalLGBModelAUC(train[feats], target, evaltrain[feats], evaltarget, params, seed)) pickle.dump( { "baseline": baseline, "pruned": pruned }, open(resultPath + "eval_" + str(th) + "_" + str(i) + ".pck", "wb")) print("------ done ")
def bureauBalancePickle(num_rows=None, nan_as_category=True): bureau = pd.read_pickle('../data/pickle/bureau.pck') bb = pd.read_pickle('../data/pickle/bureau_balance.pck') bb, bb_cat = one_hot_encoder(bb, nan_as_category) bureau, bureau_cat = one_hot_encoder(bureau, nan_as_category) bureau = bureau[bureau["DAYS_CREDIT_UPDATE"] > -180] # Bureau balance: Perform aggregations and merge with bureau.csv bb_aggregations = {'MONTHS_BALANCE': ['min', 'max', 'size']} for col in bb_cat: bb_aggregations[col] = ['mean'] bb_agg = bb.groupby('SK_ID_BUREAU').agg(bb_aggregations) bb_agg.columns = pd.Index( [e[0] + "_" + e[1].upper() for e in bb_agg.columns.tolist()]) bureau = bureau.join(bb_agg, how='left', on='SK_ID_BUREAU') bureau.drop(['SK_ID_BUREAU'], axis=1, inplace=True) del bb, bb_agg gc.collect() # Bureau and bureau_balance numeric features num_aggregations = { 'DAYS_CREDIT': ['min', 'max', 'mean', 'var'], 'DAYS_CREDIT_ENDDATE': ['min', 'max', 'mean'], 'DAYS_CREDIT_UPDATE': ['mean'], 'CREDIT_DAY_OVERDUE': ['max', 'mean'], 'AMT_CREDIT_MAX_OVERDUE': ['mean'], 'AMT_CREDIT_SUM': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_DEBT': ['max', 'mean', 'sum'], 'AMT_CREDIT_SUM_OVERDUE': ['mean'], 'AMT_CREDIT_SUM_LIMIT': ['mean', 'sum'], 'AMT_ANNUITY': ['max', 'mean'], 'CNT_CREDIT_PROLONG': ['sum'], 'MONTHS_BALANCE_MIN': ['min'], 'MONTHS_BALANCE_MAX': ['max'], 'MONTHS_BALANCE_SIZE': ['mean', 'sum'] } # Bureau and bureau_balance categorical features cat_aggregations = {} for cat in bureau_cat: cat_aggregations[cat] = ['mean'] for cat in bb_cat: cat_aggregations[cat + "_MEAN"] = ['mean'] bureau_agg = bureau.groupby('SK_ID_CURR').agg({ **num_aggregations, **cat_aggregations }) bureau_agg.columns = pd.Index([ 'BURO_' + e[0] + "_" + e[1].upper() for e in bureau_agg.columns.tolist() ]) # Bureau: Active credits - using only numerical aggregations active = bureau[bureau['CREDIT_ACTIVE_Active'] == 1] active_agg = active.groupby('SK_ID_CURR').agg(num_aggregations) active_agg.columns = pd.Index([ 'ACTIVE_' + e[0] + "_" + e[1].upper() for e in active_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(active_agg, how='left') del active, active_agg gc.collect() # Bureau: Closed credits - using only numerical aggregations closed = bureau[bureau['CREDIT_ACTIVE_Closed'] == 1] closed_agg = closed.groupby('SK_ID_CURR').agg(num_aggregations) closed_agg.columns = pd.Index([ 'CLOSED_' + e[0] + "_" + e[1].upper() for e in closed_agg.columns.tolist() ]) bureau_agg = bureau_agg.join(closed_agg, how='left') del closed, closed_agg, bureau gc.collect() joined = du.loadPickleDF("application") joined = joined[["SK_ID_CURR", "TARGET", "dataclass"]] bureau_agg = joined.join(bureau_agg, how='left', on="SK_ID_CURR") du.dfToPickle("bureau", bureau_agg)
print("done") du.csvdirToPickle("../data/csv") applicationPickle() bureauBalancePickle() prepareSmallPickle() bureauBalanceLoanPredictPickle() debug = False numBoostRounds = 5000 coreMax = 15 # train = du.loadPickle("traintest") # train = du.loadPickleDF("small") # train = du.loadPickleDF("bureausmall") train = du.loadPickleDF("bureausmall-loanpredict") # feats=["TARGET","dataclass",'BURO_DAYS_CREDIT_ENDDATE_MIN' ,'BURO_DAYS_CREDIT_ENDDATE_MAX', 'BURO_DAYS_CREDIT_ENDDATE_MEAN','ACTIVE_DAYS_CREDIT_ENDDATE_MIN', 'ACTIVE_DAYS_CREDIT_ENDDATE_MAX', 'ACTIVE_DAYS_CREDIT_ENDDATE_MEAN', 'CLOSED_DAYS_CREDIT_ENDDATE_MIN' ,'CLOSED_DAYS_CREDIT_ENDDATE_MAX', 'CLOSED_DAYS_CREDIT_ENDDATE_MEAN','BURO_MONTHS_BALANCE_MIN_MIN', 'BURO_MONTHS_BALANCE_MAX_MAX' ,'BURO_MONTHS_BALANCE_SIZE_MEAN', 'BURO_MONTHS_BALANCE_SIZE_SUM', 'ACTIVE_MONTHS_BALANCE_MIN_MIN', 'ACTIVE_MONTHS_BALANCE_MAX_MAX', 'ACTIVE_MONTHS_BALANCE_SIZE_MEAN','ACTIVE_MONTHS_BALANCE_SIZE_SUM','CLOSED_MONTHS_BALANCE_MIN_MIN', 'CLOSED_MONTHS_BALANCE_MAX_MAX' ,'CLOSED_MONTHS_BALANCE_SIZE_MEAN', 'CLOSED_MONTHS_BALANCE_SIZE_SUM',] # feats=["TARGET","dataclass",'BURO_CNT_CREDIT_PROLONG_SUM','ACTIVE_CNT_CREDIT_PROLONG_SUM','CLOSED_CNT_CREDIT_PROLONG_SUM'] # train = train[feats] # vis.visualizeCategorical(train) # vis.visualizeNumerical(train) # sys.exit(0) if (debug): #train = train.iloc[:6000] train = train[[ "SK_ID_CURR", "TARGET", "DAYS_BIRTH", "AMT_GOODS_PRICE", "AMT_ANNUITY", "DAYS_EMPLOYED", "CODE_GENDER", "DAYS_ID_PUBLISH" ]] numBoostRounds = 500
import matplotlib.pyplot as plt from sklearn import svm import gc import sys import datatools.datastore_util as du import datatools.model_util as modelUtil import datatools.visualize as vis import pandas as pd import numpy as np import util.preprocess_util as ut import time from datatools.transform_util import cart2sphere # train = du.loadPickleDF("small") train = train[["TARGET", 'AMT_CREDIT', 'AMT_ANNUITY']][0:500] train = train.dropna() y = train["TARGET"] feats = ['AMT_CREDIT', 'AMT_ANNUITY'] X = train[feats] X = X.as_matrix() y = y.as_matrix() # we create clusters with 1000 and 100 points # rng = np.random.RandomState(0) # n_samples_1 = 1000 # n_samples_2 = 100 # X = np.r_[1.5 * rng.randn(n_samples_1, 2), # 0.5 * rng.randn(n_samples_2, 2) + [2, 2]] # y = [0] * (n_samples_1) + [1] * (n_samples_2)