# 'object' columns to dummies # In[56]: # 3.0 Read previous application data first ins = pd.read_csv( 'installments_payments.csv.zip', nrows = num_rows ) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: ins = reducing.Reducer().reduce(ins) # In[57]: # 3.1 ins.shape # (13605401, 8) ins.head() # In[58]: # 3.2 No object type column ins.dtypes.value_counts()
# 'object' columns to dummies # In[89]: # 3.0 Read previous application data first df = pd.read_csv( 'application_train.csv.zip', nrows = num_rows ) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: df = reducing.Reducer().reduce(df) # In[90]: # 3.0 Read previous application data first test_df = pd.read_csv( 'application_test.csv.zip', nrows = num_rows ) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: test_df = reducing.Reducer().reduce(test_df)
# In[44]: # 2.2 Some constants num_rows = None # Implies read all rows nan_as_category = True # While transforming # 'object' columns to dummies # In[45]: # 3.0 Read previous application data first prev = pd.read_csv('previous_application.csv.zip', nrows=num_rows) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: prev = reducing.Reducer().reduce(prev) # In[46]: # 3.0.2 prev.shape # (rows=16,70,214, cols = 37) prev.head(5) prev.columns # In[47]: # 3.1 Let us examine how many unique IDs exist prev['SK_ID_PREV'].nunique() # 1670214 Unique number prev['SK_ID_CURR'].nunique() # 338857 So a number of repeat exist # We have to aggregate over it
# In[92]: # 2.1 Some constants num_rows = None # Implies read all rows nan_as_category = True # While transforming # 'object' columns to dummies # In[93]: # 3.0 Read previous application data first df = pd.read_csv('processed_df.csv.zip', nrows=num_rows) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: df = reducing.Reducer().reduce(df) # In[94]: # 3.1 df.shape # (356251, 262) df.head(2) # In[95]: # 3.2 df.columns df.drop(columns=['Unnamed: 0', 'index'], inplace=True) df.columns # In[96]:
# 'object' columns to dummies # In[66]: # 3.2 Read bureau data first bureau = pd.read_csv( 'bureau.csv.zip', nrows = None # Read all rows ) # 3.2.1 Reduce memory usage by appropriately # changing data-types per feature: bureau = reducing.Reducer().reduce(bureau) # In[67]: # 3.2.2 Explore data now bureau.head(5) bureau.shape # (rows:17,16,428, cols: 17) bureau.dtypes # In[68]: # 3.2.3 In all, how many are categoricals?
num_rows = None # Implies read all rows nan_as_category = True # While transforming # 'object' columns to dummies # ## About the data # <blockquote>POS_CASH_BALANCE: Monthly data about previous point of sale or cash loans clients have had with <u>Home Credit</u>. Each row is <i>one month</i> of a previous point of sale or cash loan, and a single previous loan can have many rows. This dataset contrasts with <i>bureau_balance</i> dataset where monthly installments were of loans with <u>bureau</u>.</blockquote> # In[29]: # 3.2 Read previous application data first pos = pd.read_csv('POS_CASH_balance.csv.zip', nrows=num_rows) # 3.0.1 Reduce memory usage by appropriately # changing data-types per feature: pos = reducing.Reducer().reduce(pos) # In[30]: # 3.3 pos.shape # (rows: 1,00,01358, cols: 8) pos.head() # ## Feature explanations # SK_ID_PREV : ID of previous credit in Home Credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit) # SK_ID_CURR: ID of loan in our sample # MONTHS_BALANCE: Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly ) # CNT_INSTALMENT: Term of previous credit (can change over time) # CNT_INSTALMENT_FUTURE: Installments left to pay on the previous credit # NAME_CONTRACT_STATUS: Contract status during the month # SK_DPD: DPD (days past due) during the month of previous credit