コード例 #1
0
                             #   'object' columns to dummies


# In[56]:


# 3.0 Read previous application data first
ins = pd.read_csv(
                   'installments_payments.csv.zip',
                   nrows = num_rows
                   )

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

ins = reducing.Reducer().reduce(ins)


# In[57]:


# 3.1
ins.shape   # (13605401, 8)
ins.head()


# In[58]:


# 3.2 No object type column
ins.dtypes.value_counts()
コード例 #2
0
                             #   'object' columns to dummies


# In[89]:


# 3.0 Read previous application data first
df = pd.read_csv(
                   'application_train.csv.zip',
                   nrows = num_rows
                   )

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

df = reducing.Reducer().reduce(df)


# In[90]:


# 3.0 Read previous application data first
test_df = pd.read_csv(
                      'application_test.csv.zip',
                       nrows = num_rows
                   )

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

test_df = reducing.Reducer().reduce(test_df)
コード例 #3
0
# In[44]:

# 2.2 Some constants
num_rows = None  # Implies read all rows
nan_as_category = True  # While transforming
#   'object' columns to dummies

# In[45]:

# 3.0 Read previous application data first
prev = pd.read_csv('previous_application.csv.zip', nrows=num_rows)

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

prev = reducing.Reducer().reduce(prev)

# In[46]:

# 3.0.2
prev.shape  # (rows=16,70,214, cols = 37)
prev.head(5)
prev.columns

# In[47]:

# 3.1 Let us examine how many unique IDs exist

prev['SK_ID_PREV'].nunique()  # 1670214 Unique number
prev['SK_ID_CURR'].nunique()  # 338857  So a number of repeat exist
# We have to aggregate over it
コード例 #4
0
# In[92]:

# 2.1 Some constants
num_rows = None  # Implies read all rows
nan_as_category = True  # While transforming
#   'object' columns to dummies

# In[93]:

# 3.0 Read previous application data first
df = pd.read_csv('processed_df.csv.zip', nrows=num_rows)

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

df = reducing.Reducer().reduce(df)

# In[94]:

# 3.1
df.shape  # (356251, 262)
df.head(2)

# In[95]:

# 3.2
df.columns
df.drop(columns=['Unnamed: 0', 'index'], inplace=True)
df.columns

# In[96]:
コード例 #5
0
                             #   'object' columns to dummies


# In[66]:


# 3.2 Read bureau data first
bureau = pd.read_csv(
                     'bureau.csv.zip',
                     nrows = None    # Read all rows
                    )

# 3.2.1 Reduce memory usage by appropriately
#       changing data-types per feature:

bureau = reducing.Reducer().reduce(bureau)


# In[67]:


# 3.2.2 Explore data now
bureau.head(5)
bureau.shape   # (rows:17,16,428, cols: 17)
bureau.dtypes


# In[68]:


# 3.2.3 In all, how many are categoricals?
コード例 #6
0
num_rows = None  # Implies read all rows
nan_as_category = True  # While transforming
#   'object' columns to dummies

# ## About the data
# <blockquote>POS_CASH_BALANCE: Monthly data about previous point of sale or cash loans clients have had with <u>Home Credit</u>. Each row is <i>one month</i> of a previous point of sale or cash loan, and a single previous loan can have many rows. This dataset contrasts with <i>bureau_balance</i> dataset where monthly installments were of loans with <u>bureau</u>.</blockquote>

# In[29]:

# 3.2 Read previous application data first
pos = pd.read_csv('POS_CASH_balance.csv.zip', nrows=num_rows)

# 3.0.1 Reduce memory usage by appropriately
#       changing data-types per feature:

pos = reducing.Reducer().reduce(pos)

# In[30]:

# 3.3
pos.shape  # (rows: 1,00,01358, cols: 8)
pos.head()

# ## Feature explanations
# SK_ID_PREV : 	ID of previous credit in Home Credit related to loan in our sample. (One loan in our sample can have 0,1,2 or more previous loans in Home Credit)
# SK_ID_CURR: 	ID of loan in our sample
# MONTHS_BALANCE: 	Month of balance relative to application date (-1 means the information to the freshest monthly snapshot, 0 means the information at application - often it will be the same as -1 as many banks are not updating the information to Credit Bureau regularly )
# CNT_INSTALMENT: 	Term of previous credit (can change over time)
# CNT_INSTALMENT_FUTURE: 	Installments left to pay on the previous credit
# NAME_CONTRACT_STATUS: 	Contract status during the month
# SK_DPD: 	DPD (days past due) during the month of previous credit