Esempio n. 1
0
    def define_dataset(self):
        # Observe the features with missing values
        f.get_missing_value_feats(self.ds1_df)

        # Seperate the categorical and numerical features
        self.ds1_df.shape
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)

        # Change the datatype of categorical and numerical values
        f.change_type(self.ds1_df, num_feats, count_threshold=5)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)
        par_num_df_start, par_cat_df_start = f.get_params(
            self.ds1_df, num_feats, cat_feats)
        return par_num_df_start, par_cat_df_start
    def define_dataset(self, df=None, ch_type=False, cnt_threshold=2):
        # Observe the features with missing values
        if df == None:
            df = self.ds1_df
        f.get_missing_value_feats(df)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(df)

        # Change the datatype of categorical and numerical values
        if ch_type == True:
            f.change_type(df, num_feats, count_threshold=cnt_threshold)

        # Seperate the categorical and numerical features
        par_num_df_start, par_cat_df_start = self.define_params(df)
        stats_df = f.feature_stats(df)

        par_num_df_start = par_num_df_start.join(stats_df, how='left')
        par_cat_df_start = par_cat_df_start.join(stats_df, how='left')

        return par_num_df_start, par_cat_df_start
f.get_missing_value_feats(x_df)


# In[5]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
print(len(num_feats),len(cat_feats))
num_feats.remove('TARGET')


# In[6]:


f.change_type(x_df,num_feats,10)


# In[7]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
print(len(num_feats),len(cat_feats))


# In[10]:


x_df_dum = pd.get_dummies(x_df)
x_df_Default_dum = x_df_dum[x_df_dum['TARGET']==1]
bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index]
bal_df.reset_index(drop=True)

del bureau_df, bureau_balance_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
print(len(num_feats_b),len(cat_feats_b))

#num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df)
#print(len(num_feats_bal),len(cat_feats_bal))

# Change the datatype of categorical and numerical values
f.change_type(b_df,num_feats_b,count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
for i in ['SK_ID_BUREAU','SK_ID_CURR']:
    num_feats_b.remove(i)
print(len(num_feats_b),len(cat_feats_b))

par_num_df_start, par_cat_df_start = f.get_params(b_df, num_feats_b, cat_feats_b)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level 
# to synchronise at the Loan Application Client level.Hence we need to extract 
# aggregated information at Client level out of the dataset
# This means that treatment to individual columns would not be generallised as 
# Delete the original dataset and work with Sample to free some space for processing.
del train_df

################################ CHANGING THE DATA TYPES ################################

# Observe the features with missing values
f.get_missing_value_feats(x_df)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Change the datatype of categorical and numerical values
f.change_type(x_df, num_feats, count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats)
############################# IDENTIFYING MISSING FEATS #########################

# Identify na values exist and add them to a list
missing_value_feats = f.get_missing_value_feats(x_df)
missing_value_feats

# Calculate Missing Value percentage and Visualize
pos_cash_df = pd.read_csv(pos_cash_dataset)

p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin(
    train_clean_bureau_df.SK_ID_CURR)].index]
p_df.reset_index(drop=True)

del pos_cash_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
print(len(num_feats_p), len(cat_feats_p))

# Change the datatype of categorical and numerical values
f.change_type(p_df, num_feats_p, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
for i in ['SK_ID_PREV', 'SK_ID_CURR']:
    num_feats_p.remove(i)
print(len(num_feats_p), len(cat_feats_p))

par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p,
                                                  cat_feats_p)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level
# to synchronise at the Loan Application Client level.Hence we need to extract
# aggregated information at Client level out of the dataset
cc_bal_df = pd.read_csv(cc_bal_dataset)

c_df = cc_bal_df.loc[cc_bal_df[cc_bal_df['SK_ID_CURR'].isin(
    train_bureau_poscash_instpmt_clean_df.SK_ID_CURR)].index]
c_df.reset_index(drop=True)

del cc_bal_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
print(len(num_feats_c), len(cat_feats_c))

# Change the datatype of categorical and numerical values
f.change_type(c_df, num_feats_c, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
for i in ['SK_ID_CURR', 'SK_ID_PREV']:
    num_feats_c.remove(i)
print(len(num_feats_c), len(cat_feats_c))

par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c,
                                                  cat_feats_c)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level
# to synchronise at the Loan Application Client level.Hence we need to extract
# aggregated information at Client level out of the dataset