def seperate_cat_num_var(self, df):
     # Seperate the categorical and numerical features
     num_feats, cat_feats = f.distinct_feats(df)
     print(len(num_feats), len(cat_feats))
     for x in range(len(self.lst)):
         if self.lst[x] in num_feats:
             num_feats.remove(self.lst[x])
     return num_feats, cat_feats
val = x_df.isna().sum().sort_values(ascending=False)
f.plot_bar(val.index,(70,10),val,30)


# In[6]:


f.get_missing_value_feats(x_df)


# In[5]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
print(len(num_feats),len(cat_feats))
num_feats.remove('TARGET')


# In[6]:


f.change_type(x_df,num_feats,10)


# In[7]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
train_clean_df = pd.read_csv(train_clean)
bureau_df = pd.read_csv(bureau_dataset)
bureau_balance_df = pd.read_csv(bureau_balance_dataset)

b_df = bureau_df.loc[bureau_df[bureau_df['SK_ID_CURR'].isin(train_clean_df.SK_ID_CURR)].index]
b_df.reset_index(drop=True)

bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index]
bal_df.reset_index(drop=True)

del bureau_df, bureau_balance_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
print(len(num_feats_b),len(cat_feats_b))

#num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df)
#print(len(num_feats_bal),len(cat_feats_bal))

# Change the datatype of categorical and numerical values
f.change_type(b_df,num_feats_b,count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
for i in ['SK_ID_BUREAU','SK_ID_CURR']:
    num_feats_b.remove(i)
print(len(num_feats_b),len(cat_feats_b))
# Import working dataset
train_df = pd.read_csv(train_dataset)

# Create a new dataset same as train data
x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True)

# Delete the original dataset and work with Sample to free some space for processing.
del train_df

################################ CHANGING THE DATA TYPES ################################

# Observe the features with missing values
f.get_missing_value_feats(x_df)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Change the datatype of categorical and numerical values
f.change_type(x_df, num_feats, count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats)
############################# IDENTIFYING MISSING FEATS #########################
Esempio n. 5
0
# Import User Libraries and Data Pre-Proccessing Scripts
import Model.FunctionLib as f
#import Model.Preprocessing_app_train
#import Model.Preprocessing_app_test
#import Model.Preprocessing_bureau

################################ IMPORT LATEST DATASET ################################
train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv")
train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True)

# Change the datatype of categorical and numerical values (NOT REQUIRED)
#f.change_type(train_df,num_feats,count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(train_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Get the list of attributes and their properties to start
par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats,
                                                  cat_feats)

############# FEATURE CORRELATIONS ##########
# Code Block to find the correlated features for various features including featues including each category correlations
# This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer
# Not using this approach for now as there are no strong correlations with missing value columns

x_df_dum = pd.get_dummies(train_df)
x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1]
import Model.FunctionLib as f

# Import working dataset
train_clean_bureau_df = pd.read_csv(train_clean_bureau)
pos_cash_df = pd.read_csv(pos_cash_dataset)

p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin(
    train_clean_bureau_df.SK_ID_CURR)].index]
p_df.reset_index(drop=True)

del pos_cash_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
print(len(num_feats_p), len(cat_feats_p))

# Change the datatype of categorical and numerical values
f.change_type(p_df, num_feats_p, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
for i in ['SK_ID_PREV', 'SK_ID_CURR']:
    num_feats_p.remove(i)
print(len(num_feats_p), len(cat_feats_p))

par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p,
                                                  cat_feats_p)
# Import working dataset
train_bureau_poscash_instpmt_clean_df = pd.read_csv(
    train_bureau_poscash_instpmt_clean)
cc_bal_df = pd.read_csv(cc_bal_dataset)

c_df = cc_bal_df.loc[cc_bal_df[cc_bal_df['SK_ID_CURR'].isin(
    train_bureau_poscash_instpmt_clean_df.SK_ID_CURR)].index]
c_df.reset_index(drop=True)

del cc_bal_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
print(len(num_feats_c), len(cat_feats_c))

# Change the datatype of categorical and numerical values
f.change_type(c_df, num_feats_c, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
for i in ['SK_ID_CURR', 'SK_ID_PREV']:
    num_feats_c.remove(i)
print(len(num_feats_c), len(cat_feats_c))

par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c,
                                                  cat_feats_c)