Esempio n. 1
0
    def define_dataset(self):
        # Observe the features with missing values
        f.get_missing_value_feats(self.ds1_df)

        # Seperate the categorical and numerical features
        self.ds1_df.shape
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)

        # Change the datatype of categorical and numerical values
        f.change_type(self.ds1_df, num_feats, count_threshold=5)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)
        par_num_df_start, par_cat_df_start = f.get_params(
            self.ds1_df, num_feats, cat_feats)
        return par_num_df_start, par_cat_df_start
    def missing_value_imputations(self):
        #################################### MISSING VALUES #############################
        # Since the numerical univariate distribution are symmetrical now with no difference
        # between median and mean. Lets impute all the numerical missing values with mean
        # Record missing values for further validations:
        #indicator = MissingIndicator(missing_values=np.nan)
        #mask_missing_values_only = indicator.fit_transform(self.ds1_df)
        #mask_missing_values_only.shape

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        # Num missing values imputations
        self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna(
            value=self.ds1_df[num_feats_imp_df].mean())

        # Left missing values are categorical.
        missing_feats_cat = f.get_missing_value_feats(self.ds1_df)

        par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df,
                                              cat_feats_imp_df)
        # Categorical values where mode frequency is more than 80% - Impute na with Mode
        # If not then use the KNN model to impute the values

        mode_threshold = 80
        for feature in missing_feats_cat:
            if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold:
                self.ds1_df[feature].fillna(
                    value=par_cat_df.loc[feature]['MODE'], inplace=True)
                print("Method : MODE , Feature : {} , Mode_Percentage : {}".
                      format(feature,
                             par_cat_df.loc[feature]['MODE_PERCENTAGE']))

            else:
                imp_list, score = f.impute_knn_classifier(
                    self.ds1_df, feature, 5)
                self.ds1_df[feature].fillna(value=imp_list, inplace=True)
                print(
                    "Method : KNN , Feature : {} , Imputation Accuracy Score : {}"
                    .format(feature, score))
        return par_num_df, par_cat_df
 def define_params(self, df):
     num_feats, cat_feats = self.seperate_cat_num_var(df)
     par_num_df_start, par_cat_df_start = f.get_params(
         df, num_feats, cat_feats)
     return par_num_df_start, par_cat_df_start
print(len(num_feats_b),len(cat_feats_b))

#num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df)
#print(len(num_feats_bal),len(cat_feats_bal))

# Change the datatype of categorical and numerical values
f.change_type(b_df,num_feats_b,count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
for i in ['SK_ID_BUREAU','SK_ID_CURR']:
    num_feats_b.remove(i)
print(len(num_feats_b),len(cat_feats_b))

par_num_df_start, par_cat_df_start = f.get_params(b_df, num_feats_b, cat_feats_b)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level 
# to synchronise at the Loan Application Client level.Hence we need to extract 
# aggregated information at Client level out of the dataset
# This means that treatment to individual columns would not be generallised as 
# we might loose information. Hence we will extract aggregated features first and then
# apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the
# features qualitatively

b_agg_df = pd.DataFrame() 
# Create a object for aggregation at SK_ID_CURR level
#b_agg = b_df.groupby('SK_ID_CURR')

#Aggregating bureau data at Customer Id level
# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Change the datatype of categorical and numerical values
f.change_type(x_df, num_feats, count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

par_num_df_start, par_cat_df_start = f.get_params(x_df, num_feats, cat_feats)
############################# IDENTIFYING MISSING FEATS #########################

# Identify na values exist and add them to a list
missing_value_feats = f.get_missing_value_feats(x_df)
missing_value_feats

# Calculate Missing Value percentage and Visualize
missing_values_perc_df = f.missing_val_perc(missing_value_feats, x_df)
val = missing_values_perc_df[0].sort_values(ascending=False)
f.plot_bar(val.index, (50, 10), val)

#################### REMOVING THE VALUES DIRECTLY ##########################
# Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove
# attributes which contain more than 65% of null values.
imp_df = f.impute_values(x_df, missing_value_feats, 65, action=True)
Esempio n. 6
0
################################ IMPORT LATEST DATASET ################################
train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv")
train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True)

# Change the datatype of categorical and numerical values (NOT REQUIRED)
#f.change_type(train_df,num_feats,count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(train_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Get the list of attributes and their properties to start
par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats,
                                                  cat_feats)

############# FEATURE CORRELATIONS ##########
# Code Block to find the correlated features for various features including featues including each category correlations
# This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer
# Not using this approach for now as there are no strong correlations with missing value columns

x_df_dum = pd.get_dummies(train_df)
x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1]

x_df_dum.columns = x_df_dum.columns.map(f.remove_space)
x_df_Default_dum.columns = x_df_Default_dum.columns.map(f.remove_space)

# General correlations wrt Correlations in case of default.
x_corr_default = x_df_Default_dum.corr()
x_corr = x_df_dum.corr()
# Seperate the categorical and numerical features
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
print(len(num_feats_p), len(cat_feats_p))

# Change the datatype of categorical and numerical values
f.change_type(p_df, num_feats_p, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
for i in ['SK_ID_PREV', 'SK_ID_CURR']:
    num_feats_p.remove(i)
print(len(num_feats_p), len(cat_feats_p))

par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p,
                                                  cat_feats_p)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level
# to synchronise at the Loan Application Client level.Hence we need to extract
# aggregated information at Client level out of the dataset
# This means that treatment to individual columns would not be generallised as
# we might loose information. Hence we will extract aggregated features first and then
# apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the
# features qualitatively

p_agg_df = pd.DataFrame()
# Create a object for aggregation at SK_ID_CURR level
#b_agg = b_df.groupby('SK_ID_CURR')

#Aggregating bureau data at Customer Id level
# Seperate the categorical and numerical features
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
print(len(num_feats_c), len(cat_feats_c))

# Change the datatype of categorical and numerical values
f.change_type(c_df, num_feats_c, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_c, cat_feats_c = f.distinct_feats(c_df)
for i in ['SK_ID_CURR', 'SK_ID_PREV']:
    num_feats_c.remove(i)
print(len(num_feats_c), len(cat_feats_c))

par_num_df_start, par_cat_df_start = f.get_params(c_df, num_feats_c,
                                                  cat_feats_c)

############################# FEATURE TREATMENT AND EXTRACTION #########################
# As the features are expected to be extracted and grouped at SK_ID_CURR level
# to synchronise at the Loan Application Client level.Hence we need to extract
# aggregated information at Client level out of the dataset
# This means that treatment to individual columns would not be generallised as
# we might loose information. Hence we will extract aggregated features first and then
# apply data correction (MISSING VALUES and OUTLIERS etc) at aggregate level based on the
# features qualitatively

c_agg_df = pd.DataFrame()
# Create a object for aggregation at SK_ID_CURR level
#b_agg = b_df.groupby('SK_ID_CURR')

#Aggregating bureau data at Customer Id level