def missing_value_treatment(self, min_threshold): # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats) # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, self.ds1_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. self.ds1_df = f.impute_values(self.ds1_df, missing_value_feats, min_threshold, action=True) self.ds1_df.reset_index(drop=True) # How row in dataframe having more than x% NaN values na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30) print('No of rows having more than 30% NA Values', na_row_cnt) # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats)
def define_dataset(self): # Observe the features with missing values f.get_missing_value_feats(self.ds1_df) # Seperate the categorical and numerical features self.ds1_df.shape num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) # Change the datatype of categorical and numerical values f.change_type(self.ds1_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) par_num_df_start, par_cat_df_start = f.get_params( self.ds1_df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
def define_dataset(self, df=None, ch_type=False, cnt_threshold=2): # Observe the features with missing values if df == None: df = self.ds1_df f.get_missing_value_feats(df) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(df) # Change the datatype of categorical and numerical values if ch_type == True: f.change_type(df, num_feats, count_threshold=cnt_threshold) # Seperate the categorical and numerical features par_num_df_start, par_cat_df_start = self.define_params(df) stats_df = f.feature_stats(df) par_num_df_start = par_num_df_start.join(stats_df, how='left') par_cat_df_start = par_cat_df_start.join(stats_df, how='left') return par_num_df_start, par_cat_df_start
def missing_value_imputations(self): #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: #indicator = MissingIndicator(missing_values=np.nan) #mask_missing_values_only = indicator.fit_transform(self.ds1_df) #mask_missing_values_only.shape num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) # Num missing values imputations self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna( value=self.ds1_df[num_feats_imp_df].mean()) # Left missing values are categorical. missing_feats_cat = f.get_missing_value_feats(self.ds1_df) par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80 for feature in missing_feats_cat: if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold: self.ds1_df[feature].fillna( value=par_cat_df.loc[feature]['MODE'], inplace=True) print("Method : MODE , Feature : {} , Mode_Percentage : {}". format(feature, par_cat_df.loc[feature]['MODE_PERCENTAGE'])) else: imp_list, score = f.impute_knn_classifier( self.ds1_df, feature, 5) self.ds1_df[feature].fillna(value=imp_list, inplace=True) print( "Method : KNN , Feature : {} , Imputation Accuracy Score : {}" .format(feature, score)) return par_num_df, par_cat_df
color_list = ['green','blue','orange','yellow','red','violet','cyan'] # In[5]: val = x_df.isna().sum().sort_values(ascending=False) f.plot_bar(val.index,(70,10),val,30) # In[6]: f.get_missing_value_feats(x_df) # In[5]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) num_feats.remove('TARGET') # In[6]: f.change_type(x_df,num_feats,10)
# Import Libraries import Model.FunctionLib as f # Import working dataset train_df = pd.read_csv(train_dataset) # Create a new dataset same as train data x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True) # Delete the original dataset and work with Sample to free some space for processing. del train_df ################################ CHANGING THE DATA TYPES ################################ # Observe the features with missing values f.get_missing_value_feats(x_df) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Change the datatype of categorical and numerical values f.change_type(x_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR')