def baseline_model(self, scoring, SEED, result_col_nm): # Run the baseline models on the unbalanced dataset models = f.GetBasedModel() names, results = f.get_model_performance(self.X_train, self.y_train, models, SEED, scoring) f.PlotBoxR().PlotResult(names, results) _score = f.ScoreDataFrame(names, results, result_col_nm) return _score
def define_dataset(self): # Observe the features with missing values f.get_missing_value_feats(self.ds1_df) # Seperate the categorical and numerical features self.ds1_df.shape num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) # Change the datatype of categorical and numerical values f.change_type(self.ds1_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df) par_num_df_start, par_cat_df_start = f.get_params( self.ds1_df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
def seperate_cat_num_var(self, df): # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(df) print(len(num_feats), len(cat_feats)) for x in range(len(self.lst)): if self.lst[x] in num_feats: num_feats.remove(self.lst[x]) return num_feats, cat_feats
def missing_value_treatment(self, min_threshold): # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats) # Calculate Missing Value percentage and Visualize missing_values_perc_df = f.missing_val_perc(missing_value_feats, self.ds1_df) val = missing_values_perc_df[0].sort_values(ascending=False) f.plot_bar(val.index, (50, 10), val) # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove # attributes which contain more than 65% of null values. self.ds1_df = f.impute_values(self.ds1_df, missing_value_feats, min_threshold, action=True) self.ds1_df.reset_index(drop=True) # How row in dataframe having more than x% NaN values na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30) print('No of rows having more than 30% NA Values', na_row_cnt) # Identify na values exist and add them to a list missing_value_feats = f.get_missing_value_feats(self.ds1_df) print(missing_value_feats)
def _cat_feature_extraction(self, cat_feats_b, b_df, b_agg_df, p_id): for feature in cat_feats_b: b_agg_cat = b_df.groupby(p_id)[feature].value_counts() _unique_items_list = f.get_unique_val_list(b_df, feature) for i in _unique_items_list: b_agg_df[feature + '_' + f'{i}' + '_count'] = b_agg_cat.xs( key=i, level=1) b_agg_df[feature + '_' + f'{i}' + '_count'].fillna( value=0, inplace=True)
def define_dataset(self, df=None, ch_type=False, cnt_threshold=2): # Observe the features with missing values if df == None: df = self.ds1_df f.get_missing_value_feats(df) # Seperate the categorical and numerical features num_feats, cat_feats = self.seperate_cat_num_var(df) # Change the datatype of categorical and numerical values if ch_type == True: f.change_type(df, num_feats, count_threshold=cnt_threshold) # Seperate the categorical and numerical features par_num_df_start, par_cat_df_start = self.define_params(df) stats_df = f.feature_stats(df) par_num_df_start = par_num_df_start.join(stats_df, how='left') par_cat_df_start = par_cat_df_start.join(stats_df, how='left') return par_num_df_start, par_cat_df_start
def missing_value_imputations(self): #################################### MISSING VALUES ############################# # Since the numerical univariate distribution are symmetrical now with no difference # between median and mean. Lets impute all the numerical missing values with mean # Record missing values for further validations: #indicator = MissingIndicator(missing_values=np.nan) #mask_missing_values_only = indicator.fit_transform(self.ds1_df) #mask_missing_values_only.shape num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) # Num missing values imputations self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna( value=self.ds1_df[num_feats_imp_df].mean()) # Left missing values are categorical. missing_feats_cat = f.get_missing_value_feats(self.ds1_df) par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df, cat_feats_imp_df) # Categorical values where mode frequency is more than 80% - Impute na with Mode # If not then use the KNN model to impute the values mode_threshold = 80 for feature in missing_feats_cat: if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold: self.ds1_df[feature].fillna( value=par_cat_df.loc[feature]['MODE'], inplace=True) print("Method : MODE , Feature : {} , Mode_Percentage : {}". format(feature, par_cat_df.loc[feature]['MODE_PERCENTAGE'])) else: imp_list, score = f.impute_knn_classifier( self.ds1_df, feature, 5) self.ds1_df[feature].fillna(value=imp_list, inplace=True) print( "Method : KNN , Feature : {} , Imputation Accuracy Score : {}" .format(feature, score)) return par_num_df, par_cat_df
def outlier_treatment(self, normalized_feats): # Find the num and cat feats for imp_df num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) other_feats = [ x for x in num_feats_imp_df if x not in normalized_feats ] # Anamolies and data correction. # DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage feature = 'DAYS_EMPLOYED' self.ds1_df[feature].loc[self.ds1_df[self.ds1_df[feature] == 365243].index] = np.nan # XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed. self.ds1_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True) # Log transformation of all numerical non normalized highly skewed values to remove outliers for feature in other_feats: print('log_transform', feature) self.ds1_df = f.log_transform(self.ds1_df, feature) self.ds1_df.drop(self.ds1_df[[feature]], axis=1, inplace=True) #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df] num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var( self.ds1_df) for i in num_feats_imp_df: print(i) out_l, out_r, min, max = f.TurkyOutliers(self.ds1_df, i, drop=False) if (len(out_l) | len(out_r)) > 0: self.ds1_df[i].loc[out_l] = round(min, 3) self.ds1_df[i].loc[out_r] = round(max, 3)
def create_dataset_remove_corr_feats(self, target_var, filter_val, corr_threshold, feats_ignore): df = self.df.copy() x_df_dum = pd.get_dummies(df) x_df_Default_dum = x_df_dum[x_df_dum[target_var] == filter_val] x_df_dum.columns = x_df_dum.columns.map(f.remove_space) x_df_Default_dum.columns = x_df_Default_dum.columns.map(f.remove_space) _corr_threshold = corr_threshold get_highly_corr_feats = f.corr_feats(x_df_dum, x_df_dum.columns, _corr_threshold) get_highly_corr_feats = pd.DataFrame(get_highly_corr_feats) print('Highly correlated features description more than pearsonsr', _corr_threshold) corr_lst = [] for i in range(len(get_highly_corr_feats.index) - 1): lst_feat = get_highly_corr_feats.iloc[i, 0] lst_corr_feat = get_highly_corr_feats.iloc[i, 1] for j in range(len(lst_corr_feat)): _str = f.match_strings(lst_feat, lst_corr_feat[j]) if len(_str) > f.min_len_col(df.drop(df[feats_ignore], axis=1)): corr_lst.append(lst_corr_feat[j]) corr_lst = pd.DataFrame(corr_lst)[0].unique().tolist() print(corr_lst) _train_drop_cols_df = x_df_dum.copy() _train_drop_cols_df.drop(_train_drop_cols_df[corr_lst], axis=1, inplace=True) self.dim_red_by_corr_df = _train_drop_cols_df.copy()
def _num_feature_extraction(self, num_feats_b, b_df, b_agg_df, p_id): for feature in num_feats_b: print(feature) b_agg_df = f.get_aggregate_features_num(b_df, b_agg_df, feature, p_id) # na_ind = b_agg_df[(b_agg_df[feature + '_std'].isna()==True) & # ((b_agg_df[feature+'_mean'])==(b_agg_df[feature+'_median']))].index # # b_agg_df.loc[na_ind][feature+'_std'].fillna(0) # b_agg_df.loc[na_ind][feature'_std'].isna().sum() b_agg_df[feature + '_std'] = np.where( (b_agg_df[feature + '_std'].isna() == True) & ((b_agg_df[feature + '_mean']) == (b_agg_df[feature + '_median'])), 0, b_agg_df[feature + '_std']) b_agg_df.insert(0, p_id, b_agg_df.index) b_agg_df.reset_index(drop=True, inplace=True) return b_agg_df
# Change the datatype of categorical and numerical values (NOT REQUIRED) #f.change_type(train_df,num_feats,count_threshold=5) # Seperate the categorical and numerical features # Get the list of attributes and their properties to start par_num_start, par_cat_start = baseline_prep.define_params(train_df) ###################### TRAIN AND TEST SET SPLIT #####################################SEED = 7 X = train_df.drop(train_df[['TARGET','SK_ID_CURR']],axis=1).copy() Y = train_df[['TARGET']] X_train, X_test, y_train, y_test =f.train_test_split(X,Y, test_size=0.25, random_state=0, stratify=train_df['TARGET']) ############# RUN BASE MODELS ################## # Run the baseline models on the unbalanced dataset #models = f.GetBasedModel() #names,results = f.get_model_performance(X_train, y_train,models, SEED, 'f1_weighted') #f.PlotBoxR().PlotResult(names,results) # #basedLineF1Score = f.ScoreDataFrame(names,results,'baseline_f1_Score') basedLineF1Score = def_model.baseline_model('f1_weighted', SEED, 'baseline_f1_Score') #models = f.GetBasedModel()
def define_params(self, df): num_feats, cat_feats = self.seperate_cat_num_var(df) par_num_df_start, par_cat_df_start = f.get_params( df, num_feats, cat_feats) return par_num_df_start, par_cat_df_start
def scaled_model_with_CW_factor(self, scoring, SEED, scalar): models = f.GetScaledModelwithfactorizedCW(scalar) names, results = f.cv_score(self.X_train, self.y_train, models, scoring, SEED) _score = f.cv_metrics(names, results) return _score
import Model.FunctionLib as f # Import working dataset train_clean_bureau_df = pd.read_csv(train_clean_bureau) pos_cash_df = pd.read_csv(pos_cash_dataset) p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin( train_clean_bureau_df.SK_ID_CURR)].index] p_df.reset_index(drop=True) del pos_cash_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_p, cat_feats_p = f.distinct_feats(p_df) print(len(num_feats_p), len(cat_feats_p)) # Change the datatype of categorical and numerical values f.change_type(p_df, num_feats_p, count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_p, cat_feats_p = f.distinct_feats(p_df) for i in ['SK_ID_PREV', 'SK_ID_CURR']: num_feats_p.remove(i) print(len(num_feats_p), len(cat_feats_p)) par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p, cat_feats_p)
# Import User Libraries and Data Pre-Proccessing Scripts import Model.FunctionLib as f #import Model.Preprocessing_app_train #import Model.Preprocessing_app_test #import Model.Preprocessing_bureau ################################ IMPORT LATEST DATASET ################################ train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv") train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True) # Change the datatype of categorical and numerical values (NOT REQUIRED) #f.change_type(train_df,num_feats,count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(train_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Get the list of attributes and their properties to start par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats, cat_feats) ############# FEATURE CORRELATIONS ########## # Code Block to find the correlated features for various features including featues including each category correlations # This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer # Not using this approach for now as there are no strong correlations with missing value columns x_df_dum = pd.get_dummies(train_df) x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1]
def scaled_model(self, scoring, SEED, result_col_nm, scalar): models = f.GetScaledModel(scalar) names, results = f.get_model_performance(self.X_train, self.y_train, models, SEED, scoring) _score = f.ScoreDataFrame(names, results, result_col_nm) return _score
# Delete the original dataset and work with Sample to free some space for processing. del train_df # In[4]: color_list = ['green','blue','orange','yellow','red','violet','cyan'] # In[5]: val = x_df.isna().sum().sort_values(ascending=False) f.plot_bar(val.index,(70,10),val,30) # In[6]: f.get_missing_value_feats(x_df) # In[5]: # Seperate the categorical and numerical features num_feats,cat_feats = f.distinct_feats(x_df) print(len(num_feats),len(cat_feats)) num_feats.remove('TARGET')
train_clean_df = pd.read_csv(train_clean) bureau_df = pd.read_csv(bureau_dataset) bureau_balance_df = pd.read_csv(bureau_balance_dataset) b_df = bureau_df.loc[bureau_df[bureau_df['SK_ID_CURR'].isin(train_clean_df.SK_ID_CURR)].index] b_df.reset_index(drop=True) bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index] bal_df.reset_index(drop=True) del bureau_df, bureau_balance_df ################################ CHANGING THE DATA TYPES ################################ # Seperate the categorical and numerical features num_feats_b,cat_feats_b = f.distinct_feats(b_df) print(len(num_feats_b),len(cat_feats_b)) #num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df) #print(len(num_feats_bal),len(cat_feats_bal)) # Change the datatype of categorical and numerical values f.change_type(b_df,num_feats_b,count_threshold=5) # Seperate the categorical and numerical features # Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df num_feats_b,cat_feats_b = f.distinct_feats(b_df) for i in ['SK_ID_BUREAU','SK_ID_CURR']: num_feats_b.remove(i) print(len(num_feats_b),len(cat_feats_b))
# Import Libraries import Model.FunctionLib as f # Import working dataset train_df = pd.read_csv(train_dataset) # Create a new dataset same as train data x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True) # Delete the original dataset and work with Sample to free some space for processing. del train_df ################################ CHANGING THE DATA TYPES ################################ # Observe the features with missing values f.get_missing_value_feats(x_df) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR') # Change the datatype of categorical and numerical values f.change_type(x_df, num_feats, count_threshold=5) # Seperate the categorical and numerical features num_feats, cat_feats = f.distinct_feats(x_df) print(len(num_feats), len(cat_feats)) num_feats.remove('TARGET') num_feats.remove('SK_ID_CURR')