def baseline_model(self, scoring, SEED, result_col_nm):
        # Run the baseline models on the unbalanced dataset
        models = f.GetBasedModel()
        names, results = f.get_model_performance(self.X_train, self.y_train,
                                                 models, SEED, scoring)
        f.PlotBoxR().PlotResult(names, results)

        _score = f.ScoreDataFrame(names, results, result_col_nm)
        return _score
Beispiel #2
0
    def define_dataset(self):
        # Observe the features with missing values
        f.get_missing_value_feats(self.ds1_df)

        # Seperate the categorical and numerical features
        self.ds1_df.shape
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)

        # Change the datatype of categorical and numerical values
        f.change_type(self.ds1_df, num_feats, count_threshold=5)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)
        par_num_df_start, par_cat_df_start = f.get_params(
            self.ds1_df, num_feats, cat_feats)
        return par_num_df_start, par_cat_df_start
 def seperate_cat_num_var(self, df):
     # Seperate the categorical and numerical features
     num_feats, cat_feats = f.distinct_feats(df)
     print(len(num_feats), len(cat_feats))
     for x in range(len(self.lst)):
         if self.lst[x] in num_feats:
             num_feats.remove(self.lst[x])
     return num_feats, cat_feats
    def missing_value_treatment(self, min_threshold):
        # Identify na values exist and add them to a list

        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
        # Calculate Missing Value percentage and Visualize
        missing_values_perc_df = f.missing_val_perc(missing_value_feats,
                                                    self.ds1_df)
        val = missing_values_perc_df[0].sort_values(ascending=False)
        f.plot_bar(val.index, (50, 10), val)

        # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove
        # attributes which contain more than 65% of null values.
        self.ds1_df = f.impute_values(self.ds1_df,
                                      missing_value_feats,
                                      min_threshold,
                                      action=True)
        self.ds1_df.reset_index(drop=True)

        # How row in dataframe having more than x% NaN values
        na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30)
        print('No of rows having more than 30% NA Values', na_row_cnt)

        # Identify na values exist and add them to a list
        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
    def _cat_feature_extraction(self, cat_feats_b, b_df, b_agg_df, p_id):

        for feature in cat_feats_b:
            b_agg_cat = b_df.groupby(p_id)[feature].value_counts()
            _unique_items_list = f.get_unique_val_list(b_df, feature)

            for i in _unique_items_list:
                b_agg_df[feature + '_' + f'{i}' + '_count'] = b_agg_cat.xs(
                    key=i, level=1)
                b_agg_df[feature + '_' + f'{i}' + '_count'].fillna(
                    value=0, inplace=True)
    def define_dataset(self, df=None, ch_type=False, cnt_threshold=2):
        # Observe the features with missing values
        if df == None:
            df = self.ds1_df
        f.get_missing_value_feats(df)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(df)

        # Change the datatype of categorical and numerical values
        if ch_type == True:
            f.change_type(df, num_feats, count_threshold=cnt_threshold)

        # Seperate the categorical and numerical features
        par_num_df_start, par_cat_df_start = self.define_params(df)
        stats_df = f.feature_stats(df)

        par_num_df_start = par_num_df_start.join(stats_df, how='left')
        par_cat_df_start = par_cat_df_start.join(stats_df, how='left')

        return par_num_df_start, par_cat_df_start
    def missing_value_imputations(self):
        #################################### MISSING VALUES #############################
        # Since the numerical univariate distribution are symmetrical now with no difference
        # between median and mean. Lets impute all the numerical missing values with mean
        # Record missing values for further validations:
        #indicator = MissingIndicator(missing_values=np.nan)
        #mask_missing_values_only = indicator.fit_transform(self.ds1_df)
        #mask_missing_values_only.shape

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        # Num missing values imputations
        self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna(
            value=self.ds1_df[num_feats_imp_df].mean())

        # Left missing values are categorical.
        missing_feats_cat = f.get_missing_value_feats(self.ds1_df)

        par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df,
                                              cat_feats_imp_df)
        # Categorical values where mode frequency is more than 80% - Impute na with Mode
        # If not then use the KNN model to impute the values

        mode_threshold = 80
        for feature in missing_feats_cat:
            if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold:
                self.ds1_df[feature].fillna(
                    value=par_cat_df.loc[feature]['MODE'], inplace=True)
                print("Method : MODE , Feature : {} , Mode_Percentage : {}".
                      format(feature,
                             par_cat_df.loc[feature]['MODE_PERCENTAGE']))

            else:
                imp_list, score = f.impute_knn_classifier(
                    self.ds1_df, feature, 5)
                self.ds1_df[feature].fillna(value=imp_list, inplace=True)
                print(
                    "Method : KNN , Feature : {} , Imputation Accuracy Score : {}"
                    .format(feature, score))
        return par_num_df, par_cat_df
    def outlier_treatment(self, normalized_feats):
        # Find the num and cat feats for imp_df

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        other_feats = [
            x for x in num_feats_imp_df if x not in normalized_feats
        ]

        # Anamolies and data correction.
        # DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage
        feature = 'DAYS_EMPLOYED'
        self.ds1_df[feature].loc[self.ds1_df[self.ds1_df[feature] ==
                                             365243].index] = np.nan

        # XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed.
        self.ds1_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True)

        # Log transformation of all numerical non normalized highly skewed values to remove outliers

        for feature in other_feats:
            print('log_transform', feature)
            self.ds1_df = f.log_transform(self.ds1_df, feature)
            self.ds1_df.drop(self.ds1_df[[feature]], axis=1, inplace=True)

        #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df]
        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)

        for i in num_feats_imp_df:
            print(i)
            out_l, out_r, min, max = f.TurkyOutliers(self.ds1_df,
                                                     i,
                                                     drop=False)
            if (len(out_l) | len(out_r)) > 0:
                self.ds1_df[i].loc[out_l] = round(min, 3)
                self.ds1_df[i].loc[out_r] = round(max, 3)
Beispiel #9
0
    def create_dataset_remove_corr_feats(self, target_var, filter_val,
                                         corr_threshold, feats_ignore):
        df = self.df.copy()
        x_df_dum = pd.get_dummies(df)
        x_df_Default_dum = x_df_dum[x_df_dum[target_var] == filter_val]

        x_df_dum.columns = x_df_dum.columns.map(f.remove_space)
        x_df_Default_dum.columns = x_df_Default_dum.columns.map(f.remove_space)

        _corr_threshold = corr_threshold
        get_highly_corr_feats = f.corr_feats(x_df_dum, x_df_dum.columns,
                                             _corr_threshold)

        get_highly_corr_feats = pd.DataFrame(get_highly_corr_feats)
        print('Highly correlated features description more than pearsonsr',
              _corr_threshold)

        corr_lst = []
        for i in range(len(get_highly_corr_feats.index) - 1):

            lst_feat = get_highly_corr_feats.iloc[i, 0]
            lst_corr_feat = get_highly_corr_feats.iloc[i, 1]

            for j in range(len(lst_corr_feat)):
                _str = f.match_strings(lst_feat, lst_corr_feat[j])
                if len(_str) > f.min_len_col(df.drop(df[feats_ignore],
                                                     axis=1)):
                    corr_lst.append(lst_corr_feat[j])

        corr_lst = pd.DataFrame(corr_lst)[0].unique().tolist()
        print(corr_lst)
        _train_drop_cols_df = x_df_dum.copy()
        _train_drop_cols_df.drop(_train_drop_cols_df[corr_lst],
                                 axis=1,
                                 inplace=True)
        self.dim_red_by_corr_df = _train_drop_cols_df.copy()
    def _num_feature_extraction(self, num_feats_b, b_df, b_agg_df, p_id):

        for feature in num_feats_b:
            print(feature)
            b_agg_df = f.get_aggregate_features_num(b_df, b_agg_df, feature,
                                                    p_id)
            #    na_ind = b_agg_df[(b_agg_df[feature + '_std'].isna()==True) &
            #                  ((b_agg_df[feature+'_mean'])==(b_agg_df[feature+'_median']))].index
            #
            #    b_agg_df.loc[na_ind][feature+'_std'].fillna(0)
            #    b_agg_df.loc[na_ind][feature'_std'].isna().sum()
            b_agg_df[feature + '_std'] = np.where(
                (b_agg_df[feature + '_std'].isna() == True) &
                ((b_agg_df[feature + '_mean'])
                 == (b_agg_df[feature + '_median'])), 0,
                b_agg_df[feature + '_std'])
        b_agg_df.insert(0, p_id, b_agg_df.index)
        b_agg_df.reset_index(drop=True, inplace=True)
        return b_agg_df
# Change the datatype of categorical and numerical values (NOT REQUIRED)
#f.change_type(train_df,num_feats,count_threshold=5)

# Seperate the categorical and numerical features
# Get the list of attributes and their properties to start
par_num_start, par_cat_start = baseline_prep.define_params(train_df)

 
###################### TRAIN AND TEST SET SPLIT #####################################SEED = 7

X =  train_df.drop(train_df[['TARGET','SK_ID_CURR']],axis=1).copy()
Y = train_df[['TARGET']]

X_train, X_test, y_train, y_test =f.train_test_split(X,Y,
                                                   test_size=0.25,
                                                   random_state=0,
                                                   stratify=train_df['TARGET'])

############# RUN BASE MODELS ##################

# Run the baseline models on the unbalanced dataset 
#models = f.GetBasedModel()
#names,results = f.get_model_performance(X_train, y_train,models, SEED, 'f1_weighted')
#f.PlotBoxR().PlotResult(names,results)
#
#basedLineF1Score = f.ScoreDataFrame(names,results,'baseline_f1_Score')
basedLineF1Score = def_model.baseline_model('f1_weighted', 
                                            SEED, 
                                            'baseline_f1_Score')

#models = f.GetBasedModel()
 def define_params(self, df):
     num_feats, cat_feats = self.seperate_cat_num_var(df)
     par_num_df_start, par_cat_df_start = f.get_params(
         df, num_feats, cat_feats)
     return par_num_df_start, par_cat_df_start
 def scaled_model_with_CW_factor(self, scoring, SEED, scalar):
     models = f.GetScaledModelwithfactorizedCW(scalar)
     names, results = f.cv_score(self.X_train, self.y_train, models,
                                 scoring, SEED)
     _score = f.cv_metrics(names, results)
     return _score
import Model.FunctionLib as f

# Import working dataset
train_clean_bureau_df = pd.read_csv(train_clean_bureau)
pos_cash_df = pd.read_csv(pos_cash_dataset)

p_df = pos_cash_df.loc[pos_cash_df[pos_cash_df['SK_ID_CURR'].isin(
    train_clean_bureau_df.SK_ID_CURR)].index]
p_df.reset_index(drop=True)

del pos_cash_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
print(len(num_feats_p), len(cat_feats_p))

# Change the datatype of categorical and numerical values
f.change_type(p_df, num_feats_p, count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_p, cat_feats_p = f.distinct_feats(p_df)
for i in ['SK_ID_PREV', 'SK_ID_CURR']:
    num_feats_p.remove(i)
print(len(num_feats_p), len(cat_feats_p))

par_num_df_start, par_cat_df_start = f.get_params(p_df, num_feats_p,
                                                  cat_feats_p)
Beispiel #15
0
# Import User Libraries and Data Pre-Proccessing Scripts
import Model.FunctionLib as f
#import Model.Preprocessing_app_train
#import Model.Preprocessing_app_test
#import Model.Preprocessing_bureau

################################ IMPORT LATEST DATASET ################################
train_df = pd.read_csv(wd + "\\Output\\application_train_bureau_clean.csv")
train_df.drop(train_df.filter(like='Unnamed').columns, axis=1, inplace=True)

# Change the datatype of categorical and numerical values (NOT REQUIRED)
#f.change_type(train_df,num_feats,count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(train_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Get the list of attributes and their properties to start
par_num_df_start, par_cat_df_start = f.get_params(train_df, num_feats,
                                                  cat_feats)

############# FEATURE CORRELATIONS ##########
# Code Block to find the correlated features for various features including featues including each category correlations
# This can be used to derive/impute na values when the correlations are strong with other features using sklearn.Impute Iterativeimputer
# Not using this approach for now as there are no strong correlations with missing value columns

x_df_dum = pd.get_dummies(train_df)
x_df_Default_dum = x_df_dum[x_df_dum['TARGET'] == 1]
 def scaled_model(self, scoring, SEED, result_col_nm, scalar):
     models = f.GetScaledModel(scalar)
     names, results = f.get_model_performance(self.X_train, self.y_train,
                                              models, SEED, scoring)
     _score = f.ScoreDataFrame(names, results, result_col_nm)
     return _score
# Delete the original dataset and work with Sample to free some space for processing.
del train_df


# In[4]:


color_list = ['green','blue','orange','yellow','red','violet','cyan']


# In[5]:


val = x_df.isna().sum().sort_values(ascending=False)
f.plot_bar(val.index,(70,10),val,30)


# In[6]:


f.get_missing_value_feats(x_df)


# In[5]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
print(len(num_feats),len(cat_feats))
num_feats.remove('TARGET')
train_clean_df = pd.read_csv(train_clean)
bureau_df = pd.read_csv(bureau_dataset)
bureau_balance_df = pd.read_csv(bureau_balance_dataset)

b_df = bureau_df.loc[bureau_df[bureau_df['SK_ID_CURR'].isin(train_clean_df.SK_ID_CURR)].index]
b_df.reset_index(drop=True)

bal_df = bureau_balance_df.loc[bureau_balance_df[bureau_balance_df['SK_ID_BUREAU'].isin(b_df.SK_ID_BUREAU)].index]
bal_df.reset_index(drop=True)

del bureau_df, bureau_balance_df

################################ CHANGING THE DATA TYPES ################################

# Seperate the categorical and numerical features
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
print(len(num_feats_b),len(cat_feats_b))

#num_feats_bal,cat_feats_bal = f.distinct_feats(bal_df)
#print(len(num_feats_bal),len(cat_feats_bal))

# Change the datatype of categorical and numerical values
f.change_type(b_df,num_feats_b,count_threshold=5)

# Seperate the categorical and numerical features
# Create dataframe with Skew kurt, Missing val and Outliers for num_feats_imp_df
num_feats_b,cat_feats_b = f.distinct_feats(b_df)
for i in ['SK_ID_BUREAU','SK_ID_CURR']:
    num_feats_b.remove(i)
print(len(num_feats_b),len(cat_feats_b))
# Import Libraries
import Model.FunctionLib as f

# Import working dataset
train_df = pd.read_csv(train_dataset)

# Create a new dataset same as train data
x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True)

# Delete the original dataset and work with Sample to free some space for processing.
del train_df

################################ CHANGING THE DATA TYPES ################################

# Observe the features with missing values
f.get_missing_value_feats(x_df)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Change the datatype of categorical and numerical values
f.change_type(x_df, num_feats, count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')