def missing_value_treatment(self, min_threshold):
        # Identify na values exist and add them to a list

        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
        # Calculate Missing Value percentage and Visualize
        missing_values_perc_df = f.missing_val_perc(missing_value_feats,
                                                    self.ds1_df)
        val = missing_values_perc_df[0].sort_values(ascending=False)
        f.plot_bar(val.index, (50, 10), val)

        # Check direct imputations such as remove the records for attributes which contain less than 5% of null values or remove
        # attributes which contain more than 65% of null values.
        self.ds1_df = f.impute_values(self.ds1_df,
                                      missing_value_feats,
                                      min_threshold,
                                      action=True)
        self.ds1_df.reset_index(drop=True)

        # How row in dataframe having more than x% NaN values
        na_row_cnt = f.get_rowcnt_most_missing_val(self.ds1_df, 30)
        print('No of rows having more than 30% NA Values', na_row_cnt)

        # Identify na values exist and add them to a list
        missing_value_feats = f.get_missing_value_feats(self.ds1_df)
        print(missing_value_feats)
Ejemplo n.º 2
0
    def define_dataset(self):
        # Observe the features with missing values
        f.get_missing_value_feats(self.ds1_df)

        # Seperate the categorical and numerical features
        self.ds1_df.shape
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)

        # Change the datatype of categorical and numerical values
        f.change_type(self.ds1_df, num_feats, count_threshold=5)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(self.ds1_df)
        par_num_df_start, par_cat_df_start = f.get_params(
            self.ds1_df, num_feats, cat_feats)
        return par_num_df_start, par_cat_df_start
    def define_dataset(self, df=None, ch_type=False, cnt_threshold=2):
        # Observe the features with missing values
        if df == None:
            df = self.ds1_df
        f.get_missing_value_feats(df)

        # Seperate the categorical and numerical features
        num_feats, cat_feats = self.seperate_cat_num_var(df)

        # Change the datatype of categorical and numerical values
        if ch_type == True:
            f.change_type(df, num_feats, count_threshold=cnt_threshold)

        # Seperate the categorical and numerical features
        par_num_df_start, par_cat_df_start = self.define_params(df)
        stats_df = f.feature_stats(df)

        par_num_df_start = par_num_df_start.join(stats_df, how='left')
        par_cat_df_start = par_cat_df_start.join(stats_df, how='left')

        return par_num_df_start, par_cat_df_start
    def missing_value_imputations(self):
        #################################### MISSING VALUES #############################
        # Since the numerical univariate distribution are symmetrical now with no difference
        # between median and mean. Lets impute all the numerical missing values with mean
        # Record missing values for further validations:
        #indicator = MissingIndicator(missing_values=np.nan)
        #mask_missing_values_only = indicator.fit_transform(self.ds1_df)
        #mask_missing_values_only.shape

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        # Num missing values imputations
        self.ds1_df[num_feats_imp_df] = self.ds1_df[num_feats_imp_df].fillna(
            value=self.ds1_df[num_feats_imp_df].mean())

        # Left missing values are categorical.
        missing_feats_cat = f.get_missing_value_feats(self.ds1_df)

        par_num_df, par_cat_df = f.get_params(self.ds1_df, num_feats_imp_df,
                                              cat_feats_imp_df)
        # Categorical values where mode frequency is more than 80% - Impute na with Mode
        # If not then use the KNN model to impute the values

        mode_threshold = 80
        for feature in missing_feats_cat:
            if par_cat_df.loc[feature]['MODE_PERCENTAGE'] > mode_threshold:
                self.ds1_df[feature].fillna(
                    value=par_cat_df.loc[feature]['MODE'], inplace=True)
                print("Method : MODE , Feature : {} , Mode_Percentage : {}".
                      format(feature,
                             par_cat_df.loc[feature]['MODE_PERCENTAGE']))

            else:
                imp_list, score = f.impute_knn_classifier(
                    self.ds1_df, feature, 5)
                self.ds1_df[feature].fillna(value=imp_list, inplace=True)
                print(
                    "Method : KNN , Feature : {} , Imputation Accuracy Score : {}"
                    .format(feature, score))
        return par_num_df, par_cat_df

color_list = ['green','blue','orange','yellow','red','violet','cyan']


# In[5]:


val = x_df.isna().sum().sort_values(ascending=False)
f.plot_bar(val.index,(70,10),val,30)


# In[6]:


f.get_missing_value_feats(x_df)


# In[5]:


# Seperate the categorical and numerical features
num_feats,cat_feats = f.distinct_feats(x_df)
print(len(num_feats),len(cat_feats))
num_feats.remove('TARGET')


# In[6]:


f.change_type(x_df,num_feats,10)
# Import Libraries
import Model.FunctionLib as f

# Import working dataset
train_df = pd.read_csv(train_dataset)

# Create a new dataset same as train data
x_df = train_df.sample(frac=0.1, random_state=1).reset_index(drop=True)

# Delete the original dataset and work with Sample to free some space for processing.
del train_df

################################ CHANGING THE DATA TYPES ################################

# Observe the features with missing values
f.get_missing_value_feats(x_df)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')

# Change the datatype of categorical and numerical values
f.change_type(x_df, num_feats, count_threshold=5)

# Seperate the categorical and numerical features
num_feats, cat_feats = f.distinct_feats(x_df)
print(len(num_feats), len(cat_feats))
num_feats.remove('TARGET')
num_feats.remove('SK_ID_CURR')