Python FunctionLib.TurkyOutliers Examples

Programming Language: Python

Namespace/Package Name: Model

Class/Type: FunctionLib

Method/Function: TurkyOutliers

Examples at hotexamples.com: 2

Python FunctionLib.TurkyOutliers - 2 examples found. These are the top rated real world Python examples of Model.FunctionLib.TurkyOutliers extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_params(8)

distinct_feats(7)

change_type(7)

get_missing_value_feats(6)

ScoreDataFrame(3)

get_aggregate_features_num(3)

get_model_performance(3)

TurkyOutliers(2)

impute_knn_classifier(2)

GetScaledModel(2)

get_rowcnt_most_missing_val(2)

GetBasedModel(2)

cv_score(2)

corr_feats(2)

GetScaledModelwithfactorizedCW(2)

plot_bar(2)

missing_val_perc(2)

impute_values(2)

log_transform(2)

PlotBoxR(2)

match_strings(1)

hist_perc(1)

hist_compare(1)

get_unique_val_list(1)

plot_stats(1)

min_len_col(1)

AdaBoostClassifier(1)

get_corr(1)

feature_stats(1)

default_ratio(1)

cv_metrics(1)

concat_model_score(1)

RandomSearch(1)

RandomForestClassifier(1)

LogisticRegression(1)

KNeighborsClassifier(1)

GridSearch(1)

GradientBoostingClassifier(1)

GetScaledModelwithbestparams(1)

train_test_split(1)

Example #1

Show file

File: Preprocessing.py Project: rkparyani/KAGGLE---Home-Credit-Default-Risk

    def outlier_treatment(self, normalized_feats):
        # Find the num and cat feats for imp_df

        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)
        other_feats = [
            x for x in num_feats_imp_df if x not in normalized_feats
        ]

        # Anamolies and data correction.
        # DAYS_EMPLOYED has abnormal value '365243' which would be changed to nan for imputation at a later stage
        feature = 'DAYS_EMPLOYED'
        self.ds1_df[feature].loc[self.ds1_df[self.ds1_df[feature] ==
                                             365243].index] = np.nan

        # XNA values exist in ORGANIZATION_TYPE feature, replacing it by np.NaN to be imputed.
        self.ds1_df['ORGANIZATION_TYPE'].replace("XNA", np.nan, inplace=True)

        # Log transformation of all numerical non normalized highly skewed values to remove outliers

        for feature in other_feats:
            print('log_transform', feature)
            self.ds1_df = f.log_transform(self.ds1_df, feature)
            self.ds1_df.drop(self.ds1_df[[feature]], axis=1, inplace=True)

        #normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df]
        num_feats_imp_df, cat_feats_imp_df = self.seperate_cat_num_var(
            self.ds1_df)

        for i in num_feats_imp_df:
            print(i)
            out_l, out_r, min, max = f.TurkyOutliers(self.ds1_df,
                                                     i,
                                                     drop=False)
            if (len(out_l) | len(out_r)) > 0:
                self.ds1_df[i].loc[out_l] = round(min, 3)
                self.ds1_df[i].loc[out_r] = round(max, 3)

Example #2

Show file

File: Preprocessing_app_train.py Project: rkparyani/KAGGLE---Home-Credit-Default-Risk

for feature in other_feats:
    print(feature)
    imp_df = f.log_transform(imp_df, feature)
    imp_df.drop(imp_df[[feature]], axis=1, inplace=True)

#normalized_num_feats_imp_df = [x for x in normalized_feats if x in num_feats_imp_df]
num_feats_imp_df, cat_feats_imp_df = f.distinct_feats(imp_df)
num_feats_imp_df.remove('TARGET')
num_feats_imp_df.remove('SK_ID_CURR')
print(len(num_feats_imp_df), len(cat_feats_imp_df))

for i in num_feats_imp_df:
    print(i)
    #i = 'AMT_REQ_CREDIT_BUREAU_YEAR_log'
    out_l, out_r, min, max = f.TurkyOutliers(imp_df, i, drop=False)
    if (len(out_l) | len(out_r)) > 0:
        imp_df[i].loc[out_l] = round(min, 3)
        imp_df[i].loc[out_r] = round(max, 3)

#################################### MISSING VALUES #############################
# Since the numerical univariate distribution are symmetrical now with no difference
# between median and mean. Lets impute all the numerical missing values with mean

# Record missing values for further validations:
indicator = MissingIndicator(missing_values=np.nan)
mask_missing_values_only = indicator.fit_transform(imp_df)
mask_missing_values_only.shape

# Num missing values imputations
imp_df[num_feats_imp_df] = imp_df[num_feats_imp_df].fillna(