def featexp_method(dataframe, target_name): X_train, X_test, y_train, y_test = generate_training_testing_set( dataframe, target_name) data_train = X_train.reset_index(drop=True) data_train['Readmission_1'] = y_train.reset_index(drop=True) data_test = X_test.reset_index(drop=True) data_test['Readmission_1'] = y_test.reset_index(drop=True) #get the plot # get_univariate_plots(data=data_train, target_col='Readmission_1', features_list=data_train.columns[21:22], # data_test=data_test) stats = get_trend_stats(data_train, target_col='Readmission_1', data_test=data_test) print(stats)
def noise_feature_find(train, test, label, features, bins=10, corr=None): """ 噪声特征的分析与判断 :param train: DataFrame | 训练集 :param test: DataFrame | 测试集 :param label: string | 标签 :param features: list | 特征列表 :param bins: int | 分组数目 :param corr: float | 选择趋势相关度低于 corr 的特征为噪音特征,默认为 None :return: noise_feature_list 噪音特征列表 """ get_univariate_plots(data=train, target_col=label, data_test=test, features_list=features, bins=bins) if corr is None: return None else: stats = get_trend_stats(data=train, target_col=label, data_test=test) noise_feature_list = list(stats[stats.Trend_correlation < corr].Feature) return noise_feature_list
sns.countplot(df['column']) #Feature understanding - see how the variable affects the target variable from featexp import get_univariate_plots # Plots drawn for all features if nothing is passed in feature_list parameter. get_univariate_plots(data=data_train, target_col='target', features_list=['DAYS_BIRTH'], bins=10) get_univariate_plots(data=data_train, target_col='target', data_test=data_test, features_list=['DAYS_EMPLOYED']) from featexp import get_trend_stats stats = get_trend_stats(data=data_train, target_col='target', data_test=data_test) #Fix or remove outliers sns.boxplot(df['feature1']) sns.boxplot(df['feature2']) plt.scatter('var1', 'y') #Do this for all variables against y def replace_outlier(df, column, value, threshold, direction='max'): #value could be the mean if direction == 'max': df[column] = df[column].apply(lambda x: value if x > threshold else x)
def _trend_consistency(Train, Valid, target_col='target'): return get_trend_stats(data=Train, target_col=target_col, data_test=Valid)
def _trend_consistency(self, Train, Valid, target_col): return get_trend_stats(data=Train, target_col=target_col, data_test=Valid).set_index('Feature')