Beispiel #1
0
def featexp_method(dataframe, target_name):
    X_train, X_test, y_train, y_test = generate_training_testing_set(
        dataframe, target_name)

    data_train = X_train.reset_index(drop=True)
    data_train['Readmission_1'] = y_train.reset_index(drop=True)
    data_test = X_test.reset_index(drop=True)
    data_test['Readmission_1'] = y_test.reset_index(drop=True)

    #get the plot
    # get_univariate_plots(data=data_train, target_col='Readmission_1', features_list=data_train.columns[21:22],
    #                      data_test=data_test)

    stats = get_trend_stats(data_train,
                            target_col='Readmission_1',
                            data_test=data_test)
    print(stats)
Beispiel #2
0
def noise_feature_find(train, test, label, features, bins=10, corr=None):
    """
    噪声特征的分析与判断
    :param train: DataFrame | 训练集
    :param test: DataFrame | 测试集
    :param label: string | 标签
    :param features: list | 特征列表
    :param bins: int | 分组数目
    :param corr: float | 选择趋势相关度低于 corr 的特征为噪音特征,默认为 None
    :return: noise_feature_list 噪音特征列表
    """
    get_univariate_plots(data=train, target_col=label, data_test=test,
                         features_list=features, bins=bins)
    if corr is None:
        return None
    else:
        stats = get_trend_stats(data=train, target_col=label, data_test=test)
        noise_feature_list = list(stats[stats.Trend_correlation < corr].Feature)
        return noise_feature_list
sns.countplot(df['column'])

#Feature understanding - see how the variable affects the target variable
from featexp import get_univariate_plots
# Plots drawn for all features if nothing is passed in feature_list parameter.
get_univariate_plots(data=data_train,
                     target_col='target',
                     features_list=['DAYS_BIRTH'],
                     bins=10)
get_univariate_plots(data=data_train,
                     target_col='target',
                     data_test=data_test,
                     features_list=['DAYS_EMPLOYED'])
from featexp import get_trend_stats
stats = get_trend_stats(data=data_train,
                        target_col='target',
                        data_test=data_test)

#Fix or remove outliers
sns.boxplot(df['feature1'])
sns.boxplot(df['feature2'])
plt.scatter('var1', 'y')  #Do this for all variables against y


def replace_outlier(df,
                    column,
                    value,
                    threshold,
                    direction='max'):  #value could be the mean
    if direction == 'max':
        df[column] = df[column].apply(lambda x: value if x > threshold else x)
Beispiel #4
0
def _trend_consistency(Train, Valid, target_col='target'):
    return get_trend_stats(data=Train, target_col=target_col, data_test=Valid)
Beispiel #5
0
 def _trend_consistency(self, Train, Valid, target_col):
     return get_trend_stats(data=Train,
                            target_col=target_col,
                            data_test=Valid).set_index('Feature')