Beispiel #1
0
def clean_data(df, use_fs=True):
    # convert object to categorical data
    if 'thal' in df.columns:
        string_labels = ['thal']
        df[string_labels] = df[string_labels].apply(categorize_label, axis=0)
        df = pd.get_dummies(df, drop_first=True)
    # drop some columns
    to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment']
    df.drop(to_drop, axis=1, inplace=True)
    # normalize high variance columns
    # high_variance_cols = ['resting_blood_pressure']
    # df[high_variance_cols] = np.log(df[high_variance_cols])
    # convert int to float
    # df = df.apply(lambda c : c.astype(float), axis=1)
    if use_fs:
        fs = FeatureSelector(data=df, labels=y)
        fs.identify_zero_importance(task='classification', eval_metric='auc',
                                    n_iterations=10, early_stopping=False)
        fs.plot_feature_importances(threshold=0.99, plot_n=14)
    # print(train_removed_all_once)
    # standard scaling
    # scaler = RobustScaler()
    # df[df.columns] = scaler.fit_transform(df[df.columns])
    # print(df.info())
    # print('\nFeature Selector analysis')
    return df
def select_top_features(train_data):
    fs = FeatureSelector(train_data[0], train_data[1])
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=6,
                                early_stopping=True)
    fs.identify_low_importance(cumulative_importance=0.99)

    return fs.ops['zero_importance'], fs.ops['low_importance']
Beispiel #3
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Beispiel #4
0
def featuresSel(train, train_labels, name):
    """Plots the curve for the importantant features
	
	Arguments:
		train {pandas.Dataframe} -- Dataset
		train_labels {numpy.ndarray} -- Labels for the dataset
		name {string} -- Name for file
	"""
    print('>>> Feature Selection...')
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    plt.figure(figsize=(15, 15))
    fs.plot_feature_importances(threshold=0.99, plot_n=50, name=name)
    plt.savefig('../../data/figures/rank_{}.png'.format(name))
    plt.close()
Beispiel #5
0
def Bestfeature_from_cummulative_importance(inFile, outFile):

    df = pd.read_csv(inFile, sep='\t')
    print(df.shape)
    train_labels = df['class_label']
    train = df.drop(columns=['class_label'])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    zero_importance_features = fs.ops['zero_importance']
    #fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    importance_index = np.min(
        np.where(fs.feature_importances['cumulative_importance'] > 0.99))
    fs.identify_low_importance(cumulative_importance=0.99)
    print(importance_index)
    train_removed_all = fs.remove(methods=['zero_importance'],
                                  keep_one_hot=False)
    train_removed_all = pd.concat([train_removed_all, train_labels], axis=1)
    train_removed_all.to_csv(outFile, sep='\t', index=None)
Beispiel #6
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Beispiel #7
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Beispiel #8
0
def select_best_features(data_file_path, saveto_path="Default"):

    mod_data_file_path = strip_header(data_file_path)

    if saveto_path == "Default":
        saveto_path = replace_ext(data_file_path, '_reduced.csv')

    X = pd.read_csv(mod_data_file_path)
    y = X['Label']
    X = X.drop(columns=['Label'])

    feature_selector = FeatureSelector(data=X, labels=y)
    feature_selector.identify_single_unique()
    feature_selector.identify_collinear(correlation_threshold=0.98)
    feature_selector.identify_zero_importance(task='classification',
                                              eval_metric='auc',
                                              n_iterations=10,
                                              early_stopping=True)
    features_1hot = feature_selector.one_hot_features
    features_base = feature_selector.base_features
    feature_selector.identify_low_importance(cumulative_importance=0.99)

    X_dash = feature_selector.remove(methods=[
        'single_unique', 'collinear', 'zero_importance', 'low_importance'
    ],
                                     keep_one_hot=False)
    X_dash['Label'] = y

    X_dash.to_csv(saveto_path, index=False)

    meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)]
    with open(saveto_path, 'r') as fh:
        contents = fh.read()
    contents = ','.join(meta_data) + '\n' + contents
    with open(saveto_path, 'w') as fh:
        fh.write(contents)

    os.system("rm -f " + mod_data_file_path)
Beispiel #9
0
    
    
    fs.identify_single_unique()
    single_unique = fs.ops['single_unique']
    single_unique
    fs.plot_unique()
    
    
    
    fs.identify_collinear(correlation_threshold=0.975)
    correlated_features = fs.ops['collinear']
    correlated_features[:5]
    fs.plot_collinear()
    fs.record_collinear.head()
    
    fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', 
                                n_iterations = 10, early_stopping = True)
    one_hot_features = fs.one_hot_features
    base_features = fs.base_features
    print('There are %d original features' % len(base_features))
    print('There are %d one-hot features' % len(one_hot_features))
    fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    fs.feature_importances.head(10)





if DO_DATA_EXPLORATION:
    discVar = 'target'
    contVar = 'feat_3766'
    boxplot(df, discVar,contVar)
Beispiel #10
0
fs.identify_collinear(0.95)
print('plot_collinear()', fs.plot_collinear())

# list of collinear features to remove
collinear_features = fs.ops['collinear']
print('collinear_features', collinear_features)

# dataframe of collinear features
df_collinear_features = fs.record_collinear.sort_values('corr_value',
                                                        ascending=False)
print('df_collinear_features', df_collinear_features.head(50))

#零重要度特征统计
# Pass in the appropriate parameters
fs.identify_zero_importance(task='classification',
                            eval_metric=tpr_weight_funtion_lc,
                            n_iterations=10,
                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
print('zero_importance_features', zero_importance_features)

#低重要度特征统计
fs.identify_low_importance(cumulative_importance=0.99)
df_low_importance = fs.feature_importances
print(df_low_importance.sort_values('importance', ascending=False).head(20))

#一次行运行所有函数
print('go')
fs.identify_all(
    selection_params={
        'missing_threshold': 0.7,
Beispiel #11
0
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd
import numpy as np

# columns = ['A_TS%', 'A_eFG%', 'A_3PAr', 'A_FTr', 'A_ORB%', 'A_DRB%',
#                              'A_TRB%', 'A_AST%', 'A_STL%', 'A_BLK%', 'A_TOV%', 'A_ORtg', 'A_DRtg',
#                              'H_TS%', 'H_eFG%', 'H_3PAr', 'H_FTr', 'H_ORB%', 'H_DRB%',
#                              'H_TRB%', 'H_AST%', 'H_STL%', 'H_BLK%', 'H_TOV%', 'H_ORtg', 'H_DRtg'
#                              ]
columns = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
                             'AST%', 'STL%', 'BLK%', 'TOV%', 'ORtg', 'DRtg']

features = pd.DataFrame(adv_diff_features(None))
labels = pd.DataFrame(adv_diff_labels())

features.columns = columns
labels.columns = ['POINT_DIFF']

print(len(features), len(labels))

fs = FeatureSelector(data=features, labels=labels)
fs.identify_missing(missing_threshold=0.9)
fs.identify_collinear(correlation_threshold=0.5)
fs.plot_collinear()

fs2 = FeatureSelector(data=features, labels=labels[:,])
fs2.identify_zero_importance(eval_metric='l2', task='regression')
# fs2.identify_low_importance()

print(fs.record_collinear.head())
Beispiel #12
0
print("# identify_zero_importance")
print("使用LightGBM库训练GB集成算法,评价特征之间的重要性")
print("1. 为了降低随机性,模型默认会训练10次")
print(
    "2. 模型默认会采用 early stopping 的操作形式,使用15%的数据作为 validation data 去获取 optimal number of estimators"
)
print("3. 需要使用到的参数")
print("    task: classification or regression , metrics 与这个是相关的")
print(
    "    eval_metric: 用于 early stopping 的指标,auc for classification, L2 for regression"
)
print("    n_iterations:训练的次数,默认是10次,feature importances会取10次计算结果的平均值")
print("    early_stopping: 默认在训练的时候是使用 early stopping 模式的,early stopping")
print("                    可以理解成一个 regulation,为了防止训练数据的过拟合")
fs.identify_zero_importance(task="classification",
                            eval_metric="auc",
                            n_iterations=10,
                            early_stopping=True)

zero_importance_features = fs.ops["zero_importance"]
with open("zero_importance.txt", "w") as f:
    for index, zero_importance_feature in enumerate(zero_importance_features):
        f.write("特征个数:{}  特征名称:{}\n".format(index, zero_importance_feature))
fs.plot_feature_importances(threshold=0.99, plot_n=20)
plt.savefig("feature_importance.jpg", dpi=300)
plt.show()

one_hundred_features = list(fs.feature_importances.loc[:99, "feature"])

print("\n")

print("# identify_low_importance")
Beispiel #13
0
constant_train = x.loc[:, (x == 0).all()].columns.tolist()
print('Number of constant columns in the train set:', len(constant_train))

# %%
fs = FeatureSelector(x, y)
fs.identify_missing(missing_threshold=0.6)
fs.record_missing.head()
fs.plot_missing()

# %%
fs.identify_collinear(correlation_threshold=0.9)
fs.record_collinear.head()

# %%
fs.identify_zero_importance(task='regression',
                            eval_metric='rmse',
                            n_iterations=10,
                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
#%%
fs.identify_low_importance(cumulative_importance=0.99999)
fs.record_low_importance.head()
fs.plot_feature_importances(50)

# %%
train_removed = fs.remove(methods='all')
X_clean = train_removed
# %%
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
Beispiel #14
0
    return (np.transpose(tt))

miss_data(df)
# %%
fs=FeatureSelector(x,y)
fs.identify_missing (missing_threshold=0.6)
fs.record_missing.head()
fs.plot_missing()

# %%
fs.identify_collinear(correlation_threshold=0.9)
fs.record_collinear.head()

# %%
fs.identify_zero_importance(task = 'classification',  
                            n_iterations = 10, 
                             early_stopping = False)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']

# %%
fs.identify_low_importance(cumulative_importance=0.99)
fs.record_low_importance.head()
fs.plot_feature_importances(100)
#%%
train_removed=fs.remove(methods='all')
X_clean=train_removed

import seaborn as sns
import matplotlib.pyplot as plt