Esempio n. 1
0
def clean_data(df, use_fs=True):
    # convert object to categorical data
    if 'thal' in df.columns:
        string_labels = ['thal']
        df[string_labels] = df[string_labels].apply(categorize_label, axis=0)
        df = pd.get_dummies(df, drop_first=True)
    # drop some columns
    to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment']
    df.drop(to_drop, axis=1, inplace=True)
    # normalize high variance columns
    # high_variance_cols = ['resting_blood_pressure']
    # df[high_variance_cols] = np.log(df[high_variance_cols])
    # convert int to float
    # df = df.apply(lambda c : c.astype(float), axis=1)
    if use_fs:
        fs = FeatureSelector(data=df, labels=y)
        fs.identify_zero_importance(task='classification', eval_metric='auc',
                                    n_iterations=10, early_stopping=False)
        fs.plot_feature_importances(threshold=0.99, plot_n=14)
    # print(train_removed_all_once)
    # standard scaling
    # scaler = RobustScaler()
    # df[df.columns] = scaler.fit_transform(df[df.columns])
    # print(df.info())
    # print('\nFeature Selector analysis')
    return df
Esempio n. 2
0
def featuresSel(train, train_labels, name):
    """Plots the curve for the importantant features
	
	Arguments:
		train {pandas.Dataframe} -- Dataset
		train_labels {numpy.ndarray} -- Labels for the dataset
		name {string} -- Name for file
	"""
    print('>>> Feature Selection...')
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    plt.figure(figsize=(15, 15))
    fs.plot_feature_importances(threshold=0.99, plot_n=50, name=name)
    plt.savefig('../../data/figures/rank_{}.png'.format(name))
    plt.close()
Esempio n. 3
0
    
    
    
    fs.identify_collinear(correlation_threshold=0.975)
    correlated_features = fs.ops['collinear']
    correlated_features[:5]
    fs.plot_collinear()
    fs.record_collinear.head()
    
    fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', 
                                n_iterations = 10, early_stopping = True)
    one_hot_features = fs.one_hot_features
    base_features = fs.base_features
    print('There are %d original features' % len(base_features))
    print('There are %d one-hot features' % len(one_hot_features))
    fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    fs.feature_importances.head(10)





if DO_DATA_EXPLORATION:
    discVar = 'target'
    contVar = 'feat_3766'
    boxplot(df, discVar,contVar)
    
    contVar1 = 'feat_1717'
    contVar2 = 'feat_3766'
    scatter(df,contVar1,contVar2)
    plot2dHist(df,contVar1,contVar2)
Esempio n. 4
0
fsDataScale.identify_feat_imp()


# zero feath with 0 

# In[29]:


fsDataScale.identify_low_importance(0.95)


# In[30]:


fsDataScale.plot_feature_importances(plot_n=300)


# In[31]:


max(fsDataScale.scores)


# In[32]:


abc = fsDataScale.feature_importances


# In[33]:
Esempio n. 5
0
File: test.py Progetto: vic7894/vic
# %%
fs.identify_collinear(correlation_threshold=0.9)
fs.record_collinear.head()

# %%
fs.identify_zero_importance(task='regression',
                            eval_metric='rmse',
                            n_iterations=10,
                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']
#%%
fs.identify_low_importance(cumulative_importance=0.99999)
fs.record_low_importance.head()
fs.plot_feature_importances(50)

# %%
train_removed = fs.remove(methods='all')
X_clean = train_removed
# %%
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
#%%
scores = make_scorer(mean_squared_error)
models = [
    RandomForestRegressor(n_estimators=200,
fs = FeatureSelector(data=X_train_community, labels=y_train_community)

fs.identify_collinear(correlation_threshold=0.8)
collinear_features = fs.ops['collinear']
fs.record_collinear.head()

# Pass in the appropriate parameters
fs.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
                            early_stopping=True)
# list of zero importance features
zero_importance_features = fs.ops['zero_importance']

# plot the feature importances
fs.plot_feature_importances(threshold=0.90, plot_n=10)
"""#Building the Model with Limited Features

###Discrete Targets
"""

limited_data = community_data_class[[
    "PctIlleg", "PctKids2Par", "racePctWhite", "PctPopUnderPov",
    "PctVacantBoarded", "ViolentCrimeLevel"
]]
limited_data.head()

#Create X and y datasets with limited data
X_limited = limited_data.copy()
X_limited = X_limited.drop(['ViolentCrimeLevel'], axis=1)
y_limited = limited_data['ViolentCrimeLevel'].copy()