Exemple #1
0
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.7,
                                  correlation_threshold=0.95) -> pd.DataFrame:
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    #     fs.identify_collinear(correlation_threshold)
    return fs.remove(methods=['missing', 'single_unique'])
def select_features_without_label(features: pd.DataFrame,
                                  missing_threshold=0.90,
                                  correlation_threshold=1) -> pd.DataFrame:
    print(missing_threshold)
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    if correlation_threshold < 1:
        fs.identify_collinear(correlation_threshold)
        return fs.remove(methods=['missing', 'single_unique', "collinear"])
    else:
        return fs.remove(methods=['missing', 'single_unique'])
Exemple #3
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Exemple #4
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Exemple #5
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Exemple #6
0
#-- Create an instance
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])
    single_value = list(fs.ops['single_unique'])
    
    r = set(flatten([missing_features,corelated_features,single_value]))
    #X = df_feats.drop(r, axis=1)    
    

     
Exemple #7
0
#pd.head,默认查看前五行数据。因为train已经是pd因此直接.head即可
print(train.head())

#pandas.drop的用法,删除Target列
train = train.drop(['TARGET'], axis=1)

#对于pandas,行标为index,列表为columns
#如常用df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three'])

#Create the Instance
fs = FeatureSelector(data=train, labels=train_labels)

#   1   Missing Values

fs.identify_missing(missing_threshold=0.6)

#The features identified for removal can be accessed through the ops dictionary of the FeatureSelector object.
missing_features = fs.ops['missing']
print(missing_features[:20])

fs.plot_missing()  #在每一个画图的后面加上plt.show即可
plt.show()
print(fs.missing_stats.head(20))

#   2   Single Unique Value

fs.identify_single_unique()

single_unique = fs.ops['single_unique']
print(single_unique)
Exemple #8
0
test = pd.read_csv(path0 + 'test.csv')
train = pd.read_csv(path0 + 'train.csv')

print('tag_value_counts', train['Tag'].value_counts())

train_labels = train['Tag']
train = train.drop(['UID', 'Tag'], axis=1)

X_loc_test = test.drop('UID', axis=1)

from feature_selector import FeatureSelector
# Features are in train and labels are in train_labels
fs = FeatureSelector(data=train, labels=train_labels)

#缺失值统计
fs.identify_missing(0.5)
df_miss_value = fs.missing_stats.sort_values('missing_fraction',
                                             ascending=False)
print('df_miss_value', df_miss_value.head(15))
missing_features = fs.ops['missing']
print('missing_features to remove', missing_features[:20])

#单值特征统计
fs.identify_single_unique()
print('fs.plot_unique()', fs.plot_unique())

fs.identify_collinear(0.95)
print('plot_collinear()', fs.plot_collinear())

# list of collinear features to remove
collinear_features = fs.ops['collinear']
Exemple #9
0
# Evaluate the K-Means clustering results
data_new2["Is_Male"] = y_pred_kmeans

test = data_new2[important_features]
test["Is_Male"] = y_pred_kmeans

#Building ANN now
data_ANN = data.copy()
data_ANN["Is_Male"] = y_pred_kmeans
data_ANN.drop(columns="customer_id", inplace=True)
X = data_ANN.iloc[:, :42]
y = data_ANN.iloc[:, 42]
#Step 1 - Feature selection
from feature_selector import FeatureSelector
fts = FeatureSelector(X, y)
fts.identify_missing(missing_threshold=0.9)

fts.identify_collinear(correlation_threshold=0.7)
fts.plot_collinear()
collinear_features = fts.ops['collinear']

fts.identify_zero_importance(task='classification',
                             eval_metric='auc',
                             n_iterations=30,
                             early_stopping=True)
zero_importance_features = fts.ops['zero_importance']

fts.plot_feature_importances(threshold=0.99, plot_n=12)
Most_important_Features = list(fts.feature_importances["feature"].head(28))

Data_ANN_2 = data_ANN[Most_important_Features]
# In[7]:

from feature_selector import FeatureSelector

# In[8]:

data = pd.DataFrame(data_np)

# In[27]:

fc = FeatureSelector(data, labels=label)

# In[10]:

fc.identify_missing(missing_threshold=0.95)

# In[11]:

fc.missing_stats.head()

# In[12]:

fc.identify_collinear(correlation_threshold=0.98)

# In[17]:

fc.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
                            early_stopping=True)