Esempio n. 1
0
def select_features_without_label(features: pd.DataFrame, missing_threshold=0.7,
                                  correlation_threshold=0.95) -> pd.DataFrame:
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    #     fs.identify_collinear(correlation_threshold)
    return fs.remove(methods=['missing', 'single_unique'])
def select_features_without_label(features: pd.DataFrame,
                                  missing_threshold=0.90,
                                  correlation_threshold=1) -> pd.DataFrame:
    print(missing_threshold)
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    if correlation_threshold < 1:
        fs.identify_collinear(correlation_threshold)
        return fs.remove(methods=['missing', 'single_unique', "collinear"])
    else:
        return fs.remove(methods=['missing', 'single_unique'])
Esempio n. 3
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Esempio n. 4
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Esempio n. 5
0
def select_best_features(data_file_path, saveto_path="Default"):

    mod_data_file_path = strip_header(data_file_path)

    if saveto_path == "Default":
        saveto_path = replace_ext(data_file_path, '_reduced.csv')

    X = pd.read_csv(mod_data_file_path)
    y = X['Label']
    X = X.drop(columns=['Label'])

    feature_selector = FeatureSelector(data=X, labels=y)
    feature_selector.identify_single_unique()
    feature_selector.identify_collinear(correlation_threshold=0.98)
    feature_selector.identify_zero_importance(task='classification',
                                              eval_metric='auc',
                                              n_iterations=10,
                                              early_stopping=True)
    features_1hot = feature_selector.one_hot_features
    features_base = feature_selector.base_features
    feature_selector.identify_low_importance(cumulative_importance=0.99)

    X_dash = feature_selector.remove(methods=[
        'single_unique', 'collinear', 'zero_importance', 'low_importance'
    ],
                                     keep_one_hot=False)
    X_dash['Label'] = y

    X_dash.to_csv(saveto_path, index=False)

    meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)]
    with open(saveto_path, 'r') as fh:
        contents = fh.read()
    contents = ','.join(meta_data) + '\n' + contents
    with open(saveto_path, 'w') as fh:
        fh.write(contents)

    os.system("rm -f " + mod_data_file_path)
Esempio n. 6
0
#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])
    single_value = list(fs.ops['single_unique'])
    
    r = set(flatten([missing_features,corelated_features,single_value]))
    #X = df_feats.drop(r, axis=1)    
    

     
rnk_pval = getPvalStats(df, 'target')    


feat_types = pd.DataFrame(df_feats.dtypes)