Esempio n. 1
0
def prepare_data(): 
	"""
	This function is the main of this module. calls the above functions in order to read/clean/save
	our data in usable form.
	I created this function to use dataset_prepare.py as a Python module in our main program.
	
	Return values: training X,Y dataset and testing X,Y dataset
	"""

	# read our csv files
	features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1")
	training_df = pd.read_csv("training.csv").drop("id",axis=1)
	testing_df = pd.read_csv("testing.csv").drop("id",axis=1)

	fs = FeatureSelector(data = training_df)
	fs.identify_collinear(correlation_threshold=0.85)
	training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)
	columnlist = list(training_df)
	testing_df = testing_df[columnlist]
	
	training_df = training_df.sample(frac=1)
	testing_df = testing_df.sample(frac=1)
	train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df)

	training_df = training_df.drop(["attack_cat","label"], axis=1)
	print("The features we will use are: ", np.array(list(training_df)))

	return train_x,train_y,test_x,test_y,labels
Esempio n. 2
0
def test():
    train = pd.read_excel('../data/menori.xlsx')
    train_labels = train["nums"]
    print(train.head())
    train = train.drop(columns=["nums"])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_collinear(correlation_threshold=0.98)
    correlated_features = fs.ops['collinear']
    print(correlated_features)
def select_features_without_label(features: pd.DataFrame,
                                  missing_threshold=0.99,
                                  correlation_threshold=1.0) -> pd.DataFrame:
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    if correlation_threshold < 1:
        fs.identify_collinear(correlation_threshold)
        return fs.remove(methods=['missing', 'single_unique', "collinear"])
    else:
        return fs.remove(methods=['missing', 'single_unique'])
Esempio n. 4
0
def transform_to_nominal(): 
    # read our csv files
    training_df = pd.read_csv("training.csv").drop("id",axis=1)
    
    # Feature selector
    fs = FeatureSelector(data = training_df)
    fs.identify_collinear(correlation_threshold=0.85)
    training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)

    training_df = training_df.sample(frac=1)
    training_df = training_df.drop(["attack_cat","label"], axis=1)
    columnList = list(training_df)
    labels,nominal_cols = retrieve_classes(training_df)
    
    return labels,nominal_cols,columnList
Esempio n. 5
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Esempio n. 6
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
Esempio n. 7
0
    def remove_unnecessary_features(self, auto=False):
        if auto:
            self.processed_data = self.processed_data.drop(
                columns=self.predefined_skip_features)
        else:
            fs = FeatureSelector(data=self.processed_data.drop("label",
                                                               axis=1),
                                 labels=self.processed_data["label"])
            fs.identify_missing(missing_threshold=0.6)
            fs.identify_collinear(correlation_threshold=0.98)

            fs.identify_zero_importance(task='classification',
                                        eval_metric='auc',
                                        n_iterations=10,
                                        early_stopping=False)

            fs.identify_low_importance(cumulative_importance=0.99)
            fs.identify_single_unique()
            # Remove the features from all methods (returns a df)
            labels = self.processed_data["label"]
            self.processed_data = fs.remove(methods='all')
            self.processed_data["label"] = labels
Esempio n. 8
0
def select_best_features(data_file_path, saveto_path="Default"):

    mod_data_file_path = strip_header(data_file_path)

    if saveto_path == "Default":
        saveto_path = replace_ext(data_file_path, '_reduced.csv')

    X = pd.read_csv(mod_data_file_path)
    y = X['Label']
    X = X.drop(columns=['Label'])

    feature_selector = FeatureSelector(data=X, labels=y)
    feature_selector.identify_single_unique()
    feature_selector.identify_collinear(correlation_threshold=0.98)
    feature_selector.identify_zero_importance(task='classification',
                                              eval_metric='auc',
                                              n_iterations=10,
                                              early_stopping=True)
    features_1hot = feature_selector.one_hot_features
    features_base = feature_selector.base_features
    feature_selector.identify_low_importance(cumulative_importance=0.99)

    X_dash = feature_selector.remove(methods=[
        'single_unique', 'collinear', 'zero_importance', 'low_importance'
    ],
                                     keep_one_hot=False)
    X_dash['Label'] = y

    X_dash.to_csv(saveto_path, index=False)

    meta_data = [str(X_dash.shape[0]), str(X_dash.shape[1] - 1)]
    with open(saveto_path, 'r') as fh:
        contents = fh.read()
    contents = ','.join(meta_data) + '\n' + contents
    with open(saveto_path, 'w') as fh:
        fh.write(contents)

    os.system("rm -f " + mod_data_file_path)
Esempio n. 9
0
# Features are in train and labels are in train_labels
fs = FeatureSelector(data=train, labels=train_labels)

#缺失值统计
fs.identify_missing(0.5)
df_miss_value = fs.missing_stats.sort_values('missing_fraction',
                                             ascending=False)
print('df_miss_value', df_miss_value.head(15))
missing_features = fs.ops['missing']
print('missing_features to remove', missing_features[:20])

#单值特征统计
fs.identify_single_unique()
print('fs.plot_unique()', fs.plot_unique())

fs.identify_collinear(0.95)
print('plot_collinear()', fs.plot_collinear())

# list of collinear features to remove
collinear_features = fs.ops['collinear']
print('collinear_features', collinear_features)

# dataframe of collinear features
df_collinear_features = fs.record_collinear.sort_values('corr_value',
                                                        ascending=False)
print('df_collinear_features', df_collinear_features.head(50))

#零重要度特征统计
# Pass in the appropriate parameters
fs.identify_zero_importance(task='classification',
                            eval_metric=tpr_weight_funtion_lc,
Esempio n. 10
0
                                                    test_size=0.2,
                                                    random_state=0)

# Feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

# Feature selection (remove highly correlated features)
from feature_selector import FeatureSelector

n = len(X_train.T)
fs = FeatureSelector(data=X_train)
fs.identify_collinear(
    correlation_threshold=0.7)  # select features from training set
corr = fs.ops['collinear']
X_train = fs.remove(methods=['collinear'
                             ])  # remove selected features from training set
to_remove = pd.unique(
    fs.record_collinear['drop_feature'])  # features to remove
X_test = X_test.drop(
    columns=to_remove)  # remove selected features from test set

# Create the artificial neural network
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

num_input_nodes = len(X_train.T)
Esempio n. 11
0
from feature_selector import FeatureSelector

# In[8]:

train_labels = train_data['label']
train_features = train_data.drop(columns=[
    'user', 'product_nbr', 'last_year_capture_user_flag', 'label',
    'pro_brand_-1', 'pro_brand_Apple', 'pro_brand_三星', 'pro_brand_其他',
    'pro_brand_华为', 'pro_brand_小米', 'pro_brand_未知厂商', 'pro_brand_欧珀',
    'pro_brand_维沃'
])

# In[14]:

fs = FeatureSelector(data=train_features, labels=train_labels)
fs.identify_collinear(correlation_threshold=0.9, one_hot=False)
# 绘制选择的特征的相关性heatmap
fs.plot_collinear()
# 列出要删除的共线特征
collinear_features = fs.ops['collinear']
# 查看共线特征的dataframe
fs.record_collinear

# In[20]:

train_data = train_data.drop(columns=collinear_features)

# In[21]:

train_data.shape
Esempio n. 12
0
def featureSelect(x, y):
    fs = FeatureSelector(data=x, labels=y)
    fs.identify_collinear(correlation_threshold=0.99)
    choose = fs.ops['collinear']
    return choose
Esempio n. 13
0
test = data_new2[important_features]
test["Is_Male"] = y_pred_kmeans

#Building ANN now
data_ANN = data.copy()
data_ANN["Is_Male"] = y_pred_kmeans
data_ANN.drop(columns="customer_id", inplace=True)
X = data_ANN.iloc[:, :42]
y = data_ANN.iloc[:, 42]
#Step 1 - Feature selection
from feature_selector import FeatureSelector
fts = FeatureSelector(X, y)
fts.identify_missing(missing_threshold=0.9)

fts.identify_collinear(correlation_threshold=0.7)
fts.plot_collinear()
collinear_features = fts.ops['collinear']

fts.identify_zero_importance(task='classification',
                             eval_metric='auc',
                             n_iterations=30,
                             early_stopping=True)
zero_importance_features = fts.ops['zero_importance']

fts.plot_feature_importances(threshold=0.99, plot_n=12)
Most_important_Features = list(fts.feature_importances["feature"].head(28))

Data_ANN_2 = data_ANN[Most_important_Features]
X = Data_ANN_2.iloc[:, :]
y = data_ANN.iloc[:, 42]
Esempio n. 14
0
In this notebook we will test all the five function, to start we need to create an instance.

***Examples on how to use Feature Selector are in the end of this notebook***
"""

fs = FeatureSelector(data = df, labels = df_label)

"""###Removing Features

***Once we have identified the features to remove, we have a number of ways to drop the features. We can access any of the feature lists in the removal_ops dictionary and remove the columns manually. We also can use the remove method, passing in the methods that identified the features we want to remove***

Now, we will apply collinear, zero importance, low importance and sigle unique feature importance to selecti which columns to remove
"""

fs.identify_collinear(correlation_threshold=0.975)

fs.identify_zero_importance(task = 'classification', eval_metric = 'auc', 
                            n_iterations = 10, early_stopping = True)

fs.identify_low_importance(cumulative_importance = 0.99)

fs.identify_single_unique()

to_remove = fs.check_removal()

feature_df = fs.remove(
    methods = ['collinear', 'zero_importance', 'low_importance', 'single_unique'],
    keep_one_hot=False
)
Esempio n. 15
0
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
    #-- Single unique value
    fs.identify_single_unique()
    
    #-- TO get keys fs.ops.keys()
    missing_features = list(fs.ops['missing'])
    corelated_features = list(fs.ops['collinear'])
    single_value = list(fs.ops['single_unique'])
    
    r = set(flatten([missing_features,corelated_features,single_value]))
    #X = df_feats.drop(r, axis=1)    
    

     
rnk_pval = getPvalStats(df, 'target')    
dfm__ = dfm_.reset_index().set_index(['date', 'symbol'])
dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64)
dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn']
dfm__ = dfm__.dropna()
dfm__.isna().sum()

df_mrq['win'] = dfm__.win
df_mrq['trt1m'] = dfm__.trt1m
df_mrq['sprtrn'] = dfm__.sprtrn
df_mrq['rtoversp'] = dfm__.rtoversp
df_mrq = df_mrq.dropna()

train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp'])
train_labels = df_mrq['win']

fs = FeatureSelector(data=train, labels=train_labels)
fs.identify_collinear(correlation_threshold=0.975)

#fs.plot_collinear(plot_all=True)

#fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True)

#fs.identify_low_importance(cumulative_importance = 0.99)

all_to_remove = fs.check_removal()
print(all_to_remove)

df_mrq_pruned = df_mrq.drop(columns=all_to_remove)

# df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')
# In[27]:

fc = FeatureSelector(data, labels=label)

# In[10]:

fc.identify_missing(missing_threshold=0.95)

# In[11]:

fc.missing_stats.head()

# In[12]:

fc.identify_collinear(correlation_threshold=0.98)

# In[17]:

fc.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
                            early_stopping=True)

# In[18]:

fc.identify_low_importance(cumulative_importance=0.95)

# In[19]:

fc.identify_single_unique()