Ejemplo n.º 1
0
def prepare_data(): 
	"""
	This function is the main of this module. calls the above functions in order to read/clean/save
	our data in usable form.
	I created this function to use dataset_prepare.py as a Python module in our main program.
	
	Return values: training X,Y dataset and testing X,Y dataset
	"""

	# read our csv files
	features_df = pd.read_csv("UNSW_NB15_features.csv",encoding = "ISO-8859-1")
	training_df = pd.read_csv("training.csv").drop("id",axis=1)
	testing_df = pd.read_csv("testing.csv").drop("id",axis=1)

	fs = FeatureSelector(data = training_df)
	fs.identify_collinear(correlation_threshold=0.85)
	training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)
	columnlist = list(training_df)
	testing_df = testing_df[columnlist]
	
	training_df = training_df.sample(frac=1)
	testing_df = testing_df.sample(frac=1)
	train_x,train_y,test_x,test_y, labels = clean_nominals_and_create_our_datasets(training_df,testing_df)

	training_df = training_df.drop(["attack_cat","label"], axis=1)
	print("The features we will use are: ", np.array(list(training_df)))

	return train_x,train_y,test_x,test_y,labels
Ejemplo n.º 2
0
def test():
    train = pd.read_excel('../data/menori.xlsx')
    train_labels = train["nums"]
    print(train.head())
    train = train.drop(columns=["nums"])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_collinear(correlation_threshold=0.98)
    correlated_features = fs.ops['collinear']
    print(correlated_features)
def select_features_without_label(features: pd.DataFrame,
                                  missing_threshold=0.99,
                                  correlation_threshold=1.0) -> pd.DataFrame:
    fs = FeatureSelector(data=features)
    fs.identify_missing(missing_threshold)
    fs.identify_single_unique()
    if correlation_threshold < 1:
        fs.identify_collinear(correlation_threshold)
        return fs.remove(methods=['missing', 'single_unique', "collinear"])
    else:
        return fs.remove(methods=['missing', 'single_unique'])
Ejemplo n.º 4
0
def transform_to_nominal(): 
    # read our csv files
    training_df = pd.read_csv("training.csv").drop("id",axis=1)
    
    # Feature selector
    fs = FeatureSelector(data = training_df)
    fs.identify_collinear(correlation_threshold=0.85)
    training_df = fs.remove(methods = ['collinear'],keep_one_hot = True)

    training_df = training_df.sample(frac=1)
    training_df = training_df.drop(["attack_cat","label"], axis=1)
    columnList = list(training_df)
    labels,nominal_cols = retrieve_classes(training_df)
    
    return labels,nominal_cols,columnList
Ejemplo n.º 5
0
def clean_data(df, use_fs=True):
    # convert object to categorical data
    if 'thal' in df.columns:
        string_labels = ['thal']
        df[string_labels] = df[string_labels].apply(categorize_label, axis=0)
        df = pd.get_dummies(df, drop_first=True)
    # drop some columns
    to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment']
    df.drop(to_drop, axis=1, inplace=True)
    # normalize high variance columns
    # high_variance_cols = ['resting_blood_pressure']
    # df[high_variance_cols] = np.log(df[high_variance_cols])
    # convert int to float
    # df = df.apply(lambda c : c.astype(float), axis=1)
    if use_fs:
        fs = FeatureSelector(data=df, labels=y)
        fs.identify_zero_importance(task='classification', eval_metric='auc',
                                    n_iterations=10, early_stopping=False)
        fs.plot_feature_importances(threshold=0.99, plot_n=14)
    # print(train_removed_all_once)
    # standard scaling
    # scaler = RobustScaler()
    # df[df.columns] = scaler.fit_transform(df[df.columns])
    # print(df.info())
    # print('\nFeature Selector analysis')
    return df
Ejemplo n.º 6
0
 def feature_engineering(self, x_data, y_data, train=None):
     #特征选择
     cols = x_data.columns
     # 消耗
     consume_col = cols[0:10]
     # 招募
     recruit_col = cols[10:22]
     # 加速
     acceleration_col = cols[22:32]
     # 建筑
     build_col = cols[32:48]
     # 科技
     science_col = cols[48:97]
     # pvp
     pvp_col = cols[97:103]
     # 付费
     pay_col = cols[103:106]
     # label
     # label_col = cols[108]
     if train:
         fs = FeatureSelector(data=x_data, labels=DataFrame(y_data))
         fs.identify_all(
             selection_params={
                 'missing_threshold': 0.6,
                 'correlation_threshold': 0.98,
                 'task': 'classification',
                 'eval_metric': 'auc',
                 'cumulative_importance': 0.99
             })
         self.drop_columns = fs.ops
         with open('drop_columns.pkl', 'wb') as file:
             pickle.dump(self.drop_columns, file)
         self.feature_df = fs.remove(methods='all', keep_one_hot=False)
     else:
         drop_list = []
         for key in self.drop_columns.keys():
             for value in self.drop_columns[key]:
                 drop_list.append(value)
         self.feature_df.drop(drop_list, axis=1, inplace=True)
     print(self.drop_columns)
def select_top_features(train_data):
    fs = FeatureSelector(train_data[0], train_data[1])
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=6,
                                early_stopping=True)
    fs.identify_low_importance(cumulative_importance=0.99)

    return fs.ops['zero_importance'], fs.ops['low_importance']
def heatmap_ftr_slcor(df):  # heatlap feature selector funciton
    le = {}
    le_df = df.drop(columns='ANNEE')
    le_df['ADR_CP'] = le_df['ADR_CP'].astype(object)
    for col in le_df.columns:  ### cai's code
        if le_df.dtypes[col] == 'object':
            le_df[col] = le_df[col].str.upper()
            le[col] = LabelEncoder()
            result = le[col].fit_transform(le_df[le_df[col].notnull()][col])
            le_df.loc[le_df[le_df[col].notnull()].index, col] = result

    fs = FeatureSelector(data=le_df, labels=df['REMUNERATION'])
    cor_out = le_df.corr()
    #cor_out.drop(columns=['idCSV','ID_ANO','id','PAYS','SUJET','idCSVDescript'],inplace=True)         ## dropping unwanted columns
    cor_out.drop(columns=[
        'idCSV', 'ID_ANO', 'id', 'PAYS', 'SUJET', 'idCSVDescript', 'SITE_LON',
        'SITE_LAT', 'ADR_LAT', 'ADR_LON', 'ENT_LAT', 'ENT_LON'
    ],
                 inplace=True)
    # print(cor_out.columns)
    new_df = pd.DataFrame(columns=['group', 'variable',
                                   'value'])  # new dataframe
    new_df.columns
    k = 0
    li = list(cor_out.columns)
    # print(li)
    length = len(li)
    #cor_out.reset_index(inplace=True, drop=True)
    i_ind = 0
    k = 0

    while i_ind < length:  ## to group all the variables according as shown in the "indu.csv", so as to be fead to heatmap
        #print(li[i_ind])
        for i in li:
            new_df.loc[k, 'group'] = li[i_ind]
            new_df.loc[k, 'variable'] = i
            new_df.loc[k, 'value'] = cor_out.loc[i, li[
                i_ind]]  ##### since all the values are very very less, there aren't showing significant difference in heatmap
            k = k + 1  ##### so just multiplied by 10 .... THIS HAS TO BE CHECKED
        i_ind = i_ind + 1
    # print(new_df.head(3))
    new_df.to_csv(os.path.join(BASE_DIR,
                               'DjangoWeb V1\Interface\static\indu.csv'),
                  index=False)
    return None
Ejemplo n.º 9
0
def Bestfeature_from_cummulative_importance(inFile, outFile):

    df = pd.read_csv(inFile, sep='\t')
    print(df.shape)
    train_labels = df['class_label']
    train = df.drop(columns=['class_label'])
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    zero_importance_features = fs.ops['zero_importance']
    #fs.plot_feature_importances(threshold = 0.99, plot_n = 12)
    importance_index = np.min(
        np.where(fs.feature_importances['cumulative_importance'] > 0.99))
    fs.identify_low_importance(cumulative_importance=0.99)
    print(importance_index)
    train_removed_all = fs.remove(methods=['zero_importance'],
                                  keep_one_hot=False)
    train_removed_all = pd.concat([train_removed_all, train_labels], axis=1)
    train_removed_all.to_csv(outFile, sep='\t', index=None)
Ejemplo n.º 10
0
def featuresSel(train, train_labels, name):
    """Plots the curve for the importantant features
	
	Arguments:
		train {pandas.Dataframe} -- Dataset
		train_labels {numpy.ndarray} -- Labels for the dataset
		name {string} -- Name for file
	"""
    print('>>> Feature Selection...')
    fs = FeatureSelector(data=train, labels=train_labels)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=True)
    plt.figure(figsize=(15, 15))
    fs.plot_feature_importances(threshold=0.99, plot_n=50, name=name)
    plt.savefig('../../data/figures/rank_{}.png'.format(name))
    plt.close()
y_dev = np.array([x - 1 for x in y_dev])

# In[11]:

#Train Distribution

d = {'y_train': y_train}
df_y_train = pd.DataFrame(d)
print(df_y_train["y_train"].value_counts())
df_y_train["y_train"].value_counts().plot.bar(figsize=(10, 8), rot=45)

# ## Features

# In[12]:

fs = FeatureSelector(data=x_train, labels=y_train)

# ### Missing Values
# The first method for finding features to remove is straightforward: find features with a fraction of missing values above a specified threshold.

# In[13]:

fs.identify_missing(missing_threshold=0.1)

# ### Collinear Features
# Collinear features are features that are highly correlated with one another. In machine learning, these lead to decreased generalization performance on the test set due to high variance and less model interpretability.

# In[14]:

fs.identify_collinear(correlation_threshold=0.70)
fs.plot_collinear()
# -*- coding: utf-8 -*-
"""
Created on Wed Jan  2 20:34:58 2019

@author: Animesh
"""
features = []
file = open('Training Dataset.arff').read()
list = file.split('\n')
data = np.array(list)
data1 = [i.split(',') for i in data]
data1 = data1[0:-1]
for i in data1:
    results.append(i[30])
data1 = np.array(data1)
features = data1[:, :-1]

x = features[:, [
    0, 1, 2, 3, 4, 5, 6, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22, 23, 24, 25, 27,
    29
]]
y = []
from feature_selector import FeatureSelector
# Features are in train and labels are in train_labels
fs = FeatureSelector(data=train, labels=train_labels)
Ejemplo n.º 13
0
from evaluation import tpr_weight_funtion_lc

path0 = '../results/'
test = pd.read_csv(path0 + 'test.csv')
train = pd.read_csv(path0 + 'train.csv')

print('tag_value_counts', train['Tag'].value_counts())

train_labels = train['Tag']
train = train.drop(['UID', 'Tag'], axis=1)

X_loc_test = test.drop('UID', axis=1)

from feature_selector import FeatureSelector
# Features are in train and labels are in train_labels
fs = FeatureSelector(data=train, labels=train_labels)

#缺失值统计
fs.identify_missing(0.5)
df_miss_value = fs.missing_stats.sort_values('missing_fraction',
                                             ascending=False)
print('df_miss_value', df_miss_value.head(15))
missing_features = fs.ops['missing']
print('missing_features to remove', missing_features[:20])

#单值特征统计
fs.identify_single_unique()
print('fs.plot_unique()', fs.plot_unique())

fs.identify_collinear(0.95)
print('plot_collinear()', fs.plot_collinear())
x = train_data.drop(['isNew', 'target'], axis=1)

#观察数据中residentAddr的编码方式,重新构造了特征
df.loc[df['isNew'] == 0,
       'residentAddr'] = df[df['isNew'] == 0]['residentAddr'].apply(
           lambda x: x if x == -999 else x - 300000)

#特征选择,特征选择的参数解释:
"""
missing_threshold表示数据特征缺失值比例阈值,当缺失值比例超过0.6时则删除该特征
correlation_threshold表示特征之间的相关性
task指的是进行的任何,eval_metric表示使用的评价指标
cumulative_importance指的是按特征重要性排序后的特征累加,看多少个特征重要性累加可以达到0.95
"""

fs = FeatureSelector(data=x, labels=y)
fs.identify_all(
    selection_params={
        'missing_threshold': 0.6,
        'correlation_threshold': 0.9,
        'task': 'regression',
        'eval_metric': 'mse',
        'cumulative_importance': 0.95
    })

choose = fs.remove(methods=['missing', 'single_unique', 'zero_importance'],
                   keep_one_hot=True)

#根据选择得到的特征集来得到训练数据和测试数据集
x = x[choose.columns.values]
X_predict = df_predict[choose.columns.values]
Ejemplo n.º 15
0
from sklearn.feature_selection import SelectKBest, chi2
import pandas as pd
import numpy as np

# columns = ['A_TS%', 'A_eFG%', 'A_3PAr', 'A_FTr', 'A_ORB%', 'A_DRB%',
#                              'A_TRB%', 'A_AST%', 'A_STL%', 'A_BLK%', 'A_TOV%', 'A_ORtg', 'A_DRtg',
#                              'H_TS%', 'H_eFG%', 'H_3PAr', 'H_FTr', 'H_ORB%', 'H_DRB%',
#                              'H_TRB%', 'H_AST%', 'H_STL%', 'H_BLK%', 'H_TOV%', 'H_ORtg', 'H_DRtg'
#                              ]
columns = ['TS%', 'eFG%', '3PAr', 'FTr', 'ORB%', 'DRB%', 'TRB%',
                             'AST%', 'STL%', 'BLK%', 'TOV%', 'ORtg', 'DRtg']

features = pd.DataFrame(adv_diff_features(None))
labels = pd.DataFrame(adv_diff_labels())

features.columns = columns
labels.columns = ['POINT_DIFF']

print(len(features), len(labels))

fs = FeatureSelector(data=features, labels=labels)
fs.identify_missing(missing_threshold=0.9)
fs.identify_collinear(correlation_threshold=0.5)
fs.plot_collinear()

fs2 = FeatureSelector(data=features, labels=labels[:,])
fs2.identify_zero_importance(eval_metric='l2', task='regression')
# fs2.identify_low_importance()

print(fs.record_collinear.head())
Ejemplo n.º 16
0
          
          
          
####################################
#--       FEATURE SELECTION
####################################



#-- Separate features from labels
y = df['target']
train_labels = y
df_feats = df.drop(columns = ['target'])

#-- Create an instance
fs = FeatureSelector(data = df_feats, labels = train_labels)

#-- Identify redundant features
if(USE_LEARNER_FOR_FEATURE_SELECTION):
    # NOT COMPLETE
    fs.identify_all(selection_params = {'missing_threshold': 0.6, 'correlation_threshold': 0.98, 
                                    'task': 'classification', 'eval_metric': 'auc', 
                                     'cumulative_importance': 0.99})
    #-- Get valuable features   
    X = fs.remove(methods = 'all', keep_one_hot = True)

else:
    #-- Features with missing values greater than threshold 
    fs.identify_missing(missing_threshold = MISSING_VALUE_THRESHOLD)
    #-- Correlated features
    fs.identify_collinear(correlation_threshold = CORRELATION_THRESHOLD)
Ejemplo n.º 17
0
x.loc[:, 'county'].replace(county_dic.keys(), county_dic.values(), inplace=True)
x.loc[:, 'state_perm'].replace(state_perm_dic.keys(), state_perm_dic.values(), inplace=True)
x.loc[:, 'major_type10'].replace(major_type10_dic.keys(), major_type10_dic.values(), inplace=True)
x.loc[:, 'major_basic'].replace(major_basic_dic.keys(), major_basic_dic.values(), inplace=True)
x.loc[:, 'DegreeCompletionTermDescr'].replace(DegreeCompletionTermDescr_dic.keys(), DegreeCompletionTermDescr_dic.values(), inplace=True)
x.loc[:, 'DegreeAcadPlan'].replace(DegreeAcadPlan_dic.keys(), DegreeAcadPlan_dic.values(), inplace=True)
x.loc[:, 'DegreeDeptName'].replace(DegreeDeptName_dic.keys(), DegreeDeptName_dic.values(), inplace=True)
x.loc[:, 'DegreeSchoolCollegeName'].replace(DegreeSchoolCollegeName_dic.keys(), DegreeSchoolCollegeName_dic.values(), inplace=True)
x.loc[:, 'DegreeAcadProgramDescr'].replace(DegreeAcadProgramDescr_dic.keys(), DegreeAcadProgramDescr_dic.values(), inplace=True)
x.loc[:, 'DegreeSubPlan'].replace(DegreeSubPlan_dic.keys(), DegreeSubPlan_dic.values(), inplace=True)

# fill Nan
# x = x.fillna(0)
# print(x)
# Features are in train and labels are in labels
fs = FeatureSelector(data=x, labels=label)
# 缺失特征分析
fs.identify_missing(missing_threshold=0.6)
# 可以看到数据缺失最厉害的几项数据
print(fs.missing_stats[:10])
missing_features = fs.ops['missing']
print(missing_features[:5])
fs.plot_missing()
plt.show()

# 共线特征分析
# 对于每对相关特征,它会标出其中一个特征来删除
fs.identify_collinear(correlation_threshold=0.7)
# fs.plot_collinear(plot_all=True)
# plt.show()
# list of collinear features to remove
Ejemplo n.º 18
0
                                max_depth=2,
                                verbose=1,
                                n_jobs=1)

col_m = []
for col in feature:
    if 'CHANGE_TYPE' not in col:
        col_m.append(col)
feature_matrix = feature[col_m]

# %%
#使用feature_selector 篩選特徵
import os
os.chdir('c:\\Users\\SA\\python\\練習py')
from feature_selector import FeatureSelector
fs = FeatureSelector(data=feature_matrix, labels=y)
#%%
#處理缺失值
fs.identify_missing(missing_threshold=0.6)
fs.record_missing.head()
fs.plot_missing()
# %%
#處理共線性(colliear)
fs.identify_collinear(correlation_threshold=0.8)
fs.record_collinear.head()
fs.plot_collinear()
# %%
#使用lightGBM演算法
fs.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=10,
Ejemplo n.º 19
0
def featureselect(datas, target):
    import os
    os.chdir('c:\\Users\\SA\\python\\練習py')
    from feature_selector import FeatureSelector
    fs = FeatureSelector(data=datas, labels=target)

    fs.identify_missing(missing_threshold=0.6)
    fs.identify_collinear(correlation_threshold=0.9)
    fs.identify_zero_importance(task='classification',
                                eval_metric='auc',
                                n_iterations=10,
                                early_stopping=False)
    fs.identify_low_importance(cumulative_importance=0.9)

    train_removed = fs.remove(methods='all')
    return train_removed
Ejemplo n.º 20
0
metrics=lambda a,b:AUCPRC(a,b,withACC=T,withAUC=T,withSS=T)


# In[24]:


aucprc=lambda y_true, y_pred:tuple(list(AUCPRC(y_true,y_pred)[0])+[T])


# ###### Feature Importance

# In[25]:


fsDataScale = FeatureSelector(data = XTrain.dropCol("Amount"), labels=YTrain)


# On redefinit la fonction qui définit les importances en incluant SMOTE

# In[26]:


def identify_feat_imp(self,n_splits=10):
    data=self.data
    dataClass=self.labels
    skf=StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=T)
    feature_names=list(data.columns)
    feature_importance_values = np.zeros(len(feature_names))
    scores = np.zeros(n_splits)
    for i,(train_index, test_index) in tqdm_notebook(enumerate(skf.split(data, dataClass))):        
Ejemplo n.º 21
0
    train_data.append(df)

# Define name of 12 features set 
file_name = ["AtomPairs2D","AtomPairs2DCount","EState", "Extended", "Fingerprinterd", "GraphOnly",
"KlekotaRoth", "KlekotaRothCount", "MACCS", "Pubchem", "Substructure", "SubstructureCount"]
file_name = sorted(file_name) # Sorting name

#################
#Load one train data for get labels
train_label = pd.read_csv("Data/DILI_data/DILI_train_MF/DILI_train_AtomPairs2D.csv")

# Start feature selecting and add labels for each training dataset
for train, name in zip(train_data, file_name):
    feature_columns = []
    labels = train_label["class."]
    X_train = train.drop(labels = "Name", axis = 1)
    fs = FeatureSelector(data = X_train, labels = labels)
    fs.identify_all(selection_params = {'missing_threshold': 0.8, 'correlation_threshold': 0.98, 
                                        'task': 'classification', 'eval_metric': 'auc', 
                                        'cumulative_importance': 0.99,'num_threads':-1})
    train_removed_all = fs.remove(methods = 'all', keep_one_hot=False) 
    print('Original Number of Features', train.shape[1]) 
    print('Final Number of Features: ', train_removed_all.shape[1]) 
    train_removed_all.head()
    feature_columns.extend(train_removed_all.columns)
    feature_columns = pd.DataFrame(feature_columns,index=None)
    feature_columns.to_csv('Features_'+ name+'.csv',index = False, header = name)
    train_removed_all['class.']=labels
    train_removed_all.to_csv('Data/Feature_Data/Feature_Data/Feature_Train_'+ name + '.csv', index=False, header=True)

Ejemplo n.º 22
0
    fs.identify_collinear(correlation_threshold=0.98)
    fs.record_collinear.to_csv("csv//record_collinear.csv")

    #Identify Single Unique
    fs.identify_single_unique()
    fs.record_single_unique.to_csv("csv//record_single_unique.csv")

    #Zero importance
    fs.identify_zero_importance(task='classification',
                                eval_metric='multi_logloss',
                                n_iterations=10,
                                early_stopping=True)
    fs.record_zero_importance.to_csv("csv//record_zero_importance.csv")

    #Low Importance
    fs.identify_low_importance(cumulative_importance=0.99)
    fs.feature_importances.to_csv("csv//feature_importance.csv")

    #Identified features for removal
    summary = pd.DataFrame.from_dict(fs.ops, orient='index')
    summary.to_csv("csv//summary.csv")


if __name__ == '__main__':

    __AAPL__ = "D:\\Dropbox\\9. Data\\Mercury Data\\CSV\\CIQ_AAPL.csv"
    data = DataLoader(__AAPL__, window=10, threshold=0.03, drop=1)
    fs = FeatureSelector(data=data.df, labels=data.targets)

    main()
Ejemplo n.º 23
0
# for f in dis_data or NAN_data_test or NAN_data_train:
# 	encoder=OneHotEncoder(sparse=False)
# 	one_hot_data=train_data[f].values.reshape(-1,1)
# 	encoder.fit(one_hot_data)
# 	train_data[f]=encoder.transform(one_hot_data)

print("data preprocessing done!")

print("starting feature selection...")
#feature selection
#correlation calculation(skip)

#feature_selector
from feature_selector import FeatureSelector
fs = FeatureSelector(data=train_data, labels=train_label)

#find features with 0 variance
fs.identify_single_unique()

#recursive feature elimination
fs.identify_zero_importance(task='classification',
                            eval_metric='auc',
                            n_iterations=5,
                            early_stopping=True)
print("finish zero importance analysis")
fs.identify_low_importance(cumulative_importance=0.99)
print("finish low importance analysis")
train_data = fs.remove(methods='all')
print("finish removing train_data")
Ejemplo n.º 24
0
from feature_selector import FeatureSelector

# In[8]:

train_labels = train_data['label']
train_features = train_data.drop(columns=[
    'user', 'product_nbr', 'last_year_capture_user_flag', 'label',
    'pro_brand_-1', 'pro_brand_Apple', 'pro_brand_三星', 'pro_brand_其他',
    'pro_brand_华为', 'pro_brand_小米', 'pro_brand_未知厂商', 'pro_brand_欧珀',
    'pro_brand_维沃'
])

# In[14]:

fs = FeatureSelector(data=train_features, labels=train_labels)
fs.identify_collinear(correlation_threshold=0.9, one_hot=False)
# 绘制选择的特征的相关性heatmap
fs.plot_collinear()
# 列出要删除的共线特征
collinear_features = fs.ops['collinear']
# 查看共线特征的dataframe
fs.record_collinear

# In[20]:

train_data = train_data.drop(columns=collinear_features)

# In[21]:

train_data.shape
Ejemplo n.º 25
0
    # data_label[emb_list].to_csv('emb_testb.csv',index=False)


print('feature done')

train_label = data_label[:num]
test_label = data_label[num:]
features = [x for x in train_label.columns if x not in ['ship','type','time','x','y','diff_time','date','day_nig','direction','speed','hour',
                                                       'speed_many','dire_diff','direction_str','speed_str','dis','x_speed','y_speed'] ]
target = 'type'
# print(len(features), ','.join(features))


from feature_selector import FeatureSelector
fs = FeatureSelector(data = train_label[features], labels = train_label[target])
fs.identify_zero_importance(task = 'classification', eval_metric = 'multiclass',
                            n_iterations = 10, early_stopping = True)
fs.identify_low_importance(cumulative_importance = 0.97)
low_importance_features = fs.ops['low_importance']
print('====low_importance_features=====')
print(low_importance_features)
for i in low_importance_features:
    features.remove(i)



print('feature number',len(features))
gc.collect()

Ejemplo n.º 26
0
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=0)

# Feature scaling
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

# Feature selection (remove highly correlated features)
from feature_selector import FeatureSelector

n = len(X_train.T)
fs = FeatureSelector(data=X_train)
fs.identify_collinear(
    correlation_threshold=0.7)  # select features from training set
corr = fs.ops['collinear']
X_train = fs.remove(methods=['collinear'
                             ])  # remove selected features from training set
to_remove = pd.unique(
    fs.record_collinear['drop_feature'])  # features to remove
X_test = X_test.drop(
    columns=to_remove)  # remove selected features from test set

# Create the artificial neural network
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
Ejemplo n.º 27
0
train = pd.read_csv(
    'C:/Users/Administrator/Scikit_learn/feature-selector-master/credit_example.csv'
)
train_labels = train['TARGET']

#pd.head,默认查看前五行数据。因为train已经是pd因此直接.head即可
print(train.head())

#pandas.drop的用法,删除Target列
train = train.drop(['TARGET'], axis=1)

#对于pandas,行标为index,列表为columns
#如常用df = pd.DataFrame(np.random.randn(5,3),index = list('abcde'),columns = ['one','two','three'])

#Create the Instance
fs = FeatureSelector(data=train, labels=train_labels)

#   1   Missing Values

fs.identify_missing(missing_threshold=0.6)

#The features identified for removal can be accessed through the ops dictionary of the FeatureSelector object.
missing_features = fs.ops['missing']
print(missing_features[:20])

fs.plot_missing()  #在每一个画图的后面加上plt.show即可
plt.show()
print(fs.missing_stats.head(20))

#   2   Single Unique Value
Ejemplo n.º 28
0
    def runFeatureSelector(self, df):
        logging.info(("Running Feature Selection"))
        fs = FeatureSelector(data=df, labels=self.targets)

        # Identify Missing Values
        fs.identify_missing(missing_threshold=0.6)

        # Identify Collinearity
        fs.identify_collinear(correlation_threshold=0.98)
        fs.record_collinear.to_csv(".\\utils\\csv\\record_collinear.csv")

        # Identify Single Unique
        fs.identify_single_unique()
        fs.record_single_unique.to_csv(
            ".\\utils\\csv\\record_single_unique.csv")

        # Zero importance
        fs.identify_zero_importance(task='classification',
                                    eval_metric='multi_logloss',
                                    n_iterations=10,
                                    early_stopping=True)
        fs.record_zero_importance.to_csv(
            ".\\utils\\csv\\record_zero_importance.csv")

        # Low Importance
        fs.identify_low_importance(cumulative_importance=0.99)
        fs.feature_importances.to_csv(".\\utils\\csv\\feature_importance.csv")

        #generate summary of all operations
        summary = pd.DataFrame.from_dict(fs.ops, orient='index')
        summary.to_csv(".\\utils\\csv\\summary.csv")

        #if drop flag is 1, go ahead and remove the suggested features
        if self.drop == 1:
            df = fs.remove(methods='all')
        else:
            pass

        return df
dfm__ = dfm_.reset_index().set_index(['date', 'symbol'])
dfm__['win'] = (dfm__['trt1m'] > dfm__['sprtrn']).astype(np.int64)
dfm__['rtoversp'] = dfm__['trt1m'] - dfm__['sprtrn']
dfm__ = dfm__.dropna()
dfm__.isna().sum()

df_mrq['win'] = dfm__.win
df_mrq['trt1m'] = dfm__.trt1m
df_mrq['sprtrn'] = dfm__.sprtrn
df_mrq['rtoversp'] = dfm__.rtoversp
df_mrq = df_mrq.dropna()

train = df_mrq.drop(columns=['dimension', 'win', 'rtoversp'])
train_labels = df_mrq['win']

fs = FeatureSelector(data=train, labels=train_labels)
fs.identify_collinear(correlation_threshold=0.975)

#fs.plot_collinear(plot_all=True)

#fs.identify_zero_importance(task = 'regression', eval_metric = 'auc', n_iterations = 10, early_stopping = True)

#fs.identify_low_importance(cumulative_importance = 0.99)

all_to_remove = fs.check_removal()
print(all_to_remove)

df_mrq_pruned = df_mrq.drop(columns=all_to_remove)

# df_mrq_pruned.to_csv('data/SHARADAR_SF1_montly_combined_universe_MRQ.labelled.csv')
Ejemplo n.º 30
0
        f.write("特征:{} 数据类型:{}\n".format(index + 1, data_type))
print("#-----------------------------------------#")
print("\n")

print("#-----------------------------------------#")
print("查看数值型数据的分布情况")
columne_value_describe = data.describe()
columne_value_describe.to_csv("columne_value_describe.csv",
                              index=True,
                              header=True)
print("#-----------------------------------------#")
print("\n")

print("#-----------------------------------------#")
print("利用FeatureSelector进行数据预处理")
fs = FeatureSelector(data=data, labels=y_labels)
print("# identify_missing")
fs.identify_missing(missing_threshold=0.65)
missing_features = fs.ops["missing"]
missing_stats = fs.missing_stats
fs.plot_missing()
plt.savefig("missing_features.jpg", dpi=300)
plt.show()

print(fs.missing_stats.head())
print("\n")

print("# identify_single_unique")
fs.identify_single_unique()
single_uniques = fs.ops["single_unique"]
with open("single_unique_feature_count.txt", "w") as f: