def main(): hsic_lasso = HSICLasso() #out_list = ['c'+str(i) for i in range(1,51)] #print (out_list) hsic_lasso.input("./user_data_new.csv", output_list=[ 'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9', 'c10' ]) # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30']) hsic_lasso.regression(100, B=50) hsic_lasso.dump() select_index = hsic_lasso.get_index() print(select_index) print(hsic_lasso.get_index_score()) #hsic_lasso.plot_path() print(hsic_lasso.get_features()) X_select = hsic_lasso.X_in[select_index, :] np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
def HSICLasso(self): df_ = self.data.copy() cols = list(df_.columns)[:-1] + ['class'] df_.columns = cols hsic_lasso = HSICLasso() hsic_lasso.input(self.X_train.values, self.Y_train.values) if self.type == CLASSIFICATION: hsic_lasso.classification(self.num_top_features) elif self.type == REGRESSION: hsic_lasso.regression(self.num_top_features) feats = [ df_.columns[int(val) - 1] for val in hsic_lasso.get_features() ] for feat, imp in zip(feats, hsic_lasso.get_index_score()): features_[feat] = imp self.report_feature_importance(features_, self.num_top_features, label="HSICLasso")
def hsic_lasso_matric(self, data, n_jobs=2, n_sample=False, frac_sample=False): '''Calculate hsic lasso (subtract correlation between explanatory variables). Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction. The correlation between variable 0 and the other variable is stored as the component on the 0th row, and the correlation between variable 1 and the other variable is stored as the component on the first row. n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU. data: (numpy or pandas) A data frame that contains all explanatory and objective variables n_sample : (int) How much random sampling to do. False if not. If a numerical value is entered, sampling is performed using that number of rows. frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample. ''' data = copy(data) data = pd.DataFrame(data).dropna() # Sampling when n_sample contains a numerical value if not n_sample: if not frac_sample: # n_sample=False, frac_sample=False pass else: # n_sample=False, frac_sample=int data = data.sample(frac=frac_sample, replace=True) else: if not frac_sample: # n_sample=int, frac_sample=False data = data.sample(n=n_sample, replace=True) else: # n_sample=int, frac_sample=int raise ValueError( 'Please enter a value for `frac` OR `n`, not both') data = check_array(data, accept_sparse="csc", dtype=float) # Convert to numpy.ndarray n_col = data.shape[1] hsic_array = np.empty((0, n_col - 1), float) for i in range(n_col): X = np.delete(data, obj=i, axis=1) y = data[:, i] # Calculation of hsic_lasso hsic_lasso = HSICLasso() hsic_lasso.input(X, y) hsic_lasso.regression(num_feat=X.shape[1], discrete_x=False, n_jobs=n_jobs) # hsic_lasso only appears in descending order of score, so sort hsic_ = np.array( [hsic_lasso.get_index(), hsic_lasso.get_index_score()]) hsic_ = hsic_.T # Transpose because it is difficult to use # Since there are not enough scores that came out, add 0.0 to the index to complement lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0]) for lack in lack_set: lack_list = np.array([[lack, 0.0]]) hsic_ = np.append(hsic_, lack_list, axis=0) hsic_ = hsic_[np.argsort(hsic_[:, 0])] # Sort by index hsic_array = np.append(hsic_array, hsic_[:, 1].reshape(1, -1), axis=0) # Since it does not include the correlation component with itself, add 1.0 n_row = hsic_array.shape[0] for i in range(n_row): insert_i = (n_row + 1) * i hsic_array = np.insert(hsic_array, insert_i, 1.0) self.hsic_lasso = hsic_array.reshape(n_row, -1) return self.hsic_lasso