def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def HSIC_lasso(): hsic = HSICLasso() hsic.input(data, labels) before = datetime.datetime.now() hsic.classification(num_feat=treshold, B=0, M=1, max_neighbors=10, discrete_x=False) # B a M su na postupne nacitanie ak mam velky dataset, B deli pocet vzoriek, pre klasicky algoritmus B=0, M=1 after = datetime.datetime.now() print("HSIC Lasso") selected = hsic.get_index() print(len(selected)) print("cas: " + str(after - before)) print('\n') if len(selected) < len(header): transform_and_save(selected, "HSIC_Lasso")
def main(): hsic_lasso = HSICLasso() #out_list = ['c'+str(i) for i in range(1,51)] #print (out_list) hsic_lasso.input("./user_data_new.csv", output_list=[ 'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9', 'c10' ]) # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30']) hsic_lasso.regression(100, B=50) hsic_lasso.dump() select_index = hsic_lasso.get_index() print(select_index) print(hsic_lasso.get_index_score()) #hsic_lasso.plot_path() print(hsic_lasso.get_features()) X_select = hsic_lasso.X_in[select_index, :] np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
class HSICLasso: def __init__(self, k=10): self.model = HLasso() self.k = k self.modelname = "HSICLasso_{}".format(k) def fit(self, X, y): self.model.input(X, y) self.model.classification(self.k) self.index = np.array(self.model.get_index()) return self def transform(self, X): return X[:, self.index] def fit_transform(self, X, y): self.fit(X, y) return self.transform(X)
def fit(self, X, y): if X.shape[1] > 10000: #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y) clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y) ftimp = clf.feature_importances_ relevant = np.where(ftimp > 0)[0] print("relevant ft:", len(relevant), "/", X.shape[1]) else: relevant = np.arange(X.shape[1]) sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[:, relevant][train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(relevant[all_ft_idx]) #if len(idxs) == 1: # self.hsic_idx_ = idxs[0] #else: # self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) self.hsic_idx_ = [] stability_concession = 0 while len(self.hsic_idx_) == 0: featurecandidates = np.unique(np.concatenate(idxs)) for candidate in featurecandidates: occurrences = np.sum( [1 if candidate in idx else 0 for idx in idxs]) if occurrences > self.stability_minimum_across_splits - stability_concession: self.hsic_idx_.append(candidate) if len(self.hsic_idx_) > 1: break else: # failed to find commonly occurring features - reduce threshold stability_concession += 1 print("HSIC done.", len(self.hsic_idx_), "(out of ", len(featurecandidates), " candidates)") print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def hsic_lasso_matric(self, data, n_jobs=2, n_sample=False, frac_sample=False): '''Calculate hsic lasso (subtract correlation between explanatory variables). Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction. The correlation between variable 0 and the other variable is stored as the component on the 0th row, and the correlation between variable 1 and the other variable is stored as the component on the first row. n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU. data: (numpy or pandas) A data frame that contains all explanatory and objective variables n_sample : (int) How much random sampling to do. False if not. If a numerical value is entered, sampling is performed using that number of rows. frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample. ''' data = copy(data) data = pd.DataFrame(data).dropna() # Sampling when n_sample contains a numerical value if not n_sample: if not frac_sample: # n_sample=False, frac_sample=False pass else: # n_sample=False, frac_sample=int data = data.sample(frac=frac_sample, replace=True) else: if not frac_sample: # n_sample=int, frac_sample=False data = data.sample(n=n_sample, replace=True) else: # n_sample=int, frac_sample=int raise ValueError( 'Please enter a value for `frac` OR `n`, not both') data = check_array(data, accept_sparse="csc", dtype=float) # Convert to numpy.ndarray n_col = data.shape[1] hsic_array = np.empty((0, n_col - 1), float) for i in range(n_col): X = np.delete(data, obj=i, axis=1) y = data[:, i] # Calculation of hsic_lasso hsic_lasso = HSICLasso() hsic_lasso.input(X, y) hsic_lasso.regression(num_feat=X.shape[1], discrete_x=False, n_jobs=n_jobs) # hsic_lasso only appears in descending order of score, so sort hsic_ = np.array( [hsic_lasso.get_index(), hsic_lasso.get_index_score()]) hsic_ = hsic_.T # Transpose because it is difficult to use # Since there are not enough scores that came out, add 0.0 to the index to complement lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0]) for lack in lack_set: lack_list = np.array([[lack, 0.0]]) hsic_ = np.append(hsic_, lack_list, axis=0) hsic_ = hsic_[np.argsort(hsic_[:, 0])] # Sort by index hsic_array = np.append(hsic_array, hsic_[:, 1].reshape(1, -1), axis=0) # Since it does not include the correlation component with itself, add 1.0 n_row = hsic_array.shape[0] for i in range(n_row): insert_i = (n_row + 1) * i hsic_array = np.insert(hsic_array, insert_i, 1.0) self.hsic_lasso = hsic_array.reshape(n_row, -1) return self.hsic_lasso
def hsic(train, test, K): hsic_lasso = HSICLasso() hsic_lasso.input(train[0], train[1]) hsic_lasso.classification(K, n_jobs=-1) indices = hsic_lasso.get_index() return indices
def featureSelection(X, y, method = 'lasso', select = 500): t0 = time.time() # sparse (15 seconds) if method == 'lasso': from sklearn import linear_model a = 0.861 if select == 500 else 0.0755 lasso = linear_model.Lasso(alpha = a) lasso.fit(X,y) XSelected = X[:,lasso.coef_ != 0] indices = np.where(lasso.coef_ != 0) if indices > select: indices = np.argsort(-lasso.coef_)[:select] # non-sparse (157 seconds) if method == 'rf': from sklearn.ensemble import ExtraTreesRegressor from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel t = ExtraTreesRegressor(n_estimators=50) t.fit(X, y) model = SelectFromModel(t, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support) # non-sparse (8.5 seconds) if method == 'svm': from sklearn.svm import SVR from sklearn.feature_selection import SelectFromModel SVMReg = SVR(kernel = 'linear', gamma='scale', C=1.0, epsilon=0.2) SVMReg.fit(X, y) model = SelectFromModel(SVMReg, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support()) # wrapper model (preset number of features) (1000 seconds / 5000 seconds) if method == 'hsiclasso': from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input(X,y) hsic_lasso.regression(select) XSelected = X[:,hsic_lasso.get_index()] indices = hsic_lasso.get_index() # dimensionality reduction # PCA # MDS # PLS # DWT # f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w") # f.create_dataset('X', data=XSelected) # f.create_dataset('indices', data=indices) # f.close() # return indices np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices) # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected) print("--- %s seconds ---" % (time.time() - t0))