def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/csv_data_mv.csv", output_list=['output1', 'output2']) hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path()
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") #Single core processing hsic_lasso.regression(5, n_jobs=1) #Multi-core processing. Use all available cores (default) hsic_lasso.regression(5, n_jobs=-1)
def hsic(num_features, hsic_data, method='regression'): hsic_lasso = HSICLasso() hsic_lasso.input(hsic_data) if method == 'regression': hsic_lasso.regression(num_features) else: hsic_lasso.classification(num_features) return hsic_lasso.get_features()
def hsic_sel(csv, no_features, method='classification'): hsic_lasso = HSICLasso() hsic_lasso.input(csv) if method == 'regression': hsic_lasso.regression(no_features) else: hsic_lasso.classification(no_features) return hsic_lasso.get_features()
def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") #max_neighbors=0 means that we only use the HSIC Lasso features to plot heatmap hsic_lasso.regression(5, max_neighbors=0) #Compute linkage hsic_lasso.linkage() #Run Hierarchical clustering # Features are clustered by using HSIC scores # Samples are clusterd by using Euclid distance hsic_lasso.plot_heatmap()
def HSIC_lasso(): hsic = HSICLasso() hsic.input(data, labels) before = datetime.datetime.now() hsic.classification(num_feat=treshold, B=0, M=1, max_neighbors=10, discrete_x=False) # B a M su na postupne nacitanie ak mam velky dataset, B deli pocet vzoriek, pre klasicky algoritmus B=0, M=1 after = datetime.datetime.now() print("HSIC Lasso") selected = hsic.get_index() print(len(selected)) print("cas: " + str(after - before)) print('\n') if len(selected) < len(header): transform_and_save(selected, "HSIC_Lasso")
def main(): #Numpy array input example hsic_lasso = HSICLasso() data = sio.loadmat("../tests/test_data/matlab_data.mat") X = data['X'].transpose() Y = data['Y'][0] featname = ['Feat%d' % x for x in range(1, X.shape[1] + 1)] hsic_lasso.input(X, Y, featname=featname) hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path() #Save parameters hsic_lasso.save_param()
def main(): hsic_lasso = HSICLasso() #out_list = ['c'+str(i) for i in range(1,51)] #print (out_list) hsic_lasso.input("./user_data_new.csv", output_list=[ 'c1', 'c2', 'c3', 'c4', 'c5,', 'c6', 'c7', 'c8', 'c9', 'c10' ]) # ,'c11', 'c12', 'c13', 'c14', 'c15,', 'c16', 'c17', 'c18', 'c19', 'c20','c21', 'c22', 'c23', 'c24', 'c25,', 'c26', 'c27', 'c28', 'c29', 'c30']) hsic_lasso.regression(100, B=50) hsic_lasso.dump() select_index = hsic_lasso.get_index() print(select_index) print(hsic_lasso.get_index_score()) #hsic_lasso.plot_path() print(hsic_lasso.get_features()) X_select = hsic_lasso.X_in[select_index, :] np.savetxt('X_select.txt', X_select, fmt=str('%.5f'), encoding='utf-8')
def HSICLasso(self): df_ = self.data.copy() cols = list(df_.columns)[:-1] + ['class'] df_.columns = cols hsic_lasso = HSICLasso() hsic_lasso.input(self.X_train.values, self.Y_train.values) if self.type == CLASSIFICATION: hsic_lasso.classification(self.num_top_features) elif self.type == REGRESSION: hsic_lasso.regression(self.num_top_features) feats = [ df_.columns[int(val) - 1] for val in hsic_lasso.get_features() ] for feat, imp in zip(feats, hsic_lasso.get_index_score()): features_[feat] = imp self.report_feature_importance(features_, self.num_top_features, label="HSICLasso")
def fit(self, X, y): if X.shape[1] > 10000: #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y) clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y) ftimp = clf.feature_importances_ relevant = np.where(ftimp > 0)[0] print("relevant ft:", len(relevant), "/", X.shape[1]) else: relevant = np.arange(X.shape[1]) sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[:, relevant][train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(relevant[all_ft_idx]) #if len(idxs) == 1: # self.hsic_idx_ = idxs[0] #else: # self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) self.hsic_idx_ = [] stability_concession = 0 while len(self.hsic_idx_) == 0: featurecandidates = np.unique(np.concatenate(idxs)) for candidate in featurecandidates: occurrences = np.sum( [1 if candidate in idx else 0 for idx in idxs]) if occurrences > self.stability_minimum_across_splits - stability_concession: self.hsic_idx_.append(candidate) if len(self.hsic_idx_) > 1: break else: # failed to find commonly occurring features - reduce threshold stability_concession += 1 print("HSIC done.", len(self.hsic_idx_), "(out of ", len(featurecandidates), " candidates)") print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def setUp(self): self.hsic_lasso = HSICLasso()
def hsic_lasso_matric(self, data, n_jobs=2, n_sample=False, frac_sample=False): '''Calculate hsic lasso (subtract correlation between explanatory variables). Since the correlation coefficient matrix is not symmetric, it is viewed in the row direction. The correlation between variable 0 and the other variable is stored as the component on the 0th row, and the correlation between variable 1 and the other variable is stored as the component on the first row. n_jobs : (int) Indicates the number of cores to be calculated. -1 for GPU. data: (numpy or pandas) A data frame that contains all explanatory and objective variables n_sample : (int) How much random sampling to do. False if not. If a numerical value is entered, sampling is performed using that number of rows. frac_sample: [0 ~ 1] (float) Sampled as a percentage of the number of rows. Not used at the same time as n_sample. ''' data = copy(data) data = pd.DataFrame(data).dropna() # Sampling when n_sample contains a numerical value if not n_sample: if not frac_sample: # n_sample=False, frac_sample=False pass else: # n_sample=False, frac_sample=int data = data.sample(frac=frac_sample, replace=True) else: if not frac_sample: # n_sample=int, frac_sample=False data = data.sample(n=n_sample, replace=True) else: # n_sample=int, frac_sample=int raise ValueError( 'Please enter a value for `frac` OR `n`, not both') data = check_array(data, accept_sparse="csc", dtype=float) # Convert to numpy.ndarray n_col = data.shape[1] hsic_array = np.empty((0, n_col - 1), float) for i in range(n_col): X = np.delete(data, obj=i, axis=1) y = data[:, i] # Calculation of hsic_lasso hsic_lasso = HSICLasso() hsic_lasso.input(X, y) hsic_lasso.regression(num_feat=X.shape[1], discrete_x=False, n_jobs=n_jobs) # hsic_lasso only appears in descending order of score, so sort hsic_ = np.array( [hsic_lasso.get_index(), hsic_lasso.get_index_score()]) hsic_ = hsic_.T # Transpose because it is difficult to use # Since there are not enough scores that came out, add 0.0 to the index to complement lack_set = set([x for x in range(X.shape[1])]) - set(hsic_[:, 0]) for lack in lack_set: lack_list = np.array([[lack, 0.0]]) hsic_ = np.append(hsic_, lack_list, axis=0) hsic_ = hsic_[np.argsort(hsic_[:, 0])] # Sort by index hsic_array = np.append(hsic_array, hsic_[:, 1].reshape(1, -1), axis=0) # Since it does not include the correlation component with itself, add 1.0 n_row = hsic_array.shape[0] for i in range(n_row): insert_i = (n_row + 1) * i hsic_array = np.insert(hsic_array, insert_i, 1.0) self.hsic_lasso = hsic_array.reshape(n_row, -1) return self.hsic_lasso
def featureSelection(X, y, method = 'lasso', select = 500): t0 = time.time() # sparse (15 seconds) if method == 'lasso': from sklearn import linear_model a = 0.861 if select == 500 else 0.0755 lasso = linear_model.Lasso(alpha = a) lasso.fit(X,y) XSelected = X[:,lasso.coef_ != 0] indices = np.where(lasso.coef_ != 0) if indices > select: indices = np.argsort(-lasso.coef_)[:select] # non-sparse (157 seconds) if method == 'rf': from sklearn.ensemble import ExtraTreesRegressor from sklearn.datasets import load_iris from sklearn.feature_selection import SelectFromModel t = ExtraTreesRegressor(n_estimators=50) t.fit(X, y) model = SelectFromModel(t, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support) # non-sparse (8.5 seconds) if method == 'svm': from sklearn.svm import SVR from sklearn.feature_selection import SelectFromModel SVMReg = SVR(kernel = 'linear', gamma='scale', C=1.0, epsilon=0.2) SVMReg.fit(X, y) model = SelectFromModel(SVMReg, prefit=True, max_features = select) XSelected = model.transform(X) indices = np.where(model.get_support()) # wrapper model (preset number of features) (1000 seconds / 5000 seconds) if method == 'hsiclasso': from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input(X,y) hsic_lasso.regression(select) XSelected = X[:,hsic_lasso.get_index()] indices = hsic_lasso.get_index() # dimensionality reduction # PCA # MDS # PLS # DWT # f = h5py.File('selected/' + str(select) + '/X_' + method + '.hdf5', "w") # f.create_dataset('X', data=XSelected) # f.create_dataset('indices', data=indices) # f.close() # return indices np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', indices) # np.savetxt('selected/' + str(select) + '/X_' + method + '.dat', XSelected) print("--- %s seconds ---" % (time.time() - t0))
def hsic(train, test, K): hsic_lasso = HSICLasso() hsic_lasso.input(train[0], train[1]) hsic_lasso.classification(K, n_jobs=-1) indices = hsic_lasso.get_index() return indices
- X_TRAIN: path of a numpy array with x. - Y_TRAIN: path of a numpy array with y. - FEATNAMES: path of a numpy array with feature names. - MODE: regression or classification. - HL_SELECT: number of features to select. - HL_B: size of the block. - HL_M: number of permutations. Output files: - features_hl.npy: numpy array with the 0-based index of the selected features. ''' import numpy as np from pyHSICLasso import HSICLasso hl = HSICLasso() np.random.seed(0) hl.X_in = np.load("${X_TRAIN}").T hl.Y_in = np.load("${Y_TRAIN}").T hl.Y_in = np.expand_dims(hl.Y_in, 0) hl.featname = np.load("${FEATNAMES}") try: hl.${MODE}($HL_SELECT, B = $HL_B, M = $HL_M, max_neighbors = 50) except MemoryError: import sys, traceback traceback.print_exc() np.save('features_hl.npy', np.array([])) sys.exit(77)
from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input("SNR-26415.csv") print(hsic_lasso.classification(100)) hsic_lasso.get_features() l = [] l.append(hsic_lasso.get_features()) print(hsic_lasso.get_features()) print(len(l)) temp = 0 hsic_lasso.dump() for i in range(0, len(l)): print(l[i]) temp = temp + 1 print(temp)
def main(): hsic_lasso = HSICLasso() hsic_lasso.input("../tests/test_data/matlab_data.mat") hsic_lasso.regression(5) hsic_lasso.dump() hsic_lasso.plot_path()