def hsic(num_features, hsic_data, method='regression'): hsic_lasso = HSICLasso() hsic_lasso.input(hsic_data) if method == 'regression': hsic_lasso.regression(num_features) else: hsic_lasso.classification(num_features) return hsic_lasso.get_features()
def hsic_sel(csv, no_features, method='classification'): hsic_lasso = HSICLasso() hsic_lasso.input(csv) if method == 'regression': hsic_lasso.regression(no_features) else: hsic_lasso.classification(no_features) return hsic_lasso.get_features()
def fit(self, X, y): sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(all_ft_idx) if len(idxs) == 1: self.hsic_idx_ = idxs[0] else: self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) print("HSIC done.", len(self.hsic_idx_)) print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
def HSIC_lasso(): hsic = HSICLasso() hsic.input(data, labels) before = datetime.datetime.now() hsic.classification(num_feat=treshold, B=0, M=1, max_neighbors=10, discrete_x=False) # B a M su na postupne nacitanie ak mam velky dataset, B deli pocet vzoriek, pre klasicky algoritmus B=0, M=1 after = datetime.datetime.now() print("HSIC Lasso") selected = hsic.get_index() print(len(selected)) print("cas: " + str(after - before)) print('\n') if len(selected) < len(header): transform_and_save(selected, "HSIC_Lasso")
class HSICLasso: def __init__(self, k=10): self.model = HLasso() self.k = k self.modelname = "HSICLasso_{}".format(k) def fit(self, X, y): self.model.input(X, y) self.model.classification(self.k) self.index = np.array(self.model.get_index()) return self def transform(self, X): return X[:, self.index] def fit_transform(self, X, y): self.fit(X, y) return self.transform(X)
def HSICLasso(self): df_ = self.data.copy() cols = list(df_.columns)[:-1] + ['class'] df_.columns = cols hsic_lasso = HSICLasso() hsic_lasso.input(self.X_train.values, self.Y_train.values) if self.type == CLASSIFICATION: hsic_lasso.classification(self.num_top_features) elif self.type == REGRESSION: hsic_lasso.regression(self.num_top_features) feats = [ df_.columns[int(val) - 1] for val in hsic_lasso.get_features() ] for feat, imp in zip(feats, hsic_lasso.get_index_score()): features_[feat] = imp self.report_feature_importance(features_, self.num_top_features, label="HSICLasso")
from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input("SNR-26415.csv") print(hsic_lasso.classification(100)) hsic_lasso.get_features() l = [] l.append(hsic_lasso.get_features()) print(hsic_lasso.get_features()) print(len(l)) temp = 0 hsic_lasso.dump() for i in range(0, len(l)): print(l[i]) temp = temp + 1 print(temp)
def fit(self, X, y): if X.shape[1] > 10000: #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y) clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y) ftimp = clf.feature_importances_ relevant = np.where(ftimp > 0)[0] print("relevant ft:", len(relevant), "/", X.shape[1]) else: relevant = np.arange(X.shape[1]) sss = StratifiedShuffleSplit(n_splits=self.hsic_splits, random_state=42) idxs = [] hsics = [] for train_index, test_index in list(sss.split(X, y)): hsic_lasso2 = HSICLasso() hsic_lasso2.input(X[:, relevant][train_index], y[train_index]) hsic_lasso2.classification( self.n_features, B=self.B, M=self.M) #(self.n_features, B=self.B, M=self.M) hsics.append(hsic_lasso2) # not just best features - get their neighbors (similar features) too all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel() for i in range(len(all_ft_idx)): idx = np.array(hsic_lasso2.get_index_neighbors( feat_index=i, num_neighbors=10), dtype=int) score = np.array(hsic_lasso2.get_index_neighbors_score( feat_index=i, num_neighbors=10), dtype=int) idx = idx[np.where(score > self.neighbor_threshold)[0]] all_ft_idx = np.concatenate((all_ft_idx, idx)) all_ft_idx = np.unique(all_ft_idx) idxs.append(relevant[all_ft_idx]) #if len(idxs) == 1: # self.hsic_idx_ = idxs[0] #else: # self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_) self.hsic_idx_ = [] stability_concession = 0 while len(self.hsic_idx_) == 0: featurecandidates = np.unique(np.concatenate(idxs)) for candidate in featurecandidates: occurrences = np.sum( [1 if candidate in idx else 0 for idx in idxs]) if occurrences > self.stability_minimum_across_splits - stability_concession: self.hsic_idx_.append(candidate) if len(self.hsic_idx_) > 1: break else: # failed to find commonly occurring features - reduce threshold stability_concession += 1 print("HSIC done.", len(self.hsic_idx_), "(out of ", len(featurecandidates), " candidates)") print("Upsampling with ADASYN... (features: " + str(len(self.hsic_idx_)) + ")") sm = ADASYN(sampling_strategy="minority", n_neighbors=self.adasyn_neighbors, n_jobs=-1) sX, sy = X[:, self.hsic_idx_], y if self.adasyn_neighbors > 0: try: sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y) for i in range(len(np.unique(y) - 1)): sX, sy = sm.fit_resample(sX, sy) except: pass print("ADASYN done. Starting clf") self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy) print("done") return self
class ClassificationTest(unittest.TestCase): def setUp(self): self.hsic_lasso = HSICLasso() def test_classification(self): np.random.seed(0) with self.assertRaises(UnboundLocalError): self.hsic_lasso.classification() self.hsic_lasso.input("test_data/csv_data.csv") self.hsic_lasso.classification(5, discrete_x=True, n_jobs=1) self.assertEqual(self.hsic_lasso.A, [764, 1422, 512, 248, 1581]) self.hsic_lasso.input("test_data/csv_data.csv") self.hsic_lasso.classification(10, discrete_x=True, n_jobs=1) self.assertEqual( self.hsic_lasso.A, [764, 1422, 512, 248, 1581, 1670, 1771, 896, 779, 266]) # Blocks self.hsic_lasso.input("test_data/csv_data.csv") B = int(self.hsic_lasso.X_in.shape[1] / 2) self.hsic_lasso.classification(5, B, 10, discrete_x=True) self.assertEqual(self.hsic_lasso.A, [764, 1422, 512, 248, 266]) self.hsic_lasso.input("test_data/csv_data.csv") B = int(self.hsic_lasso.X_in.shape[1] / 2) self.hsic_lasso.classification(10, B, 10, discrete_x=True) self.assertEqual( self.hsic_lasso.A, [764, 1422, 512, 248, 1670, 1581, 266, 896, 1771, 779]) # use non-divisor as block size with warnings.catch_warnings(record=True) as w: self.hsic_lasso.input("test_data/csv_data.csv") B = int(self.hsic_lasso.X_in.shape[1] / 2) - 1 n = self.hsic_lasso.X_in.shape[1] numblocks = n / B self.hsic_lasso.classification(10, B, 10, discrete_x=True) self.assertEqual( self.hsic_lasso.A, [1422, 764, 512, 248, 1670, 1581, 896, 266, 1771, 779]) self.assertEqual(len(w), 1) self.assertEqual(w[-1].category, RuntimeWarning) self.assertEqual( str(w[-1].message), "B {} must be an exact divisor of the \ number of samples {}. Number of blocks {} will be approximated to {}.".format( B, n, numblocks, int(numblocks)))
def hsic(train, test, K): hsic_lasso = HSICLasso() hsic_lasso.input(train[0], train[1]) hsic_lasso.classification(K, n_jobs=-1) indices = hsic_lasso.get_index() return indices
#!/usr/bin/env python import numpy as np from pyHSICLasso import HSICLasso hsic_lasso = HSICLasso() hsic_lasso.input("breast.mat") hsic_lasso.classification(50) np.save('features_hl.npy', hsic_lasso.A)