Beispiel #1
0
def hsic(num_features, hsic_data, method='regression'):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(hsic_data)

    if method == 'regression':
        hsic_lasso.regression(num_features)
    else:
        hsic_lasso.classification(num_features)

    return hsic_lasso.get_features()
Beispiel #2
0
def hsic_sel(csv, no_features, method='classification'):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(csv)

    if method == 'regression':
        hsic_lasso.regression(no_features)
    else:
        hsic_lasso.classification(no_features)

    return hsic_lasso.get_features()
    def fit(self, X, y):
        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(all_ft_idx)
            if len(idxs) == 1:
                self.hsic_idx_ = idxs[0]
            else:
                self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        print("HSIC done.", len(self.hsic_idx_))

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
Beispiel #4
0
def HSIC_lasso():
    hsic = HSICLasso()
    hsic.input(data, labels)
    before = datetime.datetime.now()
    hsic.classification(num_feat=treshold, B=0, M=1, max_neighbors=10, discrete_x=False)
    # B a M su na postupne nacitanie ak mam velky dataset, B deli pocet vzoriek, pre klasicky algoritmus B=0, M=1
    after = datetime.datetime.now()
    print("HSIC Lasso")
    selected = hsic.get_index()
    print(len(selected))
    print("cas: " + str(after - before))
    print('\n')
    if len(selected) < len(header):
        transform_and_save(selected, "HSIC_Lasso")
Beispiel #5
0
class HSICLasso:
    def __init__(self, k=10):
        self.model = HLasso()
        self.k = k
        self.modelname = "HSICLasso_{}".format(k)
    
    def fit(self, X, y):
        self.model.input(X, y)
        self.model.classification(self.k)

        self.index = np.array(self.model.get_index())

        return self
        
    def transform(self, X):
        return X[:, self.index]

    def fit_transform(self, X, y):
        self.fit(X, y)
        return self.transform(X)
Beispiel #6
0
    def HSICLasso(self):

        df_ = self.data.copy()
        cols = list(df_.columns)[:-1] + ['class']
        df_.columns = cols

        hsic_lasso = HSICLasso()
        hsic_lasso.input(self.X_train.values, self.Y_train.values)

        if self.type == CLASSIFICATION:
            hsic_lasso.classification(self.num_top_features)
        elif self.type == REGRESSION:
            hsic_lasso.regression(self.num_top_features)

        feats = [
            df_.columns[int(val) - 1] for val in hsic_lasso.get_features()
        ]

        for feat, imp in zip(feats, hsic_lasso.get_index_score()):
            features_[feat] = imp
        self.report_feature_importance(features_,
                                       self.num_top_features,
                                       label="HSICLasso")
from pyHSICLasso import HSICLasso
hsic_lasso = HSICLasso()
hsic_lasso.input("SNR-26415.csv")
print(hsic_lasso.classification(100))
hsic_lasso.get_features()
l = []
l.append(hsic_lasso.get_features())
print(hsic_lasso.get_features())
print(len(l))
temp = 0
hsic_lasso.dump()
for i in range(0, len(l)):
    print(l[i])
    temp = temp + 1
print(temp)
    def fit(self, X, y):
        if X.shape[1] > 10000:
            #clf = RandomForestClassifier(n_estimators=1000,n_jobs=-1).fit(X,y)
            clf = LGBMClassifier(n_estimators=1000, n_jobs=-1).fit(X, y)
            ftimp = clf.feature_importances_
            relevant = np.where(ftimp > 0)[0]
            print("relevant ft:", len(relevant), "/", X.shape[1])
        else:
            relevant = np.arange(X.shape[1])

        sss = StratifiedShuffleSplit(n_splits=self.hsic_splits,
                                     random_state=42)
        idxs = []
        hsics = []
        for train_index, test_index in list(sss.split(X, y)):
            hsic_lasso2 = HSICLasso()
            hsic_lasso2.input(X[:, relevant][train_index], y[train_index])
            hsic_lasso2.classification(
                self.n_features, B=self.B,
                M=self.M)  #(self.n_features, B=self.B, M=self.M)
            hsics.append(hsic_lasso2)

            # not just best features - get their neighbors (similar features) too
            all_ft_idx = np.array(hsic_lasso2.get_index(), dtype=int).ravel()
            for i in range(len(all_ft_idx)):
                idx = np.array(hsic_lasso2.get_index_neighbors(
                    feat_index=i, num_neighbors=10),
                               dtype=int)
                score = np.array(hsic_lasso2.get_index_neighbors_score(
                    feat_index=i, num_neighbors=10),
                                 dtype=int)
                idx = idx[np.where(score > self.neighbor_threshold)[0]]
                all_ft_idx = np.concatenate((all_ft_idx, idx))
            all_ft_idx = np.unique(all_ft_idx)

            idxs.append(relevant[all_ft_idx])
            #if len(idxs) == 1:
            #    self.hsic_idx_ = idxs[0]
            #else:
            #    self.hsic_idx_ = np.intersect1d(idxs[-1], self.hsic_idx_)
        self.hsic_idx_ = []

        stability_concession = 0
        while len(self.hsic_idx_) == 0:
            featurecandidates = np.unique(np.concatenate(idxs))
            for candidate in featurecandidates:
                occurrences = np.sum(
                    [1 if candidate in idx else 0 for idx in idxs])
                if occurrences > self.stability_minimum_across_splits - stability_concession:
                    self.hsic_idx_.append(candidate)
            if len(self.hsic_idx_) > 1:
                break
            else:
                # failed to find commonly occurring features - reduce threshold
                stability_concession += 1
        print("HSIC done.", len(self.hsic_idx_), "(out of ",
              len(featurecandidates), " candidates)")

        print("Upsampling with ADASYN... (features: " +
              str(len(self.hsic_idx_)) + ")")
        sm = ADASYN(sampling_strategy="minority",
                    n_neighbors=self.adasyn_neighbors,
                    n_jobs=-1)
        sX, sy = X[:, self.hsic_idx_], y
        if self.adasyn_neighbors > 0:
            try:
                sX, sy = sm.fit_resample(X[:, self.hsic_idx_], y)
                for i in range(len(np.unique(y) - 1)):
                    sX, sy = sm.fit_resample(sX, sy)
            except:
                pass
            print("ADASYN done. Starting clf")

        self.clf_ = LGBMClassifier(n_estimators=1000).fit(sX, sy)
        print("done")
        return self
class ClassificationTest(unittest.TestCase):
    def setUp(self):
        self.hsic_lasso = HSICLasso()

    def test_classification(self):

        np.random.seed(0)

        with self.assertRaises(UnboundLocalError):
            self.hsic_lasso.classification()

        self.hsic_lasso.input("test_data/csv_data.csv")
        self.hsic_lasso.classification(5, discrete_x=True, n_jobs=1)
        self.assertEqual(self.hsic_lasso.A, [764, 1422, 512, 248, 1581])

        self.hsic_lasso.input("test_data/csv_data.csv")
        self.hsic_lasso.classification(10, discrete_x=True, n_jobs=1)
        self.assertEqual(
            self.hsic_lasso.A,
            [764, 1422, 512, 248, 1581, 1670, 1771, 896, 779, 266])

        # Blocks
        self.hsic_lasso.input("test_data/csv_data.csv")
        B = int(self.hsic_lasso.X_in.shape[1] / 2)
        self.hsic_lasso.classification(5, B, 10, discrete_x=True)
        self.assertEqual(self.hsic_lasso.A, [764, 1422, 512, 248, 266])

        self.hsic_lasso.input("test_data/csv_data.csv")
        B = int(self.hsic_lasso.X_in.shape[1] / 2)
        self.hsic_lasso.classification(10, B, 10, discrete_x=True)
        self.assertEqual(
            self.hsic_lasso.A,
            [764, 1422, 512, 248, 1670, 1581, 266, 896, 1771, 779])

        # use non-divisor as block size
        with warnings.catch_warnings(record=True) as w:

            self.hsic_lasso.input("test_data/csv_data.csv")
            B = int(self.hsic_lasso.X_in.shape[1] / 2) - 1
            n = self.hsic_lasso.X_in.shape[1]
            numblocks = n / B

            self.hsic_lasso.classification(10, B, 10, discrete_x=True)
            self.assertEqual(
                self.hsic_lasso.A,
                [1422, 764, 512, 248, 1670, 1581, 896, 266, 1771, 779])
            self.assertEqual(len(w), 1)
            self.assertEqual(w[-1].category, RuntimeWarning)
            self.assertEqual(
                str(w[-1].message), "B {} must be an exact divisor of the \
number of samples {}. Number of blocks {} will be approximated to {}.".format(
                    B, n, numblocks, int(numblocks)))
Beispiel #10
0
def hsic(train, test, K):
    hsic_lasso = HSICLasso()
    hsic_lasso.input(train[0], train[1])
    hsic_lasso.classification(K, n_jobs=-1)
    indices = hsic_lasso.get_index()
    return indices
Beispiel #11
0
#!/usr/bin/env python
import numpy as np
from pyHSICLasso import HSICLasso

hsic_lasso = HSICLasso()
hsic_lasso.input("breast.mat")

hsic_lasso.classification(50)
np.save('features_hl.npy', hsic_lasso.A)