Ejemplo n.º 1
0
def selectFeatureLapScore(filename, num_feature, num_cluster):

    # Recupero del pickle salvato su disco con i sample e TUTTE le feature estratte da TSFresh. SU QUESTO LAVOREREMO NOI
    all_features_train = pd.read_pickle(
        "./pickle/feature_complete/TRAIN/{0}_TRAIN_FeatureComplete.pkl".format(
            filename))
    all_features_test = pd.read_pickle(
        "./pickle/feature_complete/TEST/{0}_TEST_FeatureComplete.pkl".format(
            filename))

    # Elimino colonne con valori NaN
    all_features_train = all_features_train.dropna(axis=1)
    all_features_test = all_features_test.dropna(axis=1)

    # Costruisco matrice W da dare a NDFS
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(all_features_train.values, **kwargs_W)

    # Esecuzione dell'algoritmo NDFS. Otteniamo il peso delle feature per cluster.
    featurePesate = lap_score.lap_score(all_features_train.values, W=W)

    # ordinamento delle feature in ordine discendente
    idx = lap_score.feature_ranking(featurePesate)

    idxSelected = idx[0:
                      num_feature]  # seleziono il numero di feature che voglio

    # Estraggo i nomi delle feature che ho scelto
    nomiFeatureSelezionate = []

    for i in idxSelected:
        nomiFeatureSelezionate.append(all_features_train.columns[i])

    # Creo il dataframe con solo le feature che ho selezionato
    dataframeFeatureSelezionate = all_features_train.loc[:,
                                                         nomiFeatureSelezionate]

    # Aggiusto anche il dataset di test con solo le feature scelte
    all_features_test = all_features_test.loc[:, nomiFeatureSelezionate]

    # Estraggo le classi conosciute
    labelConosciute = estrattoreClassiConosciute.estraiLabelConosciute(
        "./UCRArchive_2018/{0}/{0}_TEST.tsv".format(filename))

    # K-means su feature selezionate
    print("\nRisultati con feature selezionate da noi con Lap Score")
    print("Numero feature: {0}".format(all_features_test.shape[1]))
    testFeatureSelection(X_selected=dataframeFeatureSelezionate.values,
                         X_test=all_features_test.values,
                         num_clusters=num_cluster,
                         y=labelConosciute)
def SKF_lap(X, y):
    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs_W)
    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)
    return lap_score.feature_ranking(score)
def get_lap_score(data, k=5, t=1,top_feature = 30):
    kwargs_W = {"metric":"euclidean","neighbor_mode":"knn","weight_mode":"heat_kernel","k":k,'t':t}
    W = construct_W.construct_W(data, **kwargs_W)
    score = lap_score.lap_score(data, W=W)
    #print(score)
    ranking = lap_score.feature_ranking(score)
    #print(idx)
    
    dfscores = pd.DataFrame(score)
    dfcolumns = pd.DataFrame(data.columns)
    #df_rank = pd.DataFrame(idx)
    
    featureScores = pd.concat([dfcolumns,dfscores],axis=1)
    featureScores.columns = ['Feature','Score']  #naming the dataframe columns
    #print(featureScores.nlargest(k,'Score'))  #print 20 best features
    result = featureScores.nlargest(top_feature,'Score')
    
    return result, ranking
Ejemplo n.º 4
0
def laplacian_score(X, y=None, **kwargs):
    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    return idx
Ejemplo n.º 5
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_w.construct_w(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', old_div(float(nmi_total), 20))
    print('ACC:', old_div(float(acc_total), 20))
Ejemplo n.º 6
0
 def lap_score_filtering(self, vt_data, num_features):
     vt_numpy = vt_data.to_numpy()
     # construct affinity matrix
     kwargs_W = {
         "metric": "cosine",
         "neighbor_mode": "knn",
         "weight_mode": "cosine",
         "k": 40,
         't': 500
     }
     print(
         "We perform Laplacian score filtering using the following parameters: "
         + str(kwargs_W))
     W = construct_W.construct_W(vt_numpy, **kwargs_W)
     score = lap_score.lap_score(vt_numpy, W=W)
     idx = lap_score.feature_ranking(score)  # rank features
     filtered_data = vt_data.iloc[:, idx[0:num_features]].copy()
     print("\nThe data now has " + str(len(filtered_data.T)) +
           " features after Laplacian score filtering.")
     return filtered_data
Ejemplo n.º 7
0
def main():
    # load data
    mat = scipy.io.loadmat("../data/COIL20.mat")
    X = mat["X"]  # data
    X = X.astype(float)
    y = mat["Y"]  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1}
    W = construct_W.construct_W(X, **kwargs_W)

    # obtain the scores of features
    score = lap_score.lap_score(X, W=W)

    # sort the feature scores in an ascending order according to the feature scores
    idx = lap_score.feature_ranking(score)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print "NMI:", float(nmi_total) / 20
    print "ACC:", float(acc_total) / 20
Ejemplo n.º 8
0
    def predict(self, X):
        """
        :param X: shape [n_row*n_clm, n_band]
        :return:
        """
        # n_row, n_column, __n_band = X.shape
        # XX = X.reshape((n_row * n_column, -1))  # n_sample * n_band
        XX = X

        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(XX, **kwargs_W)

        # obtain the scores of features
        score = lap_score.lap_score(X, W=W)

        # sort the feature scores in an ascending order according to the feature scores
        idx = lap_score.feature_ranking(score)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:self.n_band]]

        # selected_features.reshape((self.n_band, n_row, n_column))
        # selected_features = np.transpose(selected_features, axes=(1, 2, 0))
        return selected_features
Ejemplo n.º 9
0
        y_train, y_test = labels[train_index], labels[test_index]
        start_time = str(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
        acc = []

        # lap_score
        method = 'lap_score'
        kwargs_W = {
            "metric": "euclidean",
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 5,
            't': 1
        }
        W = construct_W.construct_W(X_train, **kwargs_W)
        score = lap_score.lap_score(X_train, W=W)
        idx = lap_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # fisher_score
        score = fisher_score.fisher_score(X_train, y_train)
        idx = fisher_score.feature_ranking(score)
        selected_fea_train = X_train[:, idx[0:num_features]]
        selected_fea_test = X_test[:, idx[0:num_features]]
        clf.fit(selected_fea_train, y_train)
        acc.append(accuracy_score(y_test, clf.predict(selected_fea_test)))

        # reliefF
        score = reliefF.reliefF(X_train, y_train)
Ejemplo n.º 10
0
    def bench(self, X, X_norm, y, n=2):
        num_feats = 20
        output_data = {'method': list(), 'features': list(), 'time': list(), self.test_att: list(), 'supervised': list()}

        # ----------------------------------------------------------------
        # CFS
        # start = time.perf_counter()
        # idx = cfs(X_norm.to_numpy(), y.to_numpy())[0]
        # print(idx)
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('CFS')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))

        # LA: Laplacian Score
        start = time.perf_counter()
        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(X_norm.to_numpy(), **kwargs_W)
        score = lap_score.lap_score(X_norm.to_numpy(), W=W)
        idx = lap_score.feature_ranking(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('Laplacian Score')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # FCBF: Feature correlation based filter
        # start = time.perf_counter()
        # idx = fcbf(X_norm.to_numpy(), y.to_numpy(), n_selected_features=num_feats)[0]
        # selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        # output_data['method'].append('FCBF')
        # output_data['time'].append(time.perf_counter() - start)
        # output_data['features'].append(selected_features)
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(self.train_real_data(selected_features, X))
        # print(output_data)
        # output_data['method'].append('FCBF')
        # output_data['time'].append(9999999)
        # output_data['features'].append([])
        # output_data['supervised'].append(True)
        # output_data[self.test_att].append(0.0)

        # UDFS: Unsupervised Discriminative Feature Selection
        start = time.perf_counter()
        Weight = udfs(X_norm.to_numpy(), gamma=0.1, n_clusters=n)
        idx = feature_ranking(Weight)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('UDFS')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # SPEC: Spectral Feature Selection
        start = time.perf_counter()
        score = spec(X_norm.to_numpy())
        idx = feature_ranking_spec(score)
        selected_features = X_norm.iloc[:, idx[0: num_feats]].columns.tolist()
        output_data['method'].append('SPEC')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(selected_features)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(selected_features, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MIQ', num_feats)
        output_data['method'].append('MRMR(MIQ)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # Mrmr: minimum redundency maximum relevance
        start = time.perf_counter()
        mrmr = pymrmr.mRMR(X_norm, 'MID', num_feats)
        output_data['method'].append('MRMR(MID)')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(mrmr)
        output_data['supervised'].append(False)
        output_data[self.test_att].append(self.train_real_data(mrmr, X))
        print(output_data)

        # recursive feature elimination(RFE):

        from sklearn.feature_selection import RFE
        from sklearn.linear_model import LogisticRegression
        rfe_selector = RFE(estimator=LogisticRegression(), n_features_to_select=num_feats, step=10, verbose=5)
        start = time.perf_counter()
        rfe_selector.fit(X_norm, y)
        rfe_support = rfe_selector.get_support()
        rfe_feature = X_norm.loc[:, rfe_support].columns.tolist()
        output_data['method'].append('RFE')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(rfe_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(rfe_feature, X))
        print(output_data)

        # ----------------------------------------------------------------
        # Lasso: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.linear_model import LogisticRegression

        embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1"), max_features=num_feats)
        start = time.perf_counter()
        embeded_lr_selector.fit(X_norm, y)

        embeded_lr_support = embeded_lr_selector.get_support()
        embeded_lr_feature = X_norm.loc[:, embeded_lr_support].columns.tolist()
        output_data['method'].append('Lasso')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_lr_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_lr_feature, X))
        print(output_data)
        print(str(len(embeded_lr_feature)), 'selected features')

        # -----------------------------------------------------------------------------
        # Tree - based: SelectFromModel:

        from sklearn.feature_selection import SelectFromModel
        from sklearn.ensemble import RandomForestClassifier

        embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_feats)
        start = time.perf_counter()
        embeded_rf_selector.fit(X_norm, y)

        embeded_rf_support = embeded_rf_selector.get_support()
        embeded_rf_feature = X_norm.loc[:, embeded_rf_support].columns.tolist()
        output_data['method'].append('Tree_Based_RF')
        output_data['time'].append(time.perf_counter() - start)
        output_data['features'].append(embeded_rf_feature)
        output_data['supervised'].append(True)
        output_data[self.test_att].append(self.train_real_data(embeded_rf_feature, X))
        print(output_data)
        print(str(len(embeded_rf_feature)), 'selected features')

        # -------------------------------------------------------------------------------
        # also tree based:

        from sklearn.feature_selection import SelectFromModel
        from lightgbm import LGBMClassifier

        lgbc = LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
                              reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)

        embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_feats)
        start = time.perf_counter()
        embeded_lgb_selector.fit(X_norm, y)

        embeded_lgb_support = embeded_lgb_selector.get_support()
        embeded_lgb_feature = X_norm.loc[:, embeded_lgb_support].columns.tolist()
        output_data['method'].append('Tree_Based_lightGBM')
        output_data['time'].append(time.perf_counter() - start)
        output_data['supervised'].append(True)
        output_data['features'].append(embeded_lgb_feature)
        output_data[self.test_att].append(self.train_real_data(embeded_lgb_feature, X))
        print(output_data)
        print(str(len(embeded_lgb_feature)), 'selected features')

        return output_data
Ejemplo n.º 11
0
    ########################### Apply Feature Selection methods :ReliefF, Laplacian score & Fisher
    #ReliefF
    score_rel = reliefF.reliefF(X_train, y_train)
    idx_rel = reliefF.feature_ranking(score_rel)
    #Laplacian score
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "k": 7,
        't': 1,
        'reliefF': True
    }
    W = construct_W.construct_W(X_train, **kwargs_W)
    score_lap = lap_score.lap_score(X_train, W=W)
    idx_lap = lap_score.feature_ranking(score_lap)
    #Fisher
    score_fish = fisher_score.fisher_score(X_train, y_train)
    print(score_fish)
    idx_fish = fisher_score.feature_ranking(score_fish)
    ###################################### Feature Integration
    idxM = idx_rel[:threshold]
    idxN = idx_lap[:threshold]
    idxO = idx_fish[:threshold]

    if combination_method == 1:
        #AND
        idx_and = reduce(np.intersect1d, (idxO, idxM, idxN))
        idx = idx_and
        print("number of selectes features (bins) = ", idx.shape[0])