Example #1
0
    def predict(self, X):
        """

        :param X: shape [n_row*n_clm, n_band]
        :return:
        """
        # construct affinity matrix
        kwargs = {
            "metric": "euclidean",
            "neighborMode": "knn",
            "weightMode": "heatKernel",
            "k": 5,
            't': 1
        }
        W = construct_W.construct_W(X, **kwargs)

        # obtain the feature weight matrix
        Weight = NDFS.ndfs(X, W=W, n_clusters=self.n_cluster)

        # sort the feature scores in an ascending order according to the feature scores
        idx = feature_ranking(Weight)

        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:self.n_band]]
        return selected_features
Example #2
0
def calc_NDFS(data, n_clusters=20):
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(data, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(data, W=W, n_clusters=n_clusters)
    return (Weight * Weight).sum(1)
def SKF_ndfs(X, y):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs)
    num_cluster = len(
        set(y)
    )  # specify the number of clusters, it is usually set as the number of classes in the ground truth
    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=num_cluster)
    return sparse_learning.feature_ranking(Weight)
Example #4
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # perform evaluation on clustering task
    num_fea = 100  # number of selected features
    num_cluster = 20  # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', float(nmi_total) / 20)
    print('ACC:', float(acc_total) / 20)
Example #5
0
def ndfs_score(diheds):
  import scipy.io
  import numpy
  from numpy import mean
  import os 
  #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0')
  from skfeature.function.sparse_learning_based import NDFS
  from skfeature.utility import construct_W
  from skfeature.utility import unsupervised_evaluation
  from skfeature.utility.sparse_learning import feature_ranking
  idx = []
  kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
  #change the path for every system to be run.
  #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein')
  for i in range(0,len(diheds),5):
   X= diheds[i]
   W = construct_W.construct_W(X, **kwargs)
   score = NDFS.ndfs(X, W=W, n_clusters=20)
     
   idx.append(score)
  col_mean = mean(idx, axis =0)
  imp_features=feature_ranking(col_mean)
  return col_mean,imp_features
Example #6
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs)

    # obtain the feature weight matrix
    Weight = NDFS.ndfs(X, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = feature_ranking(Weight)

    # perform evaluation on clustering task
    num_fea = 100    # number of selected features
    num_cluster = 20    # number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
def selectFeatureNDFS(filename, num_feature, num_cluster):

    # Recupero del pickle salvato su disco con i sample e TUTTE le feature estratte da TSFresh. SU QUESTO LAVOREREMO NOI
    all_features_train = pd.read_pickle(
        "./pickle/feature_complete/TRAIN/{0}_TRAIN_FeatureComplete.pkl".format(
            filename))
    all_features_test = pd.read_pickle(
        "./pickle/feature_complete/TEST/{0}_TEST_FeatureComplete.pkl".format(
            filename))

    # Elimino colonne con valori NaN
    all_features_train = all_features_train.dropna(axis=1)
    all_features_test = all_features_test.dropna(axis=1)

    # Costruisco matrice W da dare a NDFS
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(all_features_train.values, **kwargs)

    # Eseguo più volte NDFS in modo da ottenere davvero le feature migliori
    dizionarioOccorrenzeFeature = {}

    for i in range(0, 10):

        # Esecuzione dell'algoritmo NDFS. Otteniamo il peso delle feature per cluster.
        featurePesate = NDFS.ndfs(all_features_train, n_clusters=20, W=W)

        # ordinamento delle feature in ordine discendente
        idx = feature_ranking(featurePesate)

        # prendo il numero di feature scelte
        idxSelected = idx[0:num_feature]

        # aggiorno il numero di occorrenze di quella feature nel dizionario
        for feature in idxSelected:
            if feature in dizionarioOccorrenzeFeature:
                dizionarioOccorrenzeFeature[
                    feature] = dizionarioOccorrenzeFeature[feature] + 1
            else:
                dizionarioOccorrenzeFeature[feature] = 1

    # Ordino il dizionario in maniera discendente in modo da avere la feature che compare più volte all'inizio.
    # Qui abbiamo un dizionario contenente tupla (nomeFeature, numeroOccorrenze)
    dizionarioOccorrenzeFeature_sorted = sorted(
        dizionarioOccorrenzeFeature.items(), key=lambda kv: -kv[1])

    # Metto tutti in nomi delle feature presenti in nel dizionario in un array
    featureFrequenti = []
    for key, value in dizionarioOccorrenzeFeature_sorted:
        featureFrequenti.append(key)

    # seleziono il numero di feature che voglio
    idxSelected = featureFrequenti[0:num_feature]

    # Estraggo i nomi delle feature che ho scelto
    nomiFeatureSelezionate = []

    for i in idxSelected:
        nomiFeatureSelezionate.append(all_features_train.columns[i])

    # Creo il dataframe con solo le feature che ho selezionato
    dataframeFeatureSelezionate = all_features_train.loc[:,
                                                         nomiFeatureSelezionate]

    # Aggiusto anche il dataset di test con solo le feature scelte
    all_features_test = all_features_test.loc[:, nomiFeatureSelezionate]

    # Estraggo le classi conosciute
    labelConosciute = estrattoreClassiConosciute.estraiLabelConosciute(
        "./UCRArchive_2018/{0}/{0}_TEST.tsv".format(filename))

    # K-means su feature selezionate
    print("\nRisultati con feature selezionate da noi con NDFS")
    print("Numero feature: {0}".format(all_features_test.shape[1]))
    testFeatureSelection(X_selected=dataframeFeatureSelezionate.values,
                         X_test=all_features_test.values,
                         num_clusters=num_cluster,
                         y=labelConosciute)