def predict(self, X): """ :param X: shape [n_row*n_clm, n_band] :return: """ # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=self.n_cluster) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:self.n_band]] return selected_features
def calc_NDFS(data, n_clusters=20): kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(data, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(data, W=W, n_clusters=n_clusters) return (Weight * Weight).sum(1)
def SKF_ndfs(X, y): # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W(X, **kwargs) num_cluster = len( set(y) ) # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=num_cluster) return sparse_learning.feature_ranking(Weight)
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', float(nmi_total) / 20) print('ACC:', float(acc_total) / 20)
def ndfs_score(diheds): import scipy.io import numpy from numpy import mean import os #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0') from skfeature.function.sparse_learning_based import NDFS from skfeature.utility import construct_W from skfeature.utility import unsupervised_evaluation from skfeature.utility.sparse_learning import feature_ranking idx = [] kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} #change the path for every system to be run. #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein') for i in range(0,len(diheds),5): X= diheds[i] W = construct_W.construct_W(X, **kwargs) score = NDFS.ndfs(X, W=W, n_clusters=20) idx.append(score) col_mean = mean(idx, axis =0) imp_features=feature_ranking(col_mean) return col_mean,imp_features
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def selectFeatureNDFS(filename, num_feature, num_cluster): # Recupero del pickle salvato su disco con i sample e TUTTE le feature estratte da TSFresh. SU QUESTO LAVOREREMO NOI all_features_train = pd.read_pickle( "./pickle/feature_complete/TRAIN/{0}_TRAIN_FeatureComplete.pkl".format( filename)) all_features_test = pd.read_pickle( "./pickle/feature_complete/TEST/{0}_TEST_FeatureComplete.pkl".format( filename)) # Elimino colonne con valori NaN all_features_train = all_features_train.dropna(axis=1) all_features_test = all_features_test.dropna(axis=1) # Costruisco matrice W da dare a NDFS kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(all_features_train.values, **kwargs) # Eseguo più volte NDFS in modo da ottenere davvero le feature migliori dizionarioOccorrenzeFeature = {} for i in range(0, 10): # Esecuzione dell'algoritmo NDFS. Otteniamo il peso delle feature per cluster. featurePesate = NDFS.ndfs(all_features_train, n_clusters=20, W=W) # ordinamento delle feature in ordine discendente idx = feature_ranking(featurePesate) # prendo il numero di feature scelte idxSelected = idx[0:num_feature] # aggiorno il numero di occorrenze di quella feature nel dizionario for feature in idxSelected: if feature in dizionarioOccorrenzeFeature: dizionarioOccorrenzeFeature[ feature] = dizionarioOccorrenzeFeature[feature] + 1 else: dizionarioOccorrenzeFeature[feature] = 1 # Ordino il dizionario in maniera discendente in modo da avere la feature che compare più volte all'inizio. # Qui abbiamo un dizionario contenente tupla (nomeFeature, numeroOccorrenze) dizionarioOccorrenzeFeature_sorted = sorted( dizionarioOccorrenzeFeature.items(), key=lambda kv: -kv[1]) # Metto tutti in nomi delle feature presenti in nel dizionario in un array featureFrequenti = [] for key, value in dizionarioOccorrenzeFeature_sorted: featureFrequenti.append(key) # seleziono il numero di feature che voglio idxSelected = featureFrequenti[0:num_feature] # Estraggo i nomi delle feature che ho scelto nomiFeatureSelezionate = [] for i in idxSelected: nomiFeatureSelezionate.append(all_features_train.columns[i]) # Creo il dataframe con solo le feature che ho selezionato dataframeFeatureSelezionate = all_features_train.loc[:, nomiFeatureSelezionate] # Aggiusto anche il dataset di test con solo le feature scelte all_features_test = all_features_test.loc[:, nomiFeatureSelezionate] # Estraggo le classi conosciute labelConosciute = estrattoreClassiConosciute.estraiLabelConosciute( "./UCRArchive_2018/{0}/{0}_TEST.tsv".format(filename)) # K-means su feature selezionate print("\nRisultati con feature selezionate da noi con NDFS") print("Numero feature: {0}".format(all_features_test.shape[1])) testFeatureSelection(X_selected=dataframeFeatureSelezionate.values, X_test=all_features_test.values, num_clusters=num_cluster, y=labelConosciute)