def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) num_fea = 100 # specify the number of selected features num_cluster = 20 # specify the number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = MCFS.feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def test_spec(): # load data mat = scipy.io.loadmat('./data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth kwargs = {'style': 0} pipeline = [] spec_partial = partial(SPEC.spec, **kwargs) pipeline.append( ('select top k', SelectKBest(score_func=spec_partial, k=num_fea))) model = Pipeline(pipeline) # set y param to be 0 to demonstrate that this works in unsupervised sense. selected_features = model.fit_transform(X, y=np.zeros(X.shape[0])) # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print(('NMI:', float(nmi_total) / 20)) print(('ACC:', float(acc_total) / 20))
def main(): # load data mat = scipy.io.loadmat('../data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] p = 0.1 # specify the threshold p to be 0.1 num_cluster = 2 # specify the number of clusters to be 2 # perform feature selection and obtain the dataset on the selected features selected_features = low_variance.low_variance_feature_selection(X, p*(1-p)) # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def eval_subset(train, test): n_clusters = len(np.unique(train[2])) clf = ExtraTreesClassifier(n_estimators=50, n_jobs=-1) clf.fit(train[0], train[2]) DTacc = float(clf.score(test[0], test[2])) clf = KNeighborsClassifier(n_neighbors=1, algorithm='brute', n_jobs=1) clf.fit(train[0], train[2]) acc = float(clf.score(test[0], test[2])) LR = LinearRegression(n_jobs=-1) LR.fit(train[0], train[1]) MSELR = float(((LR.predict(test[0]) - test[1])**2).mean()) MSE = float((((decoder((train[0], train[1]), (test[0], test[1])) - test[1])**2).mean())) max_iters = 10 cnmi, cacc = 0.0, 0.0 for iter in range(max_iters): nmi, acc = unsupervised_evaluation.evaluation(train[0], n_clusters=n_clusters, y=train[2]) cnmi += nmi / max_iters cacc += acc / max_iters print('nmi = {:.3f}, acc = {:.3f}'.format(cnmi, cacc)) print('acc = {:.3f}, DTacc = {:.3f}, MSELR = {:.3f}, MSE = {:.3f}'.format( acc, DTacc, MSELR, MSE)) return MSELR, MSE, acc, DTacc, float(cnmi), float(cacc)
def test_low_variance(): # load data from functools import partial mat = scipy.io.loadmat('./data/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] p = 0.1 # specify the threshold p to be 0.1 num_cluster = 2 # specify the number of clusters to be 2 # build pipeline pipeline = [] # this is equivalent to `pipeline.append(('low_variance', VarianceThreshold(threshold=p*(1-p))))` pipeline.append(('low_variance', low_variance.low_variance_feature_selection(threshold=p*(1-p)))) model = Pipeline(pipeline) # set y param to be 0 to demonstrate that this works in unsupervised sense. # perform feature selection and obtain the dataset on the selected features selected_features = model.fit_transform(X, y=np.zeros(X.shape[0])) # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print(('NMI:', float(nmi_total)/20)) print(('ACC:', float(acc_total)/20)) assert_true(float(nmi_total)/20 > 0.0) assert_true(float(acc_total)/20 > 0.5)
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # specify the second ranking function which uses all except the 1st eigenvalue kwargs = {'style': 0} # obtain the scores of features score = SPEC.spec(X, **kwargs) # sort the feature scores in an descending order according to the feature scores idx = SPEC.feature_ranking(score, **kwargs) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', old_div(float(nmi_total),20)) print('ACC:', old_div(float(acc_total),20))
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # specify the second ranking function which uses all except the 1st eigenvalue kwargs = {'style': 0} # obtain the scores of features score = SPEC.spec(X, **kwargs) # sort the feature scores in an descending order according to the feature scores idx = SPEC.feature_ranking(score, **kwargs) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the feature weight matrix Weight = UDFS.udfs(X, gamma=0.1, n_clusters=num_cluster) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print 'NMI:', float(nmi_total)/20 print 'ACC:', float(acc_total)/20
def evaluate_clustering(selected_features,y): # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = np.zeros(20) acc_total = np.zeros(20) for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=len(np.unique(y)), y=y) nmi_total[i]= nmi acc_total[i]= acc # output the average NMI and average ACC return (np.mean(nmi_total), np.std(nmi_total)), (np.mean(acc_total),np.std(acc_total))
def test_lap_score(): # load data from functools import partial mat = scipy.io.loadmat('./data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs_W = { "metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs_W) num_fea = 100 # number of selected features pipeline = [] # partial function required for SelectKBest to work correctly. lap_score_partial = partial(lap_score.lap_score, W=W) pipeline.append( ('select top k', SelectKBest(score_func=lap_score_partial, k=num_fea))) model = Pipeline(pipeline) # set y param to be 0 to demonstrate that this works in unsupervised sense. selected_features = model.fit_transform(X, y=np.zeros(X.shape[0])) print(selected_features.shape) # perform evaluation on clustering task num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print(('NMI:', float(nmi_total) / 20)) print(('ACC:', float(acc_total) / 20)) assert_true(float(nmi_total) / 20 > 0.5) assert_true(float(acc_total) / 20 > 0.5)
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] # construct affinity matrix kwargs = { "metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1 } W = construct_W.construct_W(X, **kwargs) # obtain the feature weight matrix Weight = NDFS.ndfs(X, W=W, n_clusters=20) # sort the feature scores in an ascending order according to the feature scores idx = feature_ranking(Weight) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation( X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', float(nmi_total) / 20) print('ACC:', float(acc_total) / 20)
def main(): # load data mat = scipy.io.loadmat("../data/COIL20.mat") X = mat["X"] # data X = X.astype(float) y = mat["Y"] # label y = y[:, 0] # construct affinity matrix kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, "t": 1} W = construct_W.construct_W(X, **kwargs_W) # obtain the scores of features score = lap_score.lap_score(X, W=W) # sort the feature scores in an ascending order according to the feature scores idx = lap_score.feature_ranking(score) # perform evaluation on clustering task num_fea = 100 # number of selected features num_cluster = 20 # number of clusters, it is usually set as the number of classes in the ground truth # obtain the dataset on the selected features selected_features = X[:, idx[0:num_fea]] # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print "NMI:", float(nmi_total) / 20 print "ACC:", float(acc_total) / 20
import scipy.io #from skfeature.function.statistical_based import low_variance from skfeature.utility import unsupervised_evaluation mat = scipy.io.loadmat('../Datasets/BASEHOCK.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] p = 0.1 # specify the threshold p to be 0.1 num_cluster = 2 # specify the number of clusters to be 2 # perform feature selection and obtain the dataset on the selected features from Statistical_Based.Low_Variance.LowVarianceZeal import Low_Variance_FS selected_features = Low_Variance_FS(X, p * (1 - p)) # perform kmeans clustering based on the selected features and repeats 20 times nmi_total = 0 acc_total = 0 for i in range(0, 20): nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y) nmi_total += nmi acc_total += acc # output the average NMI and average ACC print('NMI:', float(nmi_total) / 20) print('ACC:', float(acc_total) / 20)