def main(): # load matlab data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data y = mat['Y'] # label y = y[:, 0] X = X.astype(float) n_samples, n_features = X.shape # construct affinity matrix kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs_W) # feature selection score = lap_score.lap_score(X, W = W) idx = lap_score.feature_ranking(score) # evaluation num_fea = 100 selected_features = X[:, idx[0:num_fea]] ari, nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=20, y=y) print 'ARI:', ari print 'NMI:', nmi print 'ACC:', acc
def mcfs(X, n_selected_features, **kwargs): """ This function implements unsupervised feature selection for multi-cluster data. Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_selected_features: {int} number of features to select kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) affinity matrix n_clusters: {int} number of clusters (default is 5) Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference --------- Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. """ # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] # default number of clusters is 5 if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] # solve the generalized eigen-decomposition problem and get the top K # eigen-vectors with respect to the smallest eigenvalues W = W.toarray() W = (W + W.T) / 2 W_norm = np.diag(np.sqrt(1 / W.sum(1))) W = np.dot(W_norm, np.dot(W, W_norm)) WT = W.T W[W < WT] = WT[W < WT] eigen_value, ul = scipy.linalg.eigh(a=W) Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1]) # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d n_sample, n_feature = X.shape W = np.zeros((n_feature, n_clusters)) for i in range(n_clusters): clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) clf.fit(X, Y[:, i]) W[:, i] = clf.coef_ return W
def lap_score(X, **kwargs): """ This function implements the laplacian score feature selection, steps are as follows: 1. Construct the affinity matrix W if it is not specified 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Laplacian score for the r-th feature is Lr = (fr_hat'*L*fr_hat)/*(fr_hat'*D*fr_hat) Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) input affinity matrix Output ------ score: {numpy array}, shape (n_features,) laplacian score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. """ # if 'W' is not specified, use the default W if 'W' not in kwargs.keys(): W = construct_W(X) # construct the affinity matrix W W = kwargs['W'] # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 # compute laplacian score for all features score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] return np.transpose(score)
def fisher_score(X, y): """ This function implements the fisher score feature selection, steps are as follows: 1. Construct the affinity matrix W in fisher score way 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Fisher score for the r-th feature is Lr = (fr_hat'*L*fr_hat)/*(fr_hat'*D*fr_hat) Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ score: {numpy array}, shape (n_features,) fisher score for each feature """ # Construct weight matrix W in a fisherScore way kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} W = construct_W(X, **kwargs) # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 score = 1.0/lap_score - 1 return np.transpose(score)
def main(): # load matlab data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] X = X.astype(float) y = mat['Y'] y = y[:, 0] # construct W kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) # mcfs feature selection n_selected_features = 100 S = MCFS.mcfs(X, n_selected_features, W=W, n_clusters=20) idx = MCFS.feature_ranking(S) # evaluation X_selected = X[:, idx[0:n_selected_features]] ari, nmi, acc = unsupervised_evaluation.evaluation(X_selected=X_selected, n_clusters=20, y=y) print 'ARI:', ari print 'NMI:', nmi print 'ACC:', acc
def main(): # load data mat = scipy.io.loadmat('../data/COIL20.mat') X = mat['X'] # data X = X.astype(float) y = mat['Y'] # label y = y[:, 0] kwargs = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} W = construct_W.construct_W(X, **kwargs) # NDFS feature selection W = NDFS.ndfs(X, W=W, n_clusters=20, verbose=False) idx = feature_ranking(W) # evaluation n_selected_features = 100 X_selected = X[:, idx[0:n_selected_features]] ari, nmi, acc = evaluation(X_selected=X_selected, n_clusters=20, y=y) print 'ARI:', ari print 'NMI:', nmi print 'ACC:', acc
def ndfs(X, **kwargs): """ This function implement unsupervised feature selection using nonnegative spectral analysis, i.e., min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2 s.t. F >= 0 Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape {n_samples, n_samples} affinity matrix alpha: {float} Parameter alpha in objective function beta: {float} Parameter beta in objective function gamma: {float} a very large number used to force F^T F = I F0: {numpy array}, shape (n_samples, n_clusters) initialization of the pseudo label matirx F, if not provided n_clusters: {int} number of clusters verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference: Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012. """ # default gamma is 10e8 if 'gamma' not in kwargs: gamma = 10e8 else: gamma = kwargs['gamma'] # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs['alpha'] if 'beta' not in kwargs: beta = 1 else: beta = kwargs['beta'] if 'F0' not in kwargs: if 'n_clusters' not in kwargs: print >>sys.stderr, "either F0 or n_clusters should be provided" else: # initialize F n_clusters = kwargs['n_clusters'] F = kmeans_initialization(X, n_clusters) else: F = kwargs['F0'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] n_samples, n_features = X.shape # initialize D as identity matrix D = np.identity(n_features) I = np.identity(n_samples) # build laplacian matrix L = np.array(W.sum(1))[:, 0] - W max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update W T = np.linalg.inv(np.dot(X.transpose(), X) + beta * D) W = np.dot(np.dot(T, X.transpose()), F) # update D temp = np.sqrt((W*W).sum(1)) temp[temp < 1e-16] = 1e-16 temp = 0.5 / temp D = np.diag(temp) # update M M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose())) M = (M + M.transpose())/2 # update F denominator = np.dot(M, F) + gamma*np.dot(np.dot(F, F.transpose()), F) temp = np.divide(gamma*F, denominator) F = F*np.array(temp) temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16)))) F = np.dot(F, temp) # calculate objective function obj[iter_step] = np.trace(np.dot(np.dot(F.transpose(), M), F)) + gamma/4*np.linalg.norm(np.dot(F.transpose(), F)-np.identity(n_clusters), 'fro') if verbose: print 'obj at iter ' + str(iter_step+1) + ': ' + str(obj[iter_step]) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: break return W
def trace_ratio(X, y, n_selected_features, **kwargs): """ This function implements the trace ratio criterion for feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features: {int} number of features to select kwargs: {dictionary} style: {string} style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way verbose: {boolean} True if user want to print out the objective function value in each iteration, False if not Output ------ feature_idx: {numpy array}, shape (n_features,) the ranked (descending order) feature index based on subset-level score feature_score: {numpy array}, shape (n_features,) the feature-level score subset_score: {float} the subset-level score Reference --------- Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008. """ # if 'style' is not specified, use the fisher score way to built two affinity matrix if 'style' not in kwargs.keys(): kwargs['style'] = 'fisher' # get the way to build affinity matrix, 'fisher' or 'laplacian' style = kwargs['style'] n_samples, n_features = X.shape # if 'verbose' is not specified, do not output the value of objective function if 'verbose' not in kwargs: kwargs['verbose'] = False verbose = kwargs['verbose'] if style is 'fisher': kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) L_within = np.eye(n_samples) - W_within L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples L_between = L_within - L_tmp if style is 'laplacian': kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) D_within = np.diag(np.array(W_within.sum(1))[:, 0]) L_within = D_within - W_within W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within) D_between = np.diag(np.array(W_between.sum(1))) L_between = D_between - W_between # build X'*L_within*X and X'*L_between*X L_within = (np.transpose(L_within) + L_within)/2 L_between = (np.transpose(L_between) + L_between)/2 S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' S_within = (np.transpose(S_within) + S_within)/2 # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' S_between = (np.transpose(S_between) + S_between)/2 # take the absolute values of diagonal s_within = np.absolute(S_within.diagonal()) s_between = np.absolute(S_between.diagonal()) s_between[s_between == 0] = 1e-14 # this number if from authors' code # preprocessing fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features]) s_within = s_within[fs_idx[0:n_selected_features]] s_between = s_between[fs_idx[0:n_selected_features]] # iterate util converge count = 0 while True: score = np.sort(s_between-k*s_within)[::-1] I = np.argsort(s_between-k*s_within)[::-1] idx = I[0:n_selected_features] old_k = k k = np.sum(s_between[idx])/np.sum(s_within[idx]) if verbose: print 'obj at iter ' + str(count+1) + ': ' + str(k) count += 1 if abs(k - old_k) < 1e-3: break # get feature index, feature-level score and subset-level score feature_idx = fs_idx[I] feature_score = score subset_score = k return feature_idx, feature_score, subset_score