def cfs(X, y, mode="index", **kwargs): """ This function uses a correlation based heuristic to evaluate the worth of features which is called CFS Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ F: {numpy array} index of selected features Reference --------- Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010. """ if 'n_selected_features' in list(kwargs.keys()): n_selected_features = kwargs['n_selected_features'] else: n_selected_features = 0 n_samples, n_features = X.shape F = [] # M stores the merit values M = [] while True: merit = -100000000000 idx = -1 for i in range(n_features): if i not in F: F.append(i) # calculate the merit of current selected features try: t = merit_calculation(X[:, F], y) except ZeroDivisionError: t = -10000000000 if t > merit: merit = t idx = i F.pop() F.append(idx) M.append(merit) if len(M) == n_selected_features and n_selected_features != 0: break else: if n_selected_features == 0 and len(M) == n_features: break if mode == "index": return np.array(F) else: return reverse_argsort(F, X.shape[1])
def t_score(X, y, mode='rank'): """ This function calculates t_score for each feature, where t_score is only used for binary problem t_score = |mean1-mean2|/sqrt(((std1^2)/n1)+((std2^2)/n2))) Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features,) t-score for each feature """ def feature_ranking(F): """ Rank features in descending order according to t-score, the higher the t-score, the more important the feature is """ idx = np.argsort(F) return idx[::-1] n_samples, n_features = X.shape F = np.zeros(n_features) c = np.unique(y) if len(c) == 2: for i in range(n_features): f = X[:, i] # class0 contains instances belonging to the first class # class1 contains instances belonging to the second class class0 = f[y == c[0]] class1 = f[y == c[1]] mean0 = np.mean(class0) mean1 = np.mean(class1) std0 = np.std(class0) std1 = np.std(class1) n0 = len(class0) n1 = len(class1) t = mean0 - mean1 t0 = np.true_divide(std0**2, n0) t1 = np.true_divide(std1**2, n1) F[i] = np.true_divide(t, (t0 + t1)**0.5) else: print('y should be guaranteed to a binary class vector') exit(0) if mode == "index": return np.array(np.abs(F)) elif mode == 'feature_ranking': return feature_ranking(np.array(np.abs(F))) else: return reverse_argsort(feature_ranking(np.array(np.abs(F))), X.shape[1])
def decision_tree_backward(X, y, mode="rank", n_selected_features=None): """ This function implements the backward feature selection algorithm based on decision tree Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features : {int} number of selected features Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ n_samples, n_features = X.shape if n_selected_features is None: n_selected_features = n_features # using 10 fold cross validation kfold = KFold(n_splits=10, shuffle=True) # choose decision tree as the classifier clf = DecisionTreeClassifier() # selected feature set, initialized to contain all features F = list(range(n_features)) count = n_features while count > n_selected_features: max_acc = 0 for i in range(n_features): if i in F: F.remove(i) X_tmp = X[:, F] results = cross_val_score(clf, X_tmp, y, cv=kfold) acc = results.mean() F.append(i) # record the feature which results in the largest accuracy if acc > max_acc: max_acc = acc idx = i # delete the feature which results in the largest accuracy F.remove(idx) count -= 1 if mode == "index": return np.array(F) else: return reverse_argsort(F)
def svm_forward(X, y, mode="rank", n_selected_features=None): """ This function implements the forward feature selection algorithm based on SVM Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features: {int} number of selected features Output ------ F: {numpy array}, shape (n_features, ) index of selected features """ n_samples, n_features = X.shape if n_selected_features is None: n_selected_features = n_features # using 10 fold cross validation kfold = KFold(n_splits=10, shuffle=True) # choose SVM as the classifier clf = SVC() # selected feature set, initialized to be empty F = [] count = 0 while count < n_selected_features-1: max_acc = 0 for i in range(n_features): if i not in F: F.append(i) X_tmp = X[:, F] results = cross_val_score(clf, X_tmp, y, cv=kfold) acc = results.mean() F.pop() # record the feature which results in the largest accuracy if acc > max_acc: max_acc = acc idx = i # add the feature which results in the largest accuracy F.append(idx) count += 1 if mode == "index": return np.array(F) else: return reverse_argsort(F, X.shape[1])
def cfs(X, y, mode="rank"): """ This function uses a correlation based heuristic to evaluate the worth of features which is called CFS Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ F: {numpy array} index of selected features Reference --------- Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010. """ n_samples, n_features = X.shape F = [] # M stores the merit values M = [] while True: merit = -100000000000 idx = -1 for i in range(n_features): if i not in F: F.append(i) # calculate the merit of current selected features t = merit_calculation(X[:, F], y) if t > merit: merit = t idx = i F.pop() F.append(idx) M.append(merit) if len(M) > 5: if M[len(M)-1] <= M[len(M)-2]: if M[len(M)-2] <= M[len(M)-3]: if M[len(M)-3] <= M[len(M)-4]: if M[len(M)-4] <= M[len(M)-5]: break if mode == "index": return np.array(F) else: return reverse_argsort(F, X.shape[1])
def mifs(X, y, mode="rank", **kwargs): """ This function implements the MIFS feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data, guaranteed to be discrete y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} n_selected_features: {int} number of features to select Output ------ F: {numpy array}, shape (n_features,) index of selected features, F[0] is the most important feature J_CMI: {numpy array}, shape: (n_features,) corresponding objective function value of selected features MIfy: {numpy array}, shape: (n_features,) corresponding mutual information between selected features and response Reference --------- Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. """ if 'beta' not in list(kwargs.keys()): beta = 0.5 else: beta = kwargs['beta'] if 'n_selected_features' in list(kwargs.keys()): n_selected_features = kwargs['n_selected_features'] F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0, n_selected_features=n_selected_features) else: F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0) if mode == "index": return np.array(F, dtype=int) else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def icap(X, y, mode="rank", **kwargs): """ This function implements the ICAP feature selection. The scoring criteria is calculated based on the formula j_icap = I(f;y) - max_j(0,(I(fj;f)-I(fj;f|y))) Input ----- X: {numpy array}, shape (n_samples, n_features) input data, guaranteed to be a discrete data matrix y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} n_selected_features: {int} number of features to select Output ------ F: {numpy array}, shape (n_features,) index of selected features, F[0] is the most important feature J_ICAP: {numpy array}, shape: (n_features,) corresponding objective function value of selected features MIfy: {numpy array}, shape: (n_features,) corresponding mutual information between selected features and response """ n_samples, n_features = X.shape # index of selected features, initialized to be empty F = [] # Objective function value for selected features J_ICAP = [] # Mutual information between feature and response MIfy = [] # indicate whether the user specifies the number of features is_n_selected_features_specified = False if 'n_selected_features' in list(kwargs.keys()): n_selected_features = kwargs['n_selected_features'] is_n_selected_features_specified = True # t1 contains I(f;y) for each feature f t1 = np.zeros(n_features) # max contains max_j(0,(I(fj;f)-I(fj;f|y))) for each feature f max = np.zeros(n_features) for i in range(n_features): f = X[:, i] t1[i] = midd(f, y) # make sure that j_cmi is positive at the very beginning j_icap = 1 while True: if len(F) == 0: # select the feature whose mutual information is the largest idx = np.argmax(t1) F.append(idx) J_ICAP.append(t1[idx]) MIfy.append(t1[idx]) f_select = X[:, idx] if is_n_selected_features_specified is True: if len(F) == n_selected_features: break if is_n_selected_features_specified is not True: if j_icap <= 0: break # we assign an extreme small value to j_icap to ensure it is smaller than all possible values of j_icap j_icap = -1000000000000 for i in range(n_features): if i not in F: f = X[:, i] t2 = midd(f_select, f) t3 = cmidd(f_select, f, y) if t2-t3 > max[i]: max[i] = t2-t3 # calculate j_icap for feature i (not in F) t = t1[i] - max[i] # record the largest j_icap and the corresponding feature index if t > j_icap: j_icap = t idx = i F.append(idx) J_ICAP.append(j_icap) MIfy.append(t1[idx]) f_select = X[:, idx] if mode=="index": return np.array(F, dtype=int) else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def fcbf(X, y, mode="rank", **kwargs): """ This function implements Fast Correlation Based Filter algorithm Input ----- X: {numpy array}, shape (n_samples, n_features) input data, guaranteed to be discrete y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} delta: {float} delta is a threshold parameter, the default value of delta is 0 Output ------ F: {numpy array}, shape (n_features,) index of selected features, F[0] is the most important feature SU: {numpy array}, shape (n_features,) symmetrical uncertainty of selected features Reference --------- Yu, Lei and Liu, Huan. "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution." ICML 2003. """ n_samples, n_features = X.shape if 'delta' in list(kwargs.keys()): delta = kwargs['delta'] else: # the default value of delta is 0 delta = 0 # t1[:,0] stores index of features, t1[:,1] stores symmetrical uncertainty of features t1 = np.zeros((n_features, 2)) for i in range(n_features): f = X[:, i] t1[i, 0] = i t1[i, 1] = su_calculation(f, y) s_list = np.array(t1[t1[:, 1] > delta, :], dtype=int) # index of selected features, initialized to be empty F = [] # Symmetrical uncertainty of selected features SU = [] while len(s_list) != 0: # select the largest su inside s_list idx = np.argmax(s_list[:, 1]) # record the index of the feature with the largest su fp = X[:, s_list[idx, 0]] np.delete(s_list, idx, 0) F.append(s_list[idx, 0]) SU.append(s_list[idx, 1]) for i in s_list[:, 0]: fi = X[:, i] if su_calculation(fp, fi) >= t1[i, 1]: # construct the mask for feature whose su is larger than su(fp,y) idx = s_list[:, 0] != i idx = np.array([idx, idx]) idx = np.transpose(idx) # delete the feature by using the mask s_list = s_list[idx] length = len(s_list) / 2 s_list = s_list.reshape((int(length), 2)) if mode == "index": return np.array(F, dtype=int) else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def disr(X, y, mode="rank", **kwargs): """ This function implement the DISR feature selection. The scoring criteria is calculated based on the formula j_disr=sum_j(I(f,fj;y)/H(f,fj,y)) Input ----- X: {numpy array}, shape (n_samples, n_features) input data, guaranteed to be a discrete data matrix y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} n_selected_features: {int} number of features to select Output ------ F: {numpy array}, shape (n_features, ) index of selected features, F[0] is the most important feature J_DISR: {numpy array}, shape: (n_features,) corresponding objective function value of selected features MIfy: {numpy array}, shape: (n_features,) corresponding mutual information between selected features and response Reference --------- Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. """ n_samples, n_features = X.shape # index of selected features, initialized to be empty F = [] # Objective function value for selected features J_DISR = [] # Mutual information between feature and response MIfy = [] # indicate whether the user specifies the number of features is_n_selected_features_specified = False if 'n_selected_features' in list(kwargs.keys()): n_selected_features = kwargs['n_selected_features'] is_n_selected_features_specified = True # sum stores sum_j(I(f,fj;y)/H(f,fj,y)) for each feature f sum = np.zeros(n_features) # make sure that j_cmi is positive at the very beginning j_disr = 1 while True: if len(F) == 0: # t1 stores I(f;y) for each feature f t1 = np.zeros(n_features) for i in range(n_features): f = X[:, i] t1[i] = midd(f, y) # select the feature whose mutual information is the largest idx = np.argmax(t1) F.append(idx) J_DISR.append(t1[idx]) MIfy.append(t1[idx]) f_select = X[:, idx] if is_n_selected_features_specified is True: if len(F) == n_selected_features: break if is_n_selected_features_specified is not True: if j_disr <= 0: break # we assign an extreme small value to j_disr to ensure that it is smaller than all possible value of j_disr j_disr = -1E30 for i in range(n_features): if i not in F: f = X[:, i] t2 = midd(f_select, y) + cmidd(f, y, f_select) t3 = entropyd(f) + conditional_entropy(f_select, f) + (conditional_entropy(y, f_select) - cmidd(y, f, f_select)) sum[i] += np.true_divide(t2, t3) # record the largest j_disr and the corresponding feature index if sum[i] > j_disr: j_disr = sum[i] idx = i F.append(idx) J_DISR.append(j_disr) MIfy.append(t1[idx]) f_select = X[:, idx] if mode=="index": return F else: return reverse_argsort(F, X.shape[1])
def lap_score(X, y=None, mode="rank", **kwargs): """ This function implements the laplacian score feature selection, steps are as follows: 1. Construct the affinity matrix W if it is not specified 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat) Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) input affinity matrix Output ------ score: {numpy array}, shape (n_features,) laplacian score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. """ def feature_ranking(score): """ Rank features in ascending order according to their laplacian scores, the smaller the laplacian score is, the more important the feature is """ idx = np.argsort(score, 0) return idx # if 'W' is not specified, use the default W if 'W' not in list(kwargs.keys()): W = construct_W(X) # construct the affinity matrix W W = kwargs['W'] # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 # compute laplacian score for all features score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] F = feature_ranking(np.transpose(score)) if mode=="index": return np.array(F, dtype=int) else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def spec(X, y=None, mode='rank', **kwargs): """ This function implements the SPEC feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} style: {int} style == -1, the first feature ranking function, use all eigenvalues style == 0, the second feature ranking function, use all except the 1st eigenvalue style >= 2, the third feature ranking function, use the first k except 1st eigenvalue W: {sparse matrix}, shape (n_samples, n_samples} input affinity matrix Output ------ w_fea: {numpy array}, shape (n_features,) SPEC feature score for each feature Reference --------- Zhao, Zheng and Liu, Huan. "Spectral Feature Selection for Supervised and Unsupervised Learning." ICML 2007. """ def feature_ranking(score, **kwargs): if 'style' not in kwargs: kwargs['style'] = 0 style = kwargs['style'] # if style = -1 or 0, ranking features in descending order, the higher the score, the more important the feature is if style == -1 or style == 0: idx = np.argsort(score, 0) return idx[::-1] # if style != -1 and 0, ranking features in ascending order, the lower the score, the more important the feature is elif style != -1 and style != 0: idx = np.argsort(score, 0) return idx if 'style' not in kwargs: kwargs['style'] = 0 if 'is_classification' not in kwargs: # if y is available then we do supervised SPEC algo. kwargs['is_classification'] = True if 'W' not in kwargs: if y is None: kwargs['W'] = rbf_kernel(X, gamma=1) elif kwargs['is_classification']: kwargs['W'] = similiarity_classification(X, y) else: kwargs['W'] = similarity_regression( X, y, kwargs.get('n_neighbors', None)) style = kwargs['style'] W = kwargs['W'] if type(W) is numpy.ndarray: W = csc_matrix(W) n_samples, n_features = X.shape # build the degree matrix X_sum = np.array(W.sum(axis=1)) D = np.zeros((n_samples, n_samples)) for i in range(n_samples): D[i, i] = X_sum[i] # build the laplacian matrix L = D - W d1 = np.power(np.array(W.sum(axis=1)), -0.5) d1[np.isinf(d1)] = 0 d2 = np.power(np.array(W.sum(axis=1)), 0.5) v = np.dot(np.diag(d2[:, 0]), np.ones(n_samples)) v = v / LA.norm(v) # build the normalized laplacian matrix L_hat = (np.matlib.repmat(d1, 1, n_samples)) * np.array(L) * np.matlib.repmat( np.transpose(d1), n_samples, 1) # calculate and construct spectral information s, U = np.linalg.eigh(L_hat) s = np.flipud(s) U = np.fliplr(U) # begin to select features w_fea = np.ones(n_features) * 1000 for i in range(n_features): f = X[:, i] F_hat = np.dot(np.diag(d2[:, 0]), f) l = LA.norm(F_hat) if l < 100 * np.spacing(1): w_fea[i] = 1000 continue else: F_hat = F_hat / l a = np.array(np.dot(np.transpose(F_hat), U)) a = np.multiply(a, a) a = np.transpose(a) # use f'Lf formulation if style == -1: w_fea[i] = np.sum(a * s) # using all eigenvalues except the 1st elif style == 0: a1 = a[0:n_samples - 1] w_fea[i] = np.sum(a1 * s[0:n_samples - 1]) / ( 1 - np.power(np.dot(np.transpose(F_hat), v), 2)) # use first k except the 1st else: a1 = a[n_samples - style:n_samples - 1] w_fea[i] = np.sum(a1 * (2 - s[n_samples - style:n_samples - 1])) if style != -1 and style != 0: w_fea[w_fea == 1000] = -1000 if mode == 'raw': return w_fea elif mode == 'index': return feature_ranking(w_fea) else: return reverse_argsort(feature_ranking(w_fea))
def fisher_score(X, y, mode="rank"): """ This function implements the fisher score feature selection, steps are as follows: 1. Construct the affinity matrix W in fisher score way 2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W 3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones) 4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1 Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ------ score: {numpy array}, shape (n_features,) fisher score for each feature Reference --------- He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005. Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012. """ def feature_ranking(score): idx = np.argsort(score, 0) return idx[::-1] # Construct weight matrix W in a fisherScore way kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} W = construct_W(X, **kwargs) # build the diagonal D matrix from affinity matrix W D = np.array(W.sum(axis=1)) L = W tmp = np.dot(np.transpose(D), X) D = diags(np.transpose(D), [0]) Xt = np.transpose(X) t1 = np.transpose(np.dot(Xt, D.todense())) t2 = np.transpose(np.dot(Xt, L.todense())) # compute the numerator of Lr D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum() # compute the denominator of Lr L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum() # avoid the denominator of Lr to be 0 D_prime[D_prime < 1e-12] = 10000 lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :] # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1 score = 1.0/lap_score - 1 """ Rank features in descending order according to fisher score, the larger the fisher score, the more important the feature is """ F = feature_ranking(np.transpose(score)) if mode=="index": return F else: return reverse_argsort(F, X.shape[1])
def proximal_gradient_descent(X, Y_flat, z, mode="rank", **kwargs): """ This function implements supervised sparse feature selection via l2,1 norm, i.e., min_{W} sum_{i}log(1+exp(-yi*(W'*x+C))) + z*||W||_{2,1} Input ----- X: {numpy array}, shape (n_samples, n_features) input data Y: {numpy array}, shape (n_samples, n_classes) input class labels, each row is a one-hot-coding class label, guaranteed to be a numpy array z: {float} regularization parameter kwargs: {dictionary} verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape (n_features, n_classes) weight matrix obj: {numpy array}, shape (n_iterations,) objective function value during iterations value_gamma: {numpy array}, shape (n_iterations,s) suitable step size during iterations Reference: Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009. """ if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] # Starting point initialization # # convert Y_flat to one hot encoded Y = construct_label_matrix_pan(Y_flat) n_samples, n_features = X.shape n_samples, n_classes = Y.shape # the indices of positive samples p_flag = (Y == 1) # the total number of positive samples n_positive_samples = np.sum(p_flag, 0) # the total number of negative samples n_negative_samples = n_samples - n_positive_samples n_positive_samples = n_positive_samples.astype(float) n_negative_samples = n_negative_samples.astype(float) # initialize a starting point W = np.zeros((n_features, n_classes)) C = np.log(np.divide(n_positive_samples, n_negative_samples)) # compute XW = X*W XW = np.dot(X, W) # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent # the intial guess of the Lipschitz continuous gradient gamma = 1.0 / (n_samples * n_classes) # assign Wp with W, and XWp with XW XWp = XW WWp = np.zeros((n_features, n_classes)) CCp = np.zeros((1, n_classes)) alphap = 0 alpha = 1 # indicates whether the gradient step only changes a little flag = False max_iter = 1000 value_gamma = np.zeros(max_iter) obj = np.zeros(max_iter) for iter_step in range(max_iter): # step1: compute search point S based on Wp and W (with beta) beta = (alphap - 1) / alpha S = W + beta * WWp SC = C + beta * CCp # step2: line search for gamma and compute the new approximation solution W XS = XW + beta * (XW - XWp) aa = -np.multiply(Y, XS + np.tile(SC, (n_samples, 1))) # fun_S is the logistic loss at the search point bb = np.maximum(aa, 0) fun_S = np.sum(np.log(np.exp(-bb) + np.exp(aa - bb)) + bb) / (n_samples * n_classes) # compute prob = [p_1;p_2;...;p_m] prob = 1.0 / (1 + np.exp(aa)) b = np.multiply(-Y, (1 - prob)) / (n_samples * n_classes) # compute the gradient of C GC = np.sum(b, 0) # compute the gradient of W as X'*b G = np.dot(np.transpose(X), b) # copy W and XW to Wp and XWp Wp = W XWp = XW Cp = C while True: # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection V = S - G / gamma C = SC - GC / gamma W = euclidean_projection(V, n_features, n_classes, z, gamma) # the difference between the new approximate solution W and the search point S V = W - S # compute XW = X*W XW = np.dot(X, W) aa = -np.multiply(Y, XW + np.tile(C, (n_samples, 1))) # fun_W is the logistic loss at the new approximate solution bb = np.maximum(aa, 0) fun_W = np.sum(np.log(np.exp(-bb) + np.exp(aa - bb)) + bb) / (n_samples * n_classes) r_sum = (LA.norm(V, 'fro')**2 + LA.norm(C - SC, 2)**2) / 2 l_sum = fun_W - fun_S - np.sum(np.multiply(V, G)) - np.inner( (C - SC), GC) # determine weather the gradient step makes little improvement if r_sum <= 1e-20: flag = True break # the condition is fun_W <= fun_S + <V, G> + <C ,GC> + gamma/2 * (<V,V> + <C-SC,C-SC> ) if l_sum < r_sum * gamma: break else: gamma = max(2 * gamma, l_sum / r_sum) value_gamma[iter_step] = gamma # step3: update alpha and alphap, and check weather converge alphap = alpha alpha = (1 + math.sqrt(4 * alpha * alpha + 1)) / 2 WWp = W - Wp CCp = C - Cp # calculate obj obj[iter_step] = fun_W obj[iter_step] += z * calculate_l21_norm(W) if verbose: print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step])) if flag is True: break # determine weather converge if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step - 1]) < 1e-3: break if mode == "raw": return W, obj, value_gamma elif mode == "rank": # feature vector is to sort in ascending order according to the Weight idx = feature_ranking(W).tolist() return reverse_argsort(idx, size=X.shape[1]) else: print("Invalid mode {} selected, should be one of \"raw\" or \"rank\"". format(mode))
def proximal_gradient_descent(X, Y_flat, z, mode="rank", **kwargs): """ This function implements supervised sparse feature selection via l2,1 norm, i.e., min_{W} ||XW-Y||_F^2 + z*||W||_{2,1} Input ----- X: {numpy array}, shape (n_samples, n_features) input data, guaranteed to be a numpy array Y: {numpy array}, shape (n_samples, n_classes) input class labels, each row is a one-hot-coding class label z: {float} regularization parameter kwargs: {dictionary} verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape (n_features, n_classes) weight matrix obj: {numpy array}, shape (n_iterations,) objective function value during iterations value_gamma: {numpy array}, shape (n_iterations,) suitable step size during iterations Reference --------- Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009. """ def init_factor(W_norm, XW, Y, z): """ Initialize the starting point of W, according to the author's code """ n_samples, n_classes = XW.shape a = np.inner(np.reshape(XW, n_samples * n_classes), np.reshape(Y, n_samples * n_classes)) - z * W_norm b = LA.norm(XW, 'fro')**2 ratio = a / b return ratio if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] # convert Y_flat to one hot encoded Y = construct_label_matrix_pan(Y_flat) # starting point initialization n_samples, n_features = X.shape n_samples, n_classes = Y.shape # compute X'Y XtY = np.dot(np.transpose(X), Y) # initialize a starting point W = XtY # compute XW = X*W XW = np.dot(X, W) # compute l2,1 norm of W W_norm = calculate_l21_norm(W) if W_norm >= 1e-6: ratio = init_factor(W_norm, XW, Y, z) W = ratio * W XW = ratio * XW # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent # initialize step size gamma = 1 gamma = 1 # assign Wp with W, and XWp with XW XWp = XW WWp = np.zeros((n_features, n_classes)) alphap = 0 alpha = 1 # indicate whether the gradient step only changes a little flag = False max_iter = 1000 value_gamma = np.zeros(max_iter) obj = np.zeros(max_iter) for iter_step in range(max_iter): # step1: compute search point S based on Wp and W (with beta) beta = (alphap - 1) / alpha S = W + beta * WWp # step2: line search for gamma and compute the new approximation solution W XS = XW + beta * (XW - XWp) # compute X'* XS XtXS = np.dot(np.transpose(X), XS) # obtain the gradient g G = XtXS - XtY # copy W and XW to Wp and XWp Wp = W XWp = XW while True: # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection V = S - G / gamma W = euclidean_projection(V, n_features, n_classes, z, gamma) # the difference between the new approximate solution W and the search point S V = W - S # compute XW = X*W XW = np.dot(X, W) XV = XW - XS r_sum = LA.norm(V, 'fro')**2 l_sum = LA.norm(XV, 'fro')**2 # determine weather the gradient step makes little improvement if r_sum <= 1e-20: flag = True break # the condition is ||XV||_2^2 <= gamma * ||V||_2^2 if l_sum < r_sum * gamma: break else: gamma = max(2 * gamma, l_sum / r_sum) value_gamma[iter_step] = gamma # step3: update alpha and alphap, and check weather converge alphap = alpha alpha = (1 + math.sqrt(4 * alpha * alpha + 1)) / 2 WWp = W - Wp XWY = XW - Y # calculate obj obj[iter_step] = LA.norm(XWY, 'fro')**2 / 2 obj[iter_step] += z * calculate_l21_norm(W) if verbose: print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step])) if flag is True: break # determine weather converge if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step - 1]) < 1e-3: break if mode == "raw": return W, obj, value_gamma elif mode == "rank": # feature vector is to sort in ascending order according to the Weight idx = feature_ranking(W).tolist() return reverse_argsort(idx, size=X.shape[1]) else: print("Invalid mode {} selected, should be one of \"raw\" or \"rank\"". format(mode))
def gini_index(X, y, mode="index"): """ This function implements the gini index feature selection. Input ---------- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels Output ---------- gini: {numpy array}, shape (n_features, ) gini index value of each feature """ def feature_ranking(W): """ Rank features in descending order according to their gini index values, the smaller the gini index, the more important the feature is """ idx = np.argsort(W) return idx n_samples, n_features = X.shape # initialize gini_index for all features to be 1 gini = np.ones(n_features) # For i-th feature we define fi = x[:,i] ,v include all unique values in fi for i in range(n_features): v = np.unique(X[:, i]) for j in range(len(v)): # left_y contains labels of instances whose i-th feature value is less than or equal to v[j] left_y = y[X[:, i] <= v[j]] # right_y contains labels of instances whose i-th feature value is larger than v[j] right_y = y[X[:, i] > v[j]] # gini_left is sum of square of probability of occurrence of v[i] in left_y # gini_right is sum of square of probability of occurrence of v[i] in right_y gini_left = 0 gini_right = 0 for k in range(np.min(y), np.max(y)+1): if len(left_y) != 0: # t1_left is probability of occurrence of k in left_y t1_left = np.true_divide(len(left_y[left_y == k]), len(left_y)) t2_left = np.power(t1_left, 2) gini_left += t2_left if len(right_y) != 0: # t1_right is probability of occurrence of k in left_y t1_right = np.true_divide(len(right_y[right_y == k]), len(right_y)) t2_right = np.power(t1_right, 2) gini_right += t2_right gini_left = 1 - gini_left gini_right = 1 - gini_right # weighted average of len(left_y) and len(right_y) t1_gini = (len(left_y) * gini_left + len(right_y) * gini_right) # compute the gini_index for the i-th feature value = np.true_divide(t1_gini, len(y)) if value < gini[i]: gini[i] = value F = feature_ranking(gini) if mode=="index": return np.array(F, dtype=int) else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def trace_ratio(X, y, n_selected_features=None, mode='rank', **kwargs): """ This function implements the trace ratio criterion for feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels n_selected_features: {int} number of features to select kwargs: {dictionary} style: {string} style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way verbose: {boolean} True if user want to print out the objective function value in each iteration, False if not Output ------ feature_idx: {numpy array}, shape (n_features,) the ranked (descending order) feature index based on subset-level score feature_score: {numpy array}, shape (n_features,) the feature-level score subset_score: {float} the subset-level score Reference --------- Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008. """ if n_selected_features is None: n_selected_features = X.shape[1] # if 'style' is not specified, use the fisher score way to built two affinity matrix if 'style' not in list(kwargs.keys()): kwargs['style'] = 'fisher' # get the way to build affinity matrix, 'fisher' or 'laplacian' style = kwargs['style'] n_samples, n_features = X.shape # if 'verbose' is not specified, do not output the value of objective function if 'verbose' not in kwargs: kwargs['verbose'] = False verbose = kwargs['verbose'] if style is 'fisher': kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) L_within = np.eye(n_samples) - W_within L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples L_between = L_within - L_tmp if style is 'laplacian': kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1} # build within class and between class laplacian matrix L_w and L_b W_within = construct_W(X, **kwargs_within) D_within = np.diag(np.array(W_within.sum(1))[:, 0]) L_within = D_within - W_within W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within) D_between = np.diag(np.array(W_between.sum(1))) L_between = D_between - W_between # build X'*L_within*X and X'*L_between*X L_within = (np.transpose(L_within) + L_within)/2 L_between = (np.transpose(L_between) + L_between)/2 S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X)) S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X)) # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X' S_within = (np.transpose(S_within) + S_within)/2 # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X' S_between = (np.transpose(S_between) + S_between)/2 # take the absolute values of diagonal s_within = np.absolute(S_within.diagonal()) s_between = np.absolute(S_between.diagonal()) s_between[s_between == 0] = 1e-14 # this number if from authors' code # preprocessing fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1] k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features]) s_within = s_within[fs_idx[0:n_selected_features]] s_between = s_between[fs_idx[0:n_selected_features]] # iterate util converge count = 0 while True: score = np.sort(s_between-k*s_within)[::-1] I = np.argsort(s_between-k*s_within)[::-1] idx = I[0:n_selected_features] old_k = k k = np.sum(s_between[idx])/np.sum(s_within[idx]) if verbose: print('obj at iter {0}: {1}'.format(count+1, k)) count += 1 if abs(k - old_k) < 1e-3: break # get feature index, feature-level score and subset-level score feature_idx = fs_idx[I] feature_score = score subset_score = k if mode == 'raw': return feature_idx, feature_score, subset_score elif mode == 'index': return feature_idx else: return reverse_argsort(feature_idx)
def rfs(X, Y_flat, mode='rank', **kwargs): """ This function implementS efficient and robust feature selection via joint l21-norms minimization min_W||X^T W - Y||_2,1 + gamma||W||_2,1 Input ----- X: {numpy array}, shape (n_samples, n_features) input data Y: {numpy array}, shape (n_samples, n_classes) input class label matrix, each row is a one-hot-coding class label kwargs: {dictionary} gamma: {float} parameter in RFS verbose: boolean True if want to display the objective function value, false if not Output ------ W: {numpy array}, shape(n_samples, n_features) feature weight matrix Reference --------- Nie, Feiping et al. "Efficient and Robust Feature Selection via Joint l2,1-Norms Minimization" NIPS 2010. """ def calculate_obj(X, Y, W, gamma): """ This function calculates the objective function of rfs """ temp = np.dot(X, W) - Y return calculate_l21_norm(temp) + gamma*calculate_l21_norm(W) # convert Y_flat to one hot encoded Y = construct_label_matrix_pan(Y_flat) # default gamma is 1 if 'gamma' not in kwargs: gamma = 1 else: gamma = kwargs['gamma'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] n_samples, n_features = X.shape A = np.zeros((n_samples, n_samples + n_features)) A[:, 0:n_features] = X A[:, n_features:n_features+n_samples] = gamma*np.eye(n_samples) D = np.eye(n_features+n_samples) max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update U as U = D^{-1} A^T (A D^-1 A^T)^-1 Y D_inv = LA.inv(D) temp = LA.inv(np.dot(np.dot(A, D_inv), A.T) + 1e-6*np.eye(n_samples)) # (A D^-1 A^T)^-1 U = np.dot(np.dot(np.dot(D_inv, A.T), temp), Y) # update D as D_ii = 1 / 2 / ||U(i,:)|| D = generate_diagonal_matrix(U) obj[iter_step] = calculate_obj(X, Y, U[0:n_features, :], gamma) if verbose: print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step])) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3: break # the first d rows of U are the feature weights W = U[0:n_features, :] if mode=="raw": return W elif mode =="index": return feature_ranking(W) elif mode == "rank": return reverse_argsort(feature_ranking(W))
def mcfs(X, y=None, n_selected_features=None, mode="rank", **kwargs): """ This function implements unsupervised feature selection for multi-cluster data. Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_selected_features: {int} number of features to select kwargs: {dictionary} W: {sparse matrix}, shape (n_samples, n_samples) affinity matrix n_clusters: {int} number of clusters (default is 5) Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference --------- Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010. """ def feature_ranking(W): """ This function computes MCFS score and ranking features according to feature weights matrix W """ mcfs_score = W.max(1) idx = np.argsort(mcfs_score, 0) idx = idx[::-1] return idx if n_selected_features is None: n_selected_features = int(X.shape[1]) # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] # default number of clusters is 5 if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] # solve the generalized eigen-decomposition problem and get the top K # eigen-vectors with respect to the smallest eigenvalues W = W.toarray() W = (W + W.T) / 2 W_norm = np.diag(np.sqrt(1 / W.sum(1))) W = np.dot(W_norm, np.dot(W, W_norm)) WT = W.T W[W < WT] = WT[W < WT] eigen_value, ul = scipy.linalg.eigh(a=W) Y = np.dot(W_norm, ul[:, -1 * n_clusters - 1:-1]) # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d n_sample, n_feature = X.shape W = np.zeros((n_feature, n_clusters)) for i in range(n_clusters): clf = linear_model.Lars(n_nonzero_coefs=n_selected_features) clf.fit(X, Y[:, i]) W[:, i] = clf.coef_ if mode == "raw": return W elif mode == "index": return feature_ranking(W) elif mode == "rank": W_idx = feature_ranking(W) return reverse_argsort(W_idx, X.shape[1])
def cmim(X, y, mode="rank", **kwargs): """ This function implements the CMIM feature selection. The scoring criteria is calculated based on the formula j_cmim=I(f;y)-max_j(I(fj;f)-I(fj;f|y)) Input ----- X: {numpy array}, shape (n_samples, n_features) Input data, guaranteed to be a discrete numpy array y: {numpy array}, shape (n_samples,) guaranteed to be a numpy array kwargs: {dictionary} n_selected_features: {int} number of features to select Output ------ F: {numpy array}, shape (n_features,) index of selected features, F[0] is the most important feature J_CMIM: {numpy array}, shape: (n_features,) corresponding objective function value of selected features MIfy: {numpy array}, shape: (n_features,) corresponding mutual information between selected features and response Reference --------- Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012. """ n_samples, n_features = X.shape # index of selected features, initialized to be empty F = [] # Objective function value for selected features J_CMIM = [] # Mutual information between feature and response MIfy = [] # indicate whether the user specifies the number of features is_n_selected_features_specified = False if 'n_selected_features' in list(kwargs.keys()): n_selected_features = kwargs['n_selected_features'] is_n_selected_features_specified = True # t1 stores I(f;y) for each feature f t1 = np.zeros(n_features) # max stores max(I(fj;f)-I(fj;f|y)) for each feature f # we assign an extreme small value to max[i] ito make it is smaller than possible value of max(I(fj;f)-I(fj;f|y)) max = -10000000*np.ones(n_features) for i in range(n_features): f = X[:, i] t1[i] = midd(f, y) # make sure that j_cmi is positive at the very beginning j_cmim = 1 while True: if len(F) == 0: # select the feature whose mutual information is the largest idx = np.argmax(t1) F.append(idx) J_CMIM.append(t1[idx]) MIfy.append(t1[idx]) f_select = X[:, idx] if is_n_selected_features_specified: if len(F) == n_selected_features: break else: if j_cmim <= 0: break # we assign an extreme small value to j_cmim to ensure it is smaller than all possible values of j_cmim j_cmim = -1000000000000 for i in range(n_features): if i not in F: f = X[:, i] t2 = midd(f_select, f) t3 = cmidd(f_select, f, y) if t2-t3 > max[i]: max[i] = t2-t3 # calculate j_cmim for feature i (not in F) t = t1[i] - max[i] # record the largest j_cmim and the corresponding feature index if t > j_cmim: j_cmim = t idx = i F.append(idx) J_CMIM.append(j_cmim) MIfy.append(t1[idx]) f_select = X[:, idx] if mode=="index": return np.array(F) else: return reverse_argsort(F)
def udfs(X, y=None, mode='rank', **kwargs): """ This function implements l2,1-norm regularized discriminative feature selection for unsupervised learning, i.e., min_W Tr(W^T M W) + gamma ||W||_{2,1}, s.t. W^T W = I Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} gamma: {float} parameter in the objective function of UDFS (default is 1) n_clusters: {int} Number of clusters k: {int} number of nearest neighbor verbose: {boolean} True if want to display the objective function value, false if not Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference Yang, Yi et al. "l2,1-Norm Regularized Discriminative Feature Selection for Unsupervised Learning." AAAI 2012. """ def construct_M(X, k, gamma): """ This function constructs the M matrix described in the paper """ n_sample, n_feature = X.shape Xt = X.T D = pairwise_distances(X) # sort the distance matrix D in ascending order idx = np.argsort(D, axis=1) # choose the k-nearest neighbors for each instance idx_new = idx[:, 0:k + 1] H = np.eye(k + 1) - 1 / (k + 1) * np.ones((k + 1, k + 1)) I = np.eye(k + 1) Mi = np.zeros((n_sample, n_sample)) for i in range(n_sample): Xi = Xt[:, idx_new[i, :]] Xi_tilde = np.dot(Xi, H) Bi = np.linalg.inv(np.dot(Xi_tilde.T, Xi_tilde) + gamma * I) Si = np.zeros((n_sample, k + 1)) for q in range(k + 1): Si[idx_new[q], q] = 1 Mi = Mi + np.dot(np.dot(Si, np.dot(np.dot(H, Bi), H)), Si.T) M = np.dot(np.dot(X.T, Mi), X) return M def calculate_obj(X, W, M, gamma): """ This function calculates the objective function of ls_l21 described in the paper """ return np.trace(np.dot(np.dot(W.T, M), W)) + gamma * calculate_l21_norm(W) # default gamma is 0.1 if 'gamma' not in kwargs: gamma = 0.1 else: gamma = kwargs['gamma'] # default k is set to be 5 if 'k' not in kwargs: k = 5 else: k = kwargs['k'] if 'n_clusters' not in kwargs: n_clusters = 5 else: n_clusters = kwargs['n_clusters'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] # construct M n_sample, n_feature = X.shape M = construct_M(X, k, gamma) D = np.eye(n_feature) max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update W as the eigenvectors of P corresponding to the first n_clusters # smallest eigenvalues P = M + gamma * D eigen_value, eigen_vector = scipy.linalg.eigh(a=P) W = eigen_vector[:, 0:n_clusters] # update D as D_ii = 1 / 2 / ||W(i,:)|| D = generate_diagonal_matrix(W) obj[iter_step] = calculate_obj(X, W, M, gamma) if verbose: print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step])) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step - 1]) < 1e-3: break if mode == 'raw': return W elif mode == 'index': return feature_ranking(W) elif mode == 'rank': return reverse_argsort(feature_ranking(W))
def ndfs(X, y=None, mode="rank", **kwargs): """ This function implement unsupervised feature selection using nonnegative spectral analysis, i.e., min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2 s.t. F >= 0 Input ----- X: {numpy array}, shape (n_samples, n_features) input data kwargs: {dictionary} W: {sparse matrix}, shape {n_samples, n_samples} affinity matrix alpha: {float} Parameter alpha in objective function beta: {float} Parameter beta in objective function gamma: {float} a very large number used to force F^T F = I F0: {numpy array}, shape (n_samples, n_clusters) initialization of the pseudo label matirx F, if not provided n_clusters: {int} number of clusters verbose: {boolean} True if user want to print out the objective function value in each iteration, false if not Output ------ W: {numpy array}, shape(n_features, n_clusters) feature weight matrix Reference: Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012. """ def kmeans_initialization(X, n_clusters): """ This function uses kmeans to initialize the pseudo label Input ----- X: {numpy array}, shape (n_samples, n_features) input data n_clusters: {int} number of clusters Output ------ Y: {numpy array}, shape (n_samples, n_clusters) pseudo label matrix """ n_samples, n_features = X.shape kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters, init='k-means++', n_init=10, max_iter=300, tol=0.0001, precompute_distances=True, verbose=0, random_state=None, copy_x=True, n_jobs=1) kmeans.fit(X) labels = kmeans.labels_ Y = np.zeros((n_samples, n_clusters)) for row in range(0, n_samples): Y[row, labels[row]] = 1 T = np.dot(Y.transpose(), Y) F = np.dot(Y, np.sqrt(np.linalg.inv(T))) F = F + 0.02 * np.ones((n_samples, n_clusters)) return F def calculate_obj(X, W, F, L, alpha, beta): """ This function calculates the objective function of NDFS """ # Tr(F^T L F) T1 = np.trace(np.dot(np.dot(F.transpose(), L), F)) T2 = np.linalg.norm(np.dot(X, W) - F, 'fro') T3 = (np.sqrt((W * W).sum(1))).sum() obj = T1 + alpha * (T2 + beta * T3) return obj # default gamma is 10e8 if 'gamma' not in kwargs: gamma = 10e8 else: gamma = kwargs['gamma'] # use the default affinity matrix if 'W' not in kwargs: W = construct_W(X) else: W = kwargs['W'] if 'alpha' not in kwargs: alpha = 1 else: alpha = kwargs['alpha'] if 'beta' not in kwargs: beta = 1 else: beta = kwargs['beta'] if 'F0' not in kwargs: if 'n_clusters' not in kwargs: raise Exception("either F0 or n_clusters should be provided") else: # initialize F n_clusters = kwargs['n_clusters'] F = kmeans_initialization(X, n_clusters) else: F = kwargs['F0'] if 'verbose' not in kwargs: verbose = False else: verbose = kwargs['verbose'] n_samples, n_features = X.shape # initialize D as identity matrix D = np.identity(n_features) I = np.identity(n_samples) # build laplacian matrix L = np.array(W.sum(1))[:, 0] - W max_iter = 1000 obj = np.zeros(max_iter) for iter_step in range(max_iter): # update W T = np.linalg.inv( np.dot(X.transpose(), X) + beta * D + 1e-6 * np.eye(n_features)) W = np.dot(np.dot(T, X.transpose()), F) # update D temp = np.sqrt((W * W).sum(1)) temp[temp < 1e-16] = 1e-16 temp = 0.5 / temp D = np.diag(temp) # update M M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose())) M = (M + M.transpose()) / 2 # update F denominator = np.dot(M, F) + gamma * np.dot(np.dot(F, F.transpose()), F) temp = np.divide(gamma * F, denominator) F = F * np.array(temp) temp = np.diag(np.sqrt(np.diag(1 / (np.dot(F.transpose(), F) + 1e-16)))) F = np.dot(F, temp) # calculate objective function obj[iter_step] = np.trace(np.dot(np.dot( F.transpose(), M), F)) + gamma / 4 * np.linalg.norm( np.dot(F.transpose(), F) - np.identity(n_clusters), 'fro') if verbose: print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step])) if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step - 1]) < 1e-3: break F = feature_ranking(W) if mode == "index": return np.array(F, dtype=int) elif mode == "raw": return W else: # make sure that F is the same size?? return reverse_argsort(F, size=X.shape[1])
def reliefF(self,X, y,**kwargs): """ This function implements the reliefF feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} parameters of reliefF: k: {int} choices for the number of neighbors (default k = 5) Output ------ score: {numpy array}, shape (n_features,) reliefF score for each feature Reference --------- Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003. Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013. """ if "k" not in list(kwargs.keys()): k = 5 else: k = kwargs["k"] n_samples, n_features = X.shape # calculate pairwise distances between instances distance = pairwise_distances(X, metric='manhattan') # the number of sampled instances is equal to the number of total instances for idx in range(n_samples): score = np.zeros(n_features) near_hit = [] near_miss = dict() self_fea = X[idx, :] c = np.unique(y).tolist() stop_dict = dict() for label in c: stop_dict[label] = 0 del c[c.index(y[idx])] p_dict = dict() p_label_idx = float(len(y[y == y[idx]]))/float(n_samples) for label in c: p_label_c = float(len(y[y == label]))/float(n_samples) p_dict[label] = p_label_c/(1-p_label_idx) near_miss[label] = [] distance_sort = [] distance[idx, idx] = np.max(distance[idx, :]) for i in range(n_samples): distance_sort.append([distance[idx, i], int(i), y[i]]) distance_sort.sort(key=lambda x: x[0]) for i in range(n_samples): # find k nearest hit points if distance_sort[i][2] == y[idx]: if len(near_hit) < k: near_hit.append(distance_sort[i][1]) elif len(near_hit) == k: stop_dict[y[idx]] = 1 else: # find k nearest miss points for each label if len(near_miss[distance_sort[i][2]]) < k: near_miss[distance_sort[i][2]].append(distance_sort[i][1]) else: if len(near_miss[distance_sort[i][2]]) == k: stop_dict[distance_sort[i][2]] = 1 stop = True for (key, value) in list(stop_dict.items()): if value != 1: stop = False if stop: break # update reliefF score near_hit_term = np.zeros(n_features) for ele in near_hit: near_hit_term = np.array(abs(self_fea-X[ele, :]))+np.array(near_hit_term) near_miss_term = dict() for (label, miss_list) in list(near_miss.items()): near_miss_term[label] = np.zeros(n_features) for ele in miss_list: near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label]) score += near_miss_term[label]/(k*p_dict[label]) score -= near_hit_term/k self.scoreList.append(score) self.feature_ranking() if self.mode == 'raw': print("here") print(score) return score elif self.mode == 'index': print("herew") return feature_ranking(score) elif self.mode == 'rank': print("hereq") return reverse_argsort(feature_ranking(score), X.shape[1])
def reliefF(X, y, dist_params, mode="rank", **kwargs): """ This function implements the reliefF feature selection Input ----- X: {numpy array}, shape (n_samples, n_features) input data y: {numpy array}, shape (n_samples,) input class labels kwargs: {dictionary} parameters of reliefF: k: {int} choices for the number of neighbors (default k = 5) Output ------ score: {numpy array}, shape (n_features,) reliefF score for each feature Reference --------- Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003. Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013. """ def feature_ranking(score): """ Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the feature is """ idx = np.argsort(score, 0) return idx[::-1] if "k" not in list(kwargs.keys()): k = 5 else: k = kwargs["k"] n_samples, n_features = X.shape # calculate pairwise distances between instances distance = cdist(X, X, metric=partial_distance, **dist_params) distance = np.clip(distance, 0, X.shape[1]) X = np.nan_to_num(X) score = np.zeros(n_features) # the number of sampled instances is equal to the number of total instances for idx in range(n_samples): near_hit = [] near_miss = dict() self_fea = X[idx, :] c = np.unique(y).tolist() stop_dict = dict() for label in c: stop_dict[label] = 0 del c[c.index(y[idx])] p_dict = dict() p_label_idx = float(len(y[y == y[idx]])) / float(n_samples) for label in c: p_label_c = float(len(y[y == label])) / float(n_samples) p_dict[label] = p_label_c / (1 - p_label_idx) near_miss[label] = [] distance_sort = [] distance[idx, idx] = np.max(distance[idx, :]) for i in range(n_samples): distance_sort.append([distance[idx, i], int(i), y[i]]) distance_sort.sort(key=lambda x: x[0]) for i in range(n_samples): # find k nearest hit points if distance_sort[i][2] == y[idx]: if len(near_hit) < k: near_hit.append(distance_sort[i][1]) elif len(near_hit) == k: stop_dict[y[idx]] = 1 else: # find k nearest miss points for each label if len(near_miss[distance_sort[i][2]]) < k: near_miss[distance_sort[i][2]].append(distance_sort[i][1]) else: if len(near_miss[distance_sort[i][2]]) == k: stop_dict[distance_sort[i][2]] = 1 stop = True for (key, value) in list(stop_dict.items()): if value != 1: stop = False if stop: break # update reliefF score near_hit_term = np.zeros(n_features) for ele in near_hit: dist = np.zeros(X.shape[1]) for i in range(X.shape[1]): if isinstance(self_fea[i], str): dist[i] = 0 if self_fea[i] == X[ele, i] else 1 else: dist[i] = abs(self_fea[i] - X[ele, i]) near_hit_term += dist near_miss_term = dict() for (label, miss_list) in list(near_miss.items()): near_miss_term[label] = np.zeros(n_features) for ele in miss_list: dist = np.zeros(X.shape[1]) for i in range(X.shape[1]): if isinstance(self_fea[i], str): dist[i] = 0 if self_fea[i] == X[ele, i] else 1 else: dist[i] = abs(self_fea[i] - X[ele, i]) near_miss_term[label] = dist + np.array(near_miss_term[label]) score += near_miss_term[label] / (k * p_dict[label]) score -= near_hit_term / k if mode == 'raw': return score elif mode == 'index': return feature_ranking(score) elif mode == 'rank': return reverse_argsort(feature_ranking(score), X.shape[1])