Esempio n. 1
0
def main():
    # load matlab data
    print 'Loading Data !'
    mat = scipy.io.loadmat('../data/COIL20.mat')
    print 'Data Loaded !'
    X = mat['X']
    X = X.astype(float)
    y = mat['Y']
    y = y[:, 0]

    # construct W
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 0.1
    }
    W = construct_W(X, **kwargs)

    # mcfs feature selection
    n_selected_features = 100
    print 'Training Model !'
    S = MCFS.mcfs(X, n_selected_features, W=W, n_clusters=20)
    print 'Model Trained !'
    idx = MCFS.feature_ranking(S)

    # evaluation
    X_selected = X[:, idx[0:n_selected_features]]
    ari, nmi, acc = unsupervised_evaluation.evaluation(X_selected=X_selected,
                                                       n_clusters=20,
                                                       y=y)
    # print 'ARI:', ari
    # print 'NMI:', nmi
    print 'Accuracy:', round(acc * 100.0, 2), '%'
Esempio n. 2
0
def reliefF(X, y):
    """
    This function implements the reliefF feature selection, steps are as follows:
    1. Construct the affinity matrix W in reliefF way
    2. For the r-th feature, we define fr = X(:,r), reliefF score for the r-th feature is -1+fr'*W*fr

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    score: {numpy array}, shape (n_features,)
        reliefF score for each feature

    Reference
    ---------
    Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
    """

    # construct the affinity matrix W
    kwargs = {"neighbor_mode": "supervised", "reliefF": True, 'y': y}
    W = construct_W.construct_W(X, **kwargs)
    n_samples, n_features = X.shape
    score = np.zeros(n_features)
    for i in range(n_features):
        score[i] = -1 + np.dot(np.transpose(X[:, i]), W.dot(X[:, i]))
    return score
def main():
    # load matlab data
    print '-----------------------------------------'
    print 'Loading \'pixraw10P\' Data !'
    mat = scipy.io.loadmat('../data/COIL20.mat')
    print 'Data Loaded !'
    print '-----------------------------------------'
    X = mat['X']  # data
    y = mat['Y']  # label
    y = y[:, 0]
    X = X.astype(float)
    n_samples, n_features = X.shape

    # split data
    print 'Splitting data into 10 folds !'
    ss = cross_validation.KFold(n_samples, n_folds=10, shuffle=True)
    print 'Data Splitted !'
    print '-----------------------------------------'

    # evaluation
    num_fea = 100
    print 'Initializing KNN !'
    neigh = KNeighborsClassifier(n_neighbors=10)
    print 'KNN Initialized !'
    print '-----------------------------------------'
    correct = 0

    fold_no = 0
    for train, test in ss:
        print '\tFold No.', fold_no
        kwargs = {
            "neighbor_mode": "supervised",
            "fisher_score": True,
            'y': y[train]
        }
        print 'Constructing Affinity Matrix !'
        # W = construct_W.construct_W(X[train], **kwargs)
        W = CW.construct_W(X[train], **kwargs)
        print 'Affinity Matrix Constructed !'

        print 'Calculating Fischer Score and ranking...'
        # score = fisher_score.fisher_score(X[train], y[train])
        score = FS.fisher_score(X[train], y[train])
        # idx = fisher_score.feature_ranking(score)
        idx = FS.feature_ranking(score)
        print 'Fischer Score and ranking calculated !'

        selected_features = X[:, idx[0:num_fea]]
        neigh.fit(selected_features[train], y[train])
        y_predict = neigh.predict(selected_features[test])
        acc = accuracy_score(y[test], y_predict)
        print acc
        correct = correct + acc
        fold_no += 1
        print '-----------------------------------------'

    print '10 fold Cross - Validation Accuracy:', round(
        (float(correct) / 10) * 100.0, 2), '%'
Esempio n. 4
0
def mcfs(X, n_selected_features, **kwargs):
    """
    This function implements unsupervised feature selection for multi-cluster data.

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            affinity matrix
        n_clusters: {int}
            number of clusters (default is 5)

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix

    Reference
    ---------
    Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010.
    """

    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    # default number of clusters is 5
    if 'n_clusters' not in kwargs:
        n_clusters = 5
    else:
        n_clusters = kwargs['n_clusters']

    # solve the generalized eigen-decomposition problem and get the top K
    # eigen-vectors with respect to the smallest eigenvalues
    W = W.toarray()
    W = (W + W.T) / 2
    W_norm = np.diag(np.sqrt(1 / W.sum(1)))
    W = np.dot(W_norm, np.dot(W, W_norm))
    WT = W.T
    W[W < WT] = WT[W < WT]
    eigen_value, ul = scipy.linalg.eigh(a=W)
    Y = np.dot(W_norm, ul[:, -1*n_clusters-1:-1])

    # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d
    n_sample, n_feature = X.shape
    W = np.zeros((n_feature, n_clusters))
    for i in range(n_clusters):
        clf = linear_model.Lars(n_nonzero_coefs=n_selected_features)
        clf.fit(X, Y[:, i])
        W[:, i] = clf.coef_
    return W
def fisher_score(X, y):
    import construct_W
    # Construct weight matrix W in a fisherScore way
    kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
    W = construct_W.construct_W(X, **kwargs)
    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp) / D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000
    lap_score = 1 - np.array(np.multiply(L_prime, 1 / D_prime))[0, :]
    # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
    score = 1.0 / lap_score - 1
    return np.transpose(score)
def trace_ratio(X, y, n_selected_features, **kwargs):
    import construct_W
    # if 'style' is not specified, use the fisher score way to built two affinity matrix
    if 'style' not in kwargs.keys():
        kwargs['style'] = 'fisher'
    # get the way to build affinity matrix, 'fisher' or 'laplacian'
    style = kwargs['style']
    n_samples, n_features = X.shape

    # if 'verbose' is not specified, do not output the value of objective function
    if 'verbose' not in kwargs:
        kwargs['verbose'] = False
    verbose = kwargs['verbose']

    if style is 'fisher':
        kwargs_within = {
            "neighbor_mode": "supervised",
            "fisher_score": True,
            'y': y
        }
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W.construct_W(X, **kwargs_within)
        L_within = np.eye(n_samples) - W_within
        L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples]) / n_samples
        L_between = L_within - L_tmp

    if style is 'laplacian':
        kwargs_within = {
            "metric": "euclidean",
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 5,
            't': 1
        }
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W.construct_W(X, **kwargs_within)
        D_within = np.diag(np.array(W_within.sum(1))[:, 0])
        L_within = D_within - W_within
        W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])),
                           D_within) / np.sum(D_within)
        D_between = np.diag(np.array(W_between.sum(1)))
        L_between = D_between - W_between

    # build X'*L_within*X and X'*L_between*X
    L_within = (np.transpose(L_within) + L_within) / 2
    L_between = (np.transpose(L_between) + L_between) / 2
    S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X))
    S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X))

    # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X'
    S_within = (np.transpose(S_within) + S_within) / 2
    # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X'
    S_between = (np.transpose(S_between) + S_between) / 2

    # take the absolute values of diagonal
    s_within = np.absolute(S_within.diagonal())
    s_between = np.absolute(S_between.diagonal())
    s_between[s_between == 0] = 1e-14  # this number if from authors' code

    # preprocessing
    fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1]
    k = np.sum(s_between[0:n_selected_features]) / np.sum(
        s_within[0:n_selected_features])
    s_within = s_within[fs_idx[0:n_selected_features]]
    s_between = s_between[fs_idx[0:n_selected_features]]

    # iterate util converge
    count = 0
    while True:
        score = np.sort(s_between - k * s_within)[::-1]
        I = np.argsort(s_between - k * s_within)[::-1]
        idx = I[0:n_selected_features]
        old_k = k
        k = np.sum(s_between[idx]) / np.sum(s_within[idx])
        if verbose:
            print('obj at iter {0}: {1}'.format(count + 1, k))
        count += 1
        if abs(k - old_k) < 1e-3:
            break

    # get feature index, feature-level score and subset-level score
    feature_idx = fs_idx[I]
    feature_score = score
    subset_score = k
    return feature_idx, feature_score, subset_score
Esempio n. 7
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

import numpy as np
import scipy.io
from ConstructPairwiseDistance import ConstructPairwiseDistance
from sklearn.metrics.pairwise import pairwise_distances
import datetime
import construct_W
from skfeature.function.sparse_learning_based.MCFS import mcfs

mat = scipy.io.loadmat("COIL20.mat")
X = mat['X']
kwrags_W = {
    "metric": "euclidean",
    "neighbor_mode": "knn",
    "weight_mode": "heat_kernel",
    "k": 5,
    "t": 1
}
W = construct_W.construct_W(X, **kwrags_W)
print W
weightMat = mcfs(X, 10, **{"W": W, "n_clusters": 20})
print weightMat
print weightMat.shape
np.savetxt("a.txt", weightMat, fmt='%.5f')