コード例 #1
0
def cfs(X, y, mode="index", **kwargs):
    """
    This function uses a correlation based heuristic to evaluate the worth of features which is called CFS

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}
        index of selected features

    Reference
    ---------
    Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010.
    """

    if 'n_selected_features' in list(kwargs.keys()):
        n_selected_features = kwargs['n_selected_features']
    else:
        n_selected_features = 0
    n_samples, n_features = X.shape
    F = []
    # M stores the merit values
    M = []
    while True:
        merit = -100000000000
        idx = -1
        for i in range(n_features):
            if i not in F:
                F.append(i)
                # calculate the merit of current selected features
                try:
                    t = merit_calculation(X[:, F], y)
                except ZeroDivisionError:
                    t = -10000000000
                if t > merit:
                    merit = t
                    idx = i
                F.pop()
        F.append(idx)
        M.append(merit)
        if len(M) == n_selected_features and n_selected_features != 0:
            break
        else:
            if n_selected_features == 0 and len(M) == n_features:
                break

    if mode == "index":
        return np.array(F)
    else:
        return reverse_argsort(F, X.shape[1])
コード例 #2
0
def t_score(X, y, mode='rank'):
    """
    This function calculates t_score for each feature, where t_score is only used for binary problem
    t_score = |mean1-mean2|/sqrt(((std1^2)/n1)+((std2^2)/n2)))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}, shape (n_features,)
        t-score for each feature
    """
    def feature_ranking(F):
        """
        Rank features in descending order according to t-score, the higher the t-score, the more important the feature is
        """
        idx = np.argsort(F)
        return idx[::-1]

    n_samples, n_features = X.shape
    F = np.zeros(n_features)
    c = np.unique(y)
    if len(c) == 2:
        for i in range(n_features):
            f = X[:, i]
            # class0 contains instances belonging to the first class
            # class1 contains instances belonging to the second class
            class0 = f[y == c[0]]
            class1 = f[y == c[1]]
            mean0 = np.mean(class0)
            mean1 = np.mean(class1)
            std0 = np.std(class0)
            std1 = np.std(class1)
            n0 = len(class0)
            n1 = len(class1)
            t = mean0 - mean1
            t0 = np.true_divide(std0**2, n0)
            t1 = np.true_divide(std1**2, n1)
            F[i] = np.true_divide(t, (t0 + t1)**0.5)
    else:
        print('y should be guaranteed to a binary class vector')
        exit(0)
    if mode == "index":
        return np.array(np.abs(F))
    elif mode == 'feature_ranking':
        return feature_ranking(np.array(np.abs(F)))
    else:
        return reverse_argsort(feature_ranking(np.array(np.abs(F))),
                               X.shape[1])
コード例 #3
0
def decision_tree_backward(X, y, mode="rank", n_selected_features=None):
    """
    This function implements the backward feature selection algorithm based on decision tree

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    n_selected_features : {int}
        number of selected features

    Output
    ------
    F: {numpy array}, shape (n_features, )
        index of selected features
    """

    n_samples, n_features = X.shape
    if n_selected_features is None:
        n_selected_features = n_features
    # using 10 fold cross validation
    kfold = KFold(n_splits=10, shuffle=True)
    # choose decision tree as the classifier
    clf = DecisionTreeClassifier()

    # selected feature set, initialized to contain all features
    F = list(range(n_features))
    count = n_features

    while count > n_selected_features:
        max_acc = 0
        for i in range(n_features):
            if i in F:
                F.remove(i)
                X_tmp = X[:, F]
                results = cross_val_score(clf, X_tmp, y, cv=kfold)
                acc = results.mean()
                F.append(i)
                # record the feature which results in the largest accuracy
                if acc > max_acc:
                    max_acc = acc
                    idx = i
        # delete the feature which results in the largest accuracy
        F.remove(idx)
        count -= 1
    if mode == "index":
        return np.array(F)
    else:
        return reverse_argsort(F)
コード例 #4
0
def svm_forward(X, y, mode="rank", n_selected_features=None):
    """
    This function implements the forward feature selection algorithm based on SVM

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    n_selected_features: {int}
        number of selected features

    Output
    ------
    F: {numpy array}, shape (n_features, )
        index of selected features
    """

    n_samples, n_features = X.shape
    if n_selected_features is None:
        n_selected_features = n_features
    # using 10 fold cross validation
    kfold = KFold(n_splits=10, shuffle=True)
    # choose SVM as the classifier
    clf = SVC()

    # selected feature set, initialized to be empty
    F = []
    count = 0
    while count < n_selected_features-1:
        max_acc = 0
        for i in range(n_features):
            if i not in F:
                F.append(i)
                X_tmp = X[:, F]
                results = cross_val_score(clf, X_tmp, y, cv=kfold)
                acc = results.mean()     
                F.pop()
                # record the feature which results in the largest accuracy
                if acc > max_acc:
                    max_acc = acc
                    idx = i
        # add the feature which results in the largest accuracy
        F.append(idx)
        count += 1
    if mode == "index":
        return np.array(F)
    else:
        return reverse_argsort(F, X.shape[1])
コード例 #5
0
def cfs(X, y, mode="rank"):
    """
    This function uses a correlation based heuristic to evaluate the worth of features which is called CFS

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}
        index of selected features

    Reference
    ---------
    Zhao, Zheng et al. "Advancing Feature Selection Research - ASU Feature Selection Repository" 2010.
    """

    n_samples, n_features = X.shape
    F = []
    # M stores the merit values
    M = []
    while True:
        merit = -100000000000
        idx = -1
        for i in range(n_features):
            if i not in F:
                F.append(i)
                # calculate the merit of current selected features
                t = merit_calculation(X[:, F], y)
                if t > merit:
                    merit = t
                    idx = i
                F.pop()
        F.append(idx)
        M.append(merit)
        if len(M) > 5:
            if M[len(M)-1] <= M[len(M)-2]:
                if M[len(M)-2] <= M[len(M)-3]:
                    if M[len(M)-3] <= M[len(M)-4]:
                        if M[len(M)-4] <= M[len(M)-5]:
                            break
    
    if mode == "index":
        return np.array(F)
    else:
        return reverse_argsort(F, X.shape[1])
コード例 #6
0
ファイル: MIFS.py プロジェクト: EricSchles/scikit-feature-1
def mifs(X, y, mode="rank", **kwargs):
    """
    This function implements the MIFS feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be discrete
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        n_selected_features: {int}
            number of features to select

    Output
    ------
    F: {numpy array}, shape (n_features,)
        index of selected features, F[0] is the most important feature
    J_CMI: {numpy array}, shape: (n_features,)
        corresponding objective function value of selected features
    MIfy: {numpy array}, shape: (n_features,)
        corresponding mutual information between selected features and response

    Reference
    ---------
    Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
    """

    if 'beta' not in list(kwargs.keys()):
        beta = 0.5
    else:
        beta = kwargs['beta']
    if 'n_selected_features' in list(kwargs.keys()):
        n_selected_features = kwargs['n_selected_features']
        F, J_CMI, MIfy = LCSI.lcsi(X,
                                   y,
                                   beta=beta,
                                   gamma=0,
                                   n_selected_features=n_selected_features)
    else:
        F, J_CMI, MIfy = LCSI.lcsi(X, y, beta=beta, gamma=0)

    if mode == "index":
        return np.array(F, dtype=int)
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #7
0
def icap(X, y, mode="rank", **kwargs):
    """
    This function implements the ICAP feature selection.
    The scoring criteria is calculated based on the formula j_icap = I(f;y) - max_j(0,(I(fj;f)-I(fj;f|y)))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be a discrete data matrix
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        n_selected_features: {int}
            number of features to select

    Output
    ------
    F: {numpy array}, shape (n_features,)
        index of selected features, F[0] is the most important feature
    J_ICAP: {numpy array}, shape: (n_features,)
        corresponding objective function value of selected features
    MIfy: {numpy array}, shape: (n_features,)
        corresponding mutual information between selected features and response
    """
    n_samples, n_features = X.shape
    # index of selected features, initialized to be empty
    F = []
    # Objective function value for selected features
    J_ICAP = []
    # Mutual information between feature and response
    MIfy = []
    # indicate whether the user specifies the number of features
    is_n_selected_features_specified = False
    if 'n_selected_features' in list(kwargs.keys()):
        n_selected_features = kwargs['n_selected_features']
        is_n_selected_features_specified = True

    # t1 contains I(f;y) for each feature f
    t1 = np.zeros(n_features)
    # max contains max_j(0,(I(fj;f)-I(fj;f|y))) for each feature f
    max = np.zeros(n_features)
    for i in range(n_features):
        f = X[:, i]
        t1[i] = midd(f, y)

    # make sure that j_cmi is positive at the very beginning
    j_icap = 1

    while True:
        if len(F) == 0:
            # select the feature whose mutual information is the largest
            idx = np.argmax(t1)
            F.append(idx)
            J_ICAP.append(t1[idx])
            MIfy.append(t1[idx])
            f_select = X[:, idx]

        if is_n_selected_features_specified is True:
            if len(F) == n_selected_features:
                break
        if is_n_selected_features_specified is not True:
            if j_icap <= 0:
                break

        # we assign an extreme small value to j_icap to ensure it is smaller than all possible values of j_icap
        j_icap = -1000000000000
        for i in range(n_features):
            if i not in F:
                f = X[:, i]
                t2 = midd(f_select, f)
                t3 = cmidd(f_select, f, y)
                if t2-t3 > max[i]:
                    max[i] = t2-t3
                # calculate j_icap for feature i (not in F)
                t = t1[i] - max[i]
                # record the largest j_icap and the corresponding feature index
                if t > j_icap:
                    j_icap = t
                    idx = i
        F.append(idx)
        J_ICAP.append(j_icap)
        MIfy.append(t1[idx])
        f_select = X[:, idx]

    if mode=="index":
        return np.array(F, dtype=int)
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #8
0
ファイル: FCBF.py プロジェクト: EricSchles/scikit-feature-1
def fcbf(X, y, mode="rank", **kwargs):
    """
    This function implements Fast Correlation Based Filter algorithm

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be discrete
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        delta: {float}
            delta is a threshold parameter, the default value of delta is 0

    Output
    ------
    F: {numpy array}, shape (n_features,)
        index of selected features, F[0] is the most important feature
    SU: {numpy array}, shape (n_features,)
        symmetrical uncertainty of selected features

    Reference
    ---------
        Yu, Lei and Liu, Huan. "Feature Selection for High-Dimensional Data: A Fast Correlation-Based Filter Solution." ICML 2003.
    """

    n_samples, n_features = X.shape
    if 'delta' in list(kwargs.keys()):
        delta = kwargs['delta']
    else:
        # the default value of delta is 0
        delta = 0

    # t1[:,0] stores index of features, t1[:,1] stores symmetrical uncertainty of features
    t1 = np.zeros((n_features, 2))
    for i in range(n_features):
        f = X[:, i]
        t1[i, 0] = i
        t1[i, 1] = su_calculation(f, y)
    s_list = np.array(t1[t1[:, 1] > delta, :], dtype=int)
    # index of selected features, initialized to be empty
    F = []
    # Symmetrical uncertainty of selected features
    SU = []
    while len(s_list) != 0:
        # select the largest su inside s_list
        idx = np.argmax(s_list[:, 1])
        # record the index of the feature with the largest su
        fp = X[:, s_list[idx, 0]]
        np.delete(s_list, idx, 0)
        F.append(s_list[idx, 0])
        SU.append(s_list[idx, 1])
        for i in s_list[:, 0]:
            fi = X[:, i]
            if su_calculation(fp, fi) >= t1[i, 1]:
                # construct the mask for feature whose su is larger than su(fp,y)
                idx = s_list[:, 0] != i
                idx = np.array([idx, idx])
                idx = np.transpose(idx)
                # delete the feature by using the mask
                s_list = s_list[idx]
                length = len(s_list) / 2
                s_list = s_list.reshape((int(length), 2))
    if mode == "index":
        return np.array(F, dtype=int)
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #9
0
def disr(X, y, mode="rank", **kwargs):
    """
    This function implement the DISR feature selection.
    The scoring criteria is calculated based on the formula j_disr=sum_j(I(f,fj;y)/H(f,fj,y))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be a discrete data matrix
    y: {numpy array}, shape (n_samples,)
        input class labels

    kwargs: {dictionary}
        n_selected_features: {int}
            number of features to select

    Output
    ------
    F: {numpy array}, shape (n_features, )
        index of selected features, F[0] is the most important feature
    J_DISR: {numpy array}, shape: (n_features,)
        corresponding objective function value of selected features
    MIfy: {numpy array}, shape: (n_features,)
        corresponding mutual information between selected features and response

    Reference
    ---------
    Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
    """

    n_samples, n_features = X.shape
    # index of selected features, initialized to be empty
    F = []
    # Objective function value for selected features
    J_DISR = []
    # Mutual information between feature and response
    MIfy = []
    # indicate whether the user specifies the number of features
    is_n_selected_features_specified = False

    if 'n_selected_features' in list(kwargs.keys()):
        n_selected_features = kwargs['n_selected_features']
        is_n_selected_features_specified = True

    # sum stores sum_j(I(f,fj;y)/H(f,fj,y)) for each feature f
    sum = np.zeros(n_features)

    # make sure that j_cmi is positive at the very beginning
    j_disr = 1

    while True:
        if len(F) == 0:
            # t1 stores I(f;y) for each feature f
            t1 = np.zeros(n_features)
            for i in range(n_features):
                f = X[:, i]
                t1[i] = midd(f, y)
            # select the feature whose mutual information is the largest
            idx = np.argmax(t1)
            F.append(idx)
            J_DISR.append(t1[idx])
            MIfy.append(t1[idx])
            f_select = X[:, idx]

        if is_n_selected_features_specified is True:
            if len(F) == n_selected_features:
                break
        if is_n_selected_features_specified is not True:
            if j_disr <= 0:
                break

        # we assign an extreme small value to j_disr to ensure that it is smaller than all possible value of j_disr
        j_disr = -1E30
        for i in range(n_features):
            if i not in F:
                f = X[:, i]
                t2 = midd(f_select, y) + cmidd(f, y, f_select)
                t3 = entropyd(f) + conditional_entropy(f_select, f) + (conditional_entropy(y, f_select) - cmidd(y, f, f_select))
                sum[i] += np.true_divide(t2, t3)
                # record the largest j_disr and the corresponding feature index
                if sum[i] > j_disr:
                    j_disr = sum[i]
                    idx = i
        F.append(idx)
        J_DISR.append(j_disr)
        MIfy.append(t1[idx])
        f_select = X[:, idx]

    if mode=="index":
        return F
    else:
        return reverse_argsort(F, X.shape[1])
コード例 #10
0
def lap_score(X, y=None, mode="rank", **kwargs):
    """
    This function implements the laplacian score feature selection, steps are as follows:
    1. Construct the affinity matrix W if it is not specified
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Laplacian score for the r-th feature is score = (fr_hat'*L*fr_hat)/(fr_hat'*D*fr_hat)

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            input affinity matrix

    Output
    ------
    score: {numpy array}, shape (n_features,)
        laplacian score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    """

    def feature_ranking(score):
        """
        Rank features in ascending order according to their laplacian scores, the smaller the laplacian score is, the more
        important the feature is
        """
        idx = np.argsort(score, 0)
        return idx
    
    # if 'W' is not specified, use the default W
    if 'W' not in list(kwargs.keys()):
        W = construct_W(X)
    # construct the affinity matrix W
    W = kwargs['W']
    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000

    # compute laplacian score for all features
    score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]
    F = feature_ranking(np.transpose(score))

    if mode=="index":
        return np.array(F, dtype=int)
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #11
0
ファイル: SPEC.py プロジェクト: EricSchles/scikit-feature-1
def spec(X, y=None, mode='rank', **kwargs):
    """
    This function implements the SPEC feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        style: {int}
            style == -1, the first feature ranking function, use all eigenvalues
            style == 0, the second feature ranking function, use all except the 1st eigenvalue
            style >= 2, the third feature ranking function, use the first k except 1st eigenvalue
        W: {sparse matrix}, shape (n_samples, n_samples}
            input affinity matrix

    Output
    ------
    w_fea: {numpy array}, shape (n_features,)
        SPEC feature score for each feature

    Reference
    ---------
    Zhao, Zheng and Liu, Huan. "Spectral Feature Selection for Supervised and Unsupervised Learning." ICML 2007.
    """
    def feature_ranking(score, **kwargs):
        if 'style' not in kwargs:
            kwargs['style'] = 0
        style = kwargs['style']

        # if style = -1 or 0, ranking features in descending order, the higher the score, the more important the feature is
        if style == -1 or style == 0:
            idx = np.argsort(score, 0)
            return idx[::-1]
        # if style != -1 and 0, ranking features in ascending order, the lower the score, the more important the feature is
        elif style != -1 and style != 0:
            idx = np.argsort(score, 0)
            return idx

    if 'style' not in kwargs:
        kwargs['style'] = 0
    if 'is_classification' not in kwargs:
        # if y is available then we do supervised SPEC algo.
        kwargs['is_classification'] = True
    if 'W' not in kwargs:
        if y is None:
            kwargs['W'] = rbf_kernel(X, gamma=1)
        elif kwargs['is_classification']:
            kwargs['W'] = similiarity_classification(X, y)
        else:
            kwargs['W'] = similarity_regression(
                X, y, kwargs.get('n_neighbors', None))

    style = kwargs['style']
    W = kwargs['W']
    if type(W) is numpy.ndarray:
        W = csc_matrix(W)

    n_samples, n_features = X.shape

    # build the degree matrix
    X_sum = np.array(W.sum(axis=1))
    D = np.zeros((n_samples, n_samples))
    for i in range(n_samples):
        D[i, i] = X_sum[i]

    # build the laplacian matrix
    L = D - W
    d1 = np.power(np.array(W.sum(axis=1)), -0.5)
    d1[np.isinf(d1)] = 0
    d2 = np.power(np.array(W.sum(axis=1)), 0.5)
    v = np.dot(np.diag(d2[:, 0]), np.ones(n_samples))
    v = v / LA.norm(v)

    # build the normalized laplacian matrix
    L_hat = (np.matlib.repmat(d1, 1,
                              n_samples)) * np.array(L) * np.matlib.repmat(
                                  np.transpose(d1), n_samples, 1)

    # calculate and construct spectral information
    s, U = np.linalg.eigh(L_hat)
    s = np.flipud(s)
    U = np.fliplr(U)

    # begin to select features
    w_fea = np.ones(n_features) * 1000

    for i in range(n_features):
        f = X[:, i]
        F_hat = np.dot(np.diag(d2[:, 0]), f)
        l = LA.norm(F_hat)
        if l < 100 * np.spacing(1):
            w_fea[i] = 1000
            continue
        else:
            F_hat = F_hat / l
        a = np.array(np.dot(np.transpose(F_hat), U))
        a = np.multiply(a, a)
        a = np.transpose(a)

        # use f'Lf formulation
        if style == -1:
            w_fea[i] = np.sum(a * s)
        # using all eigenvalues except the 1st
        elif style == 0:
            a1 = a[0:n_samples - 1]
            w_fea[i] = np.sum(a1 * s[0:n_samples - 1]) / (
                1 - np.power(np.dot(np.transpose(F_hat), v), 2))
        # use first k except the 1st
        else:
            a1 = a[n_samples - style:n_samples - 1]
            w_fea[i] = np.sum(a1 * (2 - s[n_samples - style:n_samples - 1]))

    if style != -1 and style != 0:
        w_fea[w_fea == 1000] = -1000

    if mode == 'raw':
        return w_fea
    elif mode == 'index':
        return feature_ranking(w_fea)
    else:
        return reverse_argsort(feature_ranking(w_fea))
コード例 #12
0
def fisher_score(X, y, mode="rank"):
    """
    This function implements the fisher score feature selection, steps are as follows:
    1. Construct the affinity matrix W in fisher score way
    2. For the r-th feature, we define fr = X(:,r), D = diag(W*ones), ones = [1,...,1]', L = D - W
    3. Let fr_hat = fr - (fr'*D*ones)*ones/(ones'*D*ones)
    4. Fisher score for the r-th feature is score = (fr_hat'*D*fr_hat)/(fr_hat'*L*fr_hat)-1

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ------
    score: {numpy array}, shape (n_features,)
        fisher score for each feature

    Reference
    ---------
    He, Xiaofei et al. "Laplacian Score for Feature Selection." NIPS 2005.
    Duda, Richard et al. "Pattern classification." John Wiley & Sons, 2012.
    """
    def feature_ranking(score):
        idx = np.argsort(score, 0)
        return idx[::-1]
    
    # Construct weight matrix W in a fisherScore way
    kwargs = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
    W = construct_W(X, **kwargs)

    # build the diagonal D matrix from affinity matrix W
    D = np.array(W.sum(axis=1))
    L = W
    tmp = np.dot(np.transpose(D), X)
    D = diags(np.transpose(D), [0])
    Xt = np.transpose(X)
    t1 = np.transpose(np.dot(Xt, D.todense()))
    t2 = np.transpose(np.dot(Xt, L.todense()))
    # compute the numerator of Lr
    D_prime = np.sum(np.multiply(t1, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # compute the denominator of Lr
    L_prime = np.sum(np.multiply(t2, X), 0) - np.multiply(tmp, tmp)/D.sum()
    # avoid the denominator of Lr to be 0
    D_prime[D_prime < 1e-12] = 10000
    lap_score = 1 - np.array(np.multiply(L_prime, 1/D_prime))[0, :]

    # compute fisher score from laplacian score, where fisher_score = 1/lap_score - 1
    score = 1.0/lap_score - 1

    """
    Rank features in descending order according to fisher score, the larger the fisher score, the more important the
    feature is
    """
    F = feature_ranking(np.transpose(score))
    
    if mode=="index":
        return F
    else:
        return reverse_argsort(F, X.shape[1])
コード例 #13
0
def proximal_gradient_descent(X, Y_flat, z, mode="rank", **kwargs):
    """
    This function implements supervised sparse feature selection via l2,1 norm, i.e.,
    min_{W} sum_{i}log(1+exp(-yi*(W'*x+C))) + z*||W||_{2,1}

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    Y: {numpy array}, shape (n_samples, n_classes)
        input class labels, each row is a one-hot-coding class label, guaranteed to be a numpy array
    z: {float}
        regularization parameter
    kwargs: {dictionary}
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
    W: {numpy array}, shape (n_features, n_classes)
        weight matrix
    obj: {numpy array}, shape (n_iterations,)
        objective function value during iterations
    value_gamma: {numpy array}, shape (n_iterations,s)
        suitable step size during iterations


    Reference:
        Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
    """

    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    # Starting point initialization #

    # convert Y_flat to one hot encoded
    Y = construct_label_matrix_pan(Y_flat)
    n_samples, n_features = X.shape
    n_samples, n_classes = Y.shape

    # the indices of positive samples
    p_flag = (Y == 1)
    # the total number of positive samples
    n_positive_samples = np.sum(p_flag, 0)
    # the total number of negative samples
    n_negative_samples = n_samples - n_positive_samples
    n_positive_samples = n_positive_samples.astype(float)
    n_negative_samples = n_negative_samples.astype(float)

    # initialize a starting point
    W = np.zeros((n_features, n_classes))
    C = np.log(np.divide(n_positive_samples, n_negative_samples))

    # compute XW = X*W
    XW = np.dot(X, W)

    # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
    # the intial guess of the Lipschitz continuous gradient
    gamma = 1.0 / (n_samples * n_classes)

    # assign Wp with W, and XWp with XW
    XWp = XW
    WWp = np.zeros((n_features, n_classes))
    CCp = np.zeros((1, n_classes))

    alphap = 0
    alpha = 1

    # indicates whether the gradient step only changes a little
    flag = False

    max_iter = 1000
    value_gamma = np.zeros(max_iter)
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # step1: compute search point S based on Wp and W (with beta)
        beta = (alphap - 1) / alpha
        S = W + beta * WWp
        SC = C + beta * CCp

        # step2: line search for gamma and compute the new approximation solution W
        XS = XW + beta * (XW - XWp)
        aa = -np.multiply(Y, XS + np.tile(SC, (n_samples, 1)))
        # fun_S is the logistic loss at the search point
        bb = np.maximum(aa, 0)
        fun_S = np.sum(np.log(np.exp(-bb) + np.exp(aa - bb)) +
                       bb) / (n_samples * n_classes)
        # compute prob = [p_1;p_2;...;p_m]
        prob = 1.0 / (1 + np.exp(aa))

        b = np.multiply(-Y, (1 - prob)) / (n_samples * n_classes)
        # compute the gradient of C
        GC = np.sum(b, 0)
        # compute the gradient of W as X'*b
        G = np.dot(np.transpose(X), b)

        # copy W and XW to Wp and XWp
        Wp = W
        XWp = XW
        Cp = C

        while True:
            # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
            V = S - G / gamma
            C = SC - GC / gamma
            W = euclidean_projection(V, n_features, n_classes, z, gamma)

            # the difference between the new approximate solution W and the search point S
            V = W - S
            # compute XW = X*W
            XW = np.dot(X, W)
            aa = -np.multiply(Y, XW + np.tile(C, (n_samples, 1)))
            # fun_W is the logistic loss at the new approximate solution
            bb = np.maximum(aa, 0)
            fun_W = np.sum(np.log(np.exp(-bb) + np.exp(aa - bb)) +
                           bb) / (n_samples * n_classes)

            r_sum = (LA.norm(V, 'fro')**2 + LA.norm(C - SC, 2)**2) / 2
            l_sum = fun_W - fun_S - np.sum(np.multiply(V, G)) - np.inner(
                (C - SC), GC)

            # determine weather the gradient step makes little improvement
            if r_sum <= 1e-20:
                flag = True
                break

            # the condition is fun_W <= fun_S + <V, G> + <C ,GC> + gamma/2 * (<V,V> + <C-SC,C-SC> )
            if l_sum < r_sum * gamma:
                break
            else:
                gamma = max(2 * gamma, l_sum / r_sum)
        value_gamma[iter_step] = gamma

        # step3: update alpha and alphap, and check weather converge
        alphap = alpha
        alpha = (1 + math.sqrt(4 * alpha * alpha + 1)) / 2

        WWp = W - Wp
        CCp = C - Cp

        # calculate obj
        obj[iter_step] = fun_W
        obj[iter_step] += z * calculate_l21_norm(W)

        if verbose:
            print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step]))

        if flag is True:
            break

        # determine weather converge
        if iter_step >= 1 and math.fabs(obj[iter_step] -
                                        obj[iter_step - 1]) < 1e-3:
            break
    if mode == "raw":
        return W, obj, value_gamma
    elif mode == "rank":
        # feature vector is to sort in ascending order according to the Weight
        idx = feature_ranking(W).tolist()
        return reverse_argsort(idx, size=X.shape[1])
    else:
        print("Invalid mode {} selected, should be one of \"raw\" or \"rank\"".
              format(mode))
コード例 #14
0
def proximal_gradient_descent(X, Y_flat, z, mode="rank", **kwargs):
    """
    This function implements supervised sparse feature selection via l2,1 norm, i.e.,
    min_{W} ||XW-Y||_F^2 + z*||W||_{2,1}

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data, guaranteed to be a numpy array
    Y: {numpy array}, shape (n_samples, n_classes)
        input class labels, each row is a one-hot-coding class label
    z: {float}
        regularization parameter
    kwargs: {dictionary}
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
        W: {numpy array}, shape (n_features, n_classes)
            weight matrix
        obj: {numpy array}, shape (n_iterations,)
            objective function value during iterations
        value_gamma: {numpy array}, shape (n_iterations,)
            suitable step size during iterations

    Reference
    ---------
        Liu, Jun, et al. "Multi-Task Feature Learning Via Efficient l2,1-Norm Minimization." UAI. 2009.
    """
    def init_factor(W_norm, XW, Y, z):
        """
        Initialize the starting point of W, according to the author's code
        """
        n_samples, n_classes = XW.shape
        a = np.inner(np.reshape(XW, n_samples * n_classes),
                     np.reshape(Y, n_samples * n_classes)) - z * W_norm
        b = LA.norm(XW, 'fro')**2
        ratio = a / b
        return ratio

    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    # convert Y_flat to one hot encoded
    Y = construct_label_matrix_pan(Y_flat)
    # starting point initialization
    n_samples, n_features = X.shape
    n_samples, n_classes = Y.shape

    # compute X'Y
    XtY = np.dot(np.transpose(X), Y)

    # initialize a starting point
    W = XtY

    # compute XW = X*W
    XW = np.dot(X, W)

    # compute l2,1 norm of W
    W_norm = calculate_l21_norm(W)

    if W_norm >= 1e-6:
        ratio = init_factor(W_norm, XW, Y, z)
        W = ratio * W
        XW = ratio * XW

    # starting the main program, the Armijo Goldstein line search scheme + accelerated gradient descent
    # initialize step size gamma = 1
    gamma = 1

    # assign Wp with W, and XWp with XW
    XWp = XW
    WWp = np.zeros((n_features, n_classes))
    alphap = 0
    alpha = 1

    # indicate whether the gradient step only changes a little
    flag = False

    max_iter = 1000
    value_gamma = np.zeros(max_iter)
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # step1: compute search point S based on Wp and W (with beta)
        beta = (alphap - 1) / alpha
        S = W + beta * WWp

        # step2: line search for gamma and compute the new approximation solution W
        XS = XW + beta * (XW - XWp)
        # compute X'* XS
        XtXS = np.dot(np.transpose(X), XS)
        # obtain the gradient g
        G = XtXS - XtY
        # copy W and XW to Wp and XWp
        Wp = W
        XWp = XW

        while True:
            # let S walk in a step in the antigradient of S to get V and then do the L1/L2-norm regularized projection
            V = S - G / gamma
            W = euclidean_projection(V, n_features, n_classes, z, gamma)
            # the difference between the new approximate solution W and the search point S
            V = W - S
            # compute XW = X*W
            XW = np.dot(X, W)
            XV = XW - XS
            r_sum = LA.norm(V, 'fro')**2
            l_sum = LA.norm(XV, 'fro')**2

            # determine weather the gradient step makes little improvement
            if r_sum <= 1e-20:
                flag = True
                break

            # the condition is ||XV||_2^2 <= gamma * ||V||_2^2
            if l_sum < r_sum * gamma:
                break
            else:
                gamma = max(2 * gamma, l_sum / r_sum)
        value_gamma[iter_step] = gamma

        # step3: update alpha and alphap, and check weather converge
        alphap = alpha
        alpha = (1 + math.sqrt(4 * alpha * alpha + 1)) / 2

        WWp = W - Wp
        XWY = XW - Y

        # calculate obj
        obj[iter_step] = LA.norm(XWY, 'fro')**2 / 2
        obj[iter_step] += z * calculate_l21_norm(W)

        if verbose:
            print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step]))
        if flag is True:
            break

        # determine weather converge
        if iter_step >= 1 and math.fabs(obj[iter_step] -
                                        obj[iter_step - 1]) < 1e-3:
            break
    if mode == "raw":
        return W, obj, value_gamma
    elif mode == "rank":
        # feature vector is to sort in ascending order according to the Weight
        idx = feature_ranking(W).tolist()
        return reverse_argsort(idx, size=X.shape[1])
    else:
        print("Invalid mode {} selected, should be one of \"raw\" or \"rank\"".
              format(mode))
コード例 #15
0
def gini_index(X, y, mode="index"):
    """
    This function implements the gini index feature selection.

    Input
    ----------
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels

    Output
    ----------
    gini: {numpy array}, shape (n_features, )
        gini index value of each feature
    """
    def feature_ranking(W):
        """
        Rank features in descending order according to their gini index values, the smaller the gini index,
        the more important the feature is
        """
        idx = np.argsort(W)
        return idx

    n_samples, n_features = X.shape

    # initialize gini_index for all features to be 1
    gini = np.ones(n_features)

    # For i-th feature we define fi = x[:,i] ,v include all unique values in fi
    for i in range(n_features):
        v = np.unique(X[:, i])
        for j in range(len(v)):
            # left_y contains labels of instances whose i-th feature value is less than or equal to v[j]
            left_y = y[X[:, i] <= v[j]]
            # right_y contains labels of instances whose i-th feature value is larger than v[j]
            right_y = y[X[:, i] > v[j]]

            # gini_left is sum of square of probability of occurrence of v[i] in left_y
            # gini_right is sum of square of probability of occurrence of v[i] in right_y
            gini_left = 0
            gini_right = 0

            for k in range(np.min(y), np.max(y)+1):
                if len(left_y) != 0:
                    # t1_left is probability of occurrence of k in left_y
                    t1_left = np.true_divide(len(left_y[left_y == k]), len(left_y))
                    t2_left = np.power(t1_left, 2)
                    gini_left += t2_left

                if len(right_y) != 0:
                    # t1_right is probability of occurrence of k in left_y
                    t1_right = np.true_divide(len(right_y[right_y == k]), len(right_y))
                    t2_right = np.power(t1_right, 2)
                    gini_right += t2_right

            gini_left = 1 - gini_left
            gini_right = 1 - gini_right

            # weighted average of len(left_y) and len(right_y)
            t1_gini = (len(left_y) * gini_left + len(right_y) * gini_right)

            # compute the gini_index for the i-th feature
            value = np.true_divide(t1_gini, len(y))

            if value < gini[i]:
                gini[i] = value
    F = feature_ranking(gini)
    if mode=="index":
        return np.array(F, dtype=int)
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #16
0
def trace_ratio(X, y, n_selected_features=None, mode='rank', **kwargs):
    """
    This function implements the trace ratio criterion for feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        style: {string}
            style == 'fisher', build between-class matrix and within-class affinity matrix in a fisher score way
            style == 'laplacian', build between-class matrix and within-class affinity matrix in a laplacian score way
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, False if not

    Output
    ------
    feature_idx: {numpy array}, shape (n_features,)
        the ranked (descending order) feature index based on subset-level score
    feature_score: {numpy array}, shape (n_features,)
        the feature-level score
    subset_score: {float}
        the subset-level score

    Reference
    ---------
    Feiping Nie et al. "Trace Ratio Criterion for Feature Selection." AAAI 2008.
    """
    if n_selected_features is None:
        n_selected_features = X.shape[1]
    # if 'style' is not specified, use the fisher score way to built two affinity matrix
    if 'style' not in list(kwargs.keys()):
        kwargs['style'] = 'fisher'
    # get the way to build affinity matrix, 'fisher' or 'laplacian'
    style = kwargs['style']
    n_samples, n_features = X.shape

    # if 'verbose' is not specified, do not output the value of objective function
    if 'verbose' not in kwargs:
        kwargs['verbose'] = False
    verbose = kwargs['verbose']

    if style is 'fisher':
        kwargs_within = {"neighbor_mode": "supervised", "fisher_score": True, 'y': y}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        L_within = np.eye(n_samples) - W_within
        L_tmp = np.eye(n_samples) - np.ones([n_samples, n_samples])/n_samples
        L_between = L_within - L_tmp

    if style is 'laplacian':
        kwargs_within = {"metric": "euclidean", "neighbor_mode": "knn", "weight_mode": "heat_kernel", "k": 5, 't': 1}
        # build within class and between class laplacian matrix L_w and L_b
        W_within = construct_W(X, **kwargs_within)
        D_within = np.diag(np.array(W_within.sum(1))[:, 0])
        L_within = D_within - W_within
        W_between = np.dot(np.dot(D_within, np.ones([n_samples, n_samples])), D_within)/np.sum(D_within)
        D_between = np.diag(np.array(W_between.sum(1)))
        L_between = D_between - W_between

    # build X'*L_within*X and X'*L_between*X
    L_within = (np.transpose(L_within) + L_within)/2
    L_between = (np.transpose(L_between) + L_between)/2
    S_within = np.array(np.dot(np.dot(np.transpose(X), L_within), X))
    S_between = np.array(np.dot(np.dot(np.transpose(X), L_between), X))

    # reflect the within-class or local affinity relationship encoded on graph, Sw = X*Lw*X'
    S_within = (np.transpose(S_within) + S_within)/2
    # reflect the between-class or global affinity relationship encoded on graph, Sb = X*Lb*X'
    S_between = (np.transpose(S_between) + S_between)/2

    # take the absolute values of diagonal
    s_within = np.absolute(S_within.diagonal())
    s_between = np.absolute(S_between.diagonal())
    s_between[s_between == 0] = 1e-14  # this number if from authors' code

    # preprocessing
    fs_idx = np.argsort(np.divide(s_between, s_within), 0)[::-1]
    k = np.sum(s_between[0:n_selected_features])/np.sum(s_within[0:n_selected_features])
    s_within = s_within[fs_idx[0:n_selected_features]]
    s_between = s_between[fs_idx[0:n_selected_features]]

    # iterate util converge
    count = 0
    while True:
        score = np.sort(s_between-k*s_within)[::-1]
        I = np.argsort(s_between-k*s_within)[::-1]
        idx = I[0:n_selected_features]
        old_k = k
        k = np.sum(s_between[idx])/np.sum(s_within[idx])
        if verbose:
            print('obj at iter {0}: {1}'.format(count+1, k))
        count += 1
        if abs(k - old_k) < 1e-3:
            break

    # get feature index, feature-level score and subset-level score
    feature_idx = fs_idx[I]
    feature_score = score
    subset_score = k
    
    if mode == 'raw':
        return feature_idx, feature_score, subset_score
    elif mode == 'index':
        return feature_idx
    else:
        return reverse_argsort(feature_idx)
コード例 #17
0
ファイル: RFS.py プロジェクト: EricSchles/scikit-feature-1
def rfs(X, Y_flat, mode='rank', **kwargs):
    """
    This function implementS efficient and robust feature selection via joint l21-norms minimization
    min_W||X^T W - Y||_2,1 + gamma||W||_2,1

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    Y: {numpy array}, shape (n_samples, n_classes)
        input class label matrix, each row is a one-hot-coding class label
    kwargs: {dictionary}
        gamma: {float}
            parameter in RFS
        verbose: boolean
            True if want to display the objective function value, false if not

    Output
    ------
    W: {numpy array}, shape(n_samples, n_features)
        feature weight matrix

    Reference
    ---------
    Nie, Feiping et al. "Efficient and Robust Feature Selection via Joint l2,1-Norms Minimization" NIPS 2010.
    """
    def calculate_obj(X, Y, W, gamma):
        """
        This function calculates the objective function of rfs
        """
        temp = np.dot(X, W) - Y
        return calculate_l21_norm(temp) + gamma*calculate_l21_norm(W)
    
    # convert Y_flat to one hot encoded
    Y = construct_label_matrix_pan(Y_flat)
    # default gamma is 1
    if 'gamma' not in kwargs:
        gamma = 1
    else:
        gamma = kwargs['gamma']
    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    n_samples, n_features = X.shape
    A = np.zeros((n_samples, n_samples + n_features))
    A[:, 0:n_features] = X
    A[:, n_features:n_features+n_samples] = gamma*np.eye(n_samples)
    D = np.eye(n_features+n_samples)

    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update U as U = D^{-1} A^T (A D^-1 A^T)^-1 Y
        D_inv = LA.inv(D)
        temp = LA.inv(np.dot(np.dot(A, D_inv), A.T) + 1e-6*np.eye(n_samples))  # (A D^-1 A^T)^-1
        U = np.dot(np.dot(np.dot(D_inv, A.T), temp), Y)
        # update D as D_ii = 1 / 2 / ||U(i,:)||
        D = generate_diagonal_matrix(U)

        obj[iter_step] = calculate_obj(X, Y, U[0:n_features, :], gamma)

        if verbose:
            print('obj at iter {0}: {1}'.format(iter_step+1, obj[iter_step]))
        if iter_step >= 1 and math.fabs(obj[iter_step] - obj[iter_step-1]) < 1e-3:
            break
    
    # the first d rows of U are the feature weights
    W = U[0:n_features, :]
    if mode=="raw":
        return W
    elif mode =="index":
        return feature_ranking(W)
    elif mode == "rank":
        return reverse_argsort(feature_ranking(W))
コード例 #18
0
ファイル: MCFS.py プロジェクト: EricSchles/scikit-feature-1
def mcfs(X, y=None, n_selected_features=None, mode="rank", **kwargs):
    """
    This function implements unsupervised feature selection for multi-cluster data.

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    n_selected_features: {int}
        number of features to select
    kwargs: {dictionary}
        W: {sparse matrix}, shape (n_samples, n_samples)
            affinity matrix
        n_clusters: {int}
            number of clusters (default is 5)

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix

    Reference
    ---------
    Cai, Deng et al. "Unsupervised Feature Selection for Multi-Cluster Data." KDD 2010.
    """
    def feature_ranking(W):
        """
        This function computes MCFS score and ranking features according to feature weights matrix W
        """
        mcfs_score = W.max(1)
        idx = np.argsort(mcfs_score, 0)
        idx = idx[::-1]
        return idx

    if n_selected_features is None:
        n_selected_features = int(X.shape[1])

    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    # default number of clusters is 5
    if 'n_clusters' not in kwargs:
        n_clusters = 5
    else:
        n_clusters = kwargs['n_clusters']

    # solve the generalized eigen-decomposition problem and get the top K
    # eigen-vectors with respect to the smallest eigenvalues
    W = W.toarray()
    W = (W + W.T) / 2
    W_norm = np.diag(np.sqrt(1 / W.sum(1)))
    W = np.dot(W_norm, np.dot(W, W_norm))
    WT = W.T
    W[W < WT] = WT[W < WT]
    eigen_value, ul = scipy.linalg.eigh(a=W)
    Y = np.dot(W_norm, ul[:, -1 * n_clusters - 1:-1])

    # solve K L1-regularized regression problem using LARs algorithm with cardinality constraint being d
    n_sample, n_feature = X.shape
    W = np.zeros((n_feature, n_clusters))
    for i in range(n_clusters):
        clf = linear_model.Lars(n_nonzero_coefs=n_selected_features)
        clf.fit(X, Y[:, i])
        W[:, i] = clf.coef_

    if mode == "raw":
        return W
    elif mode == "index":
        return feature_ranking(W)
    elif mode == "rank":
        W_idx = feature_ranking(W)
        return reverse_argsort(W_idx, X.shape[1])
コード例 #19
0
def cmim(X, y, mode="rank", **kwargs):
    """
    This function implements the CMIM feature selection.
    The scoring criteria is calculated based on the formula j_cmim=I(f;y)-max_j(I(fj;f)-I(fj;f|y))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        Input data, guaranteed to be a discrete numpy array
    y: {numpy array}, shape (n_samples,)
        guaranteed to be a numpy array
    kwargs: {dictionary}
        n_selected_features: {int}
            number of features to select

    Output
    ------
    F: {numpy array}, shape (n_features,)
        index of selected features, F[0] is the most important feature
    J_CMIM: {numpy array}, shape: (n_features,)
        corresponding objective function value of selected features
    MIfy: {numpy array}, shape: (n_features,)
        corresponding mutual information between selected features and response

    Reference
    ---------
    Brown, Gavin et al. "Conditional Likelihood Maximisation: A Unifying Framework for Information Theoretic Feature Selection." JMLR 2012.
    """

    n_samples, n_features = X.shape
    # index of selected features, initialized to be empty
    F = []
    # Objective function value for selected features
    J_CMIM = []
    # Mutual information between feature and response
    MIfy = []
    # indicate whether the user specifies the number of features
    is_n_selected_features_specified = False

    if 'n_selected_features' in list(kwargs.keys()):
        n_selected_features = kwargs['n_selected_features']
        is_n_selected_features_specified = True

    # t1 stores I(f;y) for each feature f
    t1 = np.zeros(n_features)

    # max stores max(I(fj;f)-I(fj;f|y)) for each feature f
    # we assign an extreme small value to max[i] ito make it is smaller than possible value of max(I(fj;f)-I(fj;f|y))
    max = -10000000*np.ones(n_features)
    for i in range(n_features):
        f = X[:, i]
        t1[i] = midd(f, y)

    # make sure that j_cmi is positive at the very beginning
    j_cmim = 1

    while True:
        if len(F) == 0:
            # select the feature whose mutual information is the largest
            idx = np.argmax(t1)
            F.append(idx)
            J_CMIM.append(t1[idx])
            MIfy.append(t1[idx])
            f_select = X[:, idx]

        if is_n_selected_features_specified:
            if len(F) == n_selected_features:
                break
        else:
            if j_cmim <= 0:
                break

        # we assign an extreme small value to j_cmim to ensure it is smaller than all possible values of j_cmim
        j_cmim = -1000000000000
        for i in range(n_features):
            if i not in F:
                f = X[:, i]
                t2 = midd(f_select, f)
                t3 = cmidd(f_select, f, y)
                if t2-t3 > max[i]:
                        max[i] = t2-t3
                # calculate j_cmim for feature i (not in F)
                t = t1[i] - max[i]
                # record the largest j_cmim and the corresponding feature index
                if t > j_cmim:
                    j_cmim = t
                    idx = i
        F.append(idx)
        J_CMIM.append(j_cmim)
        MIfy.append(t1[idx])
        f_select = X[:, idx]
    if mode=="index":
        return np.array(F)
    else:
        return reverse_argsort(F)
コード例 #20
0
def udfs(X, y=None, mode='rank', **kwargs):
    """
    This function implements l2,1-norm regularized discriminative feature
    selection for unsupervised learning, i.e., min_W Tr(W^T M W) + gamma ||W||_{2,1}, s.t. W^T W = I

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        gamma: {float}
            parameter in the objective function of UDFS (default is 1)
        n_clusters: {int}
            Number of clusters
        k: {int}
            number of nearest neighbor
        verbose: {boolean}
            True if want to display the objective function value, false if not

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix

    Reference
    Yang, Yi et al. "l2,1-Norm Regularized Discriminative Feature Selection for Unsupervised Learning." AAAI 2012.
    """
    def construct_M(X, k, gamma):
        """
        This function constructs the M matrix described in the paper
        """
        n_sample, n_feature = X.shape
        Xt = X.T
        D = pairwise_distances(X)
        # sort the distance matrix D in ascending order
        idx = np.argsort(D, axis=1)
        # choose the k-nearest neighbors for each instance
        idx_new = idx[:, 0:k + 1]
        H = np.eye(k + 1) - 1 / (k + 1) * np.ones((k + 1, k + 1))
        I = np.eye(k + 1)
        Mi = np.zeros((n_sample, n_sample))
        for i in range(n_sample):
            Xi = Xt[:, idx_new[i, :]]
            Xi_tilde = np.dot(Xi, H)
            Bi = np.linalg.inv(np.dot(Xi_tilde.T, Xi_tilde) + gamma * I)
            Si = np.zeros((n_sample, k + 1))
            for q in range(k + 1):
                Si[idx_new[q], q] = 1
            Mi = Mi + np.dot(np.dot(Si, np.dot(np.dot(H, Bi), H)), Si.T)
        M = np.dot(np.dot(X.T, Mi), X)
        return M

    def calculate_obj(X, W, M, gamma):
        """
        This function calculates the objective function of ls_l21 described in the paper
        """
        return np.trace(np.dot(np.dot(W.T, M),
                               W)) + gamma * calculate_l21_norm(W)

    # default gamma is 0.1
    if 'gamma' not in kwargs:
        gamma = 0.1
    else:
        gamma = kwargs['gamma']
    # default k is set to be 5
    if 'k' not in kwargs:
        k = 5
    else:
        k = kwargs['k']
    if 'n_clusters' not in kwargs:
        n_clusters = 5
    else:
        n_clusters = kwargs['n_clusters']
    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    # construct M
    n_sample, n_feature = X.shape
    M = construct_M(X, k, gamma)

    D = np.eye(n_feature)
    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update W as the eigenvectors of P corresponding to the first n_clusters
        # smallest eigenvalues
        P = M + gamma * D
        eigen_value, eigen_vector = scipy.linalg.eigh(a=P)
        W = eigen_vector[:, 0:n_clusters]
        # update D as D_ii = 1 / 2 / ||W(i,:)||
        D = generate_diagonal_matrix(W)

        obj[iter_step] = calculate_obj(X, W, M, gamma)
        if verbose:
            print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step]))

        if iter_step >= 1 and math.fabs(obj[iter_step] -
                                        obj[iter_step - 1]) < 1e-3:
            break
    if mode == 'raw':
        return W
    elif mode == 'index':
        return feature_ranking(W)
    elif mode == 'rank':
        return reverse_argsort(feature_ranking(W))
コード例 #21
0
def ndfs(X, y=None, mode="rank", **kwargs):
    """
    This function implement unsupervised feature selection using nonnegative spectral analysis, i.e.,
    min_{F,W} Tr(F^T L F) + alpha*(||XW-F||_F^2 + beta*||W||_{2,1}) + gamma/2 * ||F^T F - I||_F^2
    s.t. F >= 0
    
    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    kwargs: {dictionary}
        W: {sparse matrix}, shape {n_samples, n_samples}
            affinity matrix
        alpha: {float}
            Parameter alpha in objective function
        beta: {float}
            Parameter beta in objective function
        gamma: {float}
            a very large number used to force F^T F = I
        F0: {numpy array}, shape (n_samples, n_clusters)
            initialization of the pseudo label matirx F, if not provided
        n_clusters: {int}
            number of clusters
        verbose: {boolean}
            True if user want to print out the objective function value in each iteration, false if not

    Output
    ------
    W: {numpy array}, shape(n_features, n_clusters)
        feature weight matrix
        
    Reference: 
        Li, Zechao, et al. "Unsupervised Feature Selection Using Nonnegative Spectral Analysis." AAAI. 2012.
    """
    def kmeans_initialization(X, n_clusters):
        """
        This function uses kmeans to initialize the pseudo label

        Input
        -----
        X: {numpy array}, shape (n_samples, n_features)
            input data
        n_clusters: {int}
            number of clusters

        Output
        ------
        Y: {numpy array}, shape (n_samples, n_clusters)
            pseudo label matrix
        """

        n_samples, n_features = X.shape
        kmeans = sklearn.cluster.KMeans(n_clusters=n_clusters,
                                        init='k-means++',
                                        n_init=10,
                                        max_iter=300,
                                        tol=0.0001,
                                        precompute_distances=True,
                                        verbose=0,
                                        random_state=None,
                                        copy_x=True,
                                        n_jobs=1)
        kmeans.fit(X)
        labels = kmeans.labels_
        Y = np.zeros((n_samples, n_clusters))
        for row in range(0, n_samples):
            Y[row, labels[row]] = 1
        T = np.dot(Y.transpose(), Y)
        F = np.dot(Y, np.sqrt(np.linalg.inv(T)))
        F = F + 0.02 * np.ones((n_samples, n_clusters))
        return F

    def calculate_obj(X, W, F, L, alpha, beta):
        """
        This function calculates the objective function of NDFS
        """
        # Tr(F^T L F)
        T1 = np.trace(np.dot(np.dot(F.transpose(), L), F))
        T2 = np.linalg.norm(np.dot(X, W) - F, 'fro')
        T3 = (np.sqrt((W * W).sum(1))).sum()
        obj = T1 + alpha * (T2 + beta * T3)
        return obj

    # default gamma is 10e8
    if 'gamma' not in kwargs:
        gamma = 10e8
    else:
        gamma = kwargs['gamma']
    # use the default affinity matrix
    if 'W' not in kwargs:
        W = construct_W(X)
    else:
        W = kwargs['W']
    if 'alpha' not in kwargs:
        alpha = 1
    else:
        alpha = kwargs['alpha']
    if 'beta' not in kwargs:
        beta = 1
    else:
        beta = kwargs['beta']
    if 'F0' not in kwargs:
        if 'n_clusters' not in kwargs:
            raise Exception("either F0 or n_clusters should be provided")
        else:
            # initialize F
            n_clusters = kwargs['n_clusters']
            F = kmeans_initialization(X, n_clusters)
    else:
        F = kwargs['F0']
    if 'verbose' not in kwargs:
        verbose = False
    else:
        verbose = kwargs['verbose']

    n_samples, n_features = X.shape

    # initialize D as identity matrix
    D = np.identity(n_features)
    I = np.identity(n_samples)

    # build laplacian matrix
    L = np.array(W.sum(1))[:, 0] - W

    max_iter = 1000
    obj = np.zeros(max_iter)
    for iter_step in range(max_iter):
        # update W
        T = np.linalg.inv(
            np.dot(X.transpose(), X) + beta * D + 1e-6 * np.eye(n_features))
        W = np.dot(np.dot(T, X.transpose()), F)
        # update D
        temp = np.sqrt((W * W).sum(1))
        temp[temp < 1e-16] = 1e-16
        temp = 0.5 / temp
        D = np.diag(temp)
        # update M
        M = L + alpha * (I - np.dot(np.dot(X, T), X.transpose()))
        M = (M + M.transpose()) / 2
        # update F
        denominator = np.dot(M,
                             F) + gamma * np.dot(np.dot(F, F.transpose()), F)
        temp = np.divide(gamma * F, denominator)
        F = F * np.array(temp)
        temp = np.diag(np.sqrt(np.diag(1 /
                                       (np.dot(F.transpose(), F) + 1e-16))))
        F = np.dot(F, temp)

        # calculate objective function
        obj[iter_step] = np.trace(np.dot(np.dot(
            F.transpose(), M), F)) + gamma / 4 * np.linalg.norm(
                np.dot(F.transpose(), F) - np.identity(n_clusters), 'fro')
        if verbose:
            print('obj at iter {0}: {1}'.format(iter_step + 1, obj[iter_step]))
        if iter_step >= 1 and math.fabs(obj[iter_step] -
                                        obj[iter_step - 1]) < 1e-3:
            break
    F = feature_ranking(W)

    if mode == "index":
        return np.array(F, dtype=int)
    elif mode == "raw":
        return W
    else:
        # make sure that F is the same size??
        return reverse_argsort(F, size=X.shape[1])
コード例 #22
0
    def reliefF(self,X, y,**kwargs):
        """
        This function implements the reliefF feature selection

        Input
        -----
        X: {numpy array}, shape (n_samples, n_features)
            input data
        y: {numpy array}, shape (n_samples,)
            input class labels
        kwargs: {dictionary}
            parameters of reliefF:
            k: {int}
                choices for the number of neighbors (default k = 5)

        Output
        ------
        score: {numpy array}, shape (n_features,)
            reliefF score for each feature

        Reference
        ---------
        Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003.
        Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
        """


        if "k" not in list(kwargs.keys()):
            k = 5
        else:
            k = kwargs["k"]
        n_samples, n_features = X.shape

        # calculate pairwise distances between instances
        distance = pairwise_distances(X, metric='manhattan')



        # the number of sampled instances is equal to the number of total instances
        for idx in range(n_samples):
            score = np.zeros(n_features)
            near_hit = []
            near_miss = dict()

            self_fea = X[idx, :]
            c = np.unique(y).tolist()

            stop_dict = dict()
            for label in c:
                stop_dict[label] = 0
            del c[c.index(y[idx])]

            p_dict = dict()
            p_label_idx = float(len(y[y == y[idx]]))/float(n_samples)

            for label in c:
                p_label_c = float(len(y[y == label]))/float(n_samples)
                p_dict[label] = p_label_c/(1-p_label_idx)
                near_miss[label] = []

            distance_sort = []
            distance[idx, idx] = np.max(distance[idx, :])

            for i in range(n_samples):
                distance_sort.append([distance[idx, i], int(i), y[i]])
            distance_sort.sort(key=lambda x: x[0])

            for i in range(n_samples):
                # find k nearest hit points
                if distance_sort[i][2] == y[idx]:
                    if len(near_hit) < k:
                        near_hit.append(distance_sort[i][1])
                    elif len(near_hit) == k:
                        stop_dict[y[idx]] = 1
                else:
                    # find k nearest miss points for each label
                    if len(near_miss[distance_sort[i][2]]) < k:
                        near_miss[distance_sort[i][2]].append(distance_sort[i][1])
                    else:
                        if len(near_miss[distance_sort[i][2]]) == k:
                            stop_dict[distance_sort[i][2]] = 1
                stop = True
                for (key, value) in list(stop_dict.items()):
                        if value != 1:
                            stop = False
                if stop:
                    break

            # update reliefF score
            near_hit_term = np.zeros(n_features)
            for ele in near_hit:
                near_hit_term = np.array(abs(self_fea-X[ele, :]))+np.array(near_hit_term)

            near_miss_term = dict()
            for (label, miss_list) in list(near_miss.items()):
                near_miss_term[label] = np.zeros(n_features)
                for ele in miss_list:
                    near_miss_term[label] = np.array(abs(self_fea-X[ele, :]))+np.array(near_miss_term[label])
                score += near_miss_term[label]/(k*p_dict[label])
            score -= near_hit_term/k
            self.scoreList.append(score)
            self.feature_ranking()
        if self.mode == 'raw':
            print("here")
            print(score)
            return score
        elif self.mode == 'index':
            print("herew")
            return feature_ranking(score)
        elif self.mode == 'rank':
            print("hereq")
            return reverse_argsort(feature_ranking(score), X.shape[1])
def reliefF(X, y, dist_params, mode="rank", **kwargs):
    """
    This function implements the reliefF feature selection

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y: {numpy array}, shape (n_samples,)
        input class labels
    kwargs: {dictionary}
        parameters of reliefF:
        k: {int}
            choices for the number of neighbors (default k = 5)

    Output
    ------
    score: {numpy array}, shape (n_features,)
        reliefF score for each feature

    Reference
    ---------
    Robnik-Sikonja, Marko et al. "Theoretical and empirical analysis of relieff and rrelieff." Machine Learning 2003.
    Zhao, Zheng et al. "On Similarity Preserving Feature Selection." TKDE 2013.
    """
    def feature_ranking(score):
        """
        Rank features in descending order according to reliefF score, the higher the reliefF score, the more important the
        feature is
        """
        idx = np.argsort(score, 0)
        return idx[::-1]

    if "k" not in list(kwargs.keys()):
        k = 5
    else:
        k = kwargs["k"]
    n_samples, n_features = X.shape

    # calculate pairwise distances between instances
    distance = cdist(X, X, metric=partial_distance, **dist_params)
    distance = np.clip(distance, 0, X.shape[1])
    X = np.nan_to_num(X)

    score = np.zeros(n_features)

    # the number of sampled instances is equal to the number of total instances
    for idx in range(n_samples):
        near_hit = []
        near_miss = dict()

        self_fea = X[idx, :]
        c = np.unique(y).tolist()

        stop_dict = dict()
        for label in c:
            stop_dict[label] = 0
        del c[c.index(y[idx])]

        p_dict = dict()
        p_label_idx = float(len(y[y == y[idx]])) / float(n_samples)

        for label in c:
            p_label_c = float(len(y[y == label])) / float(n_samples)
            p_dict[label] = p_label_c / (1 - p_label_idx)
            near_miss[label] = []

        distance_sort = []
        distance[idx, idx] = np.max(distance[idx, :])

        for i in range(n_samples):
            distance_sort.append([distance[idx, i], int(i), y[i]])
        distance_sort.sort(key=lambda x: x[0])

        for i in range(n_samples):
            # find k nearest hit points
            if distance_sort[i][2] == y[idx]:
                if len(near_hit) < k:
                    near_hit.append(distance_sort[i][1])
                elif len(near_hit) == k:
                    stop_dict[y[idx]] = 1
            else:
                # find k nearest miss points for each label
                if len(near_miss[distance_sort[i][2]]) < k:
                    near_miss[distance_sort[i][2]].append(distance_sort[i][1])
                else:
                    if len(near_miss[distance_sort[i][2]]) == k:
                        stop_dict[distance_sort[i][2]] = 1
            stop = True
            for (key, value) in list(stop_dict.items()):
                if value != 1:
                    stop = False
            if stop:
                break

        # update reliefF score
        near_hit_term = np.zeros(n_features)
        for ele in near_hit:
            dist = np.zeros(X.shape[1])
            for i in range(X.shape[1]):
                if isinstance(self_fea[i], str):
                    dist[i] = 0 if self_fea[i] == X[ele, i] else 1
                else:
                    dist[i] = abs(self_fea[i] - X[ele, i])

            near_hit_term += dist

        near_miss_term = dict()
        for (label, miss_list) in list(near_miss.items()):
            near_miss_term[label] = np.zeros(n_features)
            for ele in miss_list:
                dist = np.zeros(X.shape[1])
                for i in range(X.shape[1]):
                    if isinstance(self_fea[i], str):
                        dist[i] = 0 if self_fea[i] == X[ele, i] else 1
                    else:
                        dist[i] = abs(self_fea[i] - X[ele, i])
                near_miss_term[label] = dist + np.array(near_miss_term[label])
            score += near_miss_term[label] / (k * p_dict[label])
        score -= near_hit_term / k
    if mode == 'raw':
        return score
    elif mode == 'index':
        return feature_ranking(score)
    elif mode == 'rank':
        return reverse_argsort(feature_ranking(score), X.shape[1])