Beispiel #1
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']    # data
    X = X.astype(float)
    y = mat['Y']    # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
    W = construct_W.construct_W(X, **kwargs)

    num_fea = 100    # specify the number of selected features
    num_cluster = 20    # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print 'NMI:', float(nmi_total)/20
    print 'ACC:', float(acc_total)/20
    def featureSelection(listNewTimeSeries, numFeature, numCluster=5):
        """
        @description  : select features using MCFS algorithem.
        ---------
        @param  : numFeature -- how many features are to be selected.
                  numCluster -- parameter required in MFCS, deafault is set to 5.
        -------
        @Returns  : selected_features -- selected features
                    idx -- the indexes of selected features in original feature set.
        -------
        """

        kwargs = {
            "metric": "euclidean",
            "neighborMode": "knn",
            "weightMode": "heatKernel",
            "k": 5,
            't': 1
        }
        listNewTimeSeries = np.array(listNewTimeSeries)
        W = construct_W.construct_W(listNewTimeSeries, **kwargs)
        Weight = MCFS.mcfs(listNewTimeSeries,
                           n_selected_features=numFeature,
                           W=W,
                           n_clusters=numCluster)
        idx = MCFS.feature_ranking(Weight)
        selected_features = listNewTimeSeries[:, idx[0:numFeature]]
        return selected_features, idx
Beispiel #3
0
def calc_MCFS(data, n_features, n_clusters=20):
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(data, **kwargs_W)

    return MCFS.mcfs(data,
                     n_selected_features=n_features,
                     W=W,
                     n_clusters=n_clusters).max(1)
Beispiel #4
0
def mcfs(trnin, num_fea):

    from skfeature.utility import construct_W
    kwargs_W = {
        "metric": "euclidean",
        "neighbor_mode": "knn",
        "weight_mode": "heat_kernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(trnin, **kwargs_W)

    from skfeature.function.sparse_learning_based import MCFS
    score = MCFS.mcfs(trnin, num_fea, W=W)
    idx = MCFS.feature_ranking(score)
    selfea = idx[0:num_fea]
    return selfea
Beispiel #5
0
def main():
    # load data
    mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]

    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_w.construct_w(X, **kwargs)

    num_fea = 100  # specify the number of selected features
    num_cluster = 20  # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X, n_selected_features=num_fea, W=W, n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)

    # obtain the dataset on the selected features
    selected_features = X[:, idx[0:num_fea]]

    # perform kmeans clustering based on the selected features and repeats 20 times
    nmi_total = 0
    acc_total = 0
    for i in range(0, 20):
        nmi, acc = unsupervised_evaluation.evaluation(
            X_selected=selected_features, n_clusters=num_cluster, y=y)
        nmi_total += nmi
        acc_total += acc

    # output the average NMI and average ACC
    print('NMI:', old_div(float(nmi_total), 20))
    print('ACC:', old_div(float(acc_total), 20))
def SKF_mcfs(X, y):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W(X, **kwargs)
    num_fea = X.shape[1]  # specify the number of selected features
    num_cluster = len(
        set(y)
    )  # specify the number of clusters, it is usually set as the number of classes in the ground truth
    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X,
                       n_selected_features=num_fea,
                       W=W,
                       n_clusters=num_cluster)
    return MCFS.feature_ranking(Weight)
Beispiel #7
0
def MCFS_FS(X_train, k):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X_train, **kwargs)
    num_fea_ = k  # specify the number of selected features
    num_cluster = 20  # specify the number of clusters, it is usually set as the number of classes in the ground truth

    # obtain the feature weight matrix
    Weight = MCFS.mcfs(X_train,
                       n_selected_features=num_fea_,
                       W=W,
                       n_clusters=20)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS.feature_ranking(Weight)
    return (idx, Weight)
Beispiel #8
0
def mcfs_score(diheds):
  import scipy.io
  import numpy
  from numpy import mean
  import os 
  #os.chdir('/home/anu/Downloads/scikit-feature-1.0.0')
  from skfeature.function.sparse_learning_based import MCFS
  from skfeature.utility import construct_W
  from skfeature.utility import unsupervised_evaluation
  idx = []
  kwargs = {"metric": "euclidean", "neighborMode": "knn", "weightMode": "heatKernel", "k": 5, 't': 1}
  #change the path for every system to be run.
  #os.chdir('/home/anu/Downloads/DESRES-Trajectory_GTT-1-protein/GTT-1-protein')
  for i in range(0,len(diheds),5):
   X= diheds[i]
   W = construct_W.construct_W(X, **kwargs)
   score = MCFS.mcfs(X, n_selected_features=20, W=W, n_clusters=20)
     
   idx.append(score)
  col_mean = mean(idx, axis =0)
  imp_features=MCFS.feature_ranking(col_mean)
  return col_mean,imp_features
Beispiel #9
0
def MCFS(X, y=None, **kwargs):
    # construct affinity matrix
    kwargs = {
        "metric": "euclidean",
        "neighborMode": "knn",
        "weightMode": "heatKernel",
        "k": 5,
        't': 1
    }
    W = construct_W.construct_W(X, **kwargs)

    num_cluster = len(np.unique(y))

    # obtain the feature weight matrix
    Weight = MCFS_CLASS.mcfs(X,
                             n_selected_features=X.shape[1],
                             W=W,
                             n_clusters=num_cluster)

    # sort the feature scores in an ascending order according to the feature scores
    idx = MCFS_CLASS.feature_ranking(Weight)

    return idx
Beispiel #10
0
def select(dataset, features_number, clusters_number):

    app_logger.info(
        'STARTED [MCFS Selection] on {0} with features number = {1}'.format(
            dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d')

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    # Building matrix W for MCFS algorithm
    kwargs = {
        'metric': 'euclidean',
        'neighbor_mode': 'knn',
        'weight_mode': 'binary',
        'k': 3
        # 'weight_mode': 'heat_kernel',
        # 'k': 5,
        # 't': 1
    }
    W = construct_W.construct_W(indipendent_columns_train.values, **kwargs)

    # MCFS gives a weight to each features
    kwargs = {'W': W, 'n_clusters': clusters_number}
    weighted_features = MCFS.mcfs(indipendent_columns_train.values,
                                  features_number, **kwargs)

    # Ordering the features according to their weight
    ordered_features = MCFS.feature_ranking(weighted_features)

    # Getting only the first 'features_number' features
    selected_features = ordered_features[0:features_number]

    # Getting names of selected features
    names_selected_features = []
    for feature_index in selected_features:
        names_selected_features.append(
            indipendent_columns_train.columns[feature_index])

    # Selected only the selected features on the train set
    selected_features_train = indipendent_columns_train.loc[:,
                                                            names_selected_features]
    app_logger.info('Selected features trainset: {0}'.format(
        selected_features_train.shape),
                    extra=LOGGER_EXTRA_OBJECT)

    # Selected only the selected features on the test set
    selected_features_test = indipendent_columns_test.loc[:,
                                                          names_selected_features]
    app_logger.info('Selected features testset: {0}'.format(
        selected_features_test.shape),
                    extra=LOGGER_EXTRA_OBJECT)
    '''
    # Pickles for rfd
    if selected_features_train.shape[0] > 1000:
        print('Test-set')
        selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    else:
        print('Train-set')
        selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    exit()
    '''

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'MCFS', features_number, dataset, selected_features_train.values,
        selected_features_test.values, clusters_number, known_labels_test)

    app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)


# Testing
#select('TwoPatterns', 10, 4)
def generate_result_dist(dataset, x,y,num_select, zero_mean=False, N=1000, t=0.6, thresh=0.1):
    if zero_mean == False:
        x = normalize(x,axis=0)
    else:
        x = standardize_feature(x)
        
    n,d = x.shape
    
    if num_select==300:
        start_dim = 20; step = 20
    elif num_select==200:         # the dimension
        start_dim = 20; step = 10
    elif num_select==100:
        start_dim = 10; step = 10
    elif num_select==50:
        start_dim = 10; step = 5
    elif num_select == 20:
        start_dim = 4; step = 2
    else:
        start_dim = 5; step = 1
           
    dimension_list = list(range(start_dim,num_select+1,step))
    
    #########  rank: parameter  preserve_pctg, num_use  #########
    D0 = compute_dist(x)
    
    preserve_pctg_list = [0.2,0.4,0.6,0.8,1]   #dimension 0
    num_use_list = [0.1,0.2,0.3,0.4,0.5]    #dimension 1
        
    rank_result = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)])
    rank_result_l1 = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)])
    rank_result_l2 = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)])
    rank_result_lmax = np.zeros([len(preserve_pctg_list),len(num_use_list),7,len(dimension_list)])
    
    for i,preserve_pctg in enumerate(preserve_pctg_list):
        for j,num_use in enumerate(num_use_list):
            print(i,j)
            rank_selected, rank_selected_l1, rank_selected_l2, rank_selected_lmax= ranking_selection(x, num_select, N=N, num_use=int(num_use*d+1),sample_pctg=1, preserve_pctg=preserve_pctg)
            rank_selected = list(rank_selected)[::-1]

            for k,dimension in enumerate(dimension_list):      #performance using different number fo features
                s = rank_selected[:dimension]
                rank_x = x[:,s]
                D_rank = compute_dist(rank_x)
                rank_result[i,j,0,k] = ef.dif_dist(D0,D_rank,'l1')
                rank_result[i,j,1,k] = ef.dif_dist(D0,D_rank,'l2')
                rank_result[i,j,2,k] = ef.dif_dist(D0,D_rank,'lmax')
                
                s_l1 = rank_selected_l1[:dimension]
                rank_l1_x = x[:,s_l1]
                D1 = compute_dist(rank_l1_x)
                
                rank_result_l1[i,j,0,k] = ef.dif_dist(D0,D1,'l1')
                rank_result_l1[i,j,1,k] = ef.dif_dist(D0,D1,'l2')
                rank_result_l1[i,j,2,k] = ef.dif_dist(D0,D1,'lmax')               

                s_l2 = rank_selected_l2[:dimension]
                rank_l2_x = x[:,s_l2]
                D2 = compute_dist(rank_l2_x)
                
                rank_result_l2[i,j,0,k] = ef.dif_dist(D0,D2,'l1')
                rank_result_l2[i,j,1,k] = ef.dif_dist(D0,D2,'l2')
                rank_result_l2[i,j,2,k] = ef.dif_dist(D0,D2,'lmax')  
                
                s_lmax = rank_selected_lmax[:dimension]
                rank_lmax_x = x[:,s_lmax]
                D_max = compute_dist(rank_lmax_x)
                
                rank_result_lmax[i,j,0,k] = ef.dif_dist(D0,D_max,'l1')
                rank_result_lmax[i,j,1,k] = ef.dif_dist(D0,D_max,'l2')
                rank_result_lmax[i,j,2,k] = ef.dif_dist(D0,D_max,'lmax')                 

    
    np.save('./result/'+dataset+'/rank_dist',rank_result)
    np.save('./result/'+dataset+'/rank_l1_dist',rank_result_l1)
    np.save('./result/'+dataset+'/rank_l2_dist',rank_result_l2)
    np.save('./result/'+dataset+'/rank_lmax_dist',rank_result_lmax)
    
    ########  lap_score  ###########
    lap_score_result = np.zeros([7,len(dimension_list)])
    lap_score_selected = lap_score.lap_score(x)
    lap_score_selected = list(np.argsort(lap_score_selected)[:num_select])    #find minimum
    
    for k,dimension in enumerate(dimension_list):      #performance using different number fo features
        s = lap_score_selected[:dimension]
        lap_score_x = x[:,s]
        D1 = compute_dist(lap_score_x)
        
        lap_score_result[0,k] = ef.dif_dist(D0,D1,'l1')
        lap_score_result[1,k] = ef.dif_dist(D0,D1,'l2')
        lap_score_result[2,k] = ef.dif_dist(D0,D1,'lmax')

    np.save('./result/'+dataset+'/lap_score_dist',lap_score_result)
    
    ########  SPEC  ###########
    SPEC_result = np.zeros([7,len(dimension_list)])
    SPEC_selected = SPEC.spec(x)
    SPEC_selected = list(np.argsort(SPEC_selected)[:num_select])    #find minimum
    
    for k,dimension in enumerate(dimension_list):      #performance using different number fo features
        s = SPEC_selected[:dimension]
        SPEC_x = x[:,s]
        D1 = compute_dist(SPEC_x)
        
        SPEC_result[0,k] = ef.dif_dist(D0,D1,'l1')
        SPEC_result[1,k] = ef.dif_dist(D0,D1,'l2')
        SPEC_result[2,k] = ef.dif_dist(D0,D1,'lmax')

    np.save('./result/'+dataset+'/SPEC_dist',SPEC_result)
    
    #######  MCFS  parameter: num_clusters  ##############   
    num_clusters_list = [5,10,20,30]     
    MCFS_result = np.zeros([len(num_clusters_list),7,len(dimension_list)])
    for i,num_clusters in enumerate(num_clusters_list):
        MCFS_W = MCFS.mcfs(x,num_select,**{'n_clusters':num_clusters})
        MCFS_selected = [np.max(np.abs(x)) for x in MCFS_W]     #find maximum
        MCFS_selected= np.argsort(MCFS_selected)[-num_select:]
        MCFS_selected = list(MCFS_selected)[::-1]
        for k,dimension in enumerate(dimension_list):      #performance using different number fo features
            s = MCFS_selected[:dimension]
            MCFS_x = x[:,s]
            D1 = compute_dist(MCFS_x)
            
            MCFS_result[i,0,k] = ef.dif_dist(D0,D1,'l1')
            MCFS_result[i,1,k] = ef.dif_dist(D0,D1,'l2')
            MCFS_result[i,2,k] = ef.dif_dist(D0,D1,'lmax')
           
        
    np.save('./result/'+dataset+'/MCFS_dist',MCFS_result)   
    
    return rank_result, rank_result_l1, rank_result_l2,rank_result_lmax,lap_score_result, SPEC_result, MCFS_result
def compare_methods(x,y,num_select,pctg=0.5,sample_pctg=1, num_clusters=5,zero_mean=False,dim=1,t=0.8,thresh=0.1):
    if zero_mean == False:
        x = normalize(x,axis=0)
    else:
        x = standardize_feature(x)
        
    n,d = x.shape
    
#    idx = np.random.permutation(n)
#    x,y = x[idx], y[idx]
#    
#    #########  split train and test  #########
#    X=x;Y=y
#    train_num = int(n*0.6)
#    test_num = n-int(n*0.6)
#    x=X[:train_num,:]; y=Y[:train_num]
#    x_test = X[-test_num:,:];y_test = Y[-test_num:]
    
    ###########  calculate  ######################

    start_time = time.clock()
    rf_result = random_selection(x, num_select, N=500, num_use=int(0.5*d),pctg=pctg, two_sided=False)
    print('rf running time:',time.clock()-start_time)

    start_time = time.clock()
    rank_result,l1,l2,lmax= ranking_selection(x, num_select, N=500, num_use=int(0.5*d),sample_pctg=1, preserve_pctg=pctg)
    print('rank running time:',time.clock()-start_time)
    
    start_time = time.clock()
    lap_score_result = lap_score.lap_score(x)
    lap_score_result= np.argsort(lap_score_result)[:num_select]    #find minimum
    print('lap_score running time:',time.clock()-start_time)
    
    start_time = time.clock()
    SPEC_result = SPEC.spec(x)
    print('SPEC running time:',time.clock()-start_time)
    SPEC_result= np.argsort(SPEC_result)[:num_select]     #find minimum
    
    '''sparse learning based'''
    start_time = time.clock()
    MCFS_W = MCFS.mcfs(x,num_select,**{'n_clusters':num_clusters})
    print('MCFS running time:',time.clock()-start_time)
    MCFS_result = [np.max(np.abs(x)) for x in MCFS_W]     #find maximum
    MCFS_result= np.argsort(MCFS_result)[-num_select:]

#    start_time = time.clock()
#    NDFS_W = NDFS.ndfs(x,**{'n_clusters':num_clusters})
#    print('NDFS running time:',time.clock()-start_time)
#    NDFS_result = [np.sqrt(np.sum(x**2)) for x in NDFS_W]     #find maximum
#    NDFS_result= np.argsort(NDFS_result)[-num_select:]
#
#    start_time = time.clock()
#    UDFS_W = UDFS.udfs(x,**{'n_clusters':num_clusters}) 
#    print('UDFS running time:',time.clock()-start_time)             
#    UDFS_result = [np.sqrt(np.sum(x**2)) for x in UDFS_W]     #find minimum ??????????????????????
#    UDFS_result= np.argsort(UDFS_result)[:num_select]
    
#    prop_x = x[:,list(stepwise)]
    rf_x = x[:,list(rf_result)]
    rank_x = x[:,list(rank_result)]
    l1_x = x[:,list(l1)]
    l2_x = x[:,list(l2)]
    lmax_x = x[:,list(lmax)]
    lap_score_x = x[:,list(lap_score_result)]
    SPEC_x = x[:,list(SPEC_result)]
    MCFS_x = x[:,list(MCFS_result)]
#    NDFS_x = x[:,list(NDFS_result)]
#    UDFS_x = x[:,list(UDFS_result)]
    
#    '''[KNN purity NMI dgm0 dgm1], each one is a matrix'''
#    methods = ['rf','rank','lap_score','SPEC','MCFS']
#    for method in methods:
#        if method=='rf':
#            selected_feature = list(rf_result).reverse()
#        elif method=='rank':
#            selected_feature = list(rank_result).reverse()
#        elif method=='lap_score':
#            selected_feature = list(lap_score_result)
#        elif method=='SPEC':
#            selected_feature = list(SPEC_result)
#        else:
#            selected_feature = list(MCFS_result).reverse()
#        
#        if num_select<=50:         # the dimension
#            start_dim = 5; step = 2
#        else:
#            start_dim = 10; step = 5
        
    print('KNN accuracy')
    print('rf', ef.knn_accuracy(x,y,rf_result))
    print('rank', ef.knn_accuracy(x,y,rank_result))
    print('l1', ef.knn_accuracy(x,y,l1))
    print('l2', ef.knn_accuracy(x,y,l2))
    print('lmax', ef.knn_accuracy(x,y,lmax))
    print('lap_score', ef.knn_accuracy(x,y,lap_score_result))
    print('SPEC', ef.knn_accuracy(x,y,SPEC_result))
    print('MCFS',ef.knn_accuracy(x,y,MCFS_result))
#    print('NDFS',ef.knn_accuracy(x_test,y_test,NDFS_result))
#    print('UDFS',ef.knn_accuracy(x_test,y_test,UDFS_result),'\n')  

#    print('connectivity')
#    print('rf', ef.connectivity(x,rf_x,pctg, two_sided))
#    print('rank', ef.connectivity(x,rank_x,pctg, two_sided))
#    print('lap_score', ef.connectivity(x,lap_score_x,pctg, two_sided))
#    print('SPEC', ef.connectivity(x,SPEC_x,pctg, two_sided))
#    print('cut-SPEC', ef.connectivity(x,CSPEC_x,pctg, two_sided))
#    print('MCFS',ef.connectivity(x,MCFS_x,pctg, two_sided))
    
#    print('NDFS',ef.connectivity(x,NDFS_x,pctg, two_sided))
#    print('UDFS',ef.connectivity(x,UDFS_x,pctg, two_sided),'\n')  

    print('purity score | NMI')
    print('origin', ef.purity_score(x,y))
    print('rf', ef.purity_score(rf_x,y))
    print('rank', ef.purity_score(rank_x,y))
    print('lap_score', ef.purity_score(lap_score_x,y))
    print('SPEC', ef.purity_score(SPEC_x,y)  )
    print('MCFS', ef.purity_score(MCFS_x,y))
   
    dgm = ef.compute_dgm(x, t, dim, thresh)
    dgm_rf = ef.compute_dgm(rf_x, t, dim, thresh)
    dgm_rank = ef.compute_dgm(rank_x, t, dim, thresh)
    dgm_l1 = ef.compute_dgm(l1_x, t, dim, thresh)
    dgm_l2 = ef.compute_dgm(l2_x, t, dim, thresh)
    dgm_lmax = ef.compute_dgm(lmax_x, t, dim, thresh)
    dgm_lap_score = ef.compute_dgm(lap_score_x, t, dim, thresh)
    dgm_SPEC = ef.compute_dgm(SPEC_x, t, dim, thresh)
    dgm_MCFS = ef.compute_dgm(MCFS_x, t, dim, thresh)
#    plt.figure()
#    plt.plot(dgm[:,-2:], 'ro')
#    plt.figure()
#    plt.plot(dgm_rf[:,-2:], 'ro')
#    plt.figure()
#    plt.plot(dgm_rank[:,-2:], 'ro')
#    plt.figure()
#    plt.plot(dgm_SPEC[:,-2:], 'ro')
#    plt.figure()
#    plt.plot(dgm_MCFS[:,-2:], 'ro')
    
    print('dgm distance')
    print('rf', ef.dgm_distance(dgm,dgm_rf,'W', dim),'  ',ef.dgm_distance(dgm,dgm_rf,'B', dim))
    print('rank', ef.dgm_distance(dgm,dgm_rank,'W', dim),'  ',ef.dgm_distance(dgm,dgm_rank,'B', dim))
    print('l1', ef.dgm_distance(dgm,dgm_l1,'W', dim),'  ',ef.dgm_distance(dgm,dgm_l1,'B', dim))
    print('l2', ef.dgm_distance(dgm,dgm_l2,'W', dim),'  ',ef.dgm_distance(dgm,dgm_l2,'B', dim))
    print('lmax', ef.dgm_distance(dgm,dgm_lmax,'W', dim),'  ',ef.dgm_distance(dgm,dgm_lmax,'B', dim))
    print('lap_score', ef.dgm_distance(dgm,dgm_lap_score,'W', dim),'  ',ef.dgm_distance(dgm,dgm_lap_score,'B', dim))
    print('SPEC', ef.dgm_distance(dgm,dgm_SPEC,'W', dim),'  ',ef.dgm_distance(dgm,dgm_SPEC,'B', dim))
    print('MCFS', ef.dgm_distance(dgm,dgm_MCFS,'W', dim),'  ',ef.dgm_distance(dgm,dgm_MCFS,'B', dim))
Beispiel #13
0
def compare_methods(x,
                    y,
                    num_select,
                    pctg=0.1,
                    pack_size=1,
                    num_clusters=5,
                    two_sided=False):

    n, d = x.shape
    idx = np.random.permutation(n)
    x, y = x[idx], y[idx]

    #########  split train and test  #########
    X = x
    Y = y
    train_num = int(n * 0.7)
    test_num = n - int(n * 0.7)
    x = X[:train_num, :]
    y = Y[:train_num]
    x_test = X[-test_num:, :]
    y_test = Y[-test_num:]

    ###########  other methods  ######################
    '''    Similarity based: lap_score  SPEC          '''
    start_time = time.clock()
    lap_score_result = lap_score.lap_score(x)
    lap_score_result = np.argsort(lap_score_result)[:num_select]
    print('lap_score running time:', time.clock() - start_time)

    #    _,stepwise = backward_distance_selection(x,num_select,pctg,pack_size)   #pctg controls sensitivity to outliers

    start_time = time.clock()
    rf_result = random_selection(x,
                                 num_select,
                                 N=300,
                                 num_use=int(d / 2),
                                 pctg=pctg,
                                 two_sided=two_sided)
    print('rf running time:', time.clock() - start_time)

    start_time = time.clock()
    SPEC_result = SPEC.spec(x)
    print('SPEC running time:', time.clock() - start_time)
    SPEC_result = np.argsort(SPEC_result)[:num_select]  #find minimum

    start_time = time.clock()
    CSPEC_result = cut_spec(x, pctg=0.15)
    print('cut-SPEC running time:', time.clock() - start_time)
    CSPEC_result = np.argsort(CSPEC_result)[:num_select]  #find minimum
    '''sparse learning based'''
    start_time = time.clock()
    MCFS_W = MCFS.mcfs(x, num_select)
    print('MCFS running time:', time.clock() - start_time)
    MCFS_result = [np.max(np.abs(x)) for x in MCFS_W]  #find maximum
    MCFS_result = np.argsort(MCFS_result)[-num_select:]

    #    start_time = time.clock()
    #    NDFS_W = NDFS.ndfs(x,**{'n_clusters':num_clusters})
    #    print('NDFS running time:',time.clock()-start_time)
    #    NDFS_result = [np.sqrt(np.sum(x**2)) for x in NDFS_W]     #find maximum
    #    NDFS_result= np.argsort(NDFS_result)[-num_select:]
    #
    #    start_time = time.clock()
    #    UDFS_W = UDFS.udfs(x,**{'n_clusters':num_clusters})
    #    print('UDFS running time:',time.clock()-start_time)
    #    UDFS_result = [np.sqrt(np.sum(x**2)) for x in UDFS_W]     #find minimum ??????????????????????
    #    UDFS_result= np.argsort(UDFS_result)[:num_select]

    #    prop_x = x[:,list(stepwise)]
    rf_x = x[:, list(rf_result)]
    lap_score_x = x[:, list(lap_score_result)]
    SPEC_x = x[:, list(SPEC_result)]
    CSPEC_x = x[:, list(CSPEC_result)]
    MCFS_x = x[:, list(MCFS_result)]
    #    NDFS_x = x[:,list(NDFS_result)]
    #    UDFS_x = x[:,list(UDFS_result)]

    print('\n')
    print('Class Seperability')
    #    print('prop', ef.class_seperability(prop_x,y))
    print('rf', ef.class_seperability(rf_x, y))
    print('lap_score', ef.class_seperability(lap_score_x, y))
    print('SPEC', ef.class_seperability(SPEC_x, y))
    print('cut-SPEC', ef.class_seperability(CSPEC_x, y))
    print('MCFS', ef.class_seperability(MCFS_x, y))
    #    print('NDFS',ef.class_seperability(NDFS_x,y))
    #    print('UDFS',ef.class_seperability(UDFS_x,y))

    print('\n')
    print('KNN accuracy')
    #    print('prop', ef.knn_accuracy(prop_x,y))
    print('rf', ef.knn_accuracy(x_test, y_test, rf_result))
    print('lap_score', ef.knn_accuracy(x_test, y_test, lap_score_result))
    print('SPEC', ef.knn_accuracy(x_test, y_test, SPEC_result))
    print('cut-SPEC', ef.knn_accuracy(x_test, y_test, CSPEC_result))
    print('MCFS', ef.knn_accuracy(x_test, y_test, MCFS_result))
    #    print('NDFS',ef.knn_accuracy(x_test,y_test,NDFS_result))
    #    print('UDFS',ef.knn_accuracy(x_test,y_test,UDFS_result),'\n')

    print('\n')
    print('connectivity')
    #    print('prop', ef.knn_accuracy(prop_x,y))
    print('rf', ef.connectivity(x, rf_x, pctg, two_sided))
    print('lap_score', ef.connectivity(x, lap_score_x, pctg, two_sided))
    print('SPEC', ef.connectivity(x, SPEC_x, pctg, two_sided))
    print('cut-SPEC', ef.connectivity(x, CSPEC_x, pctg, two_sided))
    print('MCFS', ef.connectivity(x, MCFS_x, pctg, two_sided))
Beispiel #14
0
    data = np.loadtxt("./data/GaussianTopologyNode.txt")
    edge = np.loadtxt("./data/GaussianTopologyEdge.txt")
    timeStart = datetime.datetime.now()

    if useEdge:
        W = ConstructWbyEdge.ConstructWbyEdge(data, edge, t=1)
    else:
        kwrags_W = {
            "metric": "euclidean",
            "neighbor_mode": "knn",
            "weight_mode": "heat_kernel",
            "k": 3,
            "t": 1
        }
        W = construct_W(data, **kwrags_W)
    result = MCFS.mcfs(data, n_selected_features=2, W=W, n_clusters=2)
    print result

    timeEnd = datetime.datetime.now()
    print "Run Time: ", timeEnd - timeStart

elif dataSet == 1:
    data = np.loadtxt("./data/SwissRollTopologyNode.txt")
    edge = np.loadtxt("./data/SwissRollTopologyEdge.txt")
    timeStart = datetime.datetime.now()

    if useEdge:
        W = ConstructWbyEdge.ConstructWbyEdge(data, edge, t=1)
    else:
        kwrags_W = {
            "metric": "euclidean",
def mcfs(train_set, test_set, features_number, clusters_number):
    # Features to delete
    features_to_delete = []
    for i in range(3, len(sys.argv)):
        features_to_delete.append(sys.argv[i])

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = train_set.iloc[:, 1:]
    indipendent_columns_test = test_set.iloc[:, 1:]
    known_labels_test = test_set.iloc[:, 0]

    # Building matrix W for MCFS algorithm
    kwargs = {
        'metric': 'euclidean',
        'neighbor_mode': 'knn',
        'weight_mode': 'binary',
        'k': 3
    }
    W = construct_W.construct_W(indipendent_columns_train.values, **kwargs)

    # MCFS gives a weight to each features
    kwargs = {'W': W, 'n_clusters': clusters_number}
    weighted_features = MCFS.mcfs(indipendent_columns_train.values,
                                  features_number, **kwargs)

    # Ordering the features according to their weight
    ordered_features = MCFS.feature_ranking(weighted_features)

    # Getting only the first 'features_number' features
    selected_features = ordered_features[0:features_number]

    # Getting names of selected features
    names_selected_features = []
    for feature_index in selected_features:
        names_selected_features.append(
            indipendent_columns_train.columns[feature_index])

    # Deleting "feature to delete"
    names_selected_features = [
        feature for feature in names_selected_features
        if feature not in features_to_delete
    ]

    if len(names_selected_features
           ) != len(selected_features) - len(features_to_delete):
        kmeans_rfd_logger.error(
            'One or more feature "to delete" is/are not correct.')
    else:
        # Selected only the selected features on the train set
        selected_features_train = indipendent_columns_train.loc[:,
                                                                names_selected_features]

        # Selected only the selected features on the test set
        selected_features_test = indipendent_columns_test.loc[:,
                                                              names_selected_features]

        kmeans_rfd_logger.info(
            '(Deleted features: {0})'.format(features_to_delete))

        # Running k-means according to selected features
        run_kmeans(len(names_selected_features),
                   selected_features_train.values,
                   selected_features_test.values, clusters_number,
                   known_labels_test)