############################# Parameter Setting ##############################
# The number of clusters in KMeans
K = 4
##############################################################################

t0 = time()
# Load training set
file_data_train = open("/home/changyale/dataset/COPDGene/data_"+\
        "train_continuous.pkl","rb")
data_con_use, features_name_use = pickle.load(file_data_train)

# Prepare reference dataset for continuous features
# Random sample with replacement from training set to form a reference dataset
data_con_use_ref = np.zeros((data_con_use.shape[0], data_con_use.shape[1]))
for j in range(data_con_use.shape[1]):
    tp_index = sample_wr(range(data_con_use_ref.shape[0]),\
            data_con_use_ref.shape[0])
    for i in range(len(tp_index)):
        data_con_use_ref[i, j] = data_con_use[tp_index[i], j]

t1 = time()
print(["Preparing data takes " + str(t1 - t0) + " seconds"])

# Forward search for continuous features
# Normalization of the continuous dataset
data = scale(data_con_use)
n_instances, n_features = data.shape

data_ref = scale(data_con_use_ref)

# Start with the empty feature set
bfs = []
data_con, features_name_con, features_type_con = info_con

# Choose only 'continuous' features for backward search
data_con_use = []
features_name_use = []
for j in range(len(features_type_con)):
    if features_type_con[j] == 'continuous':
        data_con_use.append(data_con[:,j])
        features_name_use.append(features_name_con[j])
data_con_use = np.array(data_con_use).T

# Prepare reference dataset for continuous features
# Random sample with replacement from training set to form a reference dataset
data_con_use_ref = np.zeros((data_con_use.shape[0],data_con_use.shape[1]))
for j in range(data_con_use.shape[1]):
    tp_index = sample_wr(range(data_con_use_ref.shape[0]),\
            data_con_use_ref.shape[0])
    for i in range(len(tp_index)):
        data_con_use_ref[i,j] = data_con_use[tp_index[i],j]

t1 = time()
print(["Preparing data takes "+str(t1-t0)+" seconds"])

# Forward search for continuous features
# Normalization of the continuous dataset
data = scale(data_con_use)
#data = data_con_use
n_instances, n_features = data.shape

data_ref = scale(data_con_use_ref)

# Obtain gold value
Exemple #3
0
file_hsic.close()
file_hsic = open("data/mtr_hsic_nhsic_dis.pkl", "rb")
mtr_hsic_dis, mtr_nhsic_dis = pickle.load(file_hsic)
file_hsic.close()

# Load information about continuous and discrete features
file_data_train = open("/home/changyale/dataset/COPDGene/data_train.pkl", "rb")
info_con, info_dis, gold = pickle.load(file_data_train)
file_data_train.close()
data_con, features_name_con, features_type_con = info_con
data_dis, features_name_dis, features_type_dis = info_dis

# Random sample with replacement from data_train to form a reference dataset
data_train_ref = np.zeros((data_train.shape[0], data_train.shape[1]))
for j in range(data_train_ref.shape[1]):
    tp_index = sample_wr(range(data_train_ref.shape[0]),
                         data_train_ref.shape[0])
    for i in range(len(tp_index)):
        data_train_ref[i, j] = data_train[tp_index[i], j]

# Label data_train as class 0 and data_train_ref as class 1, resulting in a
# dataset "data_use" and its label "labels"
labels = []
data_use = np.zeros((data_train.shape[0]+data_train_ref.shape[0],\
        data_train.shape[1]))
for i in range(data_train.shape[0]):
    data_use[i, :] = data_train[i, :]
    labels.append(0)
for i in range(data_train_ref.shape[0]):
    data_use[data_train.shape[0] + i, :] = data_train_ref[i, :]
    labels.append(1)
labels = np.array(labels)
Exemple #4
0
file_hsic.close()
file_hsic = open("data/mtr_hsic_nhsic_dis.pkl","rb")
mtr_hsic_dis,mtr_nhsic_dis = pickle.load(file_hsic)
file_hsic.close()

# Load information about continuous and discrete features
file_data_train = open("/home/changyale/dataset/COPDGene/data_train.pkl","rb")
info_con,info_dis,gold = pickle.load(file_data_train)
file_data_train.close()
data_con, features_name_con, features_type_con = info_con
data_dis, features_name_dis, features_type_dis = info_dis

# Random sample with replacement from data_train to form a reference dataset
data_train_ref = np.zeros((data_train.shape[0],data_train.shape[1]))
for j in range(data_train_ref.shape[1]):
    tp_index = sample_wr(range(data_train_ref.shape[0]),data_train_ref.shape[0])
    for i in range(len(tp_index)):
        data_train_ref[i,j] = data_train[tp_index[i],j]

# Label data_train as class 0 and data_train_ref as class 1, resulting in a
# dataset "data_use" and its label "labels"
labels = []
data_use = np.zeros((data_train.shape[0]+data_train_ref.shape[0],\
        data_train.shape[1]))
for i in range(data_train.shape[0]):
    data_use[i,:] = data_train[i,:]
    labels.append(0)
for i in range(data_train_ref.shape[0]):
    data_use[data_train.shape[0]+i,:] = data_train_ref[i,:]
    labels.append(1)
labels = np.array(labels)
list_m_N = []
list_n_iter = []
list_time = []

for experiment in range(n_experiment):
    print experiment,"Iteration"
    n_iter = 0
    max_iter = n_instances
    tol = 0.0001
    flag_ratio = np.infty
    t1 = time()
    while n_iter<max_iter and flag_ratio>tol:
        n_iter += 1
        
        # sample a data point x_n uniformly from the dataset
        index_row = sample_wr(range(n_instances),1)
        x_n = data[index_row,:].T

        # Derive S_N and m_N from global parameters(Just different representation)
        S_N = -0.5*np.linalg.inv(lambda_1)
        m_N = np.dot(S_N,lambda_0)

        # Update local variables
        tp = S_N+m_N.reshape(n_features,1)*m_N.reshape(1,n_features)
        ks = np.dot(np.dot(x_n.T,tp),x_n)
        var_local_n = np.sqrt(ks)

        # Compute the intermediate global parameters as though x_n is replicated N
        # times
        tp_lambda_0 = np.dot(S_0_inv,m_0)+n_instances*(label[index_row[0]]-0.5)*x_n
        tp = 0.5/var_local_n*(1./(1+math.exp(-var_local_n))-0.5)
Exemple #6
0
from sklearn.preprocessing import scale
from sklearn.metrics import normalized_mutual_info_score
import matplotlib.pyplot as plt
from python.COPDGene.utils.sample_wr import sample_wr

# import iris dataset
iris = datasets.load_iris()
data_raw = iris.data
labels_true = iris.target

# Normalization of the original dataset
data = scale(data_raw)

# extract reference distribution
data_ref = []
tp_row_id = sample_wr(range(data.shape[0]),data.shape[0])
for i in range(len(tp_row_id)):
    data_ref.append(list(data[tp_row_id[i],:]))
data_ref = np.array(data_ref)

n_clusters_range = range(2,11)
inertia = [0]*len(n_clusters_range)
inertia_ref = [0]*len(n_clusters_range)
score = [0]*len(n_clusters_range)

for i in range(len(n_clusters_range)):
    # Apply Kmeans on the original dataset
    estimator = KMeans(n_clusters=n_clusters_range[i],init='random',\
            n_init=10,n_jobs=-1)
    estimator.fit(data)
    inertia[i] = estimator.inertia_