Exemple #1
0
def prepare_T(y,x,P,params):
    delta = params['delta']
    D,N = y.shape
    IN = np.ones((N,1))
    K,M = delta.shape
    C = np.zeros((D*K,D*K))
    for i in range(K):
        for j in range(i,K):
            C[i*D:(i+1)*D,j*D:(j+1)*D] = \
            x.dot(np.diag((Tr(IN).dot(P).flatten())*delta[i,:]*delta[j,:])).dot(Tr(x))

    for i in range(1,K):
        for j in range(i):
            C[i*D:(i+1)*D,j*D:(j+1)*D] = C[j*D:(j+1)*D,i*D:(i+1)*D]

    G = np.zeros((D,D*K))
    B = np.zeros((D,M))
    for m in range(M):
        for k in range(K):
            B[:,m] = B[:,m] + delta[k,m]*params['t'][:,k]
        
    for i in range(K):
        G[:,i*D:(i+1)*D] = y.dot(P).dot(Tr(x*delta[i,:]))\
            - B.dot(np.diag((Tr(IN).dot(P).flatten())*delta[i,:])).dot(Tr(x))
    
    return C,G
Exemple #2
0
def prepare_t(y, x, P, params):
    D, N = y.shape
    delta = params['delta']
    K, M = delta.shape

    IM = np.ones((M, 1))
    IN = np.ones((N, 1))

    P2 = np.sum(P, axis=0)
    W = np.zeros((K, K))
    Z = np.zeros((K, D))
    for m in range(M):
        for i in range(K):
            for j in range(i, K):
                W[i, j] = W[i, j] + P2[m] * delta[i, m] * delta[j, m]
    for i in range(1, K):
        for j in range(i):
            W[i, j] = W[j, i]

    x2 = opu.transform2(x, params)

    for k in range(K):
        Z[k,:] = (y.dot(np.diag((delta[k,:].dot(Tr(P))).flatten())).dot(IN)\
                - x2.dot(np.diag(delta[k,:]*(Tr(P).dot(IN).flatten()))).dot(IM)).flatten()
    return W, Z
Exemple #3
0
def solve_sigsq(y, yd, ys, tx, xd, xs, P, params, config):
    D, N = y.shape
    d = 0
    K, M = params['delta'].shape
    IM = np.ones((M, 1))
    IN = np.ones((N, 1))
    tmp = 0
    for i in range(K):
        tmp = tmp + np.power(
            np.linalg.norm(params['T'][:, :, i] - np.eye(D), 'fro'), 2)

    P1 = np.diag(np.dot(P, IM).flatten())
    P2 = np.diag(np.dot(Tr(P), IN).flatten())

    term1 = np.trace(
        y.dot(P1).dot(Tr(y)) - 2 * y.dot(P).dot(Tr(tx)) +
        tx.dot(P2).dot(Tr(tx)))
    term2 = 0
    if config['r'] != 0:
        d = yd.shape[0]
        term2 = config['r'] * np.trace(
            yd.dot(P1).dot(Tr(yd)) - 2 * yd.dot(P).dot(Tr(xd)) +
            xd.dot(P2).dot(Tr(xd)))
    term3 = 0
    if config['rs'] != 0:
        term3 = config['rs'] * (ys.dot(P1).dot(
            Tr(ys)) - 2 * ys.dot(P).dot(Tr(xs)) + xs.dot(P2).dot(Tr(xs)))
    sigsq = 1.0 / N / (D + d) * (
        term1 + term2 + term3 +
        config['lambda1'] * np.power(np.linalg.norm(params['t'], 'fro'), 2) +
        config['lambda2'] * tmp)
    return sigsq
Exemple #4
0
def solve_T(y,x,P,params,config):
    K = params['delta'].shape[0]
    D = x.shape[0]
    I = np.eye(K*D)
    I2 = np.tile(np.eye(D),K)
        
    C,G = prepare_T(y,x,P,params)
    A = Tr(np.linalg.solve(Tr((config['lambda2']*I + C)),Tr(config['lambda2']*I2 + G)))
    
    T = np.zeros((D,D,K))
    for i in range(K):
        T[:,:,i] = A[:,i*D:(i+1)*D]
    
    return T
Exemple #5
0
def prepare_T(y, x, P, params):
    D, N = y.shape
    delta = params['delta']
    K, M = delta.shape

    P2 = np.sum(P, axis=0)
    x2 = x * x
    W = np.zeros((D, K, K))

    for m in range(M):
        core_w = np.zeros((K, K))
        for i in range(K):
            for j in range(i, K):
                core_w[i, j] = P2[m] * delta[i, m] * delta[j, m]
                core_w[j, i] = core_w[i, j]
        for J in range(D):
            W[J] += core_w * x2[J, m]

    U1 = np.zeros((K, D))
    ZX = params['t'].dot(delta) * x
    for k in range(K):
        PD = (P2 * delta[k, :]).reshape(1, -1)
        U1[k, :] = PD.dot(Tr(ZX))
    U2 = np.zeros((K, D))
    for k in range(K):
        U2[k, :] = np.sum((y.dot(P) * delta[k, :]) * x, axis=1)
    U = U1 - U2
    return W, U
Exemple #6
0
def solve_t(y, x, P, params, config):
    K = params['delta'].shape[0]
    I = np.eye(K)

    W, Z = prepare_t(y, x, P, params)

    t = Tr(np.linalg.solve(config['lambda1'] * I + W, Z))
    return t
Exemple #7
0
def solve_delta(y,x,P,params):
    K,M = params['delta'].shape
    tx = opu.transform(x,params)
    delta = np.copy(params['delta'])
    
    P2 = np.sum(P,axis=0)
    for m in range(M):
        tx2 = opu.transform3(x[:,m],params)
        tmp = y - tx[:,m].reshape(-1,1)
        d_delta = Tr(P[:,m]).dot(Tr(tmp)).dot(-tx2) / params['sigsq']

        Hm = 1.0/params['sigsq'] * P2[m] * (Tr(tx2)).dot(tx2)

        v = params['delta'][:,m] - np.linalg.inv(Hm + 0.001*np.eye(K)).dot(Tr(d_delta))
        delta[:,m] = project_simplex(v)

    return delta
def calc_obj(x, y, xd, yd, xs, ys, P, params, config):
    """Objective function calculation
    """
    lambda1 = config['lambda1']
    lambda2 = config['lambda2']
    r = config['r']
    rs = config['rs']
    K = config['K']

    D, N = y.shape
    M = x.shape[1]
    d = 0
    ds = 0

    IM = np.ones((M, 1))
    IN = np.ones((N, 1))

    tx = transform(x, params)
    tmp = 0
    for i in range(K):
        tmp = tmp + np.power(
            np.linalg.norm(params['T'][:, :, i] - np.eye(D), 'fro'), 2)

    P1 = np.diag(np.dot(P, IM).flatten())
    P2 = np.diag(np.dot(Tr(P), IN).flatten())

    term1 = np.trace(
        y.dot(P1).dot(Tr(y)) - 2 * y.dot(P).dot(Tr(tx)) +
        tx.dot(P2).dot(Tr(tx)))
    term2 = 0
    if r != 0:
        d = xd.shape[0]
        term2 = r * np.trace(
            yd.dot(P1).dot(Tr(yd)) - 2 * yd.dot(P).dot(Tr(xd)) +
            xd.dot(P2).dot(Tr(xd)))
    term3 = 0
    if rs != 0:
        ds = 1
        term3 = rs * np.trace(
            ys.dot(P1).dot(Tr(ys)) - 2 * ys.dot(P).dot(Tr(xs)) +
            xs.dot(P2).dot(Tr(xs)))
    obj = 0.5/params['sigsq'] * ( term1 + term2 + term3 \
           + lambda1*np.power(np.linalg.norm(params['t'],'fro'),2) +lambda2*tmp) \
           + N*(D+d+ds)/2.0*np.log(params['sigsq'])

    return obj
def Estep(y, yd, ys, tx, xd, xs, sigsq, r, rs):
    """Expectation calculation.
    """
    M = tx.shape[1]
    N = y.shape[1]

    #> calculate RBF kernel distance based on imaging features
    D1 = np.diag(np.dot(Tr(y), y))
    D2 = np.diag(np.dot(Tr(tx), tx))
    Mid = 2 * np.dot(Tr(y), tx)
    tmp1 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape(
        1, -1).repeat(N, axis=0)

    #> calculate RBF kernel distance based on covariate features
    tmp2 = np.zeros(tmp1.shape)
    if r != 0:
        D1 = np.diag(np.dot(Tr(yd), yd))
        D2 = np.diag(np.dot(Tr(xd), xd))
        Mid = 2 * np.dot(Tr(yd), xd)
        tmp2 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape(
            1, -1).repeat(N, axis=0)

    #> calculate RBF kernel distance based on set information
    tmp3 = np.zeros(tmp1.shape)
    if rs != 0:
        D1 = np.diag(np.dot(Tr(ys), ys))
        D2 = np.diag(np.dot(Tr(xs), xs))
        Mid = 2 * np.dot(Tr(ys), xs)
        tmp3 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape(
            1, -1).repeat(N, axis=0)

    #> combine distances and normlize to probability distribution
    P = np.exp(
        (-tmp1 - r * tmp2 - rs * tmp3) / 2 / sigsq) + np.finfo(np.float).tiny
    P = P / np.sum(P, axis=1).reshape(-1, 1)

    return P
Exemple #10
0
def clustering_test(dataFile, outFile, modelFile):
    """Test function of CHIMERA
       Please be extremely careful when using this function.
       The ordering of normal controls should be exactly the same as training phase
    """
    #================================= Reading Data ======================================================
    sys.stdout.write('\treading model...\n')
    with open(modelFile) as f:
        model = cPickle.load(f)
    trainData = model['trainData']
    params = model['model'][0]
    config = model['config']

    sys.stdout.write('\treading data...\n')
    feat_cov = None
    feat_set = None
    ID = None
    with open(dataFile) as f:
        data = list(csv.reader(f))
        header = np.asarray(data[0])
        if 'IMG' not in header:
            sys.stdout.write(
                'Error: image features not found. Please check csv header line for field "IMG".\n'
            )
            sys.exit(1)
        data = np.asarray(data[1:])

        feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float)
        feat_cov = []
        feat_set = []
        if len(trainData['xd']) != 0:
            if 'COVAR' not in header:
                sys.stdout.write(
                    'Error: covariate features not found. Please check csv header line for field "COVAR".\n'
                )
                sys.exit(1)
            feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype(
                np.float)
        if len(trainData['xs']) != 0:
            if 'Set' not in header:
                sys.stdout.write(
                    'Error: dataset ID not found. Please check csv header line for field "Set".\n'
                )
                sys.exit(1)
            feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten()
            datasetID = np.copy(feat_set)
            feat_set = np.zeros((len(datasetID), len(trainData['datasetID'])))
            for i in range(len(trainData['datasetID'])):
                feat_set[np.nonzero(datasetID == trainData['datasetID'][i])[0],
                         i] = 1
        if 'ID' in header:
            ID = data[:, np.nonzero(header == 'ID')[0]]

    #================================= Normalizing Data ======================================================
    if config['norm'] != 0:
        feat_img, feat_cov = data_normalization_test(feat_img, feat_cov, model,
                                                     config)

    #============================ Preparing Data ======================================================
    # separate data into patient and normal groups
    x = trainData['x']  # normal controls
    y = np.transpose(feat_img)  # patients
    xd = trainData['xd']
    yd = np.transpose(feat_cov)
    xs = trainData['xs']
    ys = np.transpose(feat_set)

    #================================Perform Clustering ========================================
    sys.stdout.write('\tclustering...\n')
    tx = opu.transform(x, params)
    P = opu.Estep(y, yd, ys, tx, xd, xs, params['sigsq'], config['r'],
                  config['rs'])
    membership = np.dot(P, Tr(params['delta']))
    label = np.argmax(membership, axis=1)

    #================================ Finalizing and Save =====================================
    sys.stdout.write('\tsaving results...\n')
    with open(outFile, 'w') as f:
        if ID is None:
            f.write('Cluster\n')
            for i in range(len(label)):
                f.write('%d\n' % (label[i] + 1))
        else:
            f.write('ID,Cluster\n')
            for i in range(len(label)):
                f.write('%s,%d\n' % (ID[i][0], label[i] + 1))
Exemple #11
0
def clustering(dataFile, outFile, config):
    """Core function of CHIMERA, performs:
        1) read and preprocess data
        2) clustering
        3) save results
    """
    #================================= Reading Data ======================================================
    sys.stdout.write('\treading data...\n')
    feat_cov = None
    feat_set = None
    ID = None
    with open(dataFile) as f:
        data = list(csv.reader(f))
        header = np.asarray(data[0])
        if 'Group' not in header:
            sys.stdout.write(
                'Error: group information not found. Please check csv header line for field "Group".\n'
            )
            sys.exit(1)
        if 'IMG' not in header:
            sys.stdout.write(
                'Error: image features not found. Please check csv header line for field "IMG".\n'
            )
            sys.exit(1)
        data = np.asarray(data[1:])

        group = (data[:, np.nonzero(header == 'Group')[0]].flatten()).astype(
            np.int8)
        feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float)
        if 'COVAR' in header:
            feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype(
                np.float)
        if 'ID' in header:
            ID = data[:, np.nonzero(header == 'ID')[0]]
            ID = ID[group == 1]
        if 'Set' in header:
            feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten()

    #================================= Normalizing Data ======================================================
    if config['norm'] != 0:
        model, feat_img, feat_cov = data_normalization(feat_img, feat_cov,
                                                       config)

    #================================= Prepare Dataset ID ======================================================
    if feat_set is None:
        config['rs'] = 0
    else:
        unique_ID = np.unique(feat_set)
        datasetID = np.copy(feat_set)
        feat_set = np.zeros((len(datasetID), len(unique_ID)))
        for i in range(len(unique_ID)):
            feat_set[np.nonzero(datasetID == unique_ID[i])[0], i] = 1

    #================================= Calculate auto weight ==================================================
    if feat_cov is None:
        config['r'] = 0
    else:
        if config['r'] == -1.0:
            config['r'] = np.sum(np.var(feat_cov, axis=0)) / np.sum(
                np.var(feat_img, axis=0))

    #================================= Verbose information ==================================================
    if config['verbose']:
        sys.stdout.write(
            '\t\t================= data summary ==================\n')
        sys.stdout.write('\t\tnumber of patients: %d\n' % sum(group == 1))
        sys.stdout.write('\t\tnumber of normal controls: %d\n' %
                         sum(group == 0))
        sys.stdout.write('\t\timaging feature dimension: %d\n' %
                         feat_img.shape[1])
        if feat_cov is not None:
            sys.stdout.write('\t\tcovariates dimension: %d\n' %
                             feat_cov.shape[1])
        if feat_set is not None:
            sys.stdout.write('\t\tunique data set id: %d\n' % len(unique_ID))
        sys.stdout.write(
            '\t\t================ configurations =================\n')
        sys.stdout.write('\t\tnumber of clusters: %d\n' % config['K'])
        sys.stdout.write('\t\tnumber of runs: %d\n' % config['numRun'])
        sys.stdout.write('\t\tmax number of iterations: %d\n' %
                         config['max_iter'])
        sys.stdout.write('\t\tdistance ratio covar/img = %.4f\n' % config['r'])
        sys.stdout.write('\t\tdistance ratio set/img = %.4f\n' % config['rs'])
        sys.stdout.write('\t\tlambda1 = %.2f\tlambda2 = %.2f\n' %
                         (config['lambda1'], config['lambda2']))
        sys.stdout.write('\t\ttransformation chosen: %s\n' %
                         config['transform'])
        sys.stdout.write(
            '\t\t=================================================\n')

    #============================ Preparing Data ======================================================
    # separate data into patient and normal groups
    feat_img = np.transpose(feat_img)
    x = feat_img[:, group == 0]  # normal controls
    y = feat_img[:, group == 1]  # patients
    xd = []
    yd = []
    xs = []
    ys = []
    if feat_cov is not None:
        feat_cov = np.transpose(feat_cov)
        xd = feat_cov[:, group == 0]
        yd = feat_cov[:, group == 1]
    if feat_set is not None:
        feat_set = np.transpose(feat_set)
        xs = feat_set[:, group == 0]
        ys = feat_set[:, group == 1]

    #================================Perform Clustering (2 modes available)=================================
    sys.stdout.write('\tclustering...\n')
    if config['mode'] == 2:  #save result yields minimal energy
        obj = np.float('inf')
        for i in range(config['numRun']):
            cur_result = optimize(x, xd, xs, y, yd, ys, config)
            cur_obj = cur_result[2].min()
            if config['verbose']:
                sys.stdout.write('\t\tRun id %d, obj = %f\n' % (i, cur_obj))
            else:
                time_bar(i, config['numRun'])
            if cur_obj < obj:
                result = cur_result
                obj = cur_obj
        sys.stdout.write('\n')
        membership = np.dot(result[1], Tr(result[0]['delta']))
        label = np.argmax(membership, axis=1)
    else:  # save result most reproducible
        label_mat = []
        results = []
        for i in range(config['numRun']):
            cur_result = optimize(x, xd, xs, y, yd, ys, config)
            membership = np.dot(cur_result[1], Tr(cur_result[0]['delta']))
            label = np.argmax(membership, axis=1)
            label_mat.append(label)
            results.append(cur_result)
            time_bar(i, config['numRun'])
        sys.stdout.write('\n')
        label_mat = np.asarray(label_mat)
        ari_mat = np.zeros((config['numRun'], config['numRun']))
        for i in range(config['numRun']):
            for j in range(i + 1, config['numRun']):
                ari_mat[i, j] = ARI(label_mat[i, :], label_mat[j, :])
                ari_mat[j, i] = ari_mat[i, j]
        ave_ari = np.sum(ari_mat, axis=0) / (config['numRun'] - 1)
        idx = np.argmax(ave_ari)
        if config['verbose']:
            sys.stdout.write('\t\tBest average ARI is %f\n' % (max(ave_ari)))
        label = label_mat[idx, :]
        result = results[idx]

    #================================ Finalizing and Save =====================================
    sys.stdout.write('\tsaving results...\n')
    with open(outFile, 'w') as f:
        if ID is None:
            f.write('Cluster\n')
            for i in range(len(label)):
                f.write('%d\n' % (label[i] + 1))
        else:
            f.write('ID,Cluster\n')
            for i in range(len(label)):
                f.write('%s,%d\n' % (ID[i][0], label[i] + 1))
    if config['modelFile'] != "":
        trainData = {'x': x, 'xd': xd, 'xs': xs, 'datasetID': unique_ID}
        model.update({'trainData': trainData})
        model.update({'model': result})
        model.update({'config': config})
        with open(config['modelFile'], 'wb') as f:
            cPickle.dump(model, f, 2)