def prepare_T(y,x,P,params): delta = params['delta'] D,N = y.shape IN = np.ones((N,1)) K,M = delta.shape C = np.zeros((D*K,D*K)) for i in range(K): for j in range(i,K): C[i*D:(i+1)*D,j*D:(j+1)*D] = \ x.dot(np.diag((Tr(IN).dot(P).flatten())*delta[i,:]*delta[j,:])).dot(Tr(x)) for i in range(1,K): for j in range(i): C[i*D:(i+1)*D,j*D:(j+1)*D] = C[j*D:(j+1)*D,i*D:(i+1)*D] G = np.zeros((D,D*K)) B = np.zeros((D,M)) for m in range(M): for k in range(K): B[:,m] = B[:,m] + delta[k,m]*params['t'][:,k] for i in range(K): G[:,i*D:(i+1)*D] = y.dot(P).dot(Tr(x*delta[i,:]))\ - B.dot(np.diag((Tr(IN).dot(P).flatten())*delta[i,:])).dot(Tr(x)) return C,G
def prepare_t(y, x, P, params): D, N = y.shape delta = params['delta'] K, M = delta.shape IM = np.ones((M, 1)) IN = np.ones((N, 1)) P2 = np.sum(P, axis=0) W = np.zeros((K, K)) Z = np.zeros((K, D)) for m in range(M): for i in range(K): for j in range(i, K): W[i, j] = W[i, j] + P2[m] * delta[i, m] * delta[j, m] for i in range(1, K): for j in range(i): W[i, j] = W[j, i] x2 = opu.transform2(x, params) for k in range(K): Z[k,:] = (y.dot(np.diag((delta[k,:].dot(Tr(P))).flatten())).dot(IN)\ - x2.dot(np.diag(delta[k,:]*(Tr(P).dot(IN).flatten()))).dot(IM)).flatten() return W, Z
def solve_sigsq(y, yd, ys, tx, xd, xs, P, params, config): D, N = y.shape d = 0 K, M = params['delta'].shape IM = np.ones((M, 1)) IN = np.ones((N, 1)) tmp = 0 for i in range(K): tmp = tmp + np.power( np.linalg.norm(params['T'][:, :, i] - np.eye(D), 'fro'), 2) P1 = np.diag(np.dot(P, IM).flatten()) P2 = np.diag(np.dot(Tr(P), IN).flatten()) term1 = np.trace( y.dot(P1).dot(Tr(y)) - 2 * y.dot(P).dot(Tr(tx)) + tx.dot(P2).dot(Tr(tx))) term2 = 0 if config['r'] != 0: d = yd.shape[0] term2 = config['r'] * np.trace( yd.dot(P1).dot(Tr(yd)) - 2 * yd.dot(P).dot(Tr(xd)) + xd.dot(P2).dot(Tr(xd))) term3 = 0 if config['rs'] != 0: term3 = config['rs'] * (ys.dot(P1).dot( Tr(ys)) - 2 * ys.dot(P).dot(Tr(xs)) + xs.dot(P2).dot(Tr(xs))) sigsq = 1.0 / N / (D + d) * ( term1 + term2 + term3 + config['lambda1'] * np.power(np.linalg.norm(params['t'], 'fro'), 2) + config['lambda2'] * tmp) return sigsq
def solve_T(y,x,P,params,config): K = params['delta'].shape[0] D = x.shape[0] I = np.eye(K*D) I2 = np.tile(np.eye(D),K) C,G = prepare_T(y,x,P,params) A = Tr(np.linalg.solve(Tr((config['lambda2']*I + C)),Tr(config['lambda2']*I2 + G))) T = np.zeros((D,D,K)) for i in range(K): T[:,:,i] = A[:,i*D:(i+1)*D] return T
def prepare_T(y, x, P, params): D, N = y.shape delta = params['delta'] K, M = delta.shape P2 = np.sum(P, axis=0) x2 = x * x W = np.zeros((D, K, K)) for m in range(M): core_w = np.zeros((K, K)) for i in range(K): for j in range(i, K): core_w[i, j] = P2[m] * delta[i, m] * delta[j, m] core_w[j, i] = core_w[i, j] for J in range(D): W[J] += core_w * x2[J, m] U1 = np.zeros((K, D)) ZX = params['t'].dot(delta) * x for k in range(K): PD = (P2 * delta[k, :]).reshape(1, -1) U1[k, :] = PD.dot(Tr(ZX)) U2 = np.zeros((K, D)) for k in range(K): U2[k, :] = np.sum((y.dot(P) * delta[k, :]) * x, axis=1) U = U1 - U2 return W, U
def solve_t(y, x, P, params, config): K = params['delta'].shape[0] I = np.eye(K) W, Z = prepare_t(y, x, P, params) t = Tr(np.linalg.solve(config['lambda1'] * I + W, Z)) return t
def solve_delta(y,x,P,params): K,M = params['delta'].shape tx = opu.transform(x,params) delta = np.copy(params['delta']) P2 = np.sum(P,axis=0) for m in range(M): tx2 = opu.transform3(x[:,m],params) tmp = y - tx[:,m].reshape(-1,1) d_delta = Tr(P[:,m]).dot(Tr(tmp)).dot(-tx2) / params['sigsq'] Hm = 1.0/params['sigsq'] * P2[m] * (Tr(tx2)).dot(tx2) v = params['delta'][:,m] - np.linalg.inv(Hm + 0.001*np.eye(K)).dot(Tr(d_delta)) delta[:,m] = project_simplex(v) return delta
def calc_obj(x, y, xd, yd, xs, ys, P, params, config): """Objective function calculation """ lambda1 = config['lambda1'] lambda2 = config['lambda2'] r = config['r'] rs = config['rs'] K = config['K'] D, N = y.shape M = x.shape[1] d = 0 ds = 0 IM = np.ones((M, 1)) IN = np.ones((N, 1)) tx = transform(x, params) tmp = 0 for i in range(K): tmp = tmp + np.power( np.linalg.norm(params['T'][:, :, i] - np.eye(D), 'fro'), 2) P1 = np.diag(np.dot(P, IM).flatten()) P2 = np.diag(np.dot(Tr(P), IN).flatten()) term1 = np.trace( y.dot(P1).dot(Tr(y)) - 2 * y.dot(P).dot(Tr(tx)) + tx.dot(P2).dot(Tr(tx))) term2 = 0 if r != 0: d = xd.shape[0] term2 = r * np.trace( yd.dot(P1).dot(Tr(yd)) - 2 * yd.dot(P).dot(Tr(xd)) + xd.dot(P2).dot(Tr(xd))) term3 = 0 if rs != 0: ds = 1 term3 = rs * np.trace( ys.dot(P1).dot(Tr(ys)) - 2 * ys.dot(P).dot(Tr(xs)) + xs.dot(P2).dot(Tr(xs))) obj = 0.5/params['sigsq'] * ( term1 + term2 + term3 \ + lambda1*np.power(np.linalg.norm(params['t'],'fro'),2) +lambda2*tmp) \ + N*(D+d+ds)/2.0*np.log(params['sigsq']) return obj
def Estep(y, yd, ys, tx, xd, xs, sigsq, r, rs): """Expectation calculation. """ M = tx.shape[1] N = y.shape[1] #> calculate RBF kernel distance based on imaging features D1 = np.diag(np.dot(Tr(y), y)) D2 = np.diag(np.dot(Tr(tx), tx)) Mid = 2 * np.dot(Tr(y), tx) tmp1 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape( 1, -1).repeat(N, axis=0) #> calculate RBF kernel distance based on covariate features tmp2 = np.zeros(tmp1.shape) if r != 0: D1 = np.diag(np.dot(Tr(yd), yd)) D2 = np.diag(np.dot(Tr(xd), xd)) Mid = 2 * np.dot(Tr(yd), xd) tmp2 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape( 1, -1).repeat(N, axis=0) #> calculate RBF kernel distance based on set information tmp3 = np.zeros(tmp1.shape) if rs != 0: D1 = np.diag(np.dot(Tr(ys), ys)) D2 = np.diag(np.dot(Tr(xs), xs)) Mid = 2 * np.dot(Tr(ys), xs) tmp3 = D1.reshape(-1, 1).repeat(M, axis=1) - Mid + D2.reshape( 1, -1).repeat(N, axis=0) #> combine distances and normlize to probability distribution P = np.exp( (-tmp1 - r * tmp2 - rs * tmp3) / 2 / sigsq) + np.finfo(np.float).tiny P = P / np.sum(P, axis=1).reshape(-1, 1) return P
def clustering_test(dataFile, outFile, modelFile): """Test function of CHIMERA Please be extremely careful when using this function. The ordering of normal controls should be exactly the same as training phase """ #================================= Reading Data ====================================================== sys.stdout.write('\treading model...\n') with open(modelFile) as f: model = cPickle.load(f) trainData = model['trainData'] params = model['model'][0] config = model['config'] sys.stdout.write('\treading data...\n') feat_cov = None feat_set = None ID = None with open(dataFile) as f: data = list(csv.reader(f)) header = np.asarray(data[0]) if 'IMG' not in header: sys.stdout.write( 'Error: image features not found. Please check csv header line for field "IMG".\n' ) sys.exit(1) data = np.asarray(data[1:]) feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float) feat_cov = [] feat_set = [] if len(trainData['xd']) != 0: if 'COVAR' not in header: sys.stdout.write( 'Error: covariate features not found. Please check csv header line for field "COVAR".\n' ) sys.exit(1) feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype( np.float) if len(trainData['xs']) != 0: if 'Set' not in header: sys.stdout.write( 'Error: dataset ID not found. Please check csv header line for field "Set".\n' ) sys.exit(1) feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten() datasetID = np.copy(feat_set) feat_set = np.zeros((len(datasetID), len(trainData['datasetID']))) for i in range(len(trainData['datasetID'])): feat_set[np.nonzero(datasetID == trainData['datasetID'][i])[0], i] = 1 if 'ID' in header: ID = data[:, np.nonzero(header == 'ID')[0]] #================================= Normalizing Data ====================================================== if config['norm'] != 0: feat_img, feat_cov = data_normalization_test(feat_img, feat_cov, model, config) #============================ Preparing Data ====================================================== # separate data into patient and normal groups x = trainData['x'] # normal controls y = np.transpose(feat_img) # patients xd = trainData['xd'] yd = np.transpose(feat_cov) xs = trainData['xs'] ys = np.transpose(feat_set) #================================Perform Clustering ======================================== sys.stdout.write('\tclustering...\n') tx = opu.transform(x, params) P = opu.Estep(y, yd, ys, tx, xd, xs, params['sigsq'], config['r'], config['rs']) membership = np.dot(P, Tr(params['delta'])) label = np.argmax(membership, axis=1) #================================ Finalizing and Save ===================================== sys.stdout.write('\tsaving results...\n') with open(outFile, 'w') as f: if ID is None: f.write('Cluster\n') for i in range(len(label)): f.write('%d\n' % (label[i] + 1)) else: f.write('ID,Cluster\n') for i in range(len(label)): f.write('%s,%d\n' % (ID[i][0], label[i] + 1))
def clustering(dataFile, outFile, config): """Core function of CHIMERA, performs: 1) read and preprocess data 2) clustering 3) save results """ #================================= Reading Data ====================================================== sys.stdout.write('\treading data...\n') feat_cov = None feat_set = None ID = None with open(dataFile) as f: data = list(csv.reader(f)) header = np.asarray(data[0]) if 'Group' not in header: sys.stdout.write( 'Error: group information not found. Please check csv header line for field "Group".\n' ) sys.exit(1) if 'IMG' not in header: sys.stdout.write( 'Error: image features not found. Please check csv header line for field "IMG".\n' ) sys.exit(1) data = np.asarray(data[1:]) group = (data[:, np.nonzero(header == 'Group')[0]].flatten()).astype( np.int8) feat_img = (data[:, np.nonzero(header == 'IMG')[0]]).astype(np.float) if 'COVAR' in header: feat_cov = (data[:, np.nonzero(header == 'COVAR')[0]]).astype( np.float) if 'ID' in header: ID = data[:, np.nonzero(header == 'ID')[0]] ID = ID[group == 1] if 'Set' in header: feat_set = data[:, np.nonzero(header == 'Set')[0]].flatten() #================================= Normalizing Data ====================================================== if config['norm'] != 0: model, feat_img, feat_cov = data_normalization(feat_img, feat_cov, config) #================================= Prepare Dataset ID ====================================================== if feat_set is None: config['rs'] = 0 else: unique_ID = np.unique(feat_set) datasetID = np.copy(feat_set) feat_set = np.zeros((len(datasetID), len(unique_ID))) for i in range(len(unique_ID)): feat_set[np.nonzero(datasetID == unique_ID[i])[0], i] = 1 #================================= Calculate auto weight ================================================== if feat_cov is None: config['r'] = 0 else: if config['r'] == -1.0: config['r'] = np.sum(np.var(feat_cov, axis=0)) / np.sum( np.var(feat_img, axis=0)) #================================= Verbose information ================================================== if config['verbose']: sys.stdout.write( '\t\t================= data summary ==================\n') sys.stdout.write('\t\tnumber of patients: %d\n' % sum(group == 1)) sys.stdout.write('\t\tnumber of normal controls: %d\n' % sum(group == 0)) sys.stdout.write('\t\timaging feature dimension: %d\n' % feat_img.shape[1]) if feat_cov is not None: sys.stdout.write('\t\tcovariates dimension: %d\n' % feat_cov.shape[1]) if feat_set is not None: sys.stdout.write('\t\tunique data set id: %d\n' % len(unique_ID)) sys.stdout.write( '\t\t================ configurations =================\n') sys.stdout.write('\t\tnumber of clusters: %d\n' % config['K']) sys.stdout.write('\t\tnumber of runs: %d\n' % config['numRun']) sys.stdout.write('\t\tmax number of iterations: %d\n' % config['max_iter']) sys.stdout.write('\t\tdistance ratio covar/img = %.4f\n' % config['r']) sys.stdout.write('\t\tdistance ratio set/img = %.4f\n' % config['rs']) sys.stdout.write('\t\tlambda1 = %.2f\tlambda2 = %.2f\n' % (config['lambda1'], config['lambda2'])) sys.stdout.write('\t\ttransformation chosen: %s\n' % config['transform']) sys.stdout.write( '\t\t=================================================\n') #============================ Preparing Data ====================================================== # separate data into patient and normal groups feat_img = np.transpose(feat_img) x = feat_img[:, group == 0] # normal controls y = feat_img[:, group == 1] # patients xd = [] yd = [] xs = [] ys = [] if feat_cov is not None: feat_cov = np.transpose(feat_cov) xd = feat_cov[:, group == 0] yd = feat_cov[:, group == 1] if feat_set is not None: feat_set = np.transpose(feat_set) xs = feat_set[:, group == 0] ys = feat_set[:, group == 1] #================================Perform Clustering (2 modes available)================================= sys.stdout.write('\tclustering...\n') if config['mode'] == 2: #save result yields minimal energy obj = np.float('inf') for i in range(config['numRun']): cur_result = optimize(x, xd, xs, y, yd, ys, config) cur_obj = cur_result[2].min() if config['verbose']: sys.stdout.write('\t\tRun id %d, obj = %f\n' % (i, cur_obj)) else: time_bar(i, config['numRun']) if cur_obj < obj: result = cur_result obj = cur_obj sys.stdout.write('\n') membership = np.dot(result[1], Tr(result[0]['delta'])) label = np.argmax(membership, axis=1) else: # save result most reproducible label_mat = [] results = [] for i in range(config['numRun']): cur_result = optimize(x, xd, xs, y, yd, ys, config) membership = np.dot(cur_result[1], Tr(cur_result[0]['delta'])) label = np.argmax(membership, axis=1) label_mat.append(label) results.append(cur_result) time_bar(i, config['numRun']) sys.stdout.write('\n') label_mat = np.asarray(label_mat) ari_mat = np.zeros((config['numRun'], config['numRun'])) for i in range(config['numRun']): for j in range(i + 1, config['numRun']): ari_mat[i, j] = ARI(label_mat[i, :], label_mat[j, :]) ari_mat[j, i] = ari_mat[i, j] ave_ari = np.sum(ari_mat, axis=0) / (config['numRun'] - 1) idx = np.argmax(ave_ari) if config['verbose']: sys.stdout.write('\t\tBest average ARI is %f\n' % (max(ave_ari))) label = label_mat[idx, :] result = results[idx] #================================ Finalizing and Save ===================================== sys.stdout.write('\tsaving results...\n') with open(outFile, 'w') as f: if ID is None: f.write('Cluster\n') for i in range(len(label)): f.write('%d\n' % (label[i] + 1)) else: f.write('ID,Cluster\n') for i in range(len(label)): f.write('%s,%d\n' % (ID[i][0], label[i] + 1)) if config['modelFile'] != "": trainData = {'x': x, 'xd': xd, 'xs': xs, 'datasetID': unique_ID} model.update({'trainData': trainData}) model.update({'model': result}) model.update({'config': config}) with open(config['modelFile'], 'wb') as f: cPickle.dump(model, f, 2)