def predict(self, X): """ Predict the cluster of each observation (row) in X :param pd.DataFrame X: data :return: the clustering solution :rtype: np.array, np.ndarray """ assert self.var_moments is not None, 'Model has not been fit' # sort features so continuous variables come first, then binary, followed by non-negative discrete X, n_cts_data, n_bin_data, n_ord_data = sort_features(X) n = len(X) n_clusters = len(self.var_moments) n_cts = len(self.var_moments[0]['mu']) n_bin = len(self.var_moments[0]['lnp']) n_ord = len(self.var_moments[0]['lam']) assert n_cts + n_bin + n_ord == len( X.columns ), 'The features in X do not match the features of the model' # log factorials of Poisson features lnFactorials = X.apply( lambda row: [math.log(math.factorial(xi)) for xi in row[(n_cts + n_bin):]], axis=1) # update E(z_nk) for all n,k Ez = Parallel(n_jobs=self.n_jobs)( delayed(update_expectation)(x=X.iloc[idx], lnFactorial=lnFactorials[idx], pars=self.var_moments) for idx in range(n)) Ez = np.array(Ez) # extract E[ln(1-v)] for each cluster and lnP(c=t|v) for each t=1..max_clusters lnivs = [self.var_moments[t]['lniv'] for t in range(n_clusters)] lnPc = [ self.var_moments[t]['lnv'] + np.sum(lnivs[:t]) for t in range(n_clusters) ] # Add row vector lnP(c=[1..max_clusters]|v) to each row of Ez Ez = Ez + lnPc # the exp-normalize trick for preventing underflow rowMax = np.max(Ez, axis=1).reshape((n, 1)) Ez = np.exp(Ez - rowMax) # normalize rows of Ez rowSums = Ez.sum( axis=1)[:, None] # sum probabilities for each observation # if all probabilities for an observation are below machine epsilon then it won't be assigned a cluster assert 0 not in rowSums, \ str(list(rowSums).count(0)) + ' observations could not be assigned to a cluster. Increase max_clusters' Ez = Ez / rowSums # pick most probable cluster for each observation c = np.array(Ez.argmax(axis=1)).reshape(n) return c, Ez
def generate_embeddings(fc, schaefer_lab, yeo7_lab): """Generate embedding and cluster in 7 ICNs. Parameters ---------- fc: list of ndarray of shape (n_parcels, n_parcels) Functional connectivity matrix. schaefer_lab: ndarray of shape (n_vertices,) Schaefer parcellation with `n_parcels` parcels. yeo7_lab: ndarray of shape (n_vertices,) Yeo 7 network parcellation. Returns ------- grad_ind: list of ndarray of shape (n_parcels, n_eigenvectors) Individual embeddings. grad_ref: ndarray of shape (n_parcels, n_eigenvectors) Reference embedding. prob_ind: list of ndarray of shape (n_parcels, 7) ICN probability for individual embeddings. prob_ref: ndarray of shape (n_parcels, 7) ICN probability for reference embedding. lab_ind: list of ndarray of shape (n_parcels,) ICN labels for individual embeddings. lab_ref: ndarray of shape (n_parcels,) ICN labels for reference embedding. """ n_subjects = fc.shape[0] # embedding kwargs = {'keep': .1, 'alpha': 1, 'nc': 30, 'dt': 1} fc_ref = fc.mean(0) evec_ref, grad_ref = _embed_one(fc_ref, **kwargs)[:-1] grad_ind = [None] * n_subjects for i, x in enumerate(fc): ev1, grad1 = _embed_one(x, **kwargs)[:-1] grad_ind[i] = grad1 @ ev1.T @ evec_ref # change of basis # clustering init_prob = _get_prob_icn(schaefer_lab, yeo7_lab) prob_ind = Parallel(n_jobs=-1)(delayed(gmm_cluster)(si, emb, init_prob) for si, emb in enumerate(grad_ind)) prob_ind = np.stack(prob_ind, 0).astype(np.float32) prob_ref = gmm_cluster(1, grad_ref, init_prob).astype(np.float32) lab_ind = (prob_ind.argmax(-1) + 1).astype(np.uint8) lab_ref = (prob_ref.argmax(-1) + 1).astype(np.uint8) return grad_ind, grad_ref, prob_ind, prob_ref, lab_ind, lab_ref
def _run_single_trial_model(ddict, cfg, logger): """ Single-trial estimation. """ logger.info(f"Running single-trial estimation") n_runs = np.unique(ddict['run_idx']).size K = ddict['denoised_func'].shape[1] if cfg['hrf_model'] == 'kay': # try to optimize HRF selection logger.info(f"Going to optimize the HRF (using Kay's 20-HRF basis set)") # First, get R2 values for each HRF-based model (20 in total) # r2: list (n_runs) of 2D (20 x voxels) arrays r2 = Parallel(n_jobs=cfg['n_cpus'])(delayed(_optimize_hrf_within) (run, ddict, cfg, logger) for run in range(n_runs) ) if cfg['save_all']: # save to disk for inspection for run, this_r2 in enumerate(r2): # hrf-wise r2 per run save_data(this_r2, cfg, ddict, par_dir='best', run=run+1, desc='hrf', dtype='r2') # Stack into 3D array: M (runs) x 20 (hrfs) x K (voxels) r2 = np.stack(r2) if cfg['regularize_hrf_model']: # same voxel-specific HRF for each run logger.info("Regularizing HRF model") # IDEA: variance-weighted? So (r2_mean / r2_std).argmax(axis=0)? # IDEA: rank-transform per run r2_median = np.median(r2 - r2.mean(axis=0), axis=0) # median across runs # 1D array of size K (voxels) with best HRF index best_hrf_idx = r2_median.argmax(axis=0).astype(int) if cfg['save_all']: # save per-run statistics save_data(r2_median, cfg, ddict, par_dir='best', run=None, desc='hrf', dtype='r2') save_data(best_hrf_idx, cfg, ddict, par_dir='best', run=None, desc='opt', dtype='hrf') else: # specific HRF for each voxel and run (2D array: runs x voxels) best_hrf_idx = r2.argmax(axis=1).astype(int) else: # bit of a hack: set all voxels to the same HRF (index: 0) best_hrf_idx = np.zeros(K).astype(int) # Now, fit the single-trial models for real, using a voxel- (and possibly run-) # specific HRF or using a "fixed" one (if not --regularize-hrf-model) Parallel(n_jobs=cfg['n_cpus'])(delayed(_run_single_trial_model_parallel) (run, best_hrf_idx, ddict, cfg, logger) for run in range(n_runs) )
def svm_ova_from_kernel(ktrain, train_labels, ktest, test_labels, C=DEFAULT_REGULARIZATION, bkg_categories=None): def sighandler_svm(signum, frame): logger.info('Caught signal %i while training SVMs in paralell.' % signum) signal.signal(signal.SIGTERM, sighandler_svm) n_test = ktest.shape[0] categories = np.unique(train_labels) # -- remove background categories if bkg_categories is not None: categories = list(set(categories).difference(set(bkg_categories))) n_categories = len(categories) cat_index = {} predictions = np.empty((n_test, n_categories)) # -- train OVA SVMs in parallel predictions = Parallel(n_jobs=-1) (delayed(one_svm) (ktrain, train_labels.reshape(-1), ktest, cat, C) for cat in categories) predictions = np.array(predictions).T # -- iterates over categories for icat, cat in enumerate(categories): cat_index[cat] = icat gt = np.array([cat_index[e] for e in test_labels.reshape(-1)]).astype('int') pred = predictions.argmax(axis=1) acc = (pred == gt).sum() / float(n_test) return acc, predictions, gt
def _mfvi(self, X, hyperparameters, ml=[]): """ Clusters X with variational inference given must-link constraints ml :param pd.DataFrame X: data :param hyperparameters hyperparameters: hyperparameters for distributions :param list ml: each element is a list of indices of points that must be in the same cluster :return: cluster assignment for each observation, variational parameters and moments, and final ELBO :rtype: MFVICluster """ n = len(X) n_cts = len(hyperparameters['nu']) n_bin = len(hyperparameters['gamma']) ml_flat = [idx for link in ml for idx in link] np.random.seed(self.random_state) # set random seed # initialize expectation matrix. Ez[i,j] is the probability observation i is in cluster j Ez = np.zeros((n, self.max_clusters)) Ez[np.arange(n), np.random.randint(0, self.max_clusters, n)] = 1 # stick-break points ~ Beta(1,alpha) hyperparameters['sb1'] = 1 hyperparameters['sb2'] = self.alpha # initialize hyperparameters & parameters for each cluster phi_hyper = [hyperparameters] * self.max_clusters phi_par = [update_parameters(hyperparameters)] * self.max_clusters # calculate factorials for Poisson variates lnFactorials = X.apply( lambda row: [math.log(math.factorial(xi)) for xi in row[(n_cts + n_bin):]], axis=1) ELBO = float('-inf') # initialize ELBO for i in range(self.iterations): print("Running iteration %s ... " % str(i + 1), end="") prevELBO = ELBO # update hyperparameters phi_hyper = Parallel(n_jobs=self.n_jobs)( delayed(update_hyperparameters)(df=X, hypers=hyperparameters, Ez=Ez, k=k, alpha=self.alpha, e_mu=phi_par[k]['mu']) for k in range(self.max_clusters)) # update parameters phi_par = Parallel(n_jobs=self.n_jobs)( delayed(update_parameters)(hypers=phi_hyper[k]) for k in range(self.max_clusters)) # update E(z_nk) for all n,k Ez = Parallel(n_jobs=self.n_jobs)(delayed(update_expectation)( x=X.iloc[idx], lnFactorial=lnFactorials[idx], pars=phi_par) for idx in range(n)) Ez = np.array(Ez) # calculate joint probabilities for must-link data for link in ml: Ez[link, :] = np.sum(Ez[link, :], axis=0) # extract E[ln(1-v)] for each cluster and lnP(c=t|v) for each t=1..max_clusters lnivs = [phi_par[t]['lniv'] for t in range(self.max_clusters)] lnPc = [ phi_par[t]['lnv'] + np.sum(lnivs[:t]) for t in range(self.max_clusters) ] # Add row vector lnP(c=[1..max_clusters]|v) to each row of Ez Ez = Ez + lnPc # the exp-normalize trick for preventing underflow rowMax = np.max(Ez, axis=1).reshape((n, 1)) Ez = np.exp(Ez - rowMax) # normalize rows of Ez rowSums = Ez.sum( axis=1)[:, None] # sum probabilities for each observation # if all probabilities for an observation are below machine epsilon then it won't be assigned a cluster assert 0 not in rowSums, \ str(list(rowSums).count(0))+' observations could not be assigned to a cluster. Increase max_clusters' Ez = Ez / rowSums # create a matrix of cluster probabilities without duplicated rows for must-link constraints c_mat = np.delete(Ez, ml_flat, axis=0) for link in ml: c_mat = np.append(c_mat, [Ez[link[0], :]], axis=0) # calculate ELBO & gain ELBO = elbo(df=X, pars=phi_par, var_hypers=phi_hyper, prior_hypers=hyperparameters, e_z=Ez, c_mat=c_mat, lnPc=lnPc, alpha=self.alpha, cores=self.n_jobs) ELBOgain = ELBO - prevELBO print("ELBO: %s ... gained %s" % (str(ELBO), str(ELBOgain))) if ELBOgain < self.tol: print("ELBO converged!") break # assign to each observation, the cluster it's most likely to belong to from the expectation matrix c = np.array(Ez.argmax(axis=1)).reshape(n) # map cluster indices so they're all consecutive integers uniq_c = np.unique(c) # filter hyperparameters & expectations var_hyper = [phi_hyper[j] for j in uniq_c] self.var_moments = [phi_par[j] for j in uniq_c] # a mapping to help read the E(z_nk) matrix cluster_map = {} for clust in uniq_c: cluster_map[clust] = np.where(uniq_c == clust)[0][0] clusters = list(map(lambda clust_idx: cluster_map[clust_idx], c)) solution = MFVICluster(clusters, var_hyper, self.var_moments, Ez, cluster_map, ELBO) if solution.n_clusters == self.max_clusters: print( "Warning: max_clusters reached, you may need to cluster again with a higher max_clusters." ) print("Done") return solution