Example #1
0
    def predict(self, X):
        """
        Predict the cluster of each observation (row) in X

        :param pd.DataFrame X: data
        :return: the clustering solution
        :rtype: np.array, np.ndarray
        """

        assert self.var_moments is not None, 'Model has not been fit'
        # sort features so continuous variables come first, then binary, followed by non-negative discrete
        X, n_cts_data, n_bin_data, n_ord_data = sort_features(X)
        n = len(X)
        n_clusters = len(self.var_moments)
        n_cts = len(self.var_moments[0]['mu'])
        n_bin = len(self.var_moments[0]['lnp'])
        n_ord = len(self.var_moments[0]['lam'])
        assert n_cts + n_bin + n_ord == len(
            X.columns
        ), 'The features in X do not match the features of the model'
        # log factorials of Poisson features
        lnFactorials = X.apply(
            lambda row:
            [math.log(math.factorial(xi)) for xi in row[(n_cts + n_bin):]],
            axis=1)
        # update E(z_nk) for all n,k
        Ez = Parallel(n_jobs=self.n_jobs)(
            delayed(update_expectation)(x=X.iloc[idx],
                                        lnFactorial=lnFactorials[idx],
                                        pars=self.var_moments)
            for idx in range(n))
        Ez = np.array(Ez)
        # extract E[ln(1-v)] for each cluster and lnP(c=t|v) for each t=1..max_clusters
        lnivs = [self.var_moments[t]['lniv'] for t in range(n_clusters)]
        lnPc = [
            self.var_moments[t]['lnv'] + np.sum(lnivs[:t])
            for t in range(n_clusters)
        ]
        # Add row vector lnP(c=[1..max_clusters]|v) to each row of Ez
        Ez = Ez + lnPc
        # the exp-normalize trick for preventing underflow
        rowMax = np.max(Ez, axis=1).reshape((n, 1))
        Ez = np.exp(Ez - rowMax)
        # normalize rows of Ez
        rowSums = Ez.sum(
            axis=1)[:, None]  # sum probabilities for each observation
        # if all probabilities for an observation are below machine epsilon then it won't be assigned a cluster
        assert 0 not in rowSums, \
            str(list(rowSums).count(0)) + ' observations could not be assigned to a cluster. Increase max_clusters'
        Ez = Ez / rowSums
        # pick most probable cluster for each observation
        c = np.array(Ez.argmax(axis=1)).reshape(n)
        return c, Ez
Example #2
0
def generate_embeddings(fc, schaefer_lab, yeo7_lab):
    """Generate embedding and cluster in 7 ICNs.

    Parameters
    ----------
    fc: list of ndarray of shape (n_parcels, n_parcels)
        Functional connectivity matrix.
    schaefer_lab: ndarray of shape (n_vertices,)
        Schaefer parcellation with `n_parcels` parcels.
    yeo7_lab: ndarray of shape (n_vertices,)
        Yeo 7 network parcellation.

    Returns
    -------
    grad_ind: list of ndarray of shape (n_parcels, n_eigenvectors)
        Individual embeddings.
    grad_ref: ndarray of shape (n_parcels, n_eigenvectors)
        Reference embedding.
    prob_ind: list of ndarray of shape (n_parcels, 7)
        ICN probability for individual embeddings.
    prob_ref: ndarray of shape (n_parcels, 7)
        ICN probability for reference embedding.
    lab_ind: list of ndarray of shape (n_parcels,)
        ICN labels for individual embeddings.
    lab_ref: ndarray of shape (n_parcels,)
        ICN labels for reference embedding.
    """

    n_subjects = fc.shape[0]

    # embedding
    kwargs = {'keep': .1, 'alpha': 1, 'nc': 30, 'dt': 1}

    fc_ref = fc.mean(0)
    evec_ref, grad_ref = _embed_one(fc_ref, **kwargs)[:-1]

    grad_ind = [None] * n_subjects
    for i, x in enumerate(fc):
        ev1, grad1 = _embed_one(x, **kwargs)[:-1]
        grad_ind[i] = grad1 @ ev1.T @ evec_ref  # change of basis

    # clustering
    init_prob = _get_prob_icn(schaefer_lab, yeo7_lab)
    prob_ind = Parallel(n_jobs=-1)(delayed(gmm_cluster)(si, emb, init_prob)
                                   for si, emb in enumerate(grad_ind))

    prob_ind = np.stack(prob_ind, 0).astype(np.float32)
    prob_ref = gmm_cluster(1, grad_ref, init_prob).astype(np.float32)

    lab_ind = (prob_ind.argmax(-1) + 1).astype(np.uint8)
    lab_ref = (prob_ref.argmax(-1) + 1).astype(np.uint8)

    return grad_ind, grad_ref, prob_ind, prob_ref, lab_ind, lab_ref
Example #3
0
def _run_single_trial_model(ddict, cfg, logger):
    """ Single-trial estimation. """

    logger.info(f"Running single-trial estimation")

    n_runs = np.unique(ddict['run_idx']).size
    K = ddict['denoised_func'].shape[1]

    if cfg['hrf_model'] == 'kay':  # try to optimize HRF selection
        logger.info(f"Going to optimize the HRF (using Kay's 20-HRF basis set)")
        # First, get R2 values for each HRF-based model (20 in total)
        # r2: list (n_runs) of 2D (20 x voxels) arrays
        r2 = Parallel(n_jobs=cfg['n_cpus'])(delayed(_optimize_hrf_within)
            (run, ddict, cfg, logger) for run in range(n_runs)
        )
        if cfg['save_all']:  # save to disk for inspection
            for run, this_r2 in enumerate(r2):  # hrf-wise r2 per run
                save_data(this_r2, cfg, ddict, par_dir='best', run=run+1,
                          desc='hrf', dtype='r2')

        # Stack into 3D array: M (runs) x 20 (hrfs) x K (voxels)
        r2 = np.stack(r2)

        if cfg['regularize_hrf_model']:  # same voxel-specific HRF for each run
            logger.info("Regularizing HRF model")
            # IDEA: variance-weighted? So (r2_mean / r2_std).argmax(axis=0)?
            # IDEA: rank-transform per run
            r2_median = np.median(r2 - r2.mean(axis=0), axis=0)  # median across runs

            # 1D array of size K (voxels) with best HRF index
            best_hrf_idx = r2_median.argmax(axis=0).astype(int)

            if cfg['save_all']:  # save per-run statistics
                save_data(r2_median, cfg, ddict, par_dir='best', run=None, desc='hrf', dtype='r2')
                save_data(best_hrf_idx, cfg, ddict, par_dir='best', run=None, desc='opt', dtype='hrf')
        else:  # specific HRF for each voxel and run (2D array: runs x voxels)
            best_hrf_idx = r2.argmax(axis=1).astype(int)
    else:
        # bit of a hack: set all voxels to the same HRF (index: 0)
        best_hrf_idx = np.zeros(K).astype(int)
    
    # Now, fit the single-trial models for real, using a voxel- (and possibly run-)
    # specific HRF or using a "fixed" one (if not --regularize-hrf-model)
    Parallel(n_jobs=cfg['n_cpus'])(delayed(_run_single_trial_model_parallel)
        (run, best_hrf_idx, ddict, cfg, logger) for run in range(n_runs)
    )
Example #4
0
def svm_ova_from_kernel(ktrain, train_labels,
                        ktest, test_labels,
                        C=DEFAULT_REGULARIZATION,
                        bkg_categories=None):

    def sighandler_svm(signum, frame):
        logger.info('Caught signal %i while training SVMs in paralell.'
                    % signum)

    signal.signal(signal.SIGTERM, sighandler_svm)

    n_test = ktest.shape[0]

    categories = np.unique(train_labels)

    # -- remove background categories
    if bkg_categories is not None:
        categories = list(set(categories).difference(set(bkg_categories)))

    n_categories = len(categories)

    cat_index = {}
    predictions = np.empty((n_test, n_categories))

    # -- train OVA SVMs in parallel
    predictions = Parallel(n_jobs=-1) (delayed(one_svm) (ktrain,
                                                  train_labels.reshape(-1),
                                                  ktest,
                                                  cat, C)
           for cat in categories)

    predictions = np.array(predictions).T

    # -- iterates over categories
    for icat, cat in enumerate(categories):
        cat_index[cat] = icat

    gt = np.array([cat_index[e]
                        for e in test_labels.reshape(-1)]).astype('int')
    pred = predictions.argmax(axis=1)
    acc = (pred == gt).sum() / float(n_test)

    return acc, predictions, gt
Example #5
0
    def _mfvi(self, X, hyperparameters, ml=[]):
        """
        Clusters X with variational inference given must-link constraints ml

        :param pd.DataFrame X: data
        :param hyperparameters hyperparameters: hyperparameters for distributions
        :param list ml: each element is a list of indices of points that must be in the same cluster
        :return: cluster assignment for each observation, variational parameters and moments, and final ELBO
        :rtype: MFVICluster
        """

        n = len(X)
        n_cts = len(hyperparameters['nu'])
        n_bin = len(hyperparameters['gamma'])
        ml_flat = [idx for link in ml for idx in link]
        np.random.seed(self.random_state)  # set random seed
        # initialize expectation matrix. Ez[i,j] is the probability observation i is in cluster j
        Ez = np.zeros((n, self.max_clusters))
        Ez[np.arange(n), np.random.randint(0, self.max_clusters, n)] = 1
        # stick-break points ~ Beta(1,alpha)
        hyperparameters['sb1'] = 1
        hyperparameters['sb2'] = self.alpha
        # initialize hyperparameters & parameters for each cluster
        phi_hyper = [hyperparameters] * self.max_clusters
        phi_par = [update_parameters(hyperparameters)] * self.max_clusters
        # calculate factorials for Poisson variates
        lnFactorials = X.apply(
            lambda row:
            [math.log(math.factorial(xi)) for xi in row[(n_cts + n_bin):]],
            axis=1)
        ELBO = float('-inf')  # initialize ELBO
        for i in range(self.iterations):
            print("Running iteration %s ... " % str(i + 1), end="")
            prevELBO = ELBO
            # update hyperparameters
            phi_hyper = Parallel(n_jobs=self.n_jobs)(
                delayed(update_hyperparameters)(df=X,
                                                hypers=hyperparameters,
                                                Ez=Ez,
                                                k=k,
                                                alpha=self.alpha,
                                                e_mu=phi_par[k]['mu'])
                for k in range(self.max_clusters))
            # update parameters
            phi_par = Parallel(n_jobs=self.n_jobs)(
                delayed(update_parameters)(hypers=phi_hyper[k])
                for k in range(self.max_clusters))
            # update E(z_nk) for all n,k
            Ez = Parallel(n_jobs=self.n_jobs)(delayed(update_expectation)(
                x=X.iloc[idx], lnFactorial=lnFactorials[idx], pars=phi_par)
                                              for idx in range(n))
            Ez = np.array(Ez)
            # calculate joint probabilities for must-link data
            for link in ml:
                Ez[link, :] = np.sum(Ez[link, :], axis=0)
            # extract E[ln(1-v)] for each cluster and lnP(c=t|v) for each t=1..max_clusters
            lnivs = [phi_par[t]['lniv'] for t in range(self.max_clusters)]
            lnPc = [
                phi_par[t]['lnv'] + np.sum(lnivs[:t])
                for t in range(self.max_clusters)
            ]
            # Add row vector lnP(c=[1..max_clusters]|v) to each row of Ez
            Ez = Ez + lnPc
            # the exp-normalize trick for preventing underflow
            rowMax = np.max(Ez, axis=1).reshape((n, 1))
            Ez = np.exp(Ez - rowMax)
            # normalize rows of Ez
            rowSums = Ez.sum(
                axis=1)[:, None]  # sum probabilities for each observation
            # if all probabilities for an observation are below machine epsilon then it won't be assigned a cluster
            assert 0 not in rowSums, \
                str(list(rowSums).count(0))+' observations could not be assigned to a cluster. Increase max_clusters'
            Ez = Ez / rowSums
            # create a matrix of cluster probabilities without duplicated rows for must-link constraints
            c_mat = np.delete(Ez, ml_flat, axis=0)
            for link in ml:
                c_mat = np.append(c_mat, [Ez[link[0], :]], axis=0)
            # calculate ELBO & gain
            ELBO = elbo(df=X,
                        pars=phi_par,
                        var_hypers=phi_hyper,
                        prior_hypers=hyperparameters,
                        e_z=Ez,
                        c_mat=c_mat,
                        lnPc=lnPc,
                        alpha=self.alpha,
                        cores=self.n_jobs)
            ELBOgain = ELBO - prevELBO
            print("ELBO: %s ... gained %s" % (str(ELBO), str(ELBOgain)))
            if ELBOgain < self.tol:
                print("ELBO converged!")
                break
        # assign to each observation, the cluster it's most likely to belong to from the expectation matrix
        c = np.array(Ez.argmax(axis=1)).reshape(n)
        # map cluster indices so they're all consecutive integers
        uniq_c = np.unique(c)
        # filter hyperparameters & expectations
        var_hyper = [phi_hyper[j] for j in uniq_c]
        self.var_moments = [phi_par[j] for j in uniq_c]
        # a mapping to help read the E(z_nk) matrix
        cluster_map = {}
        for clust in uniq_c:
            cluster_map[clust] = np.where(uniq_c == clust)[0][0]

        clusters = list(map(lambda clust_idx: cluster_map[clust_idx], c))
        solution = MFVICluster(clusters, var_hyper, self.var_moments, Ez,
                               cluster_map, ELBO)
        if solution.n_clusters == self.max_clusters:
            print(
                "Warning: max_clusters reached, you may need to cluster again with a higher max_clusters."
            )
        print("Done")
        return solution