Ejemplo n.º 1
0
def initFromPi(Y, terms, pi, gene_ids=None, nHidden=3, nHiddenSparse = 0,pruneGenes=True, FPR=0.99, FNR=0.001, \
            noise='gauss', minGenes=20, do_preTrain=True, nFix=None, initZ=None):

    init_factors = {}
    init_factors['initZ'] = initZ

    #terms[terms=="hidden"] = ['%s%s' % t for t in zip(terms[terms=="hidden"], SP.arange(SP.sum(terms=="hidden")))]
    #terms[terms=="hiddenSparse"] = ['%s%s' % t for t in zip(terms[terms=="hiddenSparse"], SP.arange(SP.sum(terms=="hiddenSparse")))]

    init = {
        'init_data': CGauss(Y),
        'Pi': pi,
        'terms': terms,
        'noise': noise,
        'init_factors': init_factors
    }
    if not gene_ids is None:
        gene_ids = SP.array(gene_ids)
    FA = slalom.CSparseFA(components=pi.shape[1],
                          idx_genes=None,
                          gene_ids=gene_ids)
    FA.saveInit = True
    FA.init(**init)

    return FA
Ejemplo n.º 2
0
def initFA(Y, terms, I, gene_ids=None, nHidden=3, nHiddenSparse = 0,pruneGenes=True, FPR=0.99, FNR=0.001, \
            noise='gauss', minGenes=20, do_preTrain=True, nFix=None, priors=None, covariates=None, dropFactors=True, learnPi=False):
    """Initialise the slalom factor analysis model.

    Required 3 inputs are first, a gene expression matrix `Y` containing normalised count values of `N` cells and `G` 
    variable genes in log-space, second a vector `terms` contaning the names of all annotated gene set (correspondig to annotated factors) 
    and third, a binary indicator matrix `I` linking `G` genes to `K` terms by indicating which genes are annotated to each factor. 
    A variety of options can be specified as described below. 

    Args:
        Y (array_like): Matrix of normalised count values of `N` cells 
                                 and `G` variable genes in log-space.
                                 Dimension (:math:`N\\times G`).
        terms    (vector_like): Names of `K` annotated gene sets. Dimension
                                 (:math:`K\\times 0`).
        I           (array_like): Indicator matrix specifying
                                 whether a gene is annotated to a specific factor.
                                 Dimension (:math:`G\\times K`).
        gene_ids   (array_like): Gene identifiers (opitonal, defaults to None)
        FNR             (float): False negative rate of annotations.
                                 Defaults to 0.001
        FPR             (float): False positive rate of annotations.
                                 Defaults to 0.99                                 
        nHidden           (int): Number of unannotated dense factors. Defaults to 3.
        nHiddenSparse       (int): Number of unannotated sparse factors. Defaults to 0. 
                                 This value should be changed to e.g. 5 if the diagnositcs fail. 
        pruneGenes         (bool): prune genes that are not annotated to a least one factor. This option allows fast inference and 
                                   should be set to `True` either if the 
                                   key objective is to rank factors or if the annotations cover all genes of interest.  
                                   Defaults to `True`.
        dropFactors         (bool): drop factors from update schedule once they are shut off. In practice, factors that are switched off 
                                   at some point during inference are usuallly not switched off. Allows faster inference. Defaults to `True`.
                                   Currently only supported for the Gaussian noise model.                                  
        noise              (str): Specifies the observation noise model. Should be either `'gauss'`,`'hurdle'` or `'poisson'`.
                                 Defaults to `gauss`.                                      
        minGenes          (int): minimum number of genes required per term to retain it  
                                 Defaults to `20`.  
        do_preTrain      (bool): Boolean switch indicating whether pre-training should be used to establish the initial 
                                update order. Can be set to `False` for very large datasets.
                                Defaults to `True` 
        priors      (dict): Dictionary containing the hyperparameters of the priors for `Alpha`, `Eps` and Pi (`PiDense` and `PiSparse`). 
                            Defaults to None; in this case default values are used.    
        covariates  (array_like): Matrix with known covariates that are controlled for when fitting the model. Defaults to `None`.
        learnPi          (bool): Learn sparsity of spearse hidden factors (default False)


    Returns:
        A :class:`slalom.CSparseFA` instance.
    """

    #check for consistency of input parameters
    [num_cells, num_genes] = Y.shape
    num_terms = I.shape[1]

    assert I.shape[
        0] == num_genes, 'annotation needs to be matched to gene input dimension'

    assert noise in ['gauss', 'hurdle', 'poisson'], 'invalid noise model'
    assert 0 < FNR < 1, 'FNR is required to be between 0 and 1'
    assert 0 < FNR < 1, 'FPR is required to be between 0 and 1'
    if noise == "hurdle" and dropFactors == True:
        dropFactors = False
        print(
            "dropFactors only supported for gauss noise model. Set to False.")

    #make sure the annotation is boolean
    I = (I > .5)
    #. filter annotation by min number of required genes
    Iok = I.sum(axis=0) > minGenes
    terms = terms[Iok]
    I = I[:, Iok]
    num_terms = I.shape[1]

    #create initial pi matrix, which corresponds to the effective prior probability of an annotated link
    pi = SP.zeros([num_genes, num_terms], dtype='float')
    #default FNR
    pi[:] = FNR
    #active links
    pi[I] = FPR

    #prune genes?
    if pruneGenes == True:
        idx_genes = SP.sum(I, 1) > 0
        Y = Y[:, idx_genes]
        pi = pi[idx_genes, :]
        if not (gene_ids is None):
            gene_ids = SP.array(gene_ids)[idx_genes]
    else:
        idx_genes = SP.arange(Y.shape[1])
        if Y.shape[1] > 10000:
            print(
                "For large datasets we recommend setting the pruneGenes option to True."
            )

    #center data for Gaussian observation noise
    if noise == 'gauss':
        Y -= SP.mean(Y, 0)

    #include hidden variables
    if nHiddenSparse > 0:
        piSparse = SP.ones((Y.shape[1], nHiddenSparse)) * .01
        idxVar = SP.argsort(-Y.var(0))
        for iH in range(piSparse.shape[1]):
            idxOnH = SP.random.choice(idxVar[:100], 20, replace=False)
            piSparse[idxOnH, iH] = 0.99
        pi = SP.hstack([piSparse, pi])
        thiddenSparse = SP.repeat('hiddenSparse', nHiddenSparse)
        termsHiddnSparse = [
            '%s%s' % t for t in zip(thiddenSparse, SP.arange(nHiddenSparse))
        ]
        terms = SP.hstack([termsHiddnSparse, terms])
        num_terms += nHiddenSparse

    thidden = SP.repeat('hidden', nHidden)
    termsHidden = ['%s%s' % t for t in zip(thidden, SP.arange(nHidden))]
    terms = SP.hstack([termsHidden, terms])

    pi = SP.hstack([SP.ones((Y.shape[1], nHidden)) * .99, pi])
    num_terms += nHidden

    if not (covariates is None):
        if len(covariates.shape) == 1:
            covariates = covariates[:, SP.newaxis]
        nKnown = covariates.shape[1]
        pi = SP.hstack([SP.ones((Y.shape[1], nKnown)) * .99, pi])
        num_terms += nKnown
        tcovariates = SP.repeat('covariate', nKnown)
        termsCovariates = [
            '%s%s' % t for t in zip(tcovariates, SP.arange(nKnown))
        ]
        terms = SP.hstack([termsCovariates, terms])


#mean term for non-Gaussian noise models
    if noise != 'gauss':
        terms = SP.hstack(['bias', terms])
        pi = SP.hstack([SP.ones((Y.shape[1], 1)) * (1. - 1e-10), pi])
        num_terms += 1

    if do_preTrain == True:
        Ilabel = preTrain(Y,
                          terms,
                          pi,
                          noise=noise,
                          nFix=nFix,
                          priors=priors,
                          covariates=covariates)
        pi = pi[:, Ilabel]
        terms = terms[Ilabel]

    init = {
        'init_data': CGauss(Y),
        'Pi': pi,
        'terms': terms,
        'noise': noise,
        'covariates': covariates,
        "dropFactors": dropFactors
    }
    if not gene_ids is None:
        gene_ids = SP.array(gene_ids)

    FA = slalom.CSparseFA(components=num_terms,
                          idx_genes=idx_genes,
                          gene_ids=gene_ids,
                          priors=priors,
                          learnPi=learnPi)
    FA.saveInit = False
    FA.init(**init)

    return FA
Ejemplo n.º 3
0
def preTrain(Y,
             terms,
             P_I,
             noise='gauss',
             nFix=None,
             priors=None,
             covariates=None):
    """Pre-train the slalom factor analysis model.

    Helper function to pre-train the slalom factor analysis model to achieve 
    faster convergence and obtain an initial update order. Called by `initFA`.

    Args:
        Y          (array_like): Matrix of normalised count values of `N` cells 
                                 and `G` variable genes in log-space.
                                 Dimension (:math:`N\\times G`).
        terms     (vector_like): Names of `K` annotated gene sets. Dimension
                                 (:math:`K\\times 0`).
        P_I        (array_like): Matrix specifying the likelihood of 
                                 whether a gene is annotated to a specific factor.
                                 Dimension (:math:`G\\times K`).
        noise              (str): Specifies the observation noise model. Should be either `'gauss'`,`'hurdle'` or `'poisson'`.
                                 Defaults to `gauss`.             
        nFix               (int): Number of terms which should be fixed and updated first. Defaults to `None`, 
                                  resulting in the number of unannotated factors being updated first.                                                                                           
    Returns:
        A vector containing the initial update order of the terms
    """

    init_params = {}
    init_params['noise'] = noise
    init_params['iLatent'] = SP.where(terms == 'hidden')[0]
    init_params['iLatentSparse'] = SP.array(
        [])  #SP.where(terms=='hiddenSparse')[0]
    if not (covariates is None):
        init_params['Known'] = covariates
    learnPi = False

    pi = P_I.copy()
    K = pi.shape[1]

    #data for sparseFA instance
    pi[pi > .8] = 1 - 1e-100  # 0.99#1-1e-100#0.9999
    pi[pi < .2] = 1e-100  #1e-8

    init = {
        'init_data': CGauss(Y),
        'Pi': pi,
        'terms': terms,
        'noise': noise,
        'covariates': covariates
    }
    sigmaOff = 1E-3
    sparsity = 'VB'

    #prior on noise level
    if priors is None:
        priors = {'Eps': {'priors': [1E-3, 1E-3]}}
    #how to initialize network?
    #initType = 'pcaRand'
    terms0 = terms
    pi0 = pi.copy()
    FA0 = slalom.CSparseFA(components=K,
                           sigmaOff=sigmaOff,
                           sigmaOn=SP.ones(pi.shape[1]) * 1.0,
                           sparsity=sparsity,
                           nIterations=50,
                           permutation_move=False,
                           priors=priors,
                           initType='pcaRand',
                           learnPi=learnPi)
    FA0.init(**init)
    if nFix == None:
        nFix = FA0.nKnown + FA0.nLatent


#Fit PCA
    pca = PCA(n_components=1)  #,svd_solver='full')
    pca.fit(FA0.Z.E1)
    X = pca.transform(FA0.Z.E1)

    #Sort by correlation to PC1
    MPC = abs(vcorrcoef(FA0.S.E1[:, SP.argsort(FA0.W.Ilabel)].T, X.T))[nFix:]
    Ipi = SP.argsort(-MPC.ravel())
    IpiRev = SP.argsort(MPC.ravel())

    mRange = list(range(FA0.components))
    mRange[nFix:] = Ipi + nFix

    mRangeRev = list(range(FA0.components))
    mRangeRev[nFix:] = IpiRev + nFix

    #Run model for 50 iterations
    pi = pi0[:, mRange]
    terms = terms0[mRange]
    init = {'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise}
    FA = slalom.CSparseFA(components=K,
                          sigmaOff=sigmaOff,
                          sigmaOn=SP.ones(pi.shape[1]) * 1.0,
                          sparsity=sparsity,
                          nIterations=50,
                          permutation_move=False,
                          priors=priors,
                          initType='pcaRand',
                          learnPi=learnPi)
    FA.shuffle = True
    FA.nScale = 30

    FA.init(**init)
    for j in range(50):
        FA.update()

    #Run reverse model for 50 iterations
    pi = pi0[:, mRangeRev]
    terms = terms0[mRangeRev]
    init = {'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise}
    FArev = slalom.CSparseFA(components=K,
                             sigmaOff=sigmaOff,
                             sigmaOn=SP.ones(pi.shape[1]) * 1.0,
                             sparsity=sparsity,
                             nIterations=50,
                             permutation_move=False,
                             priors=priors,
                             initType='pcaRand',
                             learnPi=learnPi)
    FArev.shuffle = True
    FArev.nScale = 30
    FArev.init(**init)

    #FArev.iterate(forceIterations=True, nIterations=nIterations)
    for j in range(50):
        FArev.update()

    #import pdb
    IpiM = (-(0.5 * (1. / FArev.Alpha.E1[SP.argsort(mRangeRev)][nFix:]) + .5 *
              (1. / FA.Alpha.E1[SP.argsort(mRange)][nFix:]))).argsort()

    #    IpiM = (-(0.5*(1./FArev.Alpha.E1[SP.argsort(mRangeRev)][nFix:]*FArev.S.E1[:,SP.argsort(mRangeRev)][:,nFix:].std(0))+.5*(1./FA.Alpha.E1[SP.argsort(mRange)][nFix:]*FA.S.E1[:,SP.argsort(mRange)][:,nFix:].std(0)))).argsort()
    Ilabel = SP.hstack([SP.arange(nFix), IpiM + nFix])

    return Ilabel