def initFromPi(Y, terms, pi, gene_ids=None, nHidden=3, nHiddenSparse = 0,pruneGenes=True, FPR=0.99, FNR=0.001, \ noise='gauss', minGenes=20, do_preTrain=True, nFix=None, initZ=None): init_factors = {} init_factors['initZ'] = initZ #terms[terms=="hidden"] = ['%s%s' % t for t in zip(terms[terms=="hidden"], SP.arange(SP.sum(terms=="hidden")))] #terms[terms=="hiddenSparse"] = ['%s%s' % t for t in zip(terms[terms=="hiddenSparse"], SP.arange(SP.sum(terms=="hiddenSparse")))] init = { 'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise, 'init_factors': init_factors } if not gene_ids is None: gene_ids = SP.array(gene_ids) FA = slalom.CSparseFA(components=pi.shape[1], idx_genes=None, gene_ids=gene_ids) FA.saveInit = True FA.init(**init) return FA
def initFA(Y, terms, I, gene_ids=None, nHidden=3, nHiddenSparse = 0,pruneGenes=True, FPR=0.99, FNR=0.001, \ noise='gauss', minGenes=20, do_preTrain=True, nFix=None, priors=None, covariates=None, dropFactors=True, learnPi=False): """Initialise the slalom factor analysis model. Required 3 inputs are first, a gene expression matrix `Y` containing normalised count values of `N` cells and `G` variable genes in log-space, second a vector `terms` contaning the names of all annotated gene set (correspondig to annotated factors) and third, a binary indicator matrix `I` linking `G` genes to `K` terms by indicating which genes are annotated to each factor. A variety of options can be specified as described below. Args: Y (array_like): Matrix of normalised count values of `N` cells and `G` variable genes in log-space. Dimension (:math:`N\\times G`). terms (vector_like): Names of `K` annotated gene sets. Dimension (:math:`K\\times 0`). I (array_like): Indicator matrix specifying whether a gene is annotated to a specific factor. Dimension (:math:`G\\times K`). gene_ids (array_like): Gene identifiers (opitonal, defaults to None) FNR (float): False negative rate of annotations. Defaults to 0.001 FPR (float): False positive rate of annotations. Defaults to 0.99 nHidden (int): Number of unannotated dense factors. Defaults to 3. nHiddenSparse (int): Number of unannotated sparse factors. Defaults to 0. This value should be changed to e.g. 5 if the diagnositcs fail. pruneGenes (bool): prune genes that are not annotated to a least one factor. This option allows fast inference and should be set to `True` either if the key objective is to rank factors or if the annotations cover all genes of interest. Defaults to `True`. dropFactors (bool): drop factors from update schedule once they are shut off. In practice, factors that are switched off at some point during inference are usuallly not switched off. Allows faster inference. Defaults to `True`. Currently only supported for the Gaussian noise model. noise (str): Specifies the observation noise model. Should be either `'gauss'`,`'hurdle'` or `'poisson'`. Defaults to `gauss`. minGenes (int): minimum number of genes required per term to retain it Defaults to `20`. do_preTrain (bool): Boolean switch indicating whether pre-training should be used to establish the initial update order. Can be set to `False` for very large datasets. Defaults to `True` priors (dict): Dictionary containing the hyperparameters of the priors for `Alpha`, `Eps` and Pi (`PiDense` and `PiSparse`). Defaults to None; in this case default values are used. covariates (array_like): Matrix with known covariates that are controlled for when fitting the model. Defaults to `None`. learnPi (bool): Learn sparsity of spearse hidden factors (default False) Returns: A :class:`slalom.CSparseFA` instance. """ #check for consistency of input parameters [num_cells, num_genes] = Y.shape num_terms = I.shape[1] assert I.shape[ 0] == num_genes, 'annotation needs to be matched to gene input dimension' assert noise in ['gauss', 'hurdle', 'poisson'], 'invalid noise model' assert 0 < FNR < 1, 'FNR is required to be between 0 and 1' assert 0 < FNR < 1, 'FPR is required to be between 0 and 1' if noise == "hurdle" and dropFactors == True: dropFactors = False print( "dropFactors only supported for gauss noise model. Set to False.") #make sure the annotation is boolean I = (I > .5) #. filter annotation by min number of required genes Iok = I.sum(axis=0) > minGenes terms = terms[Iok] I = I[:, Iok] num_terms = I.shape[1] #create initial pi matrix, which corresponds to the effective prior probability of an annotated link pi = SP.zeros([num_genes, num_terms], dtype='float') #default FNR pi[:] = FNR #active links pi[I] = FPR #prune genes? if pruneGenes == True: idx_genes = SP.sum(I, 1) > 0 Y = Y[:, idx_genes] pi = pi[idx_genes, :] if not (gene_ids is None): gene_ids = SP.array(gene_ids)[idx_genes] else: idx_genes = SP.arange(Y.shape[1]) if Y.shape[1] > 10000: print( "For large datasets we recommend setting the pruneGenes option to True." ) #center data for Gaussian observation noise if noise == 'gauss': Y -= SP.mean(Y, 0) #include hidden variables if nHiddenSparse > 0: piSparse = SP.ones((Y.shape[1], nHiddenSparse)) * .01 idxVar = SP.argsort(-Y.var(0)) for iH in range(piSparse.shape[1]): idxOnH = SP.random.choice(idxVar[:100], 20, replace=False) piSparse[idxOnH, iH] = 0.99 pi = SP.hstack([piSparse, pi]) thiddenSparse = SP.repeat('hiddenSparse', nHiddenSparse) termsHiddnSparse = [ '%s%s' % t for t in zip(thiddenSparse, SP.arange(nHiddenSparse)) ] terms = SP.hstack([termsHiddnSparse, terms]) num_terms += nHiddenSparse thidden = SP.repeat('hidden', nHidden) termsHidden = ['%s%s' % t for t in zip(thidden, SP.arange(nHidden))] terms = SP.hstack([termsHidden, terms]) pi = SP.hstack([SP.ones((Y.shape[1], nHidden)) * .99, pi]) num_terms += nHidden if not (covariates is None): if len(covariates.shape) == 1: covariates = covariates[:, SP.newaxis] nKnown = covariates.shape[1] pi = SP.hstack([SP.ones((Y.shape[1], nKnown)) * .99, pi]) num_terms += nKnown tcovariates = SP.repeat('covariate', nKnown) termsCovariates = [ '%s%s' % t for t in zip(tcovariates, SP.arange(nKnown)) ] terms = SP.hstack([termsCovariates, terms]) #mean term for non-Gaussian noise models if noise != 'gauss': terms = SP.hstack(['bias', terms]) pi = SP.hstack([SP.ones((Y.shape[1], 1)) * (1. - 1e-10), pi]) num_terms += 1 if do_preTrain == True: Ilabel = preTrain(Y, terms, pi, noise=noise, nFix=nFix, priors=priors, covariates=covariates) pi = pi[:, Ilabel] terms = terms[Ilabel] init = { 'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise, 'covariates': covariates, "dropFactors": dropFactors } if not gene_ids is None: gene_ids = SP.array(gene_ids) FA = slalom.CSparseFA(components=num_terms, idx_genes=idx_genes, gene_ids=gene_ids, priors=priors, learnPi=learnPi) FA.saveInit = False FA.init(**init) return FA
def preTrain(Y, terms, P_I, noise='gauss', nFix=None, priors=None, covariates=None): """Pre-train the slalom factor analysis model. Helper function to pre-train the slalom factor analysis model to achieve faster convergence and obtain an initial update order. Called by `initFA`. Args: Y (array_like): Matrix of normalised count values of `N` cells and `G` variable genes in log-space. Dimension (:math:`N\\times G`). terms (vector_like): Names of `K` annotated gene sets. Dimension (:math:`K\\times 0`). P_I (array_like): Matrix specifying the likelihood of whether a gene is annotated to a specific factor. Dimension (:math:`G\\times K`). noise (str): Specifies the observation noise model. Should be either `'gauss'`,`'hurdle'` or `'poisson'`. Defaults to `gauss`. nFix (int): Number of terms which should be fixed and updated first. Defaults to `None`, resulting in the number of unannotated factors being updated first. Returns: A vector containing the initial update order of the terms """ init_params = {} init_params['noise'] = noise init_params['iLatent'] = SP.where(terms == 'hidden')[0] init_params['iLatentSparse'] = SP.array( []) #SP.where(terms=='hiddenSparse')[0] if not (covariates is None): init_params['Known'] = covariates learnPi = False pi = P_I.copy() K = pi.shape[1] #data for sparseFA instance pi[pi > .8] = 1 - 1e-100 # 0.99#1-1e-100#0.9999 pi[pi < .2] = 1e-100 #1e-8 init = { 'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise, 'covariates': covariates } sigmaOff = 1E-3 sparsity = 'VB' #prior on noise level if priors is None: priors = {'Eps': {'priors': [1E-3, 1E-3]}} #how to initialize network? #initType = 'pcaRand' terms0 = terms pi0 = pi.copy() FA0 = slalom.CSparseFA(components=K, sigmaOff=sigmaOff, sigmaOn=SP.ones(pi.shape[1]) * 1.0, sparsity=sparsity, nIterations=50, permutation_move=False, priors=priors, initType='pcaRand', learnPi=learnPi) FA0.init(**init) if nFix == None: nFix = FA0.nKnown + FA0.nLatent #Fit PCA pca = PCA(n_components=1) #,svd_solver='full') pca.fit(FA0.Z.E1) X = pca.transform(FA0.Z.E1) #Sort by correlation to PC1 MPC = abs(vcorrcoef(FA0.S.E1[:, SP.argsort(FA0.W.Ilabel)].T, X.T))[nFix:] Ipi = SP.argsort(-MPC.ravel()) IpiRev = SP.argsort(MPC.ravel()) mRange = list(range(FA0.components)) mRange[nFix:] = Ipi + nFix mRangeRev = list(range(FA0.components)) mRangeRev[nFix:] = IpiRev + nFix #Run model for 50 iterations pi = pi0[:, mRange] terms = terms0[mRange] init = {'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise} FA = slalom.CSparseFA(components=K, sigmaOff=sigmaOff, sigmaOn=SP.ones(pi.shape[1]) * 1.0, sparsity=sparsity, nIterations=50, permutation_move=False, priors=priors, initType='pcaRand', learnPi=learnPi) FA.shuffle = True FA.nScale = 30 FA.init(**init) for j in range(50): FA.update() #Run reverse model for 50 iterations pi = pi0[:, mRangeRev] terms = terms0[mRangeRev] init = {'init_data': CGauss(Y), 'Pi': pi, 'terms': terms, 'noise': noise} FArev = slalom.CSparseFA(components=K, sigmaOff=sigmaOff, sigmaOn=SP.ones(pi.shape[1]) * 1.0, sparsity=sparsity, nIterations=50, permutation_move=False, priors=priors, initType='pcaRand', learnPi=learnPi) FArev.shuffle = True FArev.nScale = 30 FArev.init(**init) #FArev.iterate(forceIterations=True, nIterations=nIterations) for j in range(50): FArev.update() #import pdb IpiM = (-(0.5 * (1. / FArev.Alpha.E1[SP.argsort(mRangeRev)][nFix:]) + .5 * (1. / FA.Alpha.E1[SP.argsort(mRange)][nFix:]))).argsort() # IpiM = (-(0.5*(1./FArev.Alpha.E1[SP.argsort(mRangeRev)][nFix:]*FArev.S.E1[:,SP.argsort(mRangeRev)][:,nFix:].std(0))+.5*(1./FA.Alpha.E1[SP.argsort(mRange)][nFix:]*FA.S.E1[:,SP.argsort(mRange)][:,nFix:].std(0)))).argsort() Ilabel = SP.hstack([SP.arange(nFix), IpiM + nFix]) return Ilabel