def getNBPValue(mean0, var0, mean1, lower=False, log=False): """ Use negative binomial to calculate p-value Reference: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom """ from scipy.stats import nbinom n = len(mean0) nb_p = [mean0[i] / var0[i] for i in range(n)] # consisitent with R nb_n0 = [mean0[i] * mean0[i] / (var0[i] - mean0[i]) for i in range(n)] nb_n = [(lambda t: t if t >= 1 else 1)(x) for x in nb_n0] # if lower == True: if log == False: nb_p_low = nbinom.cdf(mean1, nb_n, nb_p) else: nb_p_low = nbinom.logcdf(mean1, nb_n, nb_p) return list(nb_p_low) else: if log == False: nb_p_low = nbinom.sf(mean1, nb_n, nb_p) else: nb_p_low = nbinom.logsf(mean1, nb_n, nb_p) return list(nb_p_low)
def nbinom_logsf(x, mu, dispersion): # for zero mus this will be assigned to -100 res = -100.0 * np.ones(mu.shape, dtype=np.float64) p = dispersion / (dispersion + mu) res[mu > 0.0] = nbinom.logsf(x[mu > 0.0], dispersion, p[mu > 0.0]) return res
def getNBPValue(mean0,var0,mean1, lower=False,log=False): """ Use negative binomial to calculate p-value Reference: http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom """ from scipy.stats import nbinom n=len(mean0) nb_p=[mean0[i]/var0[i] for i in range(n)]; # consisitent with R nb_n0=[mean0[i]*mean0[i]/(var0[i]-mean0[i]) for i in range(n)] nb_n=[ (lambda t: t if t>=1 else 1)(x) for x in nb_n0] # if lower==True: if log==False: nb_p_low=nbinom.cdf(mean1,nb_n,nb_p) else: nb_p_low=nbinom.logcdf(mean1,nb_n,nb_p) return list(nb_p_low) else: if log==False: nb_p_low=nbinom.sf(mean1,nb_n,nb_p) else: nb_p_low=nbinom.logsf(mean1,nb_n,nb_p) return list(nb_p_low)
def GetSitesForGene(data): ''' This function determines for each gene the score of the sites ''' #Computing the probabilities for the current gene Sites, gene, nr_of_genes, gene_nr, seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length = data #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified if len(Sites) == 0: return gene, [] NrOfStates = EmissionParameters['NrOfStates'] Sites = dict([(gene, Sites)]) Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r') Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r') Sequences_per_gene = PreloadSequencesForGene(Sequences, gene) Background_per_gene = PreloadSequencesForGene(Background, gene) Ix = GetModelIx(Sequences_per_gene, Type='all') if np.sum(Ix) == 0: return gene, [] if EmissionParameters['FilterSNPs']: Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene) else: Ix = GetModelIx(Sequences_per_gene, Type='Conv') #Only compute the emission probability for regions where a site is ix_sites = np.zeros_like(Ix) ix_sites_len = Ix.shape[0] for currsite in Sites[gene]: ix_sites[max(0, currsite[0] - 1) : min(ix_sites_len, currsite[1] + 1)] = 1 ix_sites = ix_sites == 1 #2) Compute the probabilities for both states EmmisionProbGene = np.log(np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates))) CurrStackSum = StackData(Sequences_per_gene) CurrStackVar = StackData(Sequences_per_gene, add = 'no') CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov') CurrStackVarSumm = StackData(Sequences_per_gene, add = 'only_var_summed') EmmisionProbGeneDir = np.zeros_like(EmmisionProbGene) if EmissionParameters['glm_weight'] < 0.0: weight1 = 1.0 weight2 = 1.0 elif EmissionParameters['glm_weight'] == 0.0: weight1 = 0.0000001 weight2 = 1.0 - weight1 elif EmissionParameters['glm_weight'] == 1.0: weight1 = 0.9999999 weight2 = 1.0 - weight1 else: weight1 = EmissionParameters['glm_weight'] weight2 = (1.0 - EmissionParameters['glm_weight']) for State in range(NrOfStates): EmmisionProbGene[State, ix_sites] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters) if EmissionParameters['BckType'] == 'Coverage': EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg') if EmissionParameters['BckType'] == 'Coverage_bck': EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg') EmmisionProbGeneDir[State, Ix] = np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters) EmmisionProbGene[State, Ix] += np.log(weight2) + EmmisionProbGeneDir[State, Ix] Counts = StackData(Sequences_per_gene, add = 'all') Score = EmmisionProbGene CurrStack = CurrStackVar #Compute the scores when staying in the same state #RowIx = list(range(16)) + list(range(17, 38)) + list(range(39,44)) strand = Sequences_per_gene['strand'] #Get the coverages for the froeground and background CountsSeq = StackData(Sequences_per_gene, add = 'only_cov') CountsBck = StackData(Background_per_gene, add = 'only_cov') if strand == 0: strand = -1 #Since we the transition probabilty is the same for all States we do not need to compute it for the bayes factor #this list contains the returned sites sites = [] for currsite in Sites[gene]: mean_mat_fg, var_mat_fg, mean_mat_bg, var_mat_bg, counts_fg, counts_bg = ComputeStatsForSite(CountsSeq, CountsBck, currsite, fg_state, nr_of_genes, gene_nr, EmissionParameters) site = {} site['Start'] = currsite[0] site['Stop'] = currsite[1] site['Strand'] = strand site['SiteScore'] = EvaluateSite(Score, currsite, fg_state) site['Coverage'] = np.sum(np.sum(Counts[:, site['Start'] : site['Stop']], axis=0)) site['Variants'] = np.sum(CurrStackVarSumm[:, site['Start'] : site['Stop']], axis=1) site['mean_mat_fg'] = mean_mat_fg site['var_mat_fg'] = var_mat_fg site['mean_mat_bg'] = mean_mat_bg site['var_mat_bg'] = var_mat_bg site['counts_fg'] = counts_fg site['counts_bg'] = counts_bg p = mean_mat_fg / var_mat_fg n = (mean_mat_fg ** 2) / (var_mat_fg - mean_mat_fg) site['pv'] = nbinom.logsf(counts_fg, n, p) site['max_pos'] = get_max_position(Score, currsite, fg_state, strand) site['dir_score'] = EvaluateSite(EmmisionProbGeneDir, currsite, fg_state) if site['SiteScore'] < 0.0: continue sites.append(site) Sequences.close() Background.close() return gene, sites
def np_nbinom_logsf(x, mu, dispersion): p = dispersion / (dispersion + mu) return nbinom.logsf(x, dispersion, p)