def getNBPValue(mean0, var0, mean1, lower=False, log=False):
    """
  Use negative binomial to calculate p-value
  Reference:
  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom
  """
    from scipy.stats import nbinom
    n = len(mean0)
    nb_p = [mean0[i] / var0[i] for i in range(n)]
    # consisitent with R
    nb_n0 = [mean0[i] * mean0[i] / (var0[i] - mean0[i]) for i in range(n)]
    nb_n = [(lambda t: t if t >= 1 else 1)(x) for x in nb_n0]
    #
    if lower == True:
        if log == False:
            nb_p_low = nbinom.cdf(mean1, nb_n, nb_p)
        else:
            nb_p_low = nbinom.logcdf(mean1, nb_n, nb_p)
        return list(nb_p_low)
    else:
        if log == False:
            nb_p_low = nbinom.sf(mean1, nb_n, nb_p)
        else:
            nb_p_low = nbinom.logsf(mean1, nb_n, nb_p)
        return list(nb_p_low)
Example #2
0
def nbinom_logsf(x, mu, dispersion):

    # for zero mus this will be assigned to -100
    res = -100.0 * np.ones(mu.shape, dtype=np.float64)

    p = dispersion / (dispersion + mu)

    res[mu > 0.0] = nbinom.logsf(x[mu > 0.0], dispersion, p[mu > 0.0])

    return res
Example #3
0
def getNBPValue(mean0,var0,mean1, lower=False,log=False):
  """
  Use negative binomial to calculate p-value
  Reference:
  http://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html#scipy.stats.nbinom
  """
  from scipy.stats import nbinom
  n=len(mean0)
  nb_p=[mean0[i]/var0[i] for i in range(n)]; # consisitent with R
  nb_n0=[mean0[i]*mean0[i]/(var0[i]-mean0[i]) for i in range(n)]
  nb_n=[ (lambda t: t if t>=1 else 1)(x) for x in nb_n0]
  #
  if lower==True:
    if log==False:
      nb_p_low=nbinom.cdf(mean1,nb_n,nb_p)
    else:
      nb_p_low=nbinom.logcdf(mean1,nb_n,nb_p)
    return list(nb_p_low)
  else:
    if log==False:
      nb_p_low=nbinom.sf(mean1,nb_n,nb_p)
    else:
      nb_p_low=nbinom.logsf(mean1,nb_n,nb_p)
    return list(nb_p_low)
Example #4
0
def GetSitesForGene(data):
    '''
    This function determines for each gene the score of the sites
    '''

    #Computing the probabilities for the current gene

    Sites, gene, nr_of_genes, gene_nr, seq_file, bck_file, EmissionParameters, TransitionParameters, TransitionTypeFirst, fg_state, merge_neighbouring_sites, minimal_site_length = data
    #Turn the Sequence and Bacground objects into dictionaries again such that the subsequent methods for using these do not need to be modified
    if len(Sites) == 0:
        return gene, []

    NrOfStates = EmissionParameters['NrOfStates']
    
    Sites = dict([(gene, Sites)])

    Sequences = h5py.File(EmissionParameters['DataOutFile_seq'], 'r')
    Background = h5py.File(EmissionParameters['DataOutFile_bck'], 'r')

    Sequences_per_gene = PreloadSequencesForGene(Sequences, gene)
    Background_per_gene = PreloadSequencesForGene(Background, gene)

    Ix = GetModelIx(Sequences_per_gene, Type='all')

    if np.sum(Ix) == 0:
        return gene, []

    if EmissionParameters['FilterSNPs']:
        Ix = GetModelIx(Sequences_per_gene, Type='no_snps_conv', snps_thresh=EmissionParameters['SnpRatio'], snps_min_cov=EmissionParameters['SnpAbs'], Background=Background_per_gene)
    else:
        Ix = GetModelIx(Sequences_per_gene, Type='Conv')

    #Only compute the emission probability for regions where a site is
    ix_sites = np.zeros_like(Ix)
    ix_sites_len = Ix.shape[0]
    for currsite in Sites[gene]:
        ix_sites[max(0, currsite[0] - 1) : min(ix_sites_len, currsite[1] + 1)] = 1
    ix_sites = ix_sites == 1

    #2) Compute the probabilities for both states
    EmmisionProbGene = np.log(np.ones((NrOfStates, Ix.shape[0])) * (1 / np.float64(NrOfStates)))
    CurrStackSum = StackData(Sequences_per_gene)
    CurrStackVar = StackData(Sequences_per_gene, add = 'no')
    CurrStackSumBck = StackData(Background_per_gene, add = 'only_cov')

    CurrStackVarSumm = StackData(Sequences_per_gene, add = 'only_var_summed')
    EmmisionProbGeneDir = np.zeros_like(EmmisionProbGene)

    if EmissionParameters['glm_weight'] < 0.0:
        weight1 = 1.0
        weight2 = 1.0
    elif EmissionParameters['glm_weight'] == 0.0:
        weight1 = 0.0000001
        weight2 = 1.0 - weight1 
    elif EmissionParameters['glm_weight'] == 1.0:
        weight1 = 0.9999999
        weight2 = 1.0 - weight1 
    else:
        weight1 = EmissionParameters['glm_weight'] 
        weight2 = (1.0 - EmissionParameters['glm_weight']) 

        
    for State in range(NrOfStates):
        EmmisionProbGene[State, ix_sites] = np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSum[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters)
        if EmissionParameters['BckType'] == 'Coverage':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        if EmissionParameters['BckType'] == 'Coverage_bck':
            EmmisionProbGene[State, ix_sites] += np.log(weight1) + emission_prob.predict_expression_log_likelihood_for_gene(CurrStackSumBck[:, ix_sites], State, nr_of_genes, gene_nr, EmissionParameters, 'bg')
        EmmisionProbGeneDir[State, Ix] = np.log(weight2) + diag_event_model.pred_log_lik(CurrStackVar[:, Ix], State, EmissionParameters)
        EmmisionProbGene[State, Ix] += np.log(weight2) + EmmisionProbGeneDir[State, Ix]

    Counts = StackData(Sequences_per_gene, add = 'all')
    

    Score = EmmisionProbGene
    CurrStack = CurrStackVar
    #Compute the scores when staying in the same state
    #RowIx = list(range(16)) + list(range(17, 38)) + list(range(39,44))
    strand = Sequences_per_gene['strand']

    #Get the coverages for the froeground and background
    CountsSeq = StackData(Sequences_per_gene, add = 'only_cov')
    CountsBck = StackData(Background_per_gene, add = 'only_cov')

    if strand == 0:
        strand = -1
    #Since we the transition probabilty is the same for all States we do not need to compute it for the bayes factor
    #this list contains the returned sites
    sites = []
    for currsite in Sites[gene]:
        mean_mat_fg, var_mat_fg, mean_mat_bg, var_mat_bg, counts_fg, counts_bg = ComputeStatsForSite(CountsSeq, CountsBck, currsite, fg_state, nr_of_genes, gene_nr, EmissionParameters)

        site = {}
        site['Start'] = currsite[0]
        site['Stop'] = currsite[1]
        site['Strand'] = strand
        site['SiteScore'] = EvaluateSite(Score, currsite, fg_state)
        site['Coverage'] = np.sum(np.sum(Counts[:, site['Start'] : site['Stop']], axis=0))
        site['Variants'] = np.sum(CurrStackVarSumm[:, site['Start'] : site['Stop']], axis=1)
        site['mean_mat_fg'] = mean_mat_fg
        site['var_mat_fg'] = var_mat_fg
        site['mean_mat_bg'] = mean_mat_bg
        site['var_mat_bg'] = var_mat_bg
        site['counts_fg'] = counts_fg
        site['counts_bg'] = counts_bg

        p = mean_mat_fg / var_mat_fg
        n = (mean_mat_fg ** 2) / (var_mat_fg - mean_mat_fg)
        site['pv'] = nbinom.logsf(counts_fg, n, p)
        site['max_pos'] = get_max_position(Score, currsite, fg_state, strand)
        site['dir_score'] = EvaluateSite(EmmisionProbGeneDir, currsite, fg_state)
        if site['SiteScore'] < 0.0:
            continue
        sites.append(site)

    Sequences.close()
    Background.close()

    return gene, sites
Example #5
0
def np_nbinom_logsf(x, mu, dispersion):

    p = dispersion / (dispersion + mu)
    return nbinom.logsf(x, dispersion, p)