Example #1
0
def _significance_direct(n_on, mu_bkg):
    """Compute significance directly via Poisson probability.

    Use this method for small ``n_on < 10``.
    In this case the Li & Ma formula isn't correct any more.

    TODO: add large unit test coverage (where is it numerically precise enough)?
    TODO: check coverage with MC simulation

    I'm getting a positive significance for zero observed counts and small mu_bkg.
    That doesn't make too much sense ...

    >>> stats.poisson._significance_direct(0, 2)
    -1.1015196284987503

    >>> stats.poisson._significance_direct(0, 0.1)
    1.309617799458493

    """
    from scipy.stats import norm, poisson

    # Compute tail probability to see n_on or more counts
    probability = poisson.sf(n_on, mu_bkg)

    # Convert probability to a significance
    significance = norm.isf(probability)

    return significance
Example #2
0
def _error( value ) :
  '''Construct frequentist errors using Poisson distribution'''
  # up error: smallest lambda for which P(n<=nobs|lambda) < (1-0.68268...)/2 = 0.15865...
  # down error: largest lambda for which P(n>=nobs|lambda) < (1-0.68268...)/2 = 0.15865...
  lambda_up, lambda_down, step_size = 1.1*value, 0.9*value, float(value)/10
  if value == 0 : return (0,1.8410216450100005) # save time with precomputed value
  if value < 1 : lambda_up, lambda_down, step_size =  1.8, 0.0, 0.1
  for i in range(5) :
    lambda_up -= step_size; lambda_down += step_size; step_size /= 10
    while poisson.cdf( value, lambda_up ) > 0.15865525393145705 : lambda_up += step_size
    while poisson.sf( value-1, lambda_down ) > 0.15865525393145705 : lambda_down -= step_size
  return (value-lambda_down,lambda_up-value)
Example #3
0
def _significance_direct(n_on, mu_bkg):
    """Compute significance directly via Poisson probability.

    Reference: TODO (is this ever used?)
    """
    # Compute tail probability to see n_on or more counts
    # Note that we're using ``k = n_on - 1`` to get the probability
    # for n_on included or more, because `poisson.sf(k)` returns the
    # probability for more than k, with k excluded
    # For `n_on = 0` this returns `
    probability = poisson.sf(n_on - 1, mu_bkg)

    # Convert probability to a significance
    return norm.isf(probability)
Example #4
0
    def find_phase_range(periodic_blocks, phases, prob_2peak=0.01):
        """ xx and yy are the bayesian block decomposition of
            a pulsar light curve. 
            
            The off pulse phase range is defined 
            as the lowest block with 10% removed from either
            side. """
        xx = periodic_blocks.xx
        yy = periodic_blocks.yy

        ranges = [PhaseRange(a, b) for a, b in zip(xx[0::2], xx[1::2])]

        heights = yy[::2]

        if np.allclose(heights[0], heights[-1]):
            ranges[0] += ranges.pop(-1)
            heights = heights[:-1]

        sorted = np.argsort(heights)
        min_phase = ranges[sorted[0]]

        if len(sorted) < 3:
            # if only 2 blocks, no need to merge
            phase = min_phase
        else:

            second_min_phase = ranges[sorted[1]]

            ncounts = len([p for p in phases if p in min_phase])
            second_ncounts = len([p for p in phases if p in second_min_phase])

            predicted_second_counts = ncounts * second_min_phase.phase_fraction / min_phase.phase_fraction

            prob = poisson.sf(second_ncounts, predicted_second_counts)
            print "Probability of there being a second peak from %s is %s" % (str(second_min_phase), prob)
            region_too_small = second_min_phase.phase_fraction < 0.5 * min_phase.phase_fraction
            height_too_different = prob < prob_2peak

            if height_too_different or region_too_small:
                if height_too_different:
                    print "Rejecting second peak - heights are inconsistent"
                if region_too_small:
                    print "Rejecting second peak - region too small"
                phase = min_phase
            else:
                print "Adding second peak!"
                phase = min_phase + second_min_phase

        return phase.trim(fraction=0.1)
Example #5
0
def _significance_direct(n_observed, mu_background):
    """Compute significance directly via Poisson probability.

    Use this method for small n_observed < 10.
    In this case the Li & Ma formula isn't correct any more.

    TODO: add large unit test coverage (where is it numerically precise enough)?
    TODO: check coverage with MC simulation
    """
    from scipy.stats import norm, poisson

    # Compute tail probability to see n_on or more counts
    probability = poisson.sf(n_observed, mu_background)

    # Convert probability to a significance
    significance = norm.isf(probability)

    return significance
Example #6
0
def _significance_direct(n_observed, mu_background):
    """Compute significance directly via Poisson probability.

    Use this method for small n_observed < 10.
    In this case the Li & Ma formula isn't correct any more.

    TODO: add large unit test coverage (where is it numerically precise enough)?
    TODO: check coverage with MC simulation
    """
    from scipy.stats import norm, poisson

    # Compute tail probability to see n_on or more counts
    probability = poisson.sf(n_observed, mu_background)

    # Convert probability to a significance
    significance = norm.isf(probability)

    return significance
Example #7
0
def prob_return(mu_return):
    """
    p[s, s'] = the probability of transitioning from state s to state s' after cars have been returned.

    Notes:
        Car returns are bounded by max_evening (=20).
        The probabilities are given by the Poisson distribution with mu = mu_return.
    """
    prob = np.zeros((nM, nE))
    for afternoon in range(nM):
        for returned in range(nE - afternoon):
            evening = afternoon + returned
            prob[afternoon, evening] = poisson.pmf(returned, mu_return)
        # Excess returns beyond what can be kept are captured by the survival function (== 1 - CDF)
        assert evening == max_evening
        prob[afternoon, evening] += poisson.sf(evening - afternoon, mu_return)
    assert np.isclose(prob.sum(axis=1), 1).all()
    return prob
Example #8
0
def prob_request(mu_request):
    """
    p[r, s, s'] = the probability of fullfilling r rental requests and transitioning from state s to state s'.

    Notes:
        Rental requests are bounded by the state s.
        The probabilities are given by the Poisson distribution with mu = mu_request.
    """
    prob = np.zeros((nM, nM, nM))
    for morning in range(nM):
        for rented in range(morning + 1):
            afternoon = morning - rented
            prob[rented, morning, afternoon] = poisson.pmf(rented, mu_request)
        # Excess requests beyond what's available are captured by the survival function (== 1 - CDF)
        assert afternoon == 0
        prob[morning, morning, afternoon] += poisson.sf(morning, mu_request)
    assert np.isclose(prob.sum(axis=(0, 2)), 1).all()
    return prob
Example #9
0
def cost_function(mu, n, target):
    """
    Calculates the squared distance between the
    survival function (1-cdf) of a poisson function
    with rate mu and n observations and a
    the target value.

    Arguments:
        mu {float} -- Event rate, a.k.a. rate parameter
        n {int} -- Number of observed events
        target {float} -- Estimated/desired survival rate

    Returns:
        float -- Squared distance between the survival rate
        and the target
    """

    return square(poisson.sf(n, mu) - target)
Example #10
0
def test_region(
    insertions,  # type: List[Insertion]
    reference_seq,  # type: pyfaidx.Fasta
    region,  # type: Tuple[str, int, int]
    pattern=None,  # type: Optional[str]
    intervals=None,  # type: Optional[Iterable[Tuple[str, int, int]]]
    total=None,  # type: Optional[int]
    filters=None,  # type: Optional[List[Callable]]
    insertion_trees=None  # type: GenomicIntervalTree
):  # type: (...) -> float
    """Tests a given genomic region for enrichment in insertions."""

    if total is None:
        total = count_total(reference_seq,
                            pattern=pattern,
                            intervals=intervals)

    # Count pattern in region.
    region_count = count_region(reference_seq, region=region, pattern=pattern)

    # Sub-select insertions for region.
    if insertion_trees is None:
        insertion_trees = GenomicIntervalTree.from_objects_position(
            insertions, chrom_attr='seqname')

    region_ins = set(interval[2]
                     for interval in insertion_trees.search(*region))

    # Apply additional filter functions to insertions if given
    # (such as filtering on gene name/id for example).
    if filters is not None:
        for filter_func in filters:
            region_ins = set(ins for ins in region_ins if filter_func(ins))

    # Calculate p-value.
    x = len(list(region_ins))
    mu = len(insertions) * (region_count / total)

    # Note here we use loc=1, because we are interested in
    # calculating P(X >= x), not P(X > x) (the default
    # surivival function).
    p_val = poisson.sf(x, mu=mu, loc=1)  # type: float

    return p_val
Example #11
0
    def __init__(self):
        # dynamics of the MDP process for Jack-Car rental Problem
        self.number_of_locations = 3
        self.rental_credit = 10
        self.expected_rental_requests = [3, 2, 2]
        self.expected_rental_returns = [3, 1, 1]
        self.capacity = [19, 9, 9]
        self.max_car_moved = 5
        self.gamma = 0.9
        self.cost_of_moving = [2, 0, 2]

        # available actions : actions can be accessed through the index
        self.actions = self.generate_actions()

        # available states : available states can be accessed through the index
        self.states = [
            i for i in itertools.product(range(self.capacity[0] +
                                               1), range(self.capacity[1] + 1),
                                         range(self.capacity[2] + 1))
        ]

        # initializing the values of the states
        self.V = np.zeros(tuple(np.array(self.capacity) + 1), dtype=np.float)

        # initializing the policy array
        self.policy = np.zeros(tuple(np.array(self.capacity) + 1),
                               dtype=np.int)

        # poisson precompute
        self.poisson_pmf = dict()
        self.poisson_sf = dict()

        for n, lam in itertools.product(
                range(-1,
                      max(self.capacity) + 1),
                range(
                    max(self.expected_rental_requests +
                        self.expected_rental_returns) + 1)):
            self.poisson_pmf[(n, lam)] = poisson.pmf(n, lam)
            self.poisson_sf[(n, lam)] = poisson.sf(n, lam)

        # printing the dynamics
        self.print_dynamics()
Example #12
0
    def consistent(x,xphase,y,yphase,probability = 0.05, quiet=True):
        """ Assuming x counts are observed in a phase range xphase
            and y counts are observed in phase range yphase,
            decides if the regions are consistent.
            
            The regions are consistent if the probability of obtaining 
            as many or more counts in the second region compared to the first region
            is > 5% (so there is a >5% probability that ht esecond region is
            not unusually large). """
            
        y_predicted = x*(yphase/xphase)
            
        poisson_likelihood = poisson.sf(y,y_predicted)

        if not quiet:
            print 'poisson likelihood=%.2f' % poisson_likelihood

        if poisson_likelihood > 0.05: return True
        return False
Example #13
0
def test_region(
        insertions,  # type: List[Insertion]
        reference_seq,  # type: pyfaidx.Fasta
        region,  # type: Tuple[str, int, int]
        pattern=None,  # type: Optional[str]
        intervals=None,  # type: Optional[Iterable[Tuple[str, int, int]]]
        total=None,  # type: Optional[int]
        filters=None,  # type: Optional[List[Callable]]
        insertion_trees=None  # type: GenomicIntervalTree
):  # type: (...) -> float
    """Tests a given genomic region for enrichment in insertions."""

    if total is None:
        total = count_total(
            reference_seq, pattern=pattern, intervals=intervals)

    # Count pattern in region.
    region_count = count_region(reference_seq, region=region, pattern=pattern)

    # Sub-select insertions for region.
    if insertion_trees is None:
        insertion_trees = GenomicIntervalTree.from_objects_position(
            insertions, chrom_attr='seqname')

    region_ins = set(interval[2]
                     for interval in insertion_trees.search(*region))

    # Apply additional filter functions to insertions if given
    # (such as filtering on gene name/id for example).
    if filters is not None:
        for filter_func in filters:
            region_ins = set(ins for ins in region_ins if filter_func(ins))

    # Calculate p-value.
    x = len(list(region_ins))
    mu = len(insertions) * (region_count / total)

    # Note here we use loc=1, because we are interested in
    # calculating P(X >= x), not P(X > x) (the default
    # surivival function).
    p_val = poisson.sf(x, mu=mu, loc=1)  # type: float

    return p_val
Example #14
0
File: fdr.py Project: daler/epic
def compute_fdr(df, total_chip_reads, total_input_reads, args):

    df.to_csv("for_fdr_test.csv", sep=" ")
    print("total_chip_reads", total_chip_reads)
    print("total_input_reads", total_input_reads)

    total_island_input_reads = df.Input.sum()

    # Hack needed in case we run on test data
    # TODO: why does SICER not need this? Different genome versions?
    # run with FDR=1 on original SICER and get this island:
    # chr7    61606400        61606799        3       2       0.167427550906  1.40719467956   0.1674275 50906
    # does it not show up with epic.
    if total_island_input_reads == 0:
        total_island_input_reads = 2

    scaling_factor = (total_chip_reads * 1.0) / total_input_reads

    effective_genome_size = get_effective_genome_length(args.genome)
    zero_controls_multiplier = total_input_reads * 1.0 / effective_genome_size

    avg_0_denom = (df.End - df.Start + 1) * zero_controls_multiplier
    avg_0_denom[avg_0_denom > 0.25] = 0.25
    avg_0_denom = avg_0_denom * scaling_factor

    avg = df.Input * scaling_factor
    avg[df.Input == 0] = avg_0_denom[df.Input == 0]

    df.P_value = poisson.sf(df.ChIP, avg)
    no_differential_expression = df.ChIP <= avg
    df.loc[no_differential_expression, "P_value"] = 1

    df.Fold_change = df.ChIP / avg

    ranked_p_values = rankdata(df.P_value)
    df.FDR_value = df.P_value * len(df) / ranked_p_values
    fdr_too_high = df.FDR_value > 1
    df.loc[fdr_too_high, "FDR_value"] = 1

    df = df[df.FDR_value < args.false_discovery_rate_cutoff]

    return df
Example #15
0
    def event_significance(self, nevents=10, rank_fcn=None):
        """
		Calculate the Poissonian significance of the 'on source' trial set for up to the loudest nevents.
		"""
        if rank_fcn is None:
            rank_fcn = lambda e: e.snr

        offtime = float(abs(segments.segmentlist(self.offsource.keys())))
        offsource = sorted(chain(*self.offsource.values()),
                           key=lambda sb: -sb.snr)
        offrate = zip(offsource,
                      map(lambda i: i / offtime, range(1,
                                                       len(offsource) + 1)))
        offrate = offrate[::-1]
        offsource = offsource[::-1]
        offsnr = map(rank_fcn, offsource)

        ontime = float(abs(segments.segmentlist(self.onsource.keys())))
        if ontime == 0:
            return []
        onsource = sorted(chain(*self.onsource.values()),
                          key=lambda sb: -sb.snr)
        onsnr = map(rank_fcn, onsource)
        onrate = []
        for snr in onsnr:
            try:
                onrate.append(offrate[bisect_left(offsnr, snr)][1])
            except IndexError:  # on SNR > max off SNR
                onrate.append(0)

        onsource_sig = []
        for i, sb in enumerate(onsource[:nevents]):
            # From Gaussian
            #exp_num = chi2.cdf(sb.chisq_dof, sb.snr)*len(onsource)
            # From off-source
            exp_num = onrate[i] * ontime
            # FIXME: requires scipy >= 0.10
            #onsource_sig.append([sb.snr, -poisson.logsf(i, exp_num)])
            onsource_sig.append(
                [rank_fcn(sb), -numpy.log(poisson.sf(i, exp_num))])

        return onsource_sig
Example #16
0
 def pval_calculator(self, v):
     """
     Calculate the p-value of the v-motifs number of two vertices
     
     :param v: a list containing the index of the first vertex, index of the second vertex, number of V-motifs between them.
     :returns: a list containing the index of the first vertex, index of the second vertex, the relative p-value.
     """
     i = v[0]
     j = v[1]
     if self.method == 'poisson':
         if self.light_mode:
             avg_v = np.sum(
                 v_probs_from_fitnesses(self.x[i], self.x[j], self.y))
         else:
             avg_v = self.avg_v_mat[i, j]
         return i, j, poisson.sf(k=v[2] - 1, mu=avg_v)
     elif self.method == 'normal':
         if self.light_mode:
             probs = v_probs_from_fitnesses(self.x[i], self.x[j], self.y)
         else:
             probs = self.avg_mat[i] * self.avg_mat[j]
         avg_v = np.sum(probs)
         sigma_v = np.sqrt(np.sum(probs * (1 - probs)))
         return i, j, norm.cdf((v[2] + 0.5 - avg_v) / sigma_v)
     elif self.method == 'rna':
         if self.light_mode:
             probs = v_probs_from_fitnesses(self.x[i], self.x[j], self.y)
         else:
             probs = self.avg_mat[i] * self.avg_mat[j]
         avg_v = np.sum(probs)
         var_v_arr = probs * (1 - probs)
         sigma_v = np.sqrt(np.sum(var_v_arr))
         gamma_v = (sigma_v**(-3)) * np.sum(var_v_arr * (1 - 2 * probs))
         eval_x = (v[2] + 0.5 - avg_v) / sigma_v
         pval_temp = norm.cdf(
             eval_x) + gamma_v * (1 - eval_x**2) * norm.pdf(eval_x) / 6
         if pval_temp < 0:
             return i, j, 0
         elif pval_temp > 1:
             return i, j, 1
         else:
             return i, j, pval_temp
Example #17
0
def glscore_V3(sids, otuID, otutab):
    '''Write something informative here'''
    sids = dropNA(sids)
    coverages = [int(otutab.getSampleCoverage(sid)) for sid in sids]
    abnds = [int(otutab.getOTUabundance(otuID, sid)) for sid in sids]
    t0expAbnd = average([(float(abnds[i]) / coverages[i]) * coverages[0]
                         for i in range(1, len(sids))
                         ])  # `max' instead of `average'?
    tTexpAbnds = [(float(abnds[0]) / coverages[0]) * coverages[i]
                  for i in range(1, len(sids))]
    if abnds[0] == 0:
        lsc = 0
        # If we don't expect to see much, if any, of the OTU at t0 due
        # to low sample coverage, then we should penalise the gain score
        penalty = poisson.sf(0, t0expAbnd)  # ceiling(t0expAbnd, sigC) / sigC
        # If the OTU isn't consistently there, then it could be
        # an OTU that fluctuates naturally and wasn't gained
        prss = [
            presence_sc(sids[i], sids[i + 1], otuID, otutab)
            for i in range(1,
                           len(sids) - 1)
        ]
        prss = [prss[i] * prss[i + 1] for i in range(len(prss) - 1)]
        prs = average(prss)
        gsc = prs * penalty
    if abnds[0] > 0:
        gsc = 0
        # I want to penalise the loss scores of anything that could
        # have been lost because of uneven sample coverage
        lscs = [
            loss_sc(abnds[i + 1], tTexpAbnds[i])
            for i in range(len(tTexpAbnds))
        ]
        lscs = [lscs[i] * lscs[i + 1] for i in range(len(lscs) - 1)]
        lsc = average(lscs)
    output = add_block(sids) + add_block(coverages) + add_block(abnds) \
             + add_block([t0expAbnd] + tTexpAbnds)
    output.append('')
    output.append('{0:.5f}'.format(gsc))
    output.append('{0:.5f}'.format(lsc))
    return output
Example #18
0
File: fdr.py Project: endrebak/epic
def compute_fdr(df, total_chip_reads, total_input_reads, args):
    # type: (pd.DataFrame, int, int, Namespace) -> pd.DataFrame

    total_island_input_reads = df.Input.sum()

    # Hack needed in case we run on test data
    # TODO: why does SICER not need this? Different genome versions?
    # run with FDR=1 on original SICER and get this island:
    # chr7    61606400        61606799        3       2       0.167427550906  1.40719467956   0.1674275 50906
    # does it not show up with epic?
    if total_island_input_reads == 0:
        total_island_input_reads = 2

    scaling_factor = (total_chip_reads * 1.0) / total_input_reads

    zero_controls_multiplier = total_input_reads * 1.0 / args.effective_genome_fraction

    avg_0_denom = (df.End - df.Start + 1) * zero_controls_multiplier
    avg_0_denom[avg_0_denom > 0.25] = 0.25
    avg_0_denom = avg_0_denom * scaling_factor

    avg = df.Input * scaling_factor
    avg[df.Input == 0] = avg_0_denom[df.Input == 0]

    fold_change = df.ChIP / avg
    log2FC = log2(fold_change)
    df.insert(len(df.columns), "Log2FC", log2FC)

    p_vals = pd.Series(poisson.sf(df.ChIP, avg), index=df.index)

    p_vals[df.ChIP <= avg] = 1
    df.insert(len(df.columns), "P", p_vals)

    ranked_p_values = rankdata(p_vals)
    fdr = p_vals * len(df) / ranked_p_values
    fdr[fdr > 1] = 1
    df.insert(len(df.columns), "FDR", fdr)

    df = df[df.FDR < args.false_discovery_rate_cutoff]

    return df
Example #19
0
def compute_fdr(df, total_chip_reads, total_input_reads, args):
    # type: (pd.DataFrame, int, int, Namespace) -> pd.DataFrame

    total_island_input_reads = df.Input.sum()

    # Hack needed in case we run on test data
    # TODO: why does SICER not need this? Different genome versions?
    # run with FDR=1 on original SICER and get this island:
    # chr7    61606400        61606799        3       2       0.167427550906  1.40719467956   0.1674275 50906
    # does it not show up with epic?
    if total_island_input_reads == 0:
        total_island_input_reads = 2

    scaling_factor = (total_chip_reads * 1.0) / total_input_reads

    zero_controls_multiplier = total_input_reads * 1.0 / args.effective_genome_fraction

    avg_0_denom = (df.End - df.Start + 1) * zero_controls_multiplier
    avg_0_denom[avg_0_denom > 0.25] = 0.25
    avg_0_denom = avg_0_denom * scaling_factor

    avg = df.Input * scaling_factor
    avg[df.Input == 0] = avg_0_denom[df.Input == 0]

    fold_change = df.ChIP / avg
    log2FC = log2(fold_change)
    df.insert(len(df.columns), "Log2FC", log2FC)

    p_vals = pd.Series(poisson.sf(df.ChIP, avg), index=df.index)

    p_vals[df.ChIP <= avg] = 1
    df.insert(len(df.columns), "P", p_vals)

    ranked_p_values = rankdata(p_vals)
    fdr = p_vals * len(df) / ranked_p_values
    fdr[fdr > 1] = 1
    df.insert(len(df.columns), "FDR", fdr)

    df = df[df.FDR < args.false_discovery_rate_cutoff]

    return df
Example #20
0
def get_pvals_chunk(counts_series_lchunk):
    """
    Parameters:
    -----------
    counts_series_lchunk : pd.Series(int)
        Series of raw pixel counts where the name of the Series
        is pd.Interval of the lambda-bin where the pixel belong.
        I.e. counts_series_lchunk.name.right - is the upper limit of the chunk
        and is used as "expected" in Poisson distribution to estimate p-value.

    Returns:
    --------
    pvals: ndarray[float]
        array of p-values for each pixel

    Notes:
    ------
    poisson.sf = 1.0 - poisson.cdf
    """
    return poisson.sf(counts_series_lchunk.values,
                      counts_series_lchunk.name.right)
Example #21
0
def estSigOneChr(rs, jdf, pre, dis=0, win=5):
    """
    Estimating the significances for the loops in one chromosome.
    """
    #all variables with suffix t is treatment, with suffix c in control
    logger.info("Building genomic coverage model for %s" % jdf)
    model, N = getGenomeCoverage(jdf, dis)
    ds = {}
    i = 0
    for key, r in rs.items():
        i += 1
        if i % 100 == 0:
            report = "Estimating %s loops for %s" % (i, pre)
            cFlush(report)
        chrom = r[0]
        iva = [r[1], r[2]]
        ivb = [r[4], r[5]]
        ra, rb, rab = getPETsforRegions(iva, ivb, model)
        ivas, ivbs = getNearbyPairRegions(iva, ivb, win=win)
        mrab = getPermutatedBg(ivas, ivbs, model)
        if mrab > 0:
            es = rab / mrab
        else:
            es = 100
        pop = max([1e-300, poisson.sf(rab - 1.0, mrab)])
        niva = "%s:%s-%s" % (chrom, iva[0], iva[1])
        nivb = "%s:%s-%s" % (chrom, ivb[0], ivb[1])
        ds[key] = {
            "iva": niva,
            "ivb": nivb,
            "ra": ra,
            "rb": rb,
            "rab": rab,
            "ES": es,
            "poisson_p-value": pop,
        }
    if len(ds) == 0:
        return None
    ds = pd.DataFrame(ds).T
    return ds
Example #22
0
def test_enrich(expected, observed, columns):
    """ tests whether genes are enriched with de novo mutations
    
    Args:
        expected: pandas dataframe of expected numbers of mutations per gene,
            given expected mutation rates for each gene.
        observed: pandas data frame with tally of de novo mutations per gene
            for each of the mutation types: lof_snv, lof_indel, missense_snv,
            missense_indel.
        columns: list of columns to use to calculate enrichment within, such as
            the loss-of-function columns ["lof_snv", "lof_indel"].
    
    Returns:
        pandas Series of P-values from testing for enrichment.
    """
    
    # recode the columns in the expected mutations table, so merging the
    # expected and observed datasets doesn't have conflicting column names.
    expected_columns = [ x + "_expected" for x in columns ]
    rename = dict(zip(columns, expected_columns))
    expected = expected.rename(columns=rename)
    
    if 'hgnc' not in observed:
        observed['hgnc'] = observed['symbol']
    
    enriched = observed.merge(expected, how="left", on=["hgnc", "chrom"])
    
    # account for how different pandas versions sum series with only NA
    kwargs = {}
    if pandas.__version__ >= '0.22.0':
        kwargs = {'min_count': 1}
    
    # sum the observed and expected de novo mutations per gene
    observed = enriched[columns].sum(axis=1, **kwargs)
    expected = enriched[expected_columns].sum(axis=1, **kwargs)
    
    # calculate the probability of getting the observed number of de novos,
    # given the expected rate of mutations.
    return poisson.sf(observed - 1, expected)
Example #23
0
def compute_small_belief(c, m, l):
    mu = l * m
    csd = c * (l * m)**0.5
    mu_all_but_one = l * (m - 1)
    max_text = 10000

    delta1 = poisson.sf(mu_all_but_one + csd, mu_all_but_one)
    delta2 = poisson.cdf(mu_all_but_one - csd, mu_all_but_one)

    #delta1 = poisson.pmf(l-1,l) * poisson.sf(mu_all_but_one + csd, mu_all_but_one)
    #for i in xrange(1,max_text):
    #       delta1 += poisson.pmf(l + i,l) * poisson.pmf(mu_all_but_one + csd - i, mu_all_but_one)
    #delta1 += poisson.pmf(l + max_text,l)
    #
    #delta2 = poisson.pmf(0,l) * poisson.sf(mu + csd, mu_all_but_one)
    #for i in xrange(1,max_text):
    #       delta2 += poisson.pmf(l + i,l) * poisson.pmf(mu + csd - i, mu_all_but_one)
    #delta2 += poisson.pmf(l + max_text,l)

    delta = (delta1 + delta2)
    epsilon = math.log(1 + (float(c) / (m * l)**0.5))
    return epsilon, delta
Example #24
0
def countTEs(f, repf, fout, psedo=1, ext=5):
    """
    Count reads located in TEs and get their enrichment.
    """
    t, model = getCov(f)
    reps = pd.read_table(repf, index_col=0, sep="\t")
    ds = {}
    for rep in tqdm(list(reps.itertuples())):
        rid = rep[0]
        iv = HTSeq.GenomicInterval(rep[1], rep[2], rep[3])
        c, rpkm = getCount(t, model, iv)
        if c == 0:
            continue
        upiv = HTSeq.GenomicInterval(rep[1], rep[2] - iv.length * ext, rep[2])
        upc, uprpkm = getCount(t, model, upiv)
        downiv = HTSeq.GenomicInterval(rep[1], rep[3],
                                       rep[3] + iv.length * ext)
        downc, downrpkm = getCount(t, model, downiv)
        if upc + downc > 0:
            es = c / 1.0 / (upc + downc) * 2 * ext
            p = max([1e-300, poisson.sf(c, (upc + downc) / 2.0 / ext)])
        else:
            es = c / 1.0 / psedo
            p - 1e-300
        ds[rid] = {
            "length": iv.length,
            "count": c,
            "RPKM": rpkm,
            "up_count_ext%s" % ext: upc,
            "up_RPKM_ext%s" % ext: uprpkm,
            "down_count_ext%s" % ext: downc,
            "down_RPKM_ext%s" % ext: downrpkm,
            "ES": es,
            "poisson_p-value": p,
            #"ES": rpkm / 1.0 /
            #(uprpkm + downrpkm + psedo) * 2,  #psedo count to avoid divid zero
        }
    ds = pd.DataFrame(ds).T
    ds.to_csv(fout + ".txt", sep="\t")
Example #25
0
def test_enrich(expected, observed, columns):
    """ tests whether genes are enriched with de novo mutations
    
    Args:
        expected: pandas dataframe of expected numbers of mutations per gene,
            given expected mutation rates for each gene.
        observed: pandas data frame with tally of de novo mutations per gene
            for each of the mutation types: lof_snv, lof_indel, missense_snv,
            missense_indel.
        columns: list of columns to use to calculate enrichment within, such as
            the loss-of-function columns ["lof_snv", "lof_indel"].
    
    Returns:
        pandas Series of P-values from testing for enrichment.
    """

    # recode the columns in the expected mutations table, so merging the
    # expected and observed datasets doesn't have conflicting column names.
    expected_columns = [x + "_expected" for x in columns]
    rename = dict(zip(columns, expected_columns))
    expected = expected.rename(columns=rename)

    if 'hgnc' not in observed:
        observed['hgnc'] = observed['symbol']

    enriched = observed.merge(expected, how="left", on=["hgnc", "chrom"])

    # account for how different pandas versions sum series with only NA
    kwargs = {}
    if pandas.__version__ >= '0.22.0':
        kwargs = {'min_count': 1}

    # sum the observed and expected de novo mutations per gene
    observed = enriched[columns].sum(axis=1, **kwargs)
    expected = enriched[expected_columns].sum(axis=1, **kwargs)

    # calculate the probability of getting the observed number of de novos,
    # given the expected rate of mutations.
    return poisson.sf(observed - 1, expected)
def stestat(detector,background):

    stat = OrderedDict() 
    stat['Name'] = detector.name
    stat['actual sigma +'] = detector.actualsig_pos
    stat['actual sigma -'] = detector.actualsig_neg
    stat['Alarm Setting'] = detector.alarm
    stat['Micro Rad Per Hr'] = ( (100*detector.t_energy*conv) / (detector.mass_kg*(detector.source_time/3600)) )*(10**6)    
    stat['Sigma Rad'] = ((100*detector.sig_energy*conv) / (detector.mass_kg*(detector.source_time/3600)) )*(10**6)    
    stat['Source Hits Per Sec'] = detector.rate
    stat['Source Sigma Hit Rate'] = np.sqrt(detector.counts)/detector.source_time 
    stat['Background Hit Rate'] = background.rate
    stat['Sigma Background Hit Rate'] = background.sig_counts
    stat['Combined Hit Rate'] = (detector.rate) + (background.rate)
    stat['Sigma Combined Hit Rate'] = np.sqrt( stat['Source Sigma Hit Rate']**2 + (background.sig_counts)**2 )
    stat['Sigma Above Background'] = detector.rate*detector.itime / background.sig_counts
    stat['Hits Required to alarm'] = stat['Sigma Background Hit Rate']*stat['Alarm Setting'] 
    mean = stat['Combined Hit Rate']
    sigma = stat['Sigma Combined Hit Rate']
    alarm =  background.sig_counts*detector.alarm # value needed to set off alarm
    stat['Probability to Alarm'] = poisson.sf(alarm,mean)

    return stat 
def enrichment(observed, expected):
    ''' assess enrichment of de novo mutations
    '''

    groups = {
        'PTV': ['lof_snv', 'lof_indel'],
        'PAV': ['missense_snv', 'missense_indel']
    }

    data = {}
    for x in groups:
        obs = sum([observed[x].sum() for x in groups[x]])
        exp = sum([expected[x].sum() for x in groups[x]])

        ratio = obs / exp
        p_value = poisson.sf(obs - 1, exp)
        data[x] = {
            'ratio': ratio,
            'p_value': p_value,
            'observed': obs,
            'expected': exp
        }

    return data
Example #28
0
def _significance_direct(n_on, mu_bkg):
    """Compute significance directly via Poisson probability.

    Use this method for small ``n_on < 10``.
    In this case the Li & Ma formula isn't correct any more.

    TODO: add large unit test coverage (where is it numerically precise enough)?
    TODO: check coverage with MC simulation

    I'm getting a positive significance for zero observed counts and small mu_bkg.
    That doesn't make too much sense ...

    >>> stats.poisson._significance_direct(0, 2)
    -1.1015196284987503
    >>> stats.poisson._significance_direct(0, 0.1)
    1.309617799458493
    """
    # Compute tail probability to see n_on or more counts
    probability = poisson.sf(n_on, mu_bkg)

    # Convert probability to a significance
    significance = norm.isf(probability)

    return significance
def ministat(detector, background):
    stat = OrderedDict()
    stat['Name'] = detector.name
    stat['Alarm Setting'] = detector.alarm
    stat['Actual Distances'] = detector.actualdist
    stat['Actual Probability'] = detector.actualprob
    stat['Actual Sigma +'] = detector.actualsig_pos
    stat['Actual Sigma -'] = detector.actualsig_neg
    stat['Micro Rad Per Hr'] = ( (100*detector.t_energy*conv) / (detector.mass_kg*(detector.source_time/3600)) )*(10**6)    
    stat['Sigma Rad'] = ((100*detector.sig_energy*conv) / (detector.mass_kg*(detector.source_time/3600)) )*(10**6)    
    stat['Source Hits Per Sec'] = detector.rate
    stat['Source Sigma Hit Rate'] = np.sqrt(detector.counts)/detector.source_time 
    stat['Background Hit Rate'] = background.rate
    stat['Sigma Background Hit Rate'] = background.sig_counts
    stat['Combined Hit Rate'] = (detector.rate) + (background.rate)
    stat['Sigma Combined Hit Rate'] = np.sqrt( stat['Source Sigma Hit Rate']**2 + (background.sig_counts)**2 )
    stat['Base Alarm Count'] = background.base_alarm
    stat['Rate Above Base Alarm Rate'] = stat['Combined Hit Rate'] - stat['Base Alarm Count'] 
    stat['alarm level'] = detector.alarm_lv(background)
    mean = stat['Combined Hit Rate']
    alarm = stat['Base Alarm Count']  # value needed to set off alarm
    stat['Probability to Alarm'] = poisson.sf(alarm,mean)

    return stat
def cdf(a, r, mu):
    if (a > 0):
        return poisson.sf(r - 1, a / mu)
        
    elif (a == 0):
        return 0
def process_one_replicon(rep_name, replicon_length, masked_bases,
                         start_positions, window_size, out_dir):
    random.shuffle(start_positions)
    start_position_count = len(start_positions) // 2
    start_positions = start_positions[:start_position_count]

    read_starts_per_window = get_read_starts_per_window(
        replicon_length, window_size, start_positions)

    all_read_starts = []
    for window, read_starts in read_starts_per_window.items():
        masked_window = False
        for i in range(window, window + window_size):
            if i in masked_bases:
                masked_window = True
        if not masked_window:
            all_read_starts.append(read_starts)
    mean_read_starts_per_window = statistics.mean(all_read_starts)
    sig_threshold = 0.05 / len(all_read_starts)
    neg_log10_sig_threshold = -math.log10(sig_threshold)

    print(
        f'{rep_name}\t{mean_read_starts_per_window}\t{sig_threshold}\t{neg_log10_sig_threshold}'
    )

    out_filename = out_dir / (rep_name + '.tsv')
    with open(out_filename, 'wt') as out_file:
        out_file.write(
            'window\tread_starts\tp_val\tneg_log10_p_val\tsigned_neg_log10_p_val\n'
        )
        for window, read_starts in read_starts_per_window.items():
            masked_window = False
            for i in range(window, window + window_size):
                if i in masked_bases:
                    masked_window = True

            # High numbers of read-starts
            if read_starts > mean_read_starts_per_window:
                p_val = poisson.sf(read_starts - 1,
                                   mean_read_starts_per_window)
                try:
                    neg_log10_p_val = -math.log10(p_val)
                    signed_neg_log10_p_val = neg_log10_p_val
                except ValueError:
                    neg_log10_p_val = 'inf'
                    signed_neg_log10_p_val = 'inf'

            # Low numbers of read-starts
            else:
                p_val = poisson.cdf(read_starts, mean_read_starts_per_window)
                try:
                    neg_log10_p_val = -math.log10(p_val)
                    signed_neg_log10_p_val = -neg_log10_p_val
                except ValueError:
                    neg_log10_p_val = 'inf'
                    signed_neg_log10_p_val = '-inf'

            if not masked_window:
                out_file.write(
                    f'{window}\t{read_starts}\t{p_val}\t{neg_log10_p_val}\t{signed_neg_log10_p_val}\n'
                )
            else:
                out_file.write(f'{window}\t{read_starts}\tn/a\tn/a\tn/a\n')
plt.ylabel('Probability')
ax = plt.gca()
line_top = ax.get_ylim()[1]
format_axis(ax)
ax.yaxis.set_ticks_position('none')
ax.set_yticklabels([])
ax.yaxis.labelpad = 0
plt.vlines(min_pore_size, 0, line_top, linestyle='--')
plt.ylim([0, line_top])
plt.subplots_adjust(left=0.12, bottom=0.17)
# Now make inset plot showing dye release at different average Bax/lipo
# ratios:
ax = plt.axes([0.55, 0.55, 0.3, 0.3])
format_axis(ax)
bax_ratios2 = np.linspace(0, 20, 50)
plt.plot(bax_ratios2, poisson.sf(sub_pore_size, bax_ratios2), color='k')
for br in bax_ratios:
    plt.plot(br, poisson.sf(sub_pore_size, br), marker='o', markersize=4)
ax.set_yticks([0, 0.5, 1.0])
plt.ylim([-0.08, 1.05])
plt.xlabel(r'$\langle$Bax/Lipo$\rangle$')
plt.ylabel('Max release')
# Save the plot
plt.savefig('poisson_bax_fmax.pdf')

# Now, plot best fit of 140311 Fmax curve with Poisson funcs
(fmax_arr, conc_list) = get_twoexp_fmax_arr()
fmax_means = np.mean(fmax_arr, axis=0)
bax_ratios = conc_list / 5.16
log_ratios = np.log10(bax_ratios)
fmax_arr[:, 0] = [0, 0, 0]
def erlang_cdf(a, k, mu):
    if (a > 0):
        return(poisson.sf(k = k - 1, mu = a / mu))
    else:
        return(0)
Example #34
0
 def test_logpmf_zero(self):
     poisson_logpmf = poisson.logpmf(2, 2) - np.log(poisson.sf(0, 2))
     tpoisson_logpmf = truncatedpoisson.logpmf(2, 2, 0)
     assert_allclose(poisson_logpmf, tpoisson_logpmf, rtol=1e-7)
Example #35
0
def fl_cycle_application_decision():
    """use the temporary req_join endpoint to mockup:

    - reject if worker does not satisfy 'minimum_upload_speed' and/or 'minimum_download_speed'
    - is a part of current or recent cycle according to 'do_not_reuse_workers_until_cycle'
    - selects according to pool_selection
    - is under max worker (with some padding to account for expected percent of workers so do not report successfully)
    """

    # parse query strings (for now), eventually this will be parsed from the request body
    model_id = request.args.get("model_id")
    up_speed = request.args.get("up_speed")
    down_speed = request.args.get("down_speed")
    worker_id = request.args.get("worker_id")
    worker_ping = request.args.get("ping")
    _cycle = cycle_manager.last(model_id)
    _accept = False
    """
    MVP variable stubs:
        we will stub these with hard coded numbers first, then make functions to dynaically query/update in subsquent PRs
    """
    # this will be replaced with a function that check for the same (model_id, version_#) tuple when the worker last participated
    last_participation = 1
    # how late is too late into the cycle time to give a worker "new work", if only 5 seconds left probably don't bother, set this intelligently later
    MINIMUM_CYCLE_TIME_LEFT = 500
    # the historical amount of workers that fail to report (out of time, offline, too slow etc...),
    # could be modified to be worker/model specific later, track across overall pygrid instance for now
    EXPECTED_FAILURE_RATE = 0.2

    dummy_server_config = {
        "max_workers": 100,
        "pool_selection": "random",  # or "iterate"
        "num_cycles": 5,
        "do_not_reuse_workers_until_cycle": 4,
        "cycle_length": 8 * 60 * 60,  # 8 hours
        "minimum_upload_speed": 2000,  # 2 mbps
        "minimum_download_speed": 4000,  # 4 mbps
    }
    """  end of variable stubs """

    _server_config = dummy_server_config

    up_speed_check = up_speed > _server_config["minimum_upload_speed"]
    down_speed_check = down_speed > _server_config["minimum_download_speed"]
    cycle_valid_check = ((
        last_participation + _server_config["do_not_reuse_workers_until_cycle"]
        >= _cycle.get(
            "cycle_sequence",
            99999)  # this should reuturn current cycle sequence number
    ) * (_cycle.get("cycle_sequence", 99999) <= _server_config["num_cycles"]) *
                         (_cycle.cycle_time > MINIMUM_CYCLE_TIME_LEFT) *
                         (worker_id not in _cycle._workers))

    if up_speed_check * down_speed_check * cycle_valid_check:
        if _server_config["pool_selection"] == "iterate" and len(
                _cycle._workers) < _server_config["max_workers"] * (
                    1 + EXPECTED_FAILURE_RATE):
            """first come first serve selection mode."""
            _accept = True
        elif _server_config["pool_selection"] == "random":
            """probabilistic model for rejection rate:

                - model the rate of worker's request to join as lambda in a poisson process
                - set probabilistic reject rate such that we can expect enough workers will request to join and be accepted
                    - between now and ETA till end of _server_config['cycle_length']
                    - such that we can expect (,say with 95% confidence) successful completion of the cycle
                    - while accounting for EXPECTED_FAILURE_RATE (% of workers that join cycle but never successfully report diff)

            EXPECTED_FAILURE_RATE = moving average with exponential decay based on historical data (maybe: noised up weights for security)

            k' = max_workers * (1+EXPECTED_FAILURE_RATE) # expected failure adjusted max_workers = var: k_prime

            T_left = T_cycle_end - T_now # how much time is left (in the same unit as below)

            normalized_lambda_actual = (recent) historical rate of request / unit time

            lambda' = number of requests / unit of time that would satisfy the below equation

            probability of receiving at least k' requests per unit time:
                P(K>=k') = 0.95 = e ^ ( - lambda' * T_left) * ( lambda' * T_left) ^ k' / k'! = 1 - P(K<k')

            var: lambda_approx = lambda' * T_left

            solve for lambda':
                use numerical approximation (newton's method) or just repeatedly call prob = poisson.sf(x, lambda') via scipy

            reject_probability = 1 - lambda_approx / (normalized_lambda_actual * T_left)
            """

            # time base units = 1 hr, assumes lambda_actual and lambda_approx have the same unit as T_left
            k_prime = _server_config["max_workers"] * (1 +
                                                       EXPECTED_FAILURE_RATE)
            T_left = _cycle.get("cycle_time", 0)

            # TODO: remove magic number = 5 below... see block comment above re: how
            normalized_lambda_actual = 5
            lambda_actual = (
                normalized_lambda_actual * T_left
            )  # makes lambda_actual have same unit as lambda_approx
            # @hyperparam: valid_range => (0, 1) | (+) => more certainty to have completed cycle, (-) => more efficient use of worker as computational resource
            confidence = 0.95  # P(K>=k')
            pois = lambda l: poisson.sf(k_prime, l) - confidence
            """
            _bisect_approximator because:
                - solving for lambda given P(K>=k') has no algebraic solution (that I know of) => need approxmiation
                - scipy's optimizers are not stable for this problem (I tested a few) => need custom approxmiation
                - at this MVP stage we are not likely to experince performance problems, binary search is log(N)
            refactor notes:
                - implmenting a smarter approximiator using lambert's W or newton's methods will take more time
                - if we do need to scale then we can refactor to the above ^
            """
            # @hyperparam: valid_range => (0, 1) | (+) => get a faster but lower quality approximation
            _search_tolerance = 0.01

            def _bisect_approximator(arr, search_tolerance=_search_tolerance):
                """uses binary search to find lambda_actual within
                search_tolerance."""
                n = len(arr)
                L = 0
                R = n - 1

                while L <= R:
                    mid = floor((L + R) / 2)
                    if pois(arr[mid]) > 0 and pois(
                            arr[mid]) < search_tolerance:
                        return mid
                    elif pois(arr[mid]) > 0 and pois(
                            arr[mid]) > search_tolerance:
                        R = mid - 1
                    else:
                        L = mid + 1
                return None

            """
            if the number of workers is relatively small:
                - approximiation methods is not neccessary / we can find exact solution fast
                - and search_tolerance is not guaranteed because lambda has to be int()
            """
            if k_prime < 50:
                lambda_approx = np.argmin(
                    [abs(pois(x)) for x in range(floor(k_prime * 3))])
            else:
                lambda_approx = _bisect_approximator(range(floor(k_prime * 3)))

            rej_prob = (
                (1 - lambda_approx / lambda_actual)
                if lambda_actual > lambda_approx else
                0  # don't reject if we expect to be short on worker requests
            )

            # additional security:
            if (k_prime > 50
                    and abs(poisson.sf(k_prime, lambda_approx) - confidence) >
                    _search_tolerance):
                """something went wrong, fall back to safe default."""
                rej_prob = 0.1
                WARN = "_bisect_approximator failed unexpectedly, reset rej_prob to default"
                logging.exception(WARN)  # log error

            if random.random_sample() < rej_prob:
                _accept = True

    if _accept:
        return Response(
            json.dumps({"status": "accepted"
                        }),  # leave out other accpet keys/values for now
            status=200,
            mimetype="application/json",
        )

    # reject by default
    return Response(
        json.dumps({"status":
                    "rejected"}),  # leave out other accpet keys/values for now
        status=400,
        mimetype="application/json",
    )
Example #36
0
def call_peaks(foreground_read_counts,
               total_foreground_reads,
               background_read_counts,
               total_background_reads,
               bin_size,
               p_value_extend,
               q_value_seed,
               min_gap,
               min_expected_reads,
               use_broad_window_for_background=False):

    SHORT_WINDOW = max(1, 500 / bin_size)  # 1 kb / 2
    MEDIUM_WINDOW = max(1, 2500 / bin_size)  # 5 kb / 2
    LONG_WINDOW = max(1, 10000 / bin_size)  # 20 kb / 2

    if use_broad_window_for_background:
        background_read_counts = foreground_read_counts
        total_background_reads = total_foreground_reads
        LONG_WINDOW = max(1, 25000 / bin_size)  # 50 kb / 2

    pseudo_one_read = float(
        min_expected_reads * total_background_reads) / total_foreground_reads

    n_total_bins = sum(
        len(bins) for bins in foreground_read_counts.itervalues())

    mean_background_reads = float(total_background_reads) / n_total_bins

    expected_read_counts = dict((c, [0] * len(foreground_read_counts[c]))
                                for c in foreground_read_counts)

    if total_background_reads == 0:
        echo('Using average reads per bin as expected:',
             total_foreground_reads / float(n_total_bins))

    peaks = {}
    poisson_cache = {}
    echo('Calling significant bins')
    for chrom in foreground_read_counts:
        peaks[chrom] = [0] * len(foreground_read_counts[chrom])

        short_window = sum(background_read_counts[chrom][:SHORT_WINDOW])
        short_window_length = SHORT_WINDOW

        medium_window = sum(background_read_counts[chrom][:MEDIUM_WINDOW])
        medium_window_length = MEDIUM_WINDOW

        long_window = sum(background_read_counts[chrom][:LONG_WINDOW])
        long_window_length = LONG_WINDOW

        for bin_idx in xrange(len(foreground_read_counts[chrom])):

            fgr_reads = foreground_read_counts[chrom][bin_idx]

            if bin_idx >= SHORT_WINDOW:
                short_window -= background_read_counts[chrom][bin_idx -
                                                              SHORT_WINDOW]
            else:
                short_window_length += 1

            if bin_idx + SHORT_WINDOW < len(background_read_counts[chrom]):
                short_window += background_read_counts[chrom][bin_idx +
                                                              SHORT_WINDOW]
            else:
                short_window_length -= 1

            if bin_idx >= MEDIUM_WINDOW:
                medium_window -= background_read_counts[chrom][bin_idx -
                                                               MEDIUM_WINDOW]
            else:
                medium_window_length += 1

            if bin_idx + MEDIUM_WINDOW < len(background_read_counts[chrom]):
                medium_window += background_read_counts[chrom][bin_idx +
                                                               MEDIUM_WINDOW]
            else:
                medium_window_length -= 1

            if bin_idx >= LONG_WINDOW:
                long_window -= background_read_counts[chrom][bin_idx -
                                                             LONG_WINDOW]
            else:
                long_window_length += 1

            if bin_idx + LONG_WINDOW < len(background_read_counts[chrom]):
                long_window += background_read_counts[chrom][bin_idx +
                                                             LONG_WINDOW]
            else:
                long_window_length -= 1

            if use_broad_window_for_background:
                bgr_reads = max(
                    float(long_window) / long_window_length,
                    mean_background_reads, pseudo_one_read)
                expected_reads = total_foreground_reads * bgr_reads / float(
                    total_background_reads)
            else:
                if total_background_reads > 0:

                    bgr_reads = max(
                        float(short_window) / short_window_length,
                        float(medium_window) / medium_window_length,
                        float(long_window) / long_window_length,
                        mean_background_reads, pseudo_one_read)

                    expected_reads = total_foreground_reads * bgr_reads / float(
                        total_background_reads)
                else:
                    expected_reads = max(
                        1., total_foreground_reads / float(n_total_bins))

            # cache the Poisson test
            key = (fgr_reads - 1, expected_reads)
            if key not in poisson_cache:
                poisson_cache[key] = poisson.sf(fgr_reads - 1,
                                                mu=expected_reads)

            peaks[chrom][bin_idx] = poisson_cache[key]

            expected_read_counts[chrom][bin_idx] = expected_reads

    echo('Computing p-value threshold at FDR of', q_value_seed)
    sorted_p_values = sorted([p for chrom in peaks for p in peaks[chrom]])
    n = len(sorted_p_values)

    q_value_strong = None

    for i, p_value in enumerate(sorted_p_values):

        if float(n * p_value) / (i + 1) <= q_value_seed:
            q_value_strong = p_value

    echo('p-value threshold:', q_value_strong)

    if q_value_strong is None:
        echo(
            'ERROR: No significant peaks are found for this time point!\n'
            'Please, check your data and consider removing this time point or '
            'relaxing the FDR threshold with the --q-value-seed option.')
        exit(1)

    merged_peaks = {}
    for chrom in peaks:

        chrom_peaks = peaks[chrom]

        peak_bins = []
        in_peak = False
        peak_start = None
        n_bins = len(peaks[chrom])

        for bin_idx in xrange(n_bins):
            is_significant = (chrom_peaks[bin_idx] <= q_value_strong)

            if not in_peak and is_significant:
                in_peak = True
                peak_start = bin_idx

            if (not is_significant or bin_idx == n_bins - 1) and in_peak:
                peak_bins.append([peak_start, bin_idx])
                in_peak = False

        for peak_idx in xrange(len(peak_bins)):
            peak_start, peak_end = peak_bins[peak_idx]
            boundary = peak_start
            while boundary >= 0 and chrom_peaks[boundary] <= p_value_extend:
                boundary -= 1

            peak_start = boundary + 1

            boundary = peak_end
            while boundary < n_bins and chrom_peaks[boundary] <= p_value_extend:
                boundary += 1

            peak_end = boundary
            peak_bins[peak_idx] = [peak_start, peak_end]

        merged_peaks[chrom] = merge_intervals(peak_bins, min_gap=min_gap)

    return merged_peaks, expected_read_counts
Example #37
0
 def __call__(self, m):
     #lower_limits = np.ceil(m[:,None]*self.Ls[None,:]/self.Lavg)-1+0.1
     #return (poisson.sf(lower_limits, self.expected_ns[None,:])).sum(axis=1)
     lower_limits = np.ceil(m[:,None]*self.Ls[None,:]/self.Lavg)-2+0.1
     return (poisson.sf(lower_limits, self.expected_ns[None,:])*self.ps[None,:]).sum(axis=1)
    ])
    mu = np.mean(obs)

    print('mu = {}'.format(mu))

    # Show the distribution
    sns.set(style="white", palette="muted", color_codes=True)
    fig, ax = plt.subplots(figsize=(14, 7), frameon=False)

    sns.distplot(obs, kde=True, color="b", ax=ax)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    plt.show()

    # Print some probabilities
    print('P(more than 8 trains) = {}'.format(poisson.sf(8, mu)))
    print('P(more than 9 trains) = {}'.format(poisson.sf(9, mu)))
    print('P(more than 10 trains) = {}'.format(poisson.sf(10, mu)))
    print('P(more than 11 trains) = {}'.format(poisson.sf(11, mu)))

    # Add new observations
    new_obs = np.array([
        13, 14, 11, 10, 11, 13, 13, 9, 11, 14, 12, 11, 12, 14, 8, 13, 10, 14,
        12, 13, 10, 9, 14, 13, 11, 14, 13, 14
    ])

    obs = np.concatenate([obs, new_obs])
    mu = np.mean(obs)

    print('mu = {}'.format(mu))
def LoopSEDLikelihood(name, observed, tpl_dnde_log, tpl_index, eref, ra, dec, king, livetime, suffix, nside, redshift, addregular, dnde_pri_min, dnde_pri_max, idx_pri_min, idx_pri_max, binw=0.294, enr_min=4.00, nbin=4, t_start=770., t_stop=8233., nmaxevt_ebin=20):
    #t_stop will be included in the time window.

    TPL_CLASS = ('CalOnlyR100',) #, 'CalOnlyR30', 'CalOnlyR10')
    NREBIN = 1
    #SCALE_FLUX = 1.0e-13
    prob_skip = 1.0E-5

    FILE_IN = ROOT.TFile(observed, 'READ')
    HTG_OBS = FILE_IN.Get('spectrum_observed')
    print HTG_OBS, 'has been found.'
    HTG_OBS_ENR = HTG_OBS.ProjectionY('{0}_projEnr'.format(HTG_OBS.GetName()), HTG_OBS.GetXaxis().FindBin(t_start), HTG_OBS.GetXaxis().FindBin(t_stop))
    nobs = HTG_OBS_ENR.Integral(HTG_OBS_ENR.GetXaxis().FindBin(enr_min), HTG_OBS_ENR.GetXaxis().FindBin(enr_min+binw*nbin))
    print nobs, 'observed events.'
    HTG_OBS_ENR.Rebin(NREBIN)
    HTG_OBS_ENR.SetLineWidth(0)
    HTG_OBS_ENR.SetLineColor(ROOT.kRed)
    HTG_OBS_ENR.SetMarkerColor(ROOT.kRed)
    HTG_OBS_ENR.SetMarkerStyle(20)
    HTG_OBS_ENR.SetFillStyle(0)

    PATH_FILE_OUT = 'LoopLikelihood_{0}{1}'.format(name, suffix)
    FILE_OUT = ROOT.TFile('{0}.root'.format(PATH_FILE_OUT), 'RECREATE')
    FILE_OUT.cd()

    # Histogram for results
    #xaxis = array('d', tpl_dnde)
    xaxis = np.array(tpl_dnde_log+(2.*tpl_dnde_log[-1]-tpl_dnde_log[-2],), dtype=float)
    #xaxis_scaled = xaxis/SCALE_FLUX
    #yaxis = array('d', tpl_index)
    yaxis = np.array(tpl_index+(2.*tpl_index[-1]-tpl_index[-2],), dtype=float)
    dct_htg_likeresult = {}
    dct_htg_likeratio = {}
    dct_htg_likerfrac = {}
    dct_htg_unlikerfrac = {}
    dct_htg_likecoverage = {}
    dct_cvs_likeresult = {}
    dct_cvs_likeratio = {}
    likelihood_max = {}
    xlocmax = {}
    ylocmax = {}

    for cla in TPL_CLASS:
        #dct_htg_likeresult[cla] = ROOT.TH2D('htg_likeresult', 'Likelihood;log_{{10}}dN/dE at {0:1.2e} MeV;PWL index'.format(eref), len(tpl_dnde_log), xaxis, len(tpl_index), yaxis)
        dct_htg_likeresult[cla] = ROOT.TGraph2D()
        dct_htg_likeresult[cla].SetName('htg_likeresult')
        dct_htg_likeresult[cla].SetTitle('Likelihood for data')
        dct_htg_likeresult[cla].GetXaxis().SetTitle('log_{{10}}dN/dE at {0:1.2e} MeV')
        dct_htg_likeresult[cla].GetYaxis().SetTitle('PWL index'.format(eref))

#        dct_htg_likeratio[cla] = ROOT.TH2D('htg_likeratio', 'Likelihood Ratio;log_{{10}}dN/dE at {0:1.2e} MeV;PWL index'.format(eref), len(tpl_dnde_log), xaxis, len(tpl_index), yaxis)
        dct_htg_likeratio[cla] = ROOT.TGraph2D()
        dct_htg_likeratio[cla].SetName('htg_likeratio')
        dct_htg_likeratio[cla].SetTitle('Likelihood ratio with physically possible ideal case')
        dct_htg_likeratio[cla].GetXaxis().SetTitle('log_{{10}}dN/dE at {0:1.2e} MeV')
        dct_htg_likeratio[cla].GetYaxis().SetTitle('PWL index'.format(eref))

        dct_htg_likerfrac[cla] = ROOT.TGraph2D()
        dct_htg_likerfrac[cla].SetName('htg_likerfrac')
        dct_htg_likerfrac[cla].SetTitle('Fraction of cases liker than data')
        dct_htg_likerfrac[cla].GetXaxis().SetTitle('log_{{10}}dN/dE at {0:1.2e} MeV')
        dct_htg_likerfrac[cla].GetYaxis().SetTitle('PWL index'.format(eref))

        dct_htg_unlikerfrac[cla] = ROOT.TGraph2D()
        dct_htg_unlikerfrac[cla].SetName('htg_unlikerfrac')
        dct_htg_unlikerfrac[cla].SetTitle('Fraction of cases unliker than data')
        dct_htg_unlikerfrac[cla].GetXaxis().SetTitle('log_{{10}}dN/dE at {0:1.2e} MeV')
        dct_htg_unlikerfrac[cla].GetYaxis().SetTitle('PWL index'.format(eref))

        dct_htg_likecoverage[cla] = ROOT.TGraph2D()
        dct_htg_likecoverage[cla].SetName('htg_likecoverage')
        dct_htg_likecoverage[cla].SetTitle('Fraction of cases covered by calculation')
        dct_htg_likecoverage[cla].GetXaxis().SetTitle('log_{{10}}dN/dE at {0:1.2e} MeV')
        dct_htg_likecoverage[cla].GetYaxis().SetTitle('PWL index'.format(eref))

        dct_cvs_likeresult[cla] = ROOT.TCanvas('cvs_likeresult_{0}'.format(cla), '{0} Likelihood'.format(cla), 750, 750)
        dct_cvs_likeratio[cla] = ROOT.TCanvas('cvs_likeratio_{0}'.format(cla), '{0} Likelihood Ratio'.format(cla), 750, 750)

        likelihood_max[cla] = 0.0
        xlocmax[cla] = 0.0
        ylocmax[cla] = 0.0

    likelihood_ceil = math.exp(-HTG_OBS_ENR.Integral())
    for ienr in range(1, HTG_OBS_ENR.GetNbinsX()+1):
        ni = HTG_OBS_ENR.GetBinContent(ienr)
        likelihood_ceil = likelihood_ceil * math.pow(ni, ni)/math.factorial(ni)
    print 'Ideal maximum likelihood =', likelihood_ceil

    # Possible ideal likelihood (independent for model)
    nda_likelihood_bestpossible = []
    nda_likelihood_best_directprod = np.ones(nmaxevt_ebin)
    for ienr in range(1, HTG_OBS_ENR.GetNbinsX()+1):
        print 'Energy range (observed): 10^{0} - 10^{1}'.format(HTG_OBS_ENR.GetXaxis().GetBinLowEdge(ienr), HTG_OBS_ENR.GetXaxis().GetBinUpEdge(ienr))
        nda_likelihood_bestpossible.append(np.ones(nmaxevt_ebin))
        for mevt in range(nmaxevt_ebin):
            nda_likelihood_bestpossible[-1][mevt] = nda_likelihood_bestpossible[-1][mevt] * math.exp(-mevt)*math.pow(mevt, mevt)/math.factorial(mevt)
           # Make a direct product array
            nda_likelihood_bestpossible_t = nda_likelihood_bestpossible[-1] # Transposing matrix
        if ienr>1:
            for jenr in range(ienr-1):
                nda_likelihood_bestpossible_t = nda_likelihood_bestpossible_t[:, np.newaxis]
        nda_likelihood_best_directprod = nda_likelihood_best_directprod * nda_likelihood_bestpossible_t # Broadcasting of np array
        #print nda_likelihood_best_directprod
    print 'Likelihood of physically ideal cases:'
    print nda_likelihood_best_directprod # This array's indeces are corresponding to observable count for each energy bin


    # Loop over dN/dE and PWL-index
    for (ix, dnde_log) in enumerate(tpl_dnde_log):
        dnde = 10**dnde_log
        print '===================='
        print 'dN/dE = {0:1.2e} at {1:1.1e} MeV'.format(dnde, eref)
        for (iy, idx_pl) in enumerate(tpl_index):
            print '--------------------'
            print 'PWL index = {0}'.format(idx_pl)
            lst_flux_itgl = ExtrapolateFlux.ExtrapolateFlux(eref, dnde, idx_pl, binw, enr_min, nbin, redshift)
            htg_flux = ROOT.TH1D('htg_flux', 'Integral flux', nbin, enr_min, enr_min+nbin*binw)
            for ibin in range(1, htg_flux.GetNbinsX()+1):
                htg_flux.SetBinContent(ibin, lst_flux_itgl[ibin-1])
                htg_flux.SetBinError(ibin, 0)
            str_fp = 'dNdE{0:0>12d}_PWL{1}{2:0>3d}'.format(int(dnde*1e20+0.5), "n" if idx_pl<0 else "p", int(idx_pl*100+0.5))
            suffix_fp = suffix + str_fp
            dct_htg_model = ModelPointSource.ModelPointSource(name, htg_flux, ra, dec, king, livetime, suffix_fp, nside, addregular)
            print dct_htg_model
            hs = ROOT.THStack('spectrum_{0}'.format(str_fp), 'log_{{10}}dN/dE={0:.2f} at {1} MeV, PWL-index={2:+f};log_{{10}}Energy [MeV];[counts]'.format(dnde_log, eref, idx_pl))
            hs.Add(HTG_OBS_ENR)
            for (icla,cla) in enumerate(TPL_CLASS):
                print cla
                htg_model = dct_htg_model[cla]
                htg_model.Rebin(NREBIN)
                htg_model.SetLineWidth(2)
                htg_model.SetLineColor(ROOT.kGray)
                htg_model.SetLineStyle(icla+1)
                htg_model.SetMarkerColor(ROOT.kGray)
                hs.Add(htg_model)
                factor_expected_total = math.exp(-htg_model.Integral())
                likelihood_data = factor_expected_total
                likelihood_data_highcut = poisson.cdf(nobs-1, htg_model.Integral())
                likelihood_data_lowcut = poisson.sf(nobs-1, htg_model.Integral())
                if likelihood_data_highcut<prob_skip or likelihood_data_lowcut<prob_skip:
                    print 'Detaction probability of', nobs, 'events is smaller than', min(likelihood_data_highcut, likelihood_data_lowcut)*100, '%.'
                    print 'Calculation is skipped...'
                    dct_htg_likeresult[cla].SetPoint(dct_htg_likeresult[cla].GetN(), dnde_log, idx_pl, 0.)
                    dct_htg_likeratio[cla].SetPoint(dct_htg_likeratio[cla].GetN(), dnde_log, idx_pl, 0.)
                    dct_htg_likerfrac[cla].SetPoint(dct_htg_likerfrac[cla].GetN(), dnde_log, idx_pl, 1.-prob_skip)
                    dct_htg_unlikerfrac[cla].SetPoint(dct_htg_unlikerfrac[cla].GetN(), dnde_log, idx_pl, 0.+prob_skip)
                    dct_htg_likecoverage[cla].SetPoint(dct_htg_likecoverage[cla].GetN(), dnde_log, idx_pl, 1.-prob_skip)
                    continue

                nda_likelihood_allpossible = []
                nda_likelihood_allpossible_t = []
                nda_likelihood_all_directprod = np.ones(nmaxevt_ebin)
                
                for ienr in range(1, htg_model.GetNbinsX()+1):
                    print 'Energy range (model): 10^{0} - 10^{1}'.format(htg_model.GetXaxis().GetBinLowEdge(ienr), htg_model.GetXaxis().GetBinUpEdge(ienr))
                    print 'Energy range (observed): 10^{0} - 10^{1}'.format(HTG_OBS_ENR.GetXaxis().GetBinLowEdge(ienr), HTG_OBS_ENR.GetXaxis().GetBinUpEdge(ienr))
                    mi = htg_model.GetBinContent(ienr)
                    ni = HTG_OBS_ENR.GetBinContent(ienr)
                    likelihood_data = likelihood_data * math.pow(mi, ni)/math.factorial(ni)

                    # For likelihood RATIO ordering
                    nda_likelihood_allpossible.append(np.ones(nmaxevt_ebin))
                    for mevt in range(nmaxevt_ebin):
                        nda_likelihood_allpossible[-1][mevt] = nda_likelihood_allpossible[-1][mevt] * math.pow(mi, mevt)/math.factorial(mevt)
                    # Make a direct product array
                    nda_likelihood_allpossible_t = nda_likelihood_allpossible[-1] # Transposing matrix
                    if ienr>1:
                        for jenr in range(ienr-1):
                            nda_likelihood_allpossible_t = nda_likelihood_allpossible_t[:, np.newaxis]
                    nda_likelihood_all_directprod = nda_likelihood_all_directprod * nda_likelihood_allpossible_t # Broadcasting of np array
                    #print nda_likelihood_all_directprod
                print 'Likelihood of model and data =', likelihood_data
                nda_likelihood_all_directprod = nda_likelihood_all_directprod * factor_expected_total
                print 'Possible likelihood values:'
                print nda_likelihood_all_directprod # Array indeces are corresponding to observable count for each energy bin
                nda_likelihood_ratio_directprod = nda_likelihood_all_directprod / nda_likelihood_best_directprod
                print 'Possible likelihood ratio:'
                print nda_likelihood_ratio_directprod

                likelihood_ratio_data = likelihood_data / likelihood_ceil
                fprob_liker = 0.
                fprob_unliker = 0.
                for itpl, rvalue in enumerate(nda_likelihood_ratio_directprod.flat):
                    if rvalue > likelihood_ratio_data:
                        fprob_liker+=nda_likelihood_all_directprod.flat[itpl]
                    else:
                        fprob_unliker+=nda_likelihood_all_directprod.flat[itpl]
                print 'Data is', fprob_liker*100., '% likest case and excluded from acceptance interval by', fprob_unliker*100., '%.'
                fprob_coverage = fprob_liker + fprob_unliker
                print 'Calculation covers', fprob_coverage*100., '% of total possibility.'

                dct_htg_likeresult[cla].SetPoint(dct_htg_likeresult[cla].GetN(), dnde_log, idx_pl, likelihood_data)
                dct_htg_likeratio[cla].SetPoint(dct_htg_likeratio[cla].GetN(), dnde_log, idx_pl, likelihood_data/likelihood_ceil)
                dct_htg_likerfrac[cla].SetPoint(dct_htg_likerfrac[cla].GetN(), dnde_log, idx_pl, fprob_liker)
                dct_htg_unlikerfrac[cla].SetPoint(dct_htg_unlikerfrac[cla].GetN(), dnde_log, idx_pl, fprob_unliker)
                dct_htg_likecoverage[cla].SetPoint(dct_htg_likecoverage[cla].GetN(), dnde_log, idx_pl, fprob_liker+fprob_unliker)
                if likelihood_data>likelihood_max[cla]:
                    likelihood_max[cla] = likelihood_data
                    xlocmax[cla] = dnde_log
                    ylocmax[cla] = idx_pl

            FILE_OUT.cd()
            hs.Write()
            del dct_htg_model
            del htg_flux
            
    FILE_OUT.cd()
    #likelihood_max = 0.0
    #likelihood_temp = 0.0
    #xlocmax = ROOT.Long()
    #ylocmax = ROOT.Long()
    #zlocmax = ROOT.Long()
    for cla in TPL_CLASS:
        dct_htg_likeresult[cla].Write()
        dct_htg_likeratio[cla].Write()
        dct_htg_likerfrac[cla].Write()
        dct_htg_unlikerfrac[cla].Write()
        dct_htg_likecoverage[cla].Write()
        dct_cvs_likeresult[cla].cd()
        dct_cvs_likeresult[cla].SetLogz()
        dct_htg_likeresult[cla].Draw("colz")
        #likelihood_max = dct_htg_likeresult[cla].GetMaximum()
        #dct_htg_likeresult[cla].GetMaximumBin(xlocmax, ylocmax, zlocmax)
        print '===== Maximum likelihood ====='
        print 'dNdE =', xlocmax[cla], 'at', eref, 'MeV'
        print 'PWL-index =', ylocmax[cla]
        dct_htg_likeresult[cla].GetZaxis().SetRangeUser(0.001*likelihood_max[cla], likelihood_max[cla])
        dct_cvs_likeresult[cla].Write()
        dct_cvs_likeratio[cla].cd()
        dct_cvs_likeratio[cla].SetLogz()
        dct_htg_likeratio[cla].Draw("colz")
        dct_htg_likeratio[cla].GetZaxis().SetRangeUser(0.05, 0.68)
        dct_cvs_likeratio[cla].Write()
    return dct_htg_likeresult
Example #40
0
File: zt.py Project: cyrus/learning
 def _cdf(self, x, mu):
     k = floor(x)
     if k == 0:
         return 0.0
     else:
         return (poisson.cdf(k, mu) - poisson.pmf(0, mu)) / poisson.sf(0, mu) 
Example #41
0
File: zt.py Project: cyrus/learning
 def _ppf(self, q, mu):
     return poisson.ppf(poisson.sf(0, mu) * q + poisson.pmf(0, mu), mu)
Example #42
0
 def test_logpmf(self):
     poisson_logpmf = poisson.logpmf(4, 6) - np.log(poisson.sf(2, 6))
     tpoisson_logpmf = truncatedpoisson.logpmf(4, 6, 2)
     assert_allclose(poisson_logpmf, tpoisson_logpmf, rtol=1e-7)
def polistat(poli,background):
    # for one second integration

    stat_1sec = OrderedDict()
    stat_1sec['Actual Distances'] = poli.actualdist
    stat_1sec['Actual Probability'] = poli.actualprob
    stat_1sec['Actual Probability'] = poli.actualprob
    stat_1sec['Actual Sigma +'] = poli.actualsig_pos
    stat_1sec['Actual Sigma -'] = poli.actualsig_neg

    stat_1sec['Name'] = poli.name
    stat_1sec['Title'] = '1 Second Integration time'
    stat_1sec['Alarm Setting'] = poli.alarm

    stat_1sec['Channel 1 Bin'] = poli.ch1_cut
    stat_1sec['Channel 2 Bin'] = poli.ch2_cut
    stat_1sec['Channel 3 Bin'] = poli.ch3_cut
    stat_1sec['Channel 4 Bin'] = poli.ch4_cut

    stat_1sec['Channel 1 Source Hits Per Sec'] = poli.ch1_rate #source rate
    stat_1sec['Channel 2 Source Hits Per Sec'] = poli.ch2_rate 
    stat_1sec['Channel 3 Source Hits Per Sec'] = poli.ch3_rate 
    stat_1sec['Channel 4 Source Hits Per Sec'] = poli.ch4_rate 
    stat_1sec['Source Sigma Hit Rate'] = np.sqrt(poli.ch1_rate*poli.itime1)/poli.s_time  
    stat_1sec['Channel 2 Source Sigma Hit Rate'] = np.sqrt(poli.ch2_rate*poli.itime1)/poli.s_time  
    stat_1sec['Channel 3 Source Sigma Hit Rate'] = np.sqrt(poli.ch3_rate*poli.itime1)/poli.s_time 
    stat_1sec['Channel 4 Source Sigma Hit Rate'] = np.sqrt(poli.ch4_rate*poli.itime1)/poli.s_time 

    stat_1sec['Channel 1 Background Hit Rate'] = background.rate[0] 
    stat_1sec['Channel 2 Background Hit Rate'] = background.rate[1] 
    stat_1sec['Channel 3 Background Hit Rate'] = background.rate[2] 
    stat_1sec['Channel 4 Background Hit Rate'] = background.rate[3] 
    stat_1sec['Channel 1 Sigma Background Hit Rate'] = background.sig_ch1 
    stat_1sec['Channel 2 Sigma Background Hit Rate'] = background.sig_ch2 
    stat_1sec['Channel 3 Sigma Background Hit Rate'] = background.sig_ch3 
    stat_1sec['Channel 4 Sigma Background Hit Rate'] = background.sig_ch4 
    stat_1sec['Regular Background Sigma'] = background.real_sig



    stat_1sec['Combined Hit Rate Channel 1'] = poli.ch1_rate + background.rate[0]
    stat_1sec['Combined Hit Rate Channel 2'] = poli.ch2_rate + background.rate[1]
    stat_1sec['Combined Hit Rate Channel 3'] = poli.ch3_rate + background.rate[2]
    stat_1sec['Combined Hit Rate Channel 4'] = poli.ch4_rate + background.rate[3]
    stat_1sec['Sigma Combined Hit Rate Channel 1'] = np.sqrt( stat_1sec['Channel 1 Sigma Background Hit Rate']**2 + (background.sig_ch1)**2 )
    stat_1sec['Sigma Combined Hit Rate Channel 2'] = np.sqrt( stat_1sec['Channel 2 Sigma Background Hit Rate']**2 + (background.sig_ch2)**2 )
    stat_1sec['Sigma Combined Hit Rate Channel 3'] = np.sqrt( stat_1sec['Channel 3 Sigma Background Hit Rate']**2 + (background.sig_ch3)**2 )
    stat_1sec['Sigma Combined Hit Rate Channel 4'] = np.sqrt( stat_1sec['Channel 4 Sigma Background Hit Rate']**2 + (background.sig_ch4)**2 )


    stat_1sec['Channel 1 Sigma Above Background'] = stat_1sec['Channel 1 Source Hits Per Sec'] / background.sig_ch1
    stat_1sec['Channel 2 Sigma Above Background'] = stat_1sec['Channel 2 Source Hits Per Sec'] / background.sig_ch2
    stat_1sec['Channel 3 Sigma Above Background'] = stat_1sec['Channel 3 Source Hits Per Sec'] / background.sig_ch3
    stat_1sec['Channel 4 Sigma Above Background'] = stat_1sec['Channel 4 Source Hits Per Sec'] / background.sig_ch4
    stat_1sec['Channel 1 Source Rate Required to alarm'] = stat_1sec['Channel 1 Sigma Background Hit Rate']*stat_1sec['Alarm Setting'] 
    stat_1sec['Channel 2 Source Rate Required to alarm'] = stat_1sec['Channel 2 Sigma Background Hit Rate']*stat_1sec['Alarm Setting'] 
    stat_1sec['Channel 3 Source Rate Required to alarm'] = stat_1sec['Channel 3 Sigma Background Hit Rate']*stat_1sec['Alarm Setting'] 
    stat_1sec['Channel 4 Source Rate Required to alarm'] = stat_1sec['Channel 4 Sigma Background Hit Rate']*stat_1sec['Alarm Setting'] 

    ch1_mean = stat_1sec['Combined Hit Rate Channel 1']
    ch2_mean = stat_1sec['Combined Hit Rate Channel 2']
    ch3_mean = stat_1sec['Combined Hit Rate Channel 3']
    ch4_mean = stat_1sec['Combined Hit Rate Channel 4']
    ch1_sigma = stat_1sec['Sigma Combined Hit Rate Channel 1']
    ch2_sigma = stat_1sec['Sigma Combined Hit Rate Channel 2']
    ch3_sigma = stat_1sec['Sigma Combined Hit Rate Channel 3']
    ch4_sigma = stat_1sec['Sigma Combined Hit Rate Channel 4']

    ch1_alarm =  stat_1sec['Channel 1 Source Rate Required to alarm'] # value needed to set off alarm
    ch2_alarm =  stat_1sec['Channel 2 Source Rate Required to alarm'] 
    ch3_alarm =  stat_1sec['Channel 3 Source Rate Required to alarm'] 
    ch4_alarm =  stat_1sec['Channel 4 Source Rate Required to alarm'] 

    stat_1sec['Channel 1 Probability to Alarm'] = poisson.sf(ch1_alarm,ch1_mean)
    stat_1sec['Channel 2 Probability to Alarm'] = poisson.sf(ch2_alarm,ch2_mean)
    stat_1sec['Channel 3 Probability to Alarm'] = poisson.sf(ch3_alarm,ch3_mean)
    stat_1sec['Channel 4 Probability to Alarm'] = poisson.sf(ch4_alarm,ch4_mean)


    # for two second integration time
    stat_2sec = OrderedDict() 
    stat_2sec['Name'] = poli.name
    stat_2sec['Title'] = '2 Second Integration time'
    stat_2sec['Alarm Setting'] = poli.alarm

    stat_2sec['Channel 1 Bin'] = poli.ch1_cut
    stat_2sec['Channel 2 Bin'] = poli.ch2_cut
    stat_2sec['Channel 3 Bin'] = poli.ch3_cut
    stat_2sec['Channel 4 Bin'] = poli.ch4_cut

    stat_2sec['Channel 1 Source Hits Per Sec'] = poli.ch1_rate #source rate
    stat_2sec['Channel 2 Source Hits Per Sec'] = poli.ch2_rate 
    stat_2sec['Channel 3 Source Hits Per Sec'] = poli.ch3_rate 
    stat_2sec['Channel 4 Source Hits Per Sec'] = poli.ch4_rate 
    stat_2sec['Channel 1 Source Sigma Hit Rate'] = np.sqrt(poli.ch1_rate*poli.itime2) / poli.itime2  
    stat_2sec['Channel 2 Source Sigma Hit Rate'] = np.sqrt(poli.ch2_rate*poli.itime2) / poli.itime2  
    stat_2sec['Channel 3 Source Sigma Hit Rate'] = np.sqrt(poli.ch3_rate*poli.itime2) / poli.itime2 
    stat_2sec['Channel 4 Source Sigma Hit Rate'] = np.sqrt(poli.ch4_rate*poli.itime2) / poli.itime2
    stat_2sec['ch1 sigma stat all rate'] =  [ np.sqrt(poli.ch1_rate*poli.itime2) / poli.itime2, np.sqrt(poli.ch2_rate*poli.itime2) / poli.itime2, np.sqrt(poli.ch3_rate*poli.itime2) / poli.itime2, np.sqrt(poli.ch4_rate*poli.itime2) / poli.itime2]


    stat_2sec['Channel 1 Background Hit Rate'] = background.rate[0] 
    stat_2sec['Channel 2 Background Hit Rate'] = background.rate[1]
    stat_2sec['Channel 3 Background Hit Rate'] = background.rate[2]
    stat_2sec['Channel 4 Background Hit Rate'] = background.rate[3]
    stat_2sec['Channel 1 Sigma Background Hit Rate'] = background.sig_ch1 
    stat_2sec['Channel 2 Sigma Background Hit Rate'] = background.sig_ch2 
    stat_2sec['Channel 3 Sigma Background Hit Rate'] = background.sig_ch3 
    stat_2sec['Channel 4 Sigma Background Hit Rate'] = background.sig_ch4 

    stat_2sec['Combined Hit Rate Channel 1'] = poli.ch1_rate + background.rate[0]
    stat_2sec['Combined Hit Rate Channel 2'] = poli.ch2_rate + background.rate[1]
    stat_2sec['Combined Hit Rate Channel 3'] = poli.ch3_rate + background.rate[2]
    stat_2sec['Combined Hit Rate Channel 4'] = poli.ch4_rate + background.rate[3]
    stat_2sec['Sigma Combined Hit Rate Channel 1'] = np.sqrt( stat_2sec['Channel 1 Source Sigma Hit Rate']**2 + (background.sig_ch1)**2 )
    stat_2sec['Sigma Combined Hit Rate Channel 2'] = np.sqrt( stat_2sec['Channel 2 Source Sigma Hit Rate']**2 + (background.sig_ch2)**2 )
    stat_2sec['Sigma Combined Hit Rate Channel 3'] = np.sqrt( stat_2sec['Channel 3 Source Sigma Hit Rate']**2 + (background.sig_ch3)**2 )
    stat_2sec['Sigma Combined Hit Rate Channel 4'] = np.sqrt( stat_2sec['Channel 4 Source Sigma Hit Rate']**2 + (background.sig_ch4)**2 )

    stat_2sec['Channel 1 Sigma Above Background'] = stat_2sec['Channel 1 Source Hits Per Sec'] / background.sig_ch1
    stat_2sec['Channel 2 Sigma Above Background'] = stat_2sec['Channel 2 Source Hits Per Sec'] / background.sig_ch2
    stat_2sec['Channel 3 Sigma Above Background'] = stat_2sec['Channel 3 Source Hits Per Sec'] / background.sig_ch3
    stat_2sec['Channel 4 Sigma Above Background'] = stat_2sec['Channel 4 Source Hits Per Sec'] / background.sig_ch4

    stat_2sec['Channel 1 Source Rate Required to alarm'] = stat_2sec['Channel 1 Sigma Background Hit Rate']*stat_2sec['Alarm Setting'] 
    stat_2sec['Channel 2 Source Rate Required to alarm'] = stat_2sec['Channel 2 Sigma Background Hit Rate']*stat_2sec['Alarm Setting'] 
    stat_2sec['Channel 3 Source Rate Required to alarm'] = stat_2sec['Channel 3 Sigma Background Hit Rate']*stat_2sec['Alarm Setting'] 
    stat_2sec['Channel 4 Source Rate Required to alarm'] = stat_2sec['Channel 4 Sigma Background Hit Rate']*stat_2sec['Alarm Setting'] 

    ch1_mean = stat_2sec['Combined Hit Rate Channel 1']
    ch2_mean = stat_2sec['Combined Hit Rate Channel 2']
    ch3_mean = stat_2sec['Combined Hit Rate Channel 3']
    ch4_mean = stat_2sec['Combined Hit Rate Channel 4']
    ch1_sigma = stat_2sec['Sigma Combined Hit Rate Channel 1']
    ch2_sigma = stat_2sec['Sigma Combined Hit Rate Channel 2']
    ch3_sigma = stat_2sec['Sigma Combined Hit Rate Channel 3']
    ch4_sigma = stat_2sec['Sigma Combined Hit Rate Channel 4']

    ch1_alarm =  stat_2sec['Channel 1 Source Rate Required to alarm'] # value needed to set off alarm
    ch2_alarm =  stat_2sec['Channel 2 Source Rate Required to alarm'] # 
    ch3_alarm =  stat_2sec['Channel 3 Source Rate Required to alarm'] # 
    ch4_alarm =  stat_2sec['Channel 4 Source Rate Required to alarm'] # 
    stat_2sec['Channel 1 Probability to Alarm'] = poisson.sf(ch1_alarm,ch1_mean)
    stat_2sec['Channel 2 Probability to Alarm'] = poisson.sf(ch2_alarm,ch2_mean)
    stat_2sec['Channel 3 Probability to Alarm'] = poisson.sf(ch3_alarm,ch3_mean)
    stat_2sec['Channel 4 Probability to Alarm'] = poisson.sf(ch4_alarm,ch4_mean)

    return stat_1sec, stat_2sec
def poisson_pore_fit(min_pore_size):
    return np.sum((fmax_means[1:] -
                   poisson.sf(min_pore_size-1, bax_ratios[1:])) ** 2)
Example #45
0
                       })
        df = df.sort_values(
            ["chrom", "start", "end", "TTAA_chrom", "TTAA_start", "TTAA_end"])
        groups = df.groupby(["chrom", "start", "end"])
        first = groups.nth(0)["TTAA_start"]
        last = groups.nth(-1)["TTAA_end"]
        joined = pd.concat([first, last], axis=1).reset_index()
        refined = joined[["chrom", "TTAA_start", "TTAA_end"]]
        blocks = BedTool.from_dataframe(refined)

    data = blocks.intersect(experiment, c=True).intersect(background, c=True)
    df = data.to_dataframe()
    df = df.rename(index=str, columns={"name": "expHops", "score": "bgHops"})
    df["norm_bgHops"] = df["bgHops"] * scaleFactor + args.pseudocount
    df["-log10pValue"] = -np.log10(
        poisson.sf(df["expHops"] - 1, df["norm_bgHops"]))
    outdf = df[df["-log10pValue"] >= args.pValueCutoff]
    outbed = BedTool.from_dataframe(
        df[df["-log10pValue"] >= args.pValueCutoff])

    if args.distance:
        outbed = outbed.merge(d=args.distance)

    if args.minSize:
        minSize = args.minSize
    else:
        minSize = 0

    if args.maxSize:
        maxSize = args.maxSize
    else:
Example #46
0
 def test_pmf_zero(self):
     poisson_pmf = poisson.pmf(2, 2) / poisson.sf(0, 2)
     tpoisson_pmf = truncatedpoisson.pmf(2, 2, 0)
     assert_allclose(poisson_pmf, tpoisson_pmf, rtol=1e-7)
def paramPaTau(pa, tau, Rmax=100, show=False, showProb=False):

    ph = 1 - pa

    # C1
    a = 0
    b = int(ph * tau)
    k = (a + b) // 2
    while b - a > 1:
        if poisson.cdf(k, ph * tau) < epsilon:
            a = k
            k = (a + b) // 2
        else:
            b = k
            k = (a + b) // 2
    mumax = a / tau

    # C3
    a = int(2 * tau)
    b = int(tau)
    k = (a + b) // 2
    while a - b > 1:
        if poisson.sf(k, tau) < epsilon:
            a = k
            k = (a + b) // 2
        else:
            b = k
            k = (a + b) // 2
    mumin = a / (2 * tau)

    mu = mumax

    # C2
    a = int(2 * pa * tau)
    b = int(pa * tau)
    k = int((a + b) // 2)
    while a - b > 1:
        if poisson.sf(k, pa * tau) < epsilon:
            a = k
            k = (a + b) // 2
        else:
            b = k
            k = (a + b) // 2
    if mu > 0:
        alpha = a / (mu * tau)
    else:
        alpha = np.nan

    # Majoration de Xh1t
    pmin = (1 - alpha) / 2
    a = 0
    b = pmin * ph * tau
    k = (a + b) // 2
    while b - a > 1:
        if poisson.cdf(k, pmin * ph * tau) < epsilon:
            a = k
            k = (a + b) // 2
        else:
            b = k
            k = (a + b) // 2
    xh1t = a

    # C4
    n = (1 - alpha) * mu * tau
    M = xh1t
    N = 2 * mu * tau
    a = 0
    b = M
    k = (a + b) // 2
    while (b - a) > 1:  #tau/10**3 :
        # print(b-a)
        if hypergeom.cdf(k, N, M, n) * 12000 < epsilon:
            a = k
            k = (a + b) // 2
        else:
            b = k
            k = (a + b) // 2
    if mu > 0:
        lbd = (a / (mu * tau))
    else:
        lbd = np.nan

    # rmax
    if lbd == 0 or np.isnan(lbd):
        rmax = np.nan
    else:
        rmax = 3 * int((alpha / lbd + 1))

    if show:
        print("tau = ", tau)
        print("mumin = ", int(10000 * mumin) / 100, "%")
        print("mumax = ", int(10000 * mumax) / 100, "%")
        print("alpha = ", int(10000 * alpha) / 100, "%")
        print("pmin = ", int(10000 * pmin) / 100, "%")
        print("xh1t = ", 100 * xh1t / tau, "%")
        print("lbd = ", int(10000 * lbd) / 100, "%")
        print("rmax = ", rmax)
        print()
        # print("lbd*mu = ", int(10000*lbd*mu)/10000)

    if showProb:
        print("C1: ", poisson.cdf(mumax * tau, ph * tau))
        print("C2: ", poisson.sf(alpha * mu * tau, pa * tau))
        print("C3: ", poisson.sf(2 * mumin * tau, tau))
        print("Xh1t: ", poisson.cdf(xh1t, pmin * ph * tau))
        print("C4: ", 12000 * hypergeom.cdf(lbd * mu * tau, N, M, n))
        print()
    return (tau, mumin, mumax, alpha, lbd, rmax)
Example #48
0
 def func(ll):
     y = range(0,nn)
     z = y[-1]
     y = np.append( poisson.pmf(y, ll) , [poisson.sf(z, ll)] )
     return y 
Example #49
0
from scipy.stats import poisson

# Média de acidentes 3 carros por dia

# Probabilidade de 3 no dia
poisson.pmf(3, 2)

# Probabilidade de 3 ou menos no dia
poisson.cdf(3, 2)

# Probabilidade de mais de 3 no dia
poisson.sf(3, 2)
Example #50
0
import numpy as np
from scipy.stats import poisson

p = 0.95
alpha = 0.99

y = 20
num_trial = 1
success = 0.
count = 0

for i in range(num_trial):
    print(i + 1, end='\r')
    tmp_y = 20
    k = 0
    while tmp_y > 0 and tmp_y < 200:
        _k = np.random.poisson(alpha) - 1
        k += _k
        tmp_y += _k

    if tmp_y >= 200:
        success += np.exp(alpha - p) * np.power(p / alpha, k)
        count += 1

prob = success / num_trial
print(prob)
print(count)

prob = poisson.sf(18, p)
print(prob)