def test_pmf_accuracy(): """Compare accuracy of the probability mass function. Compare the results with the accuracy check proposed in [Hong2013]_, equation (15). """ [p1, p2, p3] = np.around(np.random.random_sample(size=3), decimals=2) [n1, n2, n3] = np.random.random_integers(1, 10, size=3) nn = n1 + n2 + n3 l1 = [p1 for i in range(n1)] l2 = [p2 for i in range(n2)] l3 = [p3 for i in range(n3)] p = l1 + l2 + l3 b1 = binom(n=n1, p=p1) b2 = binom(n=n2, p=p2) b3 = binom(n=n3, p=p3) k = np.random.randint(0, nn + 1) chi_bn = 0 for j in range(0, k + 1): for i in range(0, j + 1): chi_bn += b1.pmf(i) * b2.pmf(j - i) * b3.pmf(k - j) pb = PoiBin(p) chi_pb = pb.pmf(k) assert np.all( np.around(chi_bn, decimals=10) == np.around(chi_pb, decimals=10))
def test_cdf(): """Test the cumulative distribution function.""" p = [1, 1] pb = PoiBin(p) assert np.all( pb.cdf([1, 2]) - np.array([0., 1.]) < 4 * np.finfo(float).eps) assert (pb.cdf(2) - 1.) < 4 * np.finfo(float).eps
def test_check_xi_are_real(): """Test the check that the ``xi`` values are real.""" pb = PoiBin([0]) xi = np.array([1 + 0j, 1.8 + 0j], dtype=complex) assert pb.check_xi_are_real(xi) xi = np.array([1 + 99j, 1.8 + 0j], dtype=complex) assert not pb.check_xi_are_real(xi)
def test_pmf_accuracy(): """Compare accuracy of the probability mass function. Compare the results with the accuracy check proposed in [Hong2013]_, equation (15). """ [p1, p2, p3] = np.around(np.random.random_sample(size=3), decimals=2) [n1, n2, n3] = np.random.random_integers(1, 10, size=3) nn = n1 + n2 + n3 l1 = [p1 for i in range(n1)] l2 = [p2 for i in range(n2)] l3 = [p3 for i in range(n3)] p = l1 + l2 + l3 b1 = binom(n=n1, p=p1) b2 = binom(n=n2, p=p2) b3 = binom(n=n3, p=p3) k = np.random.randint(0, nn + 1) chi_bn = 0 for j in range(0, k+1): for i in range(0, j+1): chi_bn += b1.pmf(i) * b2.pmf(j - i) * b3.pmf(k - j) pb = PoiBin(p) chi_pb = pb.pmf(k) assert np.all(np.around(chi_bn, decimals=10) == np.around(chi_pb, decimals=10))
def test_pval(): """Test the p-values function.""" p = [1, 1] pb = PoiBin(p) assert np.all(pb.pval([1, 2]) - np.array([1., 1.]) < 4 * np.finfo(float).eps) assert (pb.pval(2) - 1.) < 4 * np.finfo(float).eps
def test_pval(): """Test the p-values function.""" p = [1, 1] pb = PoiBin(p) assert np.all( pb.pval([1, 2]) - np.array([1., 1.]) < 4 * np.finfo(float).eps) assert (pb.pval(2) - 1.) < 4 * np.finfo(float).eps
def test_pmf(): """Test the probability mass function. The outcomes of some results are compared with the poibin R package [Rpoibin]_. """ p = [1, 1] pb = PoiBin(p) assert pb.pmf([1, 2]).size == 2 # Compare results with the ones obtained with the R poibin package # [Rpoibin]_ p = [0.4163448, 0.3340270, 0.9689613] pb = PoiBin(p) res = pb.pmf([0, 1, 2, 3]) res_ref = np.array([0.0120647, 0.39129134, 0.46189012, 0.13475384]) assert np.all(np.abs(res - res_ref) < 1e-8) p = [ 0.9955901, 0.5696224, 0.8272597, 0.3818746, 0.4290036, 0.8707646, 0.8858267, 0.7557183 ] pb = PoiBin(p) res = pb.pmf([0, 2, 7, 8]) res_ref = np.array( [4.17079659e-07, 2.46250608e-03, 2.02460933e-01, 4.48023378e-02]) assert np.all(np.abs(res - res_ref) < 1e-8)
def test_check_input_prob(): """Test the check that input probabilities are between 0 and 1.""" with pytest.raises(ValueError): pb = PoiBin([[1, 1], [1, 2]]) pytest.fail("Input must be an one-dimensional array or a list") with pytest.raises(ValueError): pb = PoiBin([1, -1]) pytest.fail("Input probabilities have to be non negative.") with pytest.raises(ValueError): pb = PoiBin([1, 2]) pytest.fail("Input probabilities have to be smaller than 1.")
def win_oe_pval(win_probs, outcomes): """ Given predicted Bernoulli win probabilities and actual outcomes, compute Poisson binomial P value. Args: win_probs (numpy.ndarray): 1D array of win probabilities of each match. outcomes (numpy.ndarray): 1D binary array of match outcomes. Returns: int: Total number of matches. int: Observed number of wins. float: Expected number of wins. float: Poisson binomial P value for the observed count. float: Poisson distribution P value for the observed count. """ assert len(win_probs) == len(outcomes) exp_wins = np.sum(win_probs) obs_wins = np.sum(outcomes) poibin_alpha = PoiBin(win_probs).cdf(obs_wins) if poibin_alpha < 0.5: poibin_pval = poibin_alpha * 2 else: poibin_pval = (1 - poibin_alpha) * 2 pois_alpha = scipy.stats.poisson.cdf(obs_wins, exp_wins) if pois_alpha < 0.5: pois_pval = pois_alpha * 2 else: pois_pval = (1 - pois_alpha) * 2 return len(win_probs), obs_wins, exp_wins, poibin_pval, pois_pval
def test_pmf_pb_binom(): """Compare the probability mass function with the binomial limit case.""" # For equal probabilites p_j, the Poisson Binomial distribution reduces to # the Binomial one: p = [0.5, 0.5] pb = PoiBin(p) bn = binom(n=2, p=p[0]) # Compare to four digits behind the comma assert int(bn.pmf(0) * 10000) == int(pb.pmf(0) * 10000) # For different probabilities p_j, the Poisson Binomial distribution and # the Binomial distribution are different: pb = PoiBin([0.5, 0.8]) bn = binom(2, p=0.5) assert int(bn.pmf(0) * 10000) != int(pb.pmf(0) * 10000)
def test_pval_pb_binom(): """Compare the p-values with the binomial limit case. Test that the p-values of the Poisson Binomial distribution are the same as the ones of the Binomial distribution when all the probabilities are equal. """ pi = np.around(np.random.random_sample(), decimals=2) ni = np.random.randint(5, 500) pp = [pi for i in range(ni)] bn = binom(n=ni, p=pi) k = np.random.randint(0, ni) pval_bn = 1 - bn.cdf(k) + bn.pmf(k) pb = PoiBin(pp) pval_pb = pb.pval(k) assert np.all(np.around(pval_bn, decimals=10) == np.around(pval_pb, decimals=10))
def test_pval_pb_binom(): """Compare the p-values with the binomial limit case. Test that the p-values of the Poisson Binomial distribution are the same as the ones of the Binomial distribution when all the probabilities are equal. """ pi = np.around(np.random.random_sample(), decimals=2) ni = np.random.randint(5, 500) pp = [pi for i in range(ni)] bn = binom(n=ni, p=pi) k = np.random.randint(0, ni) pval_bn = 1 - bn.cdf(k) + bn.pmf(k) pb = PoiBin(pp) pval_pb = pb.pval(k) assert np.all( np.around(pval_bn, decimals=10) == np.around(pval_pb, decimals=10))
def test_get_pmf_xi(): """Test that the correct pmf elements are obtained.""" p = [0.2, 0.5] pb = PoiBin(p) assert np.all(np.abs(pb.get_pmf_xi() - np.array([0.4, 0.5, 0.1])) < 1e-10) p = [0.3, 0.8] pb = PoiBin(p) assert np.all( np.abs(pb.get_pmf_xi() - np.array([0.14, 0.62, 0.24])) < 1e-10) p = [0.3, 0.8, 0.3] pb = PoiBin(p) assert np.all( np.abs(pb.get_pmf_xi() - np.array([0.098, 0.476, 0.354, 0.072])) < 1e-10)
def test_check_rv_input(): """Test tat inputs are positive integers.""" p = [1, 1] pb = PoiBin(p) assert pb.check_rv_input([1, 2]) assert pb.check_rv_input(2) with pytest.raises(AssertionError): pb.check_rv_input(-1) pytest.fail("Input value cannot be negative.") with pytest.raises(AssertionError): pb.check_rv_input(1.7) pytest.fail("Input value must be an integer.")
def _run_binom_test(self, alternative="null"): family = self._df_results["family"].values df1, df2, p, ncp33 = self._df_results[["df1", "df2", "p", "ncp33"]].to_numpy().T k_below_25 = self._n_tests['p025'] if alternative == "null": return binom(n=self._n_tests['p05'], p=.5).sf(k_below_25 - 1) else: prop_below_25_33 = 3 * self._compute_prop_lower_33( .025, family, df1, df2, p, ncp33) prop_below_25_33_filtered = prop_below_25_33[p < .05] return PoiBin(prop_below_25_33_filtered).cdf(k_below_25)
def test_pmf(): """Test the probability mass function. The outcomes of some results are compared with the poibin R package [Rpoibin]_. """ p = [1, 1] pb = PoiBin(p) assert pb.pmf([1, 2]).size == 2 # Compare results with the ones obtained with the R poibin package # [Rpoibin]_ p = [0.4163448, 0.3340270, 0.9689613] pb = PoiBin(p) res = pb.pmf([0, 1, 2, 3]) res_ref = np.array([0.0120647, 0.39129134, 0.46189012, 0.13475384]) assert np.all(np.abs(res - res_ref) < 1e-8) p = [0.9955901, 0.5696224, 0.8272597, 0.3818746, 0.4290036, 0.8707646, 0.8858267, 0.7557183] pb = PoiBin(p) res = pb.pmf([0, 2, 7, 8]) res_ref = np.array([4.17079659e-07, 2.46250608e-03, 2.02460933e-01, 4.48023378e-02]) assert np.all(np.abs(res - res_ref) < 1e-8)
def test_check_rv_input(): """Test tat inputs are positive integers.""" p = [1, 1] pb = PoiBin(p) assert pb.check_rv_input([1, 2]) assert pb.check_rv_input(2) with pytest.raises(AssertionError, message="Input value cannot be negative."): pb.check_rv_input(-1) with pytest.raises(AssertionError, message="Input value must be an integer."): pb.check_rv_input(1.7)
def test_cdf_accuracy(): """Compare accuracy of the cumulative distribution function. Compare the results with the ones obtained with the R poibin package [Rpoibin]_. """ p = [0.1, 0.1] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([0, 2]) - np.array([0.81, 1.])) < 1e-10) p = [0.5, 1.0] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([1, 2]) == np.array([0.5, 1.])) < 1e-10) p = [0.1, 0.5] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([0, 1, 2]) == np.array([0.45, 0.95, 1.])) < 1e-10) p = [0.1, 0.5, 0.7] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([0, 1, 2]) == np.array([0.135, 0.6, 0.965])) < 1e-10)
def test_get_pmf_xi(): """Test that the correct pmf elements are obtained.""" p = [0.2, 0.5] pb = PoiBin(p) assert np.all(np.abs(pb.get_pmf_xi() - np.array([0.4, 0.5, 0.1])) < 1e-10) p = [0.3, 0.8] pb = PoiBin(p) assert np.all(np.abs(pb.get_pmf_xi() - np.array([0.14, 0.62, 0.24])) < 1e-10) p = [0.3, 0.8, 0.3] pb = PoiBin(p) assert np.all(np.abs(pb.get_pmf_xi() - np.array([0.098, 0.476, 0.354, 0.072])) < 1e-10)
def test_skew_pb_binom(): """Compare the skew function with the binomial limit case.""" # For equal probabilites p_j, the Poisson Binomial distribution reduces # to the Binomial one: p = [0.5, 0.5, 0.5, 0.5] pb = PoiBin(p) bn = binom(n=4, p=p[0]) # Compare to four digits behind the comma assert int(bn.stats(moments='s') * 10000) == int(pb.skew() * 10000) # For different probabilities p_j, the Poisson Binomial distribution and # the Binomial distribution are different: pb = PoiBin([0.5, 0.5, 0.8, 0.8]) bn = binom(4, p=0.5) assert int(bn.stats(moments='s') * 10000) != int(pb.skew() * 10000)
def test_argmax_pb_binom(): """Compare the amax function with the binomial limit case.""" # For equal probabilites p_j, the Poisson Binomial distribution reduces # to the Binomial one: p = [0.5, 0.5, 0.5, 0.5] pb = PoiBin(p) bn = binom(n=4, p=p[0]) cases = [0, 1, 2, 3, 4] # Compare to four digits behind the comma assert int(np.argmax(bn.pmf(cases)) * 10000) == int(pb.argmax() * 10000) # For different probabilities p_j, the Poisson Binomial distribution and # the Binomial distribution are different: pb = PoiBin([0.5, 0.5, 0.8, 0.8]) bn = binom(4, p=0.5) assert int(np.argmax(bn.pmf(cases)) * 10000) != int(pb.argmax() * 10000)
def test_argmax(): """Test amax function.""" p = [0.1, 0.1, 0.1, 0.9, 0.9, 0.9] pb = PoiBin(p) assert (pb.amax() - np.array([0.59122])) < 4 * np.finfo(float).eps
def test_cdf_accuracy(): """Compare accuracy of the cumulative distribution function. Compare the results with the ones obtained with the R poibin package [Rpoibin]_. """ p = [0.1, 0.1] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([0, 2]) - np.array([0.81, 1.])) < 1e-10) p = [0.5, 1.0] pb = PoiBin(p) assert np.all(np.abs(pb.cdf([1, 2]) == np.array([0.5, 1.])) < 1e-10) p = [0.1, 0.5] pb = PoiBin(p) assert np.all( np.abs(pb.cdf([0, 1, 2]) == np.array([0.45, 0.95, 1.])) < 1e-10) p = [0.1, 0.5, 0.7] pb = PoiBin(p) assert np.all( np.abs(pb.cdf([0, 1, 2]) == np.array([0.135, 0.6, 0.965])) < 1e-10)
def test_skew(): """Test skew function.""" p = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6] pb = PoiBin(p) assert (pb.skew() - np.array([0.1941243876059742])) < \ 4 * np.finfo(float).eps
def test_get_cdf(): """Test that the right cumulative distribution function is obtained.""" p = [1, 1] pb = PoiBin(p) assert np.all(pb.get_cdf([1, 1, 1]) == np.array([1., 2., 3.]))
# uses output by ANGSD run with options "-doPost 2 -doGeno 11" # output: contig, position, number non-missing samples, number "hard-called" heterozygotes, # expected num heterozygotes, probability of heterozygote majority # (last behaves slightly differently for odd/even numbers of samples) # requires numpy, poibin, scipy # put provided poibin.py into your PYTHONPATH location from sys import stdin, stdout from numpy import array, array_split, exp from scipy.misc import logsumexp from poibin import PoiBin for line in stdin: line = line.strip().split("\t") chrom, pos, gl = line[0], line[1], array_split(array(line[4:], "float"), (len(line) - 4) / 4) pr_heteroz = array([x[2] for x in gl if not x[0] < 0.]) num_heteroz = sum([int(x[0]) for x in gl if x[0] == 1]) h_expected = sum(pr_heteroz) / len(pr_heteroz) try: pois_binom = PoiBin(pr_heteroz) utail_prob = pois_binom.pval(len(pr_heteroz) / 2 + 1) except: utail_prob = 'NaN' stdout.write("\t".join([ chrom, pos, str(len(pr_heteroz)), str(num_heteroz), str(h_expected), str(utail_prob) ]) + "\n")
# fuzzy calculator of probability that heterozygotes constitute # the majority of calls for a given site # (for filtering out lumped paralogs) # by Nathaniel "Nate" S. Pope ([email protected]) # uses output by ANGSD run with options "-doPost 2 -doGeno 11" # output: contig, position, number non-missing samples, number "hard-called" heterozygotes, # expected num heterozygotes, probability of heterozygote majority # (last behaves slightly differently for odd/even numbers of samples) # requires numpy, poibin, scipy # put provided poibin.py into your PYTHONPATH location from sys import stdin, stdout from numpy import array, array_split, exp from scipy.misc import logsumexp from poibin import PoiBin for line in stdin: line = line.strip().split("\t") chrom, pos, gl = line[0], line[1], array_split(array(line[4:], "float"), (len(line)-4)/4) pr_heteroz = array([x[2] for x in gl if not x[0]<0.]) num_heteroz = sum([int(x[0]) for x in gl if x[0]==1]) h_expected = sum(pr_heteroz)/len(pr_heteroz) try: pois_binom = PoiBin(pr_heteroz) utail_prob = pois_binom.pval(len(pr_heteroz)/2+1) except: utail_prob = 'NaN' stdout.write("\t".join([chrom, pos, str(len(pr_heteroz)), str(num_heteroz), str(h_expected), str(utail_prob)]) + "\n")
def test_mean(): """Test mean function.""" p = [0, 0, 0, 1, 1, 1] pb = PoiBin(p) assert (pb.mean() == np.array([3]))
def test_var(): """Test mean function.""" p = [0.1, 0.1, 0.1, 0.9, 0.9, 0.9] pb = PoiBin(p) assert (pb.var() == np.array([0.54]))
def test_cdf(): """Test the cumulative distribution function.""" p = [1, 1] pb = PoiBin(p) assert np.all(pb.cdf([1, 2]) - np.array([0., 1.]) < 4 * np.finfo(float).eps) assert (pb.cdf(2) - 1.) < 4 * np.finfo(float).eps
for i in range(len(justices)): feature_master = pd.DataFrame.from_records(feature_info_master, columns = feature_columns_info_master) feature_master.to_csv('OutcomeReport_{}_FeatureImportInfo{}.csv'.format(unique_report, current_justice), mode = 'w+') master_probas = master_probas.fillna(2) ps = dict.fromkeys(list(master_probas.index.values), 0) for ind, row in master_probas.iterrows(): lista = [] for c in master_probas.columns: if row[c] != 2: lista.append(row[c]) ps[ind] = lista outcomes = {} for k in ps.keys(): pb = PoiBin(ps[k]) if len(ps[k]) == 9: outcomes[k] = sum(pb.pmf([5, 6, 7, 8, 9])) elif len(ps[k]) == 8: outcomes[k] = sum(pb.pmf([5, 6, 7, 8])) elif len(ps[k]) == 7: outcomes[k] = sum(pb.pmf([4, 5, 6, 7])) elif len(ps[k]) == 6: outcomes[k] = sum(pb.pmf([4, 5, 6])) elif len(ps[k]) == 5: outcomes[k] = sum(pb.pmf([3, 4, 5])) elif len(ps[k]) == 4: outcomes[k] = sum(pb.pmf([3, 4])) elif len(ps[k]) == 3: outcomes[k] = sum(pb.pmf([2, 3])) elif len(ps[k]) == 2:
def calculate_pov_exact(self): self.theta_T = round_probabilities(self.theta_T) pb = PoiBin(self.theta_T) return 1 - pb.cdf(math.floor(self.n/2))
def test_std(): """Test mean function.""" p = [0.1, 0.1, 0.1, 0.9, 0.9, 0.9] pb = PoiBin(p) assert (pb.std() == np.sqrt(0.54))