def test_heterozygosity_observed(self): # diploid g = GenotypeArray( [[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]], [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]], [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]], [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]], dtype='i1') expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1] actual = allel.heterozygosity_observed(g, fill=-1) aeq(expect, actual) # polyploid g = GenotypeArray( [[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]], [[1, 1, 1], [2, 2, 2]], [[0, 0, 0], [0, 0, 1]], [[0, 0, 0], [0, 0, 2]], [[1, 1, 1], [0, 1, 2]], [[0, 0, 1], [0, 1, 1]], [[0, 1, 1], [0, 1, 2]], [[0, 0, 0], [-1, -1, -1]], [[0, 0, 1], [-1, -1, -1]], [[-1, -1, -1], [-1, -1, -1]]], dtype='i1') expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1] actual = allel.heterozygosity_observed(g, fill=-1) aeq(expect, actual)
def test_observed_heterozygosity__scikit_allel_comparison( n_variant, n_sample, missing_pct, window_size, seed): ds = simulate_genotype_call_dataset( n_variant=n_variant, n_sample=n_sample, n_ploidy=2, missing_pct=missing_pct, seed=seed, ) ds["sample_cohort"] = ( ["samples"], np.zeros(n_sample, int), ) ds = window(ds, size=window_size) ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values if n_sample % window_size: # scikit-allel will drop the ragged end ho_sg = ho_sg[0:-1] # calculate with scikit-allel ho_sa = allel.moving_statistic( allel.heterozygosity_observed(ds["call_genotype"]), np.sum, size=window_size, ) # add cohort dimension to scikit-allel result np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
haps = np.array(ts1.genotype_matrix()) positions = np.array([s.position for s in ts1.sites()]) genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2) allele_counts = genotypes.count_alleles() subpop_allele_counts = genotypes.count_alleles_subpops(subpops=pops) genotype_allele_counts = genotypes.to_allele_counts() ##SNP stats segsites = np.shape(genotypes)[0] pi = allel.sequence_diversity(positions, allele_counts, start=1, stop=1e7) tajD = allel.tajima_d(ac=allele_counts, start=1, stop=1e7) thetaW = allel.watterson_theta(pos=positions, ac=allele_counts, start=1, stop=1e7) het_o = np.mean(allel.heterozygosity_observed(genotypes)) fst = allel.stats.fst.average_weir_cockerham_fst( genotypes, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 ]], 100)[0] dxy = allel.stats.diversity.sequence_divergence(positions, subpop_allele_counts['0'], subpop_allele_counts['1']) ##Identical-by-state haplotype block length stats (i.e. LD-type stats) #between population comparisons bw_pairs = list(itertools.product(pops['0'], pops['1'])) ibs_bw = []
def traditional_stats(data): """ Caclulates lots of (mostly) traditional statistics, that are summaries of the site frequency spectrum. Arguments --------- data: Named tuple of results (made by collate_results function) Returns --------- Nested dictionary of statistics """ pop_names = ["domestic", "wild", "captive", "all_pops"] stats = { "sfs_mean": {}, "diversity": {}, "wattersons_theta": {}, "tajimas_d": {}, "observed_heterozygosity": {}, "expected_heterozygosity": {}, "segregating_sites": {}, "monomorphic_sites": {}, "roh_mean": {}, "roh_iqr": {}, "r2": {}, "f3": {}, "divergence": {}, "fst": {}, "f2": {}, } for pop in pop_names: # One way statistics stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop]) stats["diversity"][pop] = allel.sequence_diversity( data.positions, data.allele_counts[pop]) stats["wattersons_theta"][pop] = allel.watterson_theta( data.positions, data.allele_counts[pop]) stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop], data.positions) stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed( data.genotypes[pop]).mean() stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected( data.allele_counts[pop].to_frequencies(), ploidy=2).mean() stats["segregating_sites"] = data.allele_counts[pop].count_segregating( ) if pop != "all_pops": # all_pops has no monomorphic sites stats["monomorphic_sites"][pop] = data.allele_counts[ pop].count_non_segregating() # Three way statistics other_pops = [ pop_name for pop_name in pop_names if pop_name not in ["all_pops", pop] ] t, b = allel.patterson_f3(data.allele_counts[pop], data.allele_counts[other_pops[0]], data.allele_counts[other_pops[1]]) stats["f3"][pop] = np.sum(t) / np.sum(b) # Two way statistics for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]: p = comparison.split("_") stats["divergence"][comparison] = allel.sequence_divergence( data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]]) num, den = allel.hudson_fst(data.allele_counts[p[0]], data.allele_counts[p[1]]) stats["fst"][comparison] = np.sum(num) / np.sum(den) stats["f2"][comparison] = allel.patterson_f2( data.allele_counts[p[0]], data.allele_counts[p[1]]).mean() return stats