Beispiel #1
0
    def test_heterozygosity_observed(self):

        # diploid
        g = GenotypeArray(
            [[[0, 0], [0, 0]], [[1, 1], [1, 1]], [[1, 1], [2, 2]],
             [[0, 0], [0, 1]], [[0, 0], [0, 2]], [[1, 1], [1, 2]],
             [[0, 1], [0, 1]], [[0, 1], [1, 2]], [[0, 0], [-1, -1]],
             [[0, 1], [-1, -1]], [[-1, -1], [-1, -1]]],
            dtype='i1')
        expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1]
        actual = allel.heterozygosity_observed(g, fill=-1)
        aeq(expect, actual)

        # polyploid
        g = GenotypeArray(
            [[[0, 0, 0], [0, 0, 0]], [[1, 1, 1], [1, 1, 1]],
             [[1, 1, 1], [2, 2, 2]], [[0, 0, 0], [0, 0, 1]],
             [[0, 0, 0], [0, 0, 2]], [[1, 1, 1], [0, 1, 2]],
             [[0, 0, 1], [0, 1, 1]], [[0, 1, 1], [0, 1, 2]],
             [[0, 0, 0], [-1, -1, -1]], [[0, 0, 1], [-1, -1, -1]],
             [[-1, -1, -1], [-1, -1, -1]]],
            dtype='i1')
        expect = [0, 0, 0, .5, .5, .5, 1, 1, 0, 1, -1]
        actual = allel.heterozygosity_observed(g, fill=-1)
        aeq(expect, actual)
Beispiel #2
0
def test_observed_heterozygosity__scikit_allel_comparison(
        n_variant, n_sample, missing_pct, window_size, seed):
    ds = simulate_genotype_call_dataset(
        n_variant=n_variant,
        n_sample=n_sample,
        n_ploidy=2,
        missing_pct=missing_pct,
        seed=seed,
    )
    ds["sample_cohort"] = (
        ["samples"],
        np.zeros(n_sample, int),
    )
    ds = window(ds, size=window_size)
    ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values
    if n_sample % window_size:
        # scikit-allel will drop the ragged end
        ho_sg = ho_sg[0:-1]
    # calculate with scikit-allel
    ho_sa = allel.moving_statistic(
        allel.heterozygosity_observed(ds["call_genotype"]),
        np.sum,
        size=window_size,
    )
    # add cohort dimension to scikit-allel result
    np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
haps = np.array(ts1.genotype_matrix())
positions = np.array([s.position for s in ts1.sites()])
genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2)
allele_counts = genotypes.count_alleles()
subpop_allele_counts = genotypes.count_alleles_subpops(subpops=pops)
genotype_allele_counts = genotypes.to_allele_counts()

##SNP stats
segsites = np.shape(genotypes)[0]
pi = allel.sequence_diversity(positions, allele_counts, start=1, stop=1e7)
tajD = allel.tajima_d(ac=allele_counts, start=1, stop=1e7)
thetaW = allel.watterson_theta(pos=positions,
                               ac=allele_counts,
                               start=1,
                               stop=1e7)
het_o = np.mean(allel.heterozygosity_observed(genotypes))
fst = allel.stats.fst.average_weir_cockerham_fst(
    genotypes,
    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
     [
         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         37, 38, 39
     ]], 100)[0]
dxy = allel.stats.diversity.sequence_divergence(positions,
                                                subpop_allele_counts['0'],
                                                subpop_allele_counts['1'])

##Identical-by-state haplotype block length stats (i.e. LD-type stats)
#between population comparisons
bw_pairs = list(itertools.product(pops['0'], pops['1']))
ibs_bw = []
Beispiel #4
0
def traditional_stats(data):
    """
    Caclulates lots of (mostly) traditional statistics,
    that are summaries of the site frequency spectrum.

    Arguments
    ---------
    data: Named tuple of results (made by collate_results function)

    Returns
    ---------
    Nested dictionary of statistics
    """
    pop_names = ["domestic", "wild", "captive", "all_pops"]

    stats = {
        "sfs_mean": {},
        "diversity": {},
        "wattersons_theta": {},
        "tajimas_d": {},
        "observed_heterozygosity": {},
        "expected_heterozygosity": {},
        "segregating_sites": {},
        "monomorphic_sites": {},
        "roh_mean": {},
        "roh_iqr": {},
        "r2": {},
        "f3": {},
        "divergence": {},
        "fst": {},
        "f2": {},
    }

    for pop in pop_names:
        # One way statistics
        stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop])
        stats["diversity"][pop] = allel.sequence_diversity(
            data.positions, data.allele_counts[pop])
        stats["wattersons_theta"][pop] = allel.watterson_theta(
            data.positions, data.allele_counts[pop])
        stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop],
                                                 data.positions)
        stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed(
            data.genotypes[pop]).mean()
        stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected(
            data.allele_counts[pop].to_frequencies(), ploidy=2).mean()
        stats["segregating_sites"] = data.allele_counts[pop].count_segregating(
        )

        if pop != "all_pops":  # all_pops has no monomorphic sites
            stats["monomorphic_sites"][pop] = data.allele_counts[
                pop].count_non_segregating()

            # Three way statistics
            other_pops = [
                pop_name for pop_name in pop_names
                if pop_name not in ["all_pops", pop]
            ]
            t, b = allel.patterson_f3(data.allele_counts[pop],
                                      data.allele_counts[other_pops[0]],
                                      data.allele_counts[other_pops[1]])
            stats["f3"][pop] = np.sum(t) / np.sum(b)

    # Two way statistics
    for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]:
        p = comparison.split("_")
        stats["divergence"][comparison] = allel.sequence_divergence(
            data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]])

        num, den = allel.hudson_fst(data.allele_counts[p[0]],
                                    data.allele_counts[p[1]])
        stats["fst"][comparison] = np.sum(num) / np.sum(den)
        stats["f2"][comparison] = allel.patterson_f2(
            data.allele_counts[p[0]], data.allele_counts[p[1]]).mean()

    return stats