예제 #1
0
        samples=metadata,
        numbers=numbers,
        ploidy=ploidy,
        qualflt=qualflt,
        missingfltprop=missingprop)

    #### Genome-wide statistics (seqDiv, Wattersons Theta, LD, inbreeding coefficient) ####
    seqdivdict = {}
    thetadict = {}
    coefdict = {}
    allcoef = defaultdict(list)

    for pop in metadata['treatment'].unique():

        # Sequence diversity
        seqdivdict[pop] = allel.sequence_diversity(pos, acsubpops[pop])

        # Wattersons theta
        thetadict[pop] = allel.watterson_theta(pos, acsubpops[pop])

        # Inbreeding coefficient
        if ploidy > 1:
            gn = geno.take(subpops[pop], axis=1)
            coef = allel.moving_statistic(
                gn,
                statistic=allel.inbreeding_coefficient,
                size=1000,
                step=100)
            coef = np.nanmean(coef, axis=1)
            coefdict[pop] = np.mean(coef)
            allcoef[pop].append(np.array(coef))
    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
    '1': [
        20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
        38, 39
    ]
}
haps = np.array(ts1.genotype_matrix())
positions = np.array([s.position for s in ts1.sites()])
genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2)
allele_counts = genotypes.count_alleles()
subpop_allele_counts = genotypes.count_alleles_subpops(subpops=pops)
genotype_allele_counts = genotypes.to_allele_counts()

##SNP stats
segsites = np.shape(genotypes)[0]
pi = allel.sequence_diversity(positions, allele_counts, start=1, stop=1e7)
tajD = allel.tajima_d(ac=allele_counts, start=1, stop=1e7)
thetaW = allel.watterson_theta(pos=positions,
                               ac=allele_counts,
                               start=1,
                               stop=1e7)
het_o = np.mean(allel.heterozygosity_observed(genotypes))
fst = allel.stats.fst.average_weir_cockerham_fst(
    genotypes,
    [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
     [
         20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
         37, 38, 39
     ]], 100)[0]
dxy = allel.stats.diversity.sequence_divergence(positions,
                                                subpop_allele_counts['0'],
예제 #3
0
        # fst and dxy per gene between each comparison
        for comp1,comp2 in comparisons:
            name = comp1 + "_" + comp2
            ac1 = acsubpops[comp1].compress(gene_bool, axis=0)
            ac2 = acsubpops[comp2].compress(gene_bool, axis=0)

            fst_per_comp[name], se_per_comp[name],_,_= allel.average_hudson_fst(ac1, ac2, blen=1)
            
            dxy_per_comp[name] = allel.sequence_divergence(pos[gene_bool], ac1, ac2)

        # tajimas d and sequence diversity per gene for each subpop(i.e treatment)
        for subpop in subpops:
            ac = acsubpops[subpop].compress(gene_bool)
            genepos = pos[gene_bool]
            tajd_per_pop[subpop] = allel.tajima_d(ac=ac, pos=genepos)
            gdiv_per_pop[subpop] = allel.sequence_diversity(ac=ac, pos=genepos)

        # pbs for each gene for each pbc comparison as defined in config.yaml
        if pbs is True:
            for pbscomp in pbscomps:
                pop1, pop2, outpop = pbscomp.split("_")
                pbs_per_comp[pbscomp],se,_,_ = rnaseqpop.meanPBS(acsubpops[pop1].compress(gene_bool, axis=0),
                                          acsubpops[pop2].compress(gene_bool, axis=0),
                                          acsubpops[outpop].compress(gene_bool, axis=0),
                                                     window_size=1,
                                                    normalise=True)
        # store inner dict in outer dicts
        fst_per_gene[ID] = dict(fst_per_comp)
        se_per_gene[ID] = dict(se_per_comp)
        if pbs is True : pbs_per_gene[ID] = dict(pbs_per_comp)
        tajd_per_gene[ID] = dict(tajd_per_pop)
varbool_syn   = np.asarray([oc_effvars_seg_typ[i] == "synonymous_variant" for i,_ in enumerate(oc_effvars_seg_typ)])
varbool_nosyn = np.logical_or(varbool_syn , np.asarray([oc_effvars_seg_typ[i] == "missense_variant" for i,_ in enumerate(oc_effvars_seg_typ)]))

for popi in ["cluster_103","cluster_231","cluster_233"]:

    j_run    = len(popdich_clu[popi])
    j_pi_s   = np.zeros(shape=j_run)
    j_pi_n   = np.zeros(shape=j_run)
    j_pi_n_s = np.zeros(shape=j_run)
    for i in range(j_run):
        j_sel1      = popdich_clu[popi][0:i] + popdich_clu[popi][i+1:j_run]
        j_dic       = dict()
        j_dic[popi] = j_sel1
        j_dic_n_ac  = oc_haploty_hap_seg.subset(sel0=np.logical_and(varbool_loc, varbool_nosyn)).count_alleles_subpops(subpops=j_dic)
        j_dic_s_ac  = oc_haploty_hap_seg.subset(sel0=np.logical_and(varbool_loc, varbool_syn)).count_alleles_subpops(subpops=j_dic)
        j_pi_n[i]   = allel.sequence_diversity(pos=oc_hapvars_seg["POS"].subset(sel0=np.logical_and(varbool_loc, varbool_nosyn)), ac=j_dic_n_ac[popi])
        j_pi_s[i]   = allel.sequence_diversity(pos=oc_hapvars_seg["POS"].subset(sel0=np.logical_and(varbool_loc, varbool_syn)),   ac=j_dic_s_ac[popi])
        j_pi_n_s[i] = j_pi_n[i] / j_pi_s[i]

    j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_n)
    print("pi_n %s \t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu))
    j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_s)
    print("pi_s %s \t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu))
    j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_n_s)
    print("r_n_s %s\t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu))



# Can we phase additional mutations?
# If we find variants that are in high linkage disequilibrium with specimens that are 0 or 2 for the P4 mutation, we can use them to infer whether a particular haplotype has ZZB, dups, or indels.
# First, load info of extra mutations:
예제 #5
0
def traditional_stats(data):
    """
    Caclulates lots of (mostly) traditional statistics,
    that are summaries of the site frequency spectrum.

    Arguments
    ---------
    data: Named tuple of results (made by collate_results function)

    Returns
    ---------
    Nested dictionary of statistics
    """
    pop_names = ["domestic", "wild", "captive", "all_pops"]

    stats = {
        "sfs_mean": {},
        "diversity": {},
        "wattersons_theta": {},
        "tajimas_d": {},
        "observed_heterozygosity": {},
        "expected_heterozygosity": {},
        "segregating_sites": {},
        "monomorphic_sites": {},
        "roh_mean": {},
        "roh_iqr": {},
        "r2": {},
        "f3": {},
        "divergence": {},
        "fst": {},
        "f2": {},
    }

    for pop in pop_names:
        # One way statistics
        stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop])
        stats["diversity"][pop] = allel.sequence_diversity(
            data.positions, data.allele_counts[pop])
        stats["wattersons_theta"][pop] = allel.watterson_theta(
            data.positions, data.allele_counts[pop])
        stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop],
                                                 data.positions)
        stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed(
            data.genotypes[pop]).mean()
        stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected(
            data.allele_counts[pop].to_frequencies(), ploidy=2).mean()
        stats["segregating_sites"] = data.allele_counts[pop].count_segregating(
        )

        if pop != "all_pops":  # all_pops has no monomorphic sites
            stats["monomorphic_sites"][pop] = data.allele_counts[
                pop].count_non_segregating()

            # Three way statistics
            other_pops = [
                pop_name for pop_name in pop_names
                if pop_name not in ["all_pops", pop]
            ]
            t, b = allel.patterson_f3(data.allele_counts[pop],
                                      data.allele_counts[other_pops[0]],
                                      data.allele_counts[other_pops[1]])
            stats["f3"][pop] = np.sum(t) / np.sum(b)

    # Two way statistics
    for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]:
        p = comparison.split("_")
        stats["divergence"][comparison] = allel.sequence_divergence(
            data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]])

        num, den = allel.hudson_fst(data.allele_counts[p[0]],
                                    data.allele_counts[p[1]])
        stats["fst"][comparison] = np.sum(num) / np.sum(den)
        stats["f2"][comparison] = allel.patterson_f2(
            data.allele_counts[p[0]], data.allele_counts[p[1]]).mean()

    return stats