def test_sequence_divergence(self): from allel import sequence_divergence pos = [2, 4, 8] ac1 = AlleleCountsArray([[2, 0], [2, 0], [2, 0]]) ac2 = AlleleCountsArray([[0, 2], [0, 2], [0, 2]]) # all variants e = 3 / 7 a = sequence_divergence(pos, ac1, ac2) assert e == a # start/stop e = 2 / 6 a = sequence_divergence(pos, ac1, ac2, start=0, stop=5) assert e == a # start/stop, an provided an1 = ac1.sum(axis=1) an2 = ac2.sum(axis=1) e = 2 / 6 a = sequence_divergence(pos, ac1, ac2, start=0, stop=5, an1=an1, an2=an2) assert e == a
continue # store number of snps per gene n_dict[ID] = nsnps # store midpoint positions of gene pos_dict[ID] = (gene['start'] + gene['end'])/2 # fst and dxy per gene between each comparison for comp1,comp2 in comparisons: name = comp1 + "_" + comp2 ac1 = acsubpops[comp1].compress(gene_bool, axis=0) ac2 = acsubpops[comp2].compress(gene_bool, axis=0) fst_per_comp[name], se_per_comp[name],_,_= allel.average_hudson_fst(ac1, ac2, blen=1) dxy_per_comp[name] = allel.sequence_divergence(pos[gene_bool], ac1, ac2) # tajimas d and sequence diversity per gene for each subpop(i.e treatment) for subpop in subpops: ac = acsubpops[subpop].compress(gene_bool) genepos = pos[gene_bool] tajd_per_pop[subpop] = allel.tajima_d(ac=ac, pos=genepos) gdiv_per_pop[subpop] = allel.sequence_diversity(ac=ac, pos=genepos) # pbs for each gene for each pbc comparison as defined in config.yaml if pbs is True: for pbscomp in pbscomps: pop1, pop2, outpop = pbscomp.split("_") pbs_per_comp[pbscomp],se,_,_ = rnaseqpop.meanPBS(acsubpops[pop1].compress(gene_bool, axis=0), acsubpops[pop2].compress(gene_bool, axis=0), acsubpops[outpop].compress(gene_bool, axis=0),
def traditional_stats(data): """ Caclulates lots of (mostly) traditional statistics, that are summaries of the site frequency spectrum. Arguments --------- data: Named tuple of results (made by collate_results function) Returns --------- Nested dictionary of statistics """ pop_names = ["domestic", "wild", "captive", "all_pops"] stats = { "sfs_mean": {}, "diversity": {}, "wattersons_theta": {}, "tajimas_d": {}, "observed_heterozygosity": {}, "expected_heterozygosity": {}, "segregating_sites": {}, "monomorphic_sites": {}, "roh_mean": {}, "roh_iqr": {}, "r2": {}, "f3": {}, "divergence": {}, "fst": {}, "f2": {}, } for pop in pop_names: # One way statistics stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop]) stats["diversity"][pop] = allel.sequence_diversity( data.positions, data.allele_counts[pop]) stats["wattersons_theta"][pop] = allel.watterson_theta( data.positions, data.allele_counts[pop]) stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop], data.positions) stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed( data.genotypes[pop]).mean() stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected( data.allele_counts[pop].to_frequencies(), ploidy=2).mean() stats["segregating_sites"] = data.allele_counts[pop].count_segregating( ) if pop != "all_pops": # all_pops has no monomorphic sites stats["monomorphic_sites"][pop] = data.allele_counts[ pop].count_non_segregating() # Three way statistics other_pops = [ pop_name for pop_name in pop_names if pop_name not in ["all_pops", pop] ] t, b = allel.patterson_f3(data.allele_counts[pop], data.allele_counts[other_pops[0]], data.allele_counts[other_pops[1]]) stats["f3"][pop] = np.sum(t) / np.sum(b) # Two way statistics for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]: p = comparison.split("_") stats["divergence"][comparison] = allel.sequence_divergence( data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]]) num, den = allel.hudson_fst(data.allele_counts[p[0]], data.allele_counts[p[1]]) stats["fst"][comparison] = np.sum(num) / np.sum(den) stats["f2"][comparison] = allel.patterson_f2( data.allele_counts[p[0]], data.allele_counts[p[1]]).mean() return stats