samples=metadata, numbers=numbers, ploidy=ploidy, qualflt=qualflt, missingfltprop=missingprop) #### Genome-wide statistics (seqDiv, Wattersons Theta, LD, inbreeding coefficient) #### seqdivdict = {} thetadict = {} coefdict = {} allcoef = defaultdict(list) for pop in metadata['treatment'].unique(): # Sequence diversity seqdivdict[pop] = allel.sequence_diversity(pos, acsubpops[pop]) # Wattersons theta thetadict[pop] = allel.watterson_theta(pos, acsubpops[pop]) # Inbreeding coefficient if ploidy > 1: gn = geno.take(subpops[pop], axis=1) coef = allel.moving_statistic( gn, statistic=allel.inbreeding_coefficient, size=1000, step=100) coef = np.nanmean(coef, axis=1) coefdict[pop] = np.mean(coef) allcoef[pop].append(np.array(coef))
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], '1': [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 ] } haps = np.array(ts1.genotype_matrix()) positions = np.array([s.position for s in ts1.sites()]) genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2) allele_counts = genotypes.count_alleles() subpop_allele_counts = genotypes.count_alleles_subpops(subpops=pops) genotype_allele_counts = genotypes.to_allele_counts() ##SNP stats segsites = np.shape(genotypes)[0] pi = allel.sequence_diversity(positions, allele_counts, start=1, stop=1e7) tajD = allel.tajima_d(ac=allele_counts, start=1, stop=1e7) thetaW = allel.watterson_theta(pos=positions, ac=allele_counts, start=1, stop=1e7) het_o = np.mean(allel.heterozygosity_observed(genotypes)) fst = allel.stats.fst.average_weir_cockerham_fst( genotypes, [[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], [ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39 ]], 100)[0] dxy = allel.stats.diversity.sequence_divergence(positions, subpop_allele_counts['0'],
# fst and dxy per gene between each comparison for comp1,comp2 in comparisons: name = comp1 + "_" + comp2 ac1 = acsubpops[comp1].compress(gene_bool, axis=0) ac2 = acsubpops[comp2].compress(gene_bool, axis=0) fst_per_comp[name], se_per_comp[name],_,_= allel.average_hudson_fst(ac1, ac2, blen=1) dxy_per_comp[name] = allel.sequence_divergence(pos[gene_bool], ac1, ac2) # tajimas d and sequence diversity per gene for each subpop(i.e treatment) for subpop in subpops: ac = acsubpops[subpop].compress(gene_bool) genepos = pos[gene_bool] tajd_per_pop[subpop] = allel.tajima_d(ac=ac, pos=genepos) gdiv_per_pop[subpop] = allel.sequence_diversity(ac=ac, pos=genepos) # pbs for each gene for each pbc comparison as defined in config.yaml if pbs is True: for pbscomp in pbscomps: pop1, pop2, outpop = pbscomp.split("_") pbs_per_comp[pbscomp],se,_,_ = rnaseqpop.meanPBS(acsubpops[pop1].compress(gene_bool, axis=0), acsubpops[pop2].compress(gene_bool, axis=0), acsubpops[outpop].compress(gene_bool, axis=0), window_size=1, normalise=True) # store inner dict in outer dicts fst_per_gene[ID] = dict(fst_per_comp) se_per_gene[ID] = dict(se_per_comp) if pbs is True : pbs_per_gene[ID] = dict(pbs_per_comp) tajd_per_gene[ID] = dict(tajd_per_pop)
varbool_syn = np.asarray([oc_effvars_seg_typ[i] == "synonymous_variant" for i,_ in enumerate(oc_effvars_seg_typ)]) varbool_nosyn = np.logical_or(varbool_syn , np.asarray([oc_effvars_seg_typ[i] == "missense_variant" for i,_ in enumerate(oc_effvars_seg_typ)])) for popi in ["cluster_103","cluster_231","cluster_233"]: j_run = len(popdich_clu[popi]) j_pi_s = np.zeros(shape=j_run) j_pi_n = np.zeros(shape=j_run) j_pi_n_s = np.zeros(shape=j_run) for i in range(j_run): j_sel1 = popdich_clu[popi][0:i] + popdich_clu[popi][i+1:j_run] j_dic = dict() j_dic[popi] = j_sel1 j_dic_n_ac = oc_haploty_hap_seg.subset(sel0=np.logical_and(varbool_loc, varbool_nosyn)).count_alleles_subpops(subpops=j_dic) j_dic_s_ac = oc_haploty_hap_seg.subset(sel0=np.logical_and(varbool_loc, varbool_syn)).count_alleles_subpops(subpops=j_dic) j_pi_n[i] = allel.sequence_diversity(pos=oc_hapvars_seg["POS"].subset(sel0=np.logical_and(varbool_loc, varbool_nosyn)), ac=j_dic_n_ac[popi]) j_pi_s[i] = allel.sequence_diversity(pos=oc_hapvars_seg["POS"].subset(sel0=np.logical_and(varbool_loc, varbool_syn)), ac=j_dic_s_ac[popi]) j_pi_n_s[i] = j_pi_n[i] / j_pi_s[i] j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_n) print("pi_n %s \t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu)) j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_s) print("pi_s %s \t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu)) j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_pi_n_s) print("r_n_s %s\t = %.3E +/- %.3E SE, %.3E-%.3E CI95, n=%i" % (popi, j_av, j_se, j_cl, j_cu, j_nu)) # Can we phase additional mutations? # If we find variants that are in high linkage disequilibrium with specimens that are 0 or 2 for the P4 mutation, we can use them to infer whether a particular haplotype has ZZB, dups, or indels. # First, load info of extra mutations:
def traditional_stats(data): """ Caclulates lots of (mostly) traditional statistics, that are summaries of the site frequency spectrum. Arguments --------- data: Named tuple of results (made by collate_results function) Returns --------- Nested dictionary of statistics """ pop_names = ["domestic", "wild", "captive", "all_pops"] stats = { "sfs_mean": {}, "diversity": {}, "wattersons_theta": {}, "tajimas_d": {}, "observed_heterozygosity": {}, "expected_heterozygosity": {}, "segregating_sites": {}, "monomorphic_sites": {}, "roh_mean": {}, "roh_iqr": {}, "r2": {}, "f3": {}, "divergence": {}, "fst": {}, "f2": {}, } for pop in pop_names: # One way statistics stats["sfs_mean"][pop] = binned_sfs_mean(data.allele_counts[pop]) stats["diversity"][pop] = allel.sequence_diversity( data.positions, data.allele_counts[pop]) stats["wattersons_theta"][pop] = allel.watterson_theta( data.positions, data.allele_counts[pop]) stats["tajimas_d"][pop] = allel.tajima_d(data.allele_counts[pop], data.positions) stats["observed_heterozygosity"][pop] = allel.heterozygosity_observed( data.genotypes[pop]).mean() stats["expected_heterozygosity"][pop] = allel.heterozygosity_expected( data.allele_counts[pop].to_frequencies(), ploidy=2).mean() stats["segregating_sites"] = data.allele_counts[pop].count_segregating( ) if pop != "all_pops": # all_pops has no monomorphic sites stats["monomorphic_sites"][pop] = data.allele_counts[ pop].count_non_segregating() # Three way statistics other_pops = [ pop_name for pop_name in pop_names if pop_name not in ["all_pops", pop] ] t, b = allel.patterson_f3(data.allele_counts[pop], data.allele_counts[other_pops[0]], data.allele_counts[other_pops[1]]) stats["f3"][pop] = np.sum(t) / np.sum(b) # Two way statistics for comparison in ["domestic_wild", "domestic_captive", "wild_captive"]: p = comparison.split("_") stats["divergence"][comparison] = allel.sequence_divergence( data.positions, data.allele_counts[p[0]], data.allele_counts[p[1]]) num, den = allel.hudson_fst(data.allele_counts[p[0]], data.allele_counts[p[1]]) stats["fst"][comparison] = np.sum(num) / np.sum(den) stats["f2"][comparison] = allel.patterson_f2( data.allele_counts[p[0]], data.allele_counts[p[1]]).mean() return stats