def test_windowed_tajima_d(self): from allel import windowed_tajima_d pos = np.array([1, 11, 21, 31, 41]) # example with calculable value ac = AlleleCountsArray([[1, 3], [2, 2], [3, 1], [1, 3], [2, 2]]) expect = np.array([0.168] * 3) actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10) assert_array_almost_equal(expect, actual, decimal=3) # too few sites actual, _, _ = windowed_tajima_d(pos, ac, size=15, step=10) assert 4 == len(actual) assert np.all(np.isnan(actual)) # too few segregating sites ac = AlleleCountsArray([[4, 0], [2, 2], [3, 1], [4, 0], [2, 2]]) actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10) assert 3 == len(actual) assert np.all(np.isnan(actual)) # allow people to override if they really want to expect = np.array([0.592] * 3) actual, _, _ = windowed_tajima_d(pos, ac, size=25, step=10, min_sites=2) assert_array_almost_equal(expect, actual, decimal=3)
def tajd(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=1000, nwindow=100): """ """ tajddict = {} windlen = int(chrsize / nwindow) for x in ac_subpops.keys(): acu = ac_subpops[x] flt = acu.is_segregating() & (acu.max_allele() == 1) print('TajD : retaining', np.count_nonzero(flt), 'SNPs') posflt = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) # tajd tajd = allel.windowed_tajima_d(posflt, ac, size=blenw) d_m, d_se, *d = jackknife(tajd[0]) tajd_windowed = allel.windowed_tajima_d(posflt, ac, size=windlen, start=1, stop=chrsize) # moving window of variants rather than based # tajd_sizevars = allel.moving_tajima_d(ac, size=size) tajddict[x] = (d_m, d_se, (tajd_windowed[0], tajd_windowed[1])) if plot: div_plot(tajddict, pop2color, list(ac_subpops.keys()), c, chrsize, "Tajima's D") return (tajddict)
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5): window_size = int(window_size) reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples]) original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code] synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles() reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles() synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size) reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size) plt.title('Nucleotide Diversity Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Nucleotide Diversity (π)') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code))) plt.close(plt.gcf()) synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size) reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size) plt.title('Tajima\'s D Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Tajima\'s D') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code))) plt.close(plt.gcf())
def tajimaD(pos, gt, win_size, length_bp): """Calculate Tajima's D in steps of seg sites. Parameters ---------- ac : array allele counts array size : int, optional window size in number of variants. The default is 4. Returns ------- tajd_mean : float DESCRIPTION. tajd_std : float DESCRIPTION. """ gtseg, pos_s = get_seg(gt, pos) ac = gtseg.count_alleles() tajd_, *_ = allel.windowed_tajima_d(pos_s, ac, size=win_size, start=1, stop=length_bp) tajd_mean = np.nanmean(tajd_) tajd_std = np.nanstd(tajd_) return tajd_mean, tajd_std
size=winsize, start=start, stop=stop, step=int(winsize / 2)) new_dat = format_results(stat=thetaW, stat_name="thetaW", chrom=chrom, windows=windows, nvar=counts, pop=pop) df_list.append(new_dat) if 'tajD' in args.s: tajD, windows, counts = allel.windowed_tajima_d( pos, ac, size=winsize, start=start, stop=stop, step=int(winsize / 2)) new_dat = format_results(stat=tajD, stat_name="tajD", chrom=chrom, windows=windows, nvar=counts, pop=pop) df_list.append(new_dat) if 'dxy' in args.s and args.p2 != "None": dxy, windows, n_bases, counts = allel.windowed_divergence( pos, ac, ac2,
def win_pi_sims(path, neut_mut, n_pops, n_sims, T, win_size, L, N): foname = os.path.basename(path[:-1]) print(("Base filename:" + foname), flush=True) x = np.arange(n_pops) combs = list(itertools.combinations(x, 2)) pis = np.zeros((len(T), n_sims, n_pops, int(L / win_size))) div = np.zeros((len(T), n_sims, len(combs), int(L / win_size))) fst = np.zeros((len(T), n_sims, len(combs), int(L / win_size))) tajd = np.zeros((len(T), n_sims, n_pops, int(L / win_size))) for t in range(len(T)): for i in range(n_sims): files = glob(path + str(T[t]) + "N_sim_" + str(i) + "_RAND_*[0-9]_overlaid.trees") print(files) assert (len(files) == 1), str( len(files)) + " file(s) found with glob T: " + str( T[t]) + " sim:" + str(i) filename = files[0] print(filename) ts = pyslim.load(filename).simplify() #print(("Pi0: ", ts.pairwise_diversity(samples=ts.samples(population=0)),"Pi1: ", ts.pairwise_diversity(samples=ts.samples(population=1))), flush=True) s1 = timer() acs, pos = ac_from_ts(ts, n_pops, N) for j in range(n_pops): pi, windows, n_bases, counts = allel.windowed_diversity( pos, acs[j], size=win_size, start=1, stop=L) pis[t, i, j, :] = pi D, windows, counts = allel.windowed_tajima_d(pos, acs[j], size=win_size, start=1, stop=L) tajd[t, i, j, :] = D s2 = timer() print(("Calculating windowed Pi/TajD... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True) s1 = timer() for k in range(len(combs)): dxy, windows, n_bases, counts = allel.windowed_divergence( pos, acs[combs[k][0]], acs[combs[k][1]], size=win_size, start=1, stop=L) div[t, i, k, :] = dxy fstat, windows, counts = allel.windowed_hudson_fst( pos, acs[combs[k][0]], acs[combs[k][1]], size=win_size, start=1, stop=L) fst[t, i, k, :] = fstat s2 = timer() print(("Calculating windowed Dxy and Fst... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True) s1 = timer() print((pis.shape), flush=True) print((tajd.shape), flush=True) print((div.shape), flush=True) output = open(path + foname + '_pis.pkl', 'wb') pickle.dump(pis, output) output.close() output = open(path + foname + '_tajd.pkl', 'wb') pickle.dump(tajd, output) output.close() output = open(path + foname + '_div.pkl', 'wb') pickle.dump(div, output) output.close() output = open(path + foname + '_fst.pkl', 'wb') pickle.dump(fst, output) output.close() if (0): plt.subplot(2, 1, 1) plt.plot(np.transpose(pis[0, 0, :]), "-") plt.title('0N after split') plt.ylabel('Pi') plt.subplot(2, 1, 2) plt.plot(np.transpose(pis[9, 0, :]), "-") plt.title('10N after split') plt.xlabel('Window') plt.ylabel('Pi') plt.tight_layout() plt.savefig(path + foname + '_landscape.pdf') plt.close() s2 = timer() print(("Saving stats and plots to file... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True)