def pi(c, chrsize, ac_subpops, pos, pop2color, plot=False, blenw=1000, nwindow=100): """ """ pidict = {} windlen = int(chrsize / nwindow) for x in ac_subpops.keys(): acu = ac_subpops[x] flt = acu.is_segregating() & (acu.max_allele() == 1) print('PI : retaining', np.count_nonzero(flt), 'SNPs') posflt = pos[flt] ac = allel.AlleleCountsArray(ac_subpops[x].compress(flt, axis=0)[:, :2]) # pi pi = allel.windowed_diversity(posflt, ac, size=blenw) pi_m, pi_se, *f = jackknife(pi[0]) pi_windowed = allel.windowed_diversity(posflt, ac, size=windlen, start=1, stop=chrsize) pidict[x] = (pi_m, pi_se, (pi_windowed[0], pi_windowed[1])) if plot: div_plot(pidict, pop2color, list(ac_subpops.keys()), c, chrsize, "pi") return (pidict)
def population_statistics(synthetic_population_code, synthetic_genotypes, reference_genotypes, synthetic_positions, reference_positions, reference_samples, classification_map, window_size=2e5): window_size = int(window_size) reference_population_labels = np.array([classification_map.loc[sample]['population'] for sample in reference_samples]) original_reference_genotypes = reference_genotypes[:, reference_population_labels == synthetic_population_code] synthetic_allele_counts = allel.GenotypeArray(synthetic_genotypes).count_alleles() reference_allele_counts = allel.GenotypeArray(original_reference_genotypes).count_alleles() synthetic_pi, _, _, _ = allel.windowed_diversity(synthetic_positions, synthetic_allele_counts, size=window_size) reference_pi, _, _, _ = allel.windowed_diversity(reference_positions, reference_allele_counts, size=window_size) plt.title('Nucleotide Diversity Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_pi) + 1), synthetic_pi, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_pi) + 1), reference_pi, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Nucleotide Diversity (π)') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.pi.png'.format(synthetic_population_code))) plt.close(plt.gcf()) synthetic_D, _, _ = allel.windowed_tajima_d(synthetic_positions, synthetic_allele_counts, size=window_size) reference_D, _, _ = allel.windowed_tajima_d(reference_positions, reference_allele_counts, size=window_size) plt.title('Tajima\'s D Sliding Window Analysis') plt.plot(np.arange(1, len(synthetic_D) + 1), synthetic_D, label='Synthetic {}'.format(synthetic_population_code)) plt.plot(np.arange(1, len(reference_D) + 1), reference_D, label='{}'.format(synthetic_population_code)) plt.xlabel('Windows ({}kb)'.format(window_size // 1000)) plt.ylabel('Tajima\'s D') plt.legend() plt.savefig(os.path.join(FIGURES_DIR, '{}.tajima_d.png'.format(synthetic_population_code))) plt.close(plt.gcf())
def test_masked_windowed_diversity(self): # four haplotypes, 6 pairwise comparison h = allel.HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() # mean pairwise diversity # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27]) mask = np.tile(np.repeat(np.array([True, False]), 5), 3) # expected is every other window with size 5 expect, _, _, _ = allel.windowed_diversity(pos, ac, size=5, start=1, stop=31) # only getting every other element expect = expect[::2] # actual is window of size 10 with the last half masked out actual, _, _, _ = allel.windowed_diversity(pos, ac, size=10, start=1, stop=31, is_accessible=mask) assert_array_almost_equal(expect, actual)
def pi_window(pos, gt, win_size, length_bp): """Calculate pi in windows. Parameters ---------- gt : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. win_size : TYPE DESCRIPTION. length_bp : TYPE DESCRIPTION. Returns ------- pi_mean : TYPE DESCRIPTION. pi_std : TYPE DESCRIPTION. """ gtseg, pos_s = get_seg(gt, pos) ac = gtseg.count_alleles() pi, *_ = allel.windowed_diversity(pos_s, ac, size=win_size, start=1, stop=length_bp) pi_mean = np.nanmean(pi) pi_std = np.nanstd(pi) return pi_mean, pi_std
def pi_fx(pos, gt, win_size, length_bp): """Diversity in one pop in a window.""" ac = gt.count_alleles() pi_ = allel.windowed_diversity(pos, ac, size=win_size, start=1, stop=length_bp) return pi_[0]
def test_fully_masked_windowed_diversty(self): ac = allel.AlleleCountsArray(np.array([[5, 5], [5, 5], [1, 9], [1, 9]])) pos = np.array([1, 2, 3, 4]) mask = np.array([False, False, True, True]) pi, _, _, _ = allel.windowed_diversity(pos, ac, size=2, start=1, stop=5, is_accessible=mask) self.assertTrue(np.isnan(pi[0]))
def test_windowed_diversity(self): # four haplotypes, 6 pairwise comparison h = HaplotypeArray([[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 1], [0, 1, 1, 1], [1, 1, 1, 1], [0, 0, 1, 2], [0, 1, 1, 2], [0, 1, -1, -1], [-1, -1, -1, -1]]) ac = h.count_alleles() # mean pairwise diversity # expect = [0, 3/6, 4/6, 3/6, 0, 5/6, 5/6, 1, -1] pos = SortedIndex([2, 4, 7, 14, 15, 18, 19, 25, 27]) expect = [(7 / 6) / 10, (13 / 6) / 10, 1 / 11] actual, _, _, _ = allel.windowed_diversity(pos, ac, size=10, start=1, stop=31) assert_array_almost_equal(expect, actual)
def pltPi(chromlist): """ """ for c in chromlist: callset = h5py.File("PNG.phased.autosomal.recode.{}.h5".format(c), mode='r') # callset = h5py.File("PNG.phased.X.recode.{}.h5".format(c), mode='r') samples = callset['samples'][:] sample_name = [sid.decode() for sid in samples.tolist()] g = allel.GenotypeChunkedArray(callset["calldata/GT"]) pos = allel.SortedIndex(callset["variants/POS"][:]) acc = g.count_alleles() pi_windowed = allel.windowed_diversity(pos, acc, size=10) plt.plot(pos, h12_pos) plt.xlabel("{} genomic position".format(c)) plt.ylabel("H12") plt.savefig("PNG.{}.H12.pdf".format(c)) plt.clf()
biallelic = get_biallelic( args.z, chrom, np.concatenate((loc_samples, loc2_samples), axis=0)) ac = ac.compress(biallelic, axis=0)[:, :2] ac2 = ac2.compress(biallelic, axis=0)[:, :2] pos = pos.get_mask_selection(biallelic) else: biallelic = get_biallelic(args.z, chrom, loc_samples) ac = ac.compress(biallelic, axis=0)[:, :2] pos = pos.get_mask_selection(biallelic) if 'pi' in args.s: pi, windows, n_bases, counts = allel.windowed_diversity( pos, ac, size=winsize, start=start, stop=stop, step=int(winsize / 2)) new_dat = format_results(stat=pi, stat_name="pi", chrom=chrom, windows=windows, nvar=counts, pop=pop) df_list.append(new_dat) if 'thetaW' in args.s: thetaW, windows, n_bases, counts = allel.windowed_watterson_theta( pos, ac, size=winsize,
## Data preprocessing ## non_het_variants = gt.count_het(axis=1) == 0 gt_clean = gt[ non_het_variants ] ## filter out sites with heterozygous calls, because TB is haploid gt_clean = gt_clean.haploidify_samples() ## Convert to haploid calls pos = callset["variants/POS"][non_het_variants] ## The retained variant positions #######Plot 1: nucleotide diversity######### allele_counts = gt_clean.count_alleles() window_size = int((pos[-1] - pos[0]) / 100) # We want about 100 windows pi, windows, n_bases, n_counts = allel.windowed_diversity( pos, allele_counts, size=window_size, start=pos[0], stop=pos[-1] ) plot_windowed_pi(pi, windows) ######Plot 2: site frequency spectrum######## ## Filtering : only keep biallelic variants (at most two alleles segregate in the set of samples) max_2_alleles = [sum(row != 0) <= 2 for row in allele_counts] filtered_ac = allele_counts[max_2_alleles] bi_counts = allel.AlleleCountsArray(np.ndarray((filtered_ac.shape[0], 2), dtype=int)) index = 0 for row in filtered_ac: picked = [i for i in row if i != 0] assert len(picked) <= 2 if len(picked) == 1: picked = picked + [0]
def win_pi_sims(path, neut_mut, n_pops, n_sims, T, win_size, L, N): foname = os.path.basename(path[:-1]) print(("Base filename:" + foname), flush=True) x = np.arange(n_pops) combs = list(itertools.combinations(x, 2)) pis = np.zeros((len(T), n_sims, n_pops, int(L / win_size))) div = np.zeros((len(T), n_sims, len(combs), int(L / win_size))) fst = np.zeros((len(T), n_sims, len(combs), int(L / win_size))) tajd = np.zeros((len(T), n_sims, n_pops, int(L / win_size))) for t in range(len(T)): for i in range(n_sims): files = glob(path + str(T[t]) + "N_sim_" + str(i) + "_RAND_*[0-9]_overlaid.trees") print(files) assert (len(files) == 1), str( len(files)) + " file(s) found with glob T: " + str( T[t]) + " sim:" + str(i) filename = files[0] print(filename) ts = pyslim.load(filename).simplify() #print(("Pi0: ", ts.pairwise_diversity(samples=ts.samples(population=0)),"Pi1: ", ts.pairwise_diversity(samples=ts.samples(population=1))), flush=True) s1 = timer() acs, pos = ac_from_ts(ts, n_pops, N) for j in range(n_pops): pi, windows, n_bases, counts = allel.windowed_diversity( pos, acs[j], size=win_size, start=1, stop=L) pis[t, i, j, :] = pi D, windows, counts = allel.windowed_tajima_d(pos, acs[j], size=win_size, start=1, stop=L) tajd[t, i, j, :] = D s2 = timer() print(("Calculating windowed Pi/TajD... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True) s1 = timer() for k in range(len(combs)): dxy, windows, n_bases, counts = allel.windowed_divergence( pos, acs[combs[k][0]], acs[combs[k][1]], size=win_size, start=1, stop=L) div[t, i, k, :] = dxy fstat, windows, counts = allel.windowed_hudson_fst( pos, acs[combs[k][0]], acs[combs[k][1]], size=win_size, start=1, stop=L) fst[t, i, k, :] = fstat s2 = timer() print(("Calculating windowed Dxy and Fst... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True) s1 = timer() print((pis.shape), flush=True) print((tajd.shape), flush=True) print((div.shape), flush=True) output = open(path + foname + '_pis.pkl', 'wb') pickle.dump(pis, output) output.close() output = open(path + foname + '_tajd.pkl', 'wb') pickle.dump(tajd, output) output.close() output = open(path + foname + '_div.pkl', 'wb') pickle.dump(div, output) output.close() output = open(path + foname + '_fst.pkl', 'wb') pickle.dump(fst, output) output.close() if (0): plt.subplot(2, 1, 1) plt.plot(np.transpose(pis[0, 0, :]), "-") plt.title('0N after split') plt.ylabel('Pi') plt.subplot(2, 1, 2) plt.plot(np.transpose(pis[9, 0, :]), "-") plt.title('10N after split') plt.xlabel('Window') plt.ylabel('Pi') plt.tight_layout() plt.savefig(path + foname + '_landscape.pdf') plt.close() s2 = timer() print(("Saving stats and plots to file... Time elapsed (min):" + str(round((s2 - s1) / 60, 3))), flush=True)