def test_divergence__windowed_scikit_allel_comparison(sample_size, n_cohorts, chunks): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts, chunks) # type: ignore[no-untyped-call] ds, subsets = add_cohorts(ds, ts, n_cohorts) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = divergence(ds) div = ds["stat_divergence"].values # test off-diagonal entries, by replacing diagonal with NaNs div[:, np.arange(2), np.arange(2)] = np.nan # Calculate divergence using scikit-allel moving_statistic # (Don't use windowed_divergence, since it treats the last window differently) ds1 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[:1])) # type: ignore[no-untyped-call] ds2 = count_variant_alleles(ts_to_dataset( ts, samples=ts.samples()[1:])) # type: ignore[no-untyped-call] ac1 = ds1["variant_allele_count"].values ac2 = ds2["variant_allele_count"].values mpd = allel.mean_pairwise_difference_between(ac1, ac2, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) # noqa: F841 # TODO: investigate why numbers are different np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def test_observed_heterozygosity__scikit_allel_comparison( n_variant, n_sample, missing_pct, window_size, seed): ds = simulate_genotype_call_dataset( n_variant=n_variant, n_sample=n_sample, n_ploidy=2, missing_pct=missing_pct, seed=seed, ) ds["sample_cohort"] = ( ["samples"], np.zeros(n_sample, int), ) ds = window(ds, size=window_size) ho_sg = observed_heterozygosity(ds)["stat_observed_heterozygosity"].values if n_sample % window_size: # scikit-allel will drop the ragged end ho_sg = ho_sg[0:-1] # calculate with scikit-allel ho_sa = allel.moving_statistic( allel.heterozygosity_observed(ds["call_genotype"]), np.sum, size=window_size, ) # add cohort dimension to scikit-allel result np.testing.assert_almost_equal(ho_sg, ho_sa[..., None])
def test_diversity__windowed(sample_size): ts = simulate_ts(sample_size, length=200) ds = ts_to_dataset(ts) # type: ignore[no-untyped-call] ds, subsets = add_cohorts( ds, ts, cohort_key_names=["cohorts"]) # type: ignore[no-untyped-call] ds = window(ds, size=25) ds = diversity(ds) div = ds["stat_diversity"].sel(cohorts="co_0").compute() # Calculate diversity using tskit windows # Find the variant positions so we can have windows with a fixed number of variants positions = ts.tables.sites.position windows = np.concatenate(([0], positions[::25][1:], [ts.sequence_length])) ts_div = ts.diversity(windows=windows, span_normalise=False) np.testing.assert_allclose(div, ts_div) # Calculate diversity using scikit-allel moving_statistic # (Don't use windowed_diversity, since it treats the last window differently) ds = count_variant_alleles( ts_to_dataset(ts)) # type: ignore[no-untyped-call] ac = ds["variant_allele_count"].values mpd = allel.mean_pairwise_difference(ac, fill=0) ska_div = allel.moving_statistic(mpd, np.sum, size=25) np.testing.assert_allclose( div[:-1], ska_div) # scikit-allel has final window missing
def test_moving_statistic_1d(length, chunks, size, step, dtype): values = da.from_array(np.arange(length, dtype=dtype), chunks=chunks) stat = moving_statistic(values, np.sum, size=size, step=step, dtype=values.dtype) stat = stat.compute() if length % size != 0 or size != step: # scikit-allel misses final window in this case stat = stat[:-1] assert stat.dtype == dtype values_sa = np.arange(length) stat_sa = allel.moving_statistic(values_sa, np.sum, size=size, step=step) np.testing.assert_equal(stat, stat_sa)
def test_moving_statistic_2d(length, chunks, size, step, dtype): arr = np.arange(length * 3, dtype=dtype).reshape(length, 3) def sum_cols(x): return np.sum(x, axis=0) values = da.from_array(arr, chunks=chunks) stat = moving_statistic(values, sum_cols, size=size, step=step, dtype=values.dtype) stat = stat.compute() if length % size != 0 or size != step: # scikit-allel misses final window in this case stat = stat[:-1] assert stat.dtype == dtype values_sa = arr stat_sa = allel.moving_statistic(values_sa, sum_cols, size=size, step=step) np.testing.assert_equal(stat, stat_sa)
qualflt=qualflt, missingfltprop=missingprop) #### Fst in windows #### for sus, res in comparisons: name = sus + "_" + res cohortText = f"{sus} v {res}" print(f"Calculating Fst values in sliding windows for {name}\n") for wname, size, step in zip(windownames, windowsizes, windowsteps): FstArray = allel.moving_hudson_fst(acsubpops[sus], acsubpops[res], size=size, step=step) midpoint = allel.moving_statistic(pos, np.median, size=size, step=step) cohortNoSpaceText = name + "." + wname rnaseqpop.plotWindowed( statName="Fst", cohortText=cohortText, cohortNoSpaceText=cohortNoSpaceText, values=FstArray, midpoints=midpoint, colour='dodgerblue', prefix="results/variantAnalysis/selection/fst", chrom=chrom, ylim=0.5, save=True)
def selective_sweep(chroms, pop, samples, haplo=True, plot=False, inaccessible=False): """ Function to calculate H12 statistic across chromosome for given population. Currently not standardised or normalised. """ for chrom in chroms: if inaccessible is False: ############ Read zarrs ############# Ag_store = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/home/sanj/ag1000g/data/ag1000g.phase2.ar1.pass/{chrom}/variants/POS", mode='r')[:] else: Ag_store = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/calldata/GT/", mode='r') positions = zarr.open_array( f"/media/sanj/Sanj_HDD/Ag1000g/ag1000g.phase2.ar1/{chrom}/variants/POS", mode='r')[:] print("--------------------------------------------------") print(f"Zarrs loaded: {pop}, Chromosome {chrom}") ############ Load intro gen.array and compute statistics ########### ag_geno = allel.GenotypeChunkedArray(Ag_store) pop_bool = samples.population == pop print("Constructing HaplotypeArray") pop_geno = ag_geno.compress(pop_bool, axis=1) pop_haplo = pop_geno.to_haplotypes() print("Computing statistics") h1, h12, h123, h2_h1 = allel.moving_garud_h(pop_haplo, size=1000) median_pos = allel.moving_statistic(positions, np.median, size=1000) print(f"mean {chrom} h12", np.mean(h12)) if plot is True: print("Producing figure") sns.set_palette("muted") xtick = np.arange(0, median_pos.max(), 1000000) plt.figure(figsize=(30, 10)) sns.lineplot( median_pos, h12).set_title(f'{pop} {chrom} H12 in 1000 snp windows') plt.xticks(xtick) plt.savefig(f"../data/{pop}/{chrom}/{pop}_{chrom}_H12_scatter.png", dpi=800) plt.close if haplo is True: return (pop_haplo, h12, np.around(median_pos), positions) else: return (h12, np.around(median_pos), positions)
allcoef = defaultdict(list) for pop in metadata['treatment'].unique(): # Sequence diversity seqdivdict[pop] = allel.sequence_diversity(pos, acsubpops[pop]) # Wattersons theta thetadict[pop] = allel.watterson_theta(pos, acsubpops[pop]) # Inbreeding coefficient if ploidy > 1: gn = geno.take(subpops[pop], axis=1) coef = allel.moving_statistic( gn, statistic=allel.inbreeding_coefficient, size=1000, step=100) coef = np.nanmean(coef, axis=1) coefdict[pop] = np.mean(coef) allcoef[pop].append(np.array(coef)) print(f"{pop} | {chrom} | Nucleotide Diversity (Pi) =", seqdivdict[pop]) print(f"{pop} | {chrom} | Wattersons Theta =", thetadict[pop]) if ploidy > 1: print(f"{pop} | {chrom} | Inbreeding Coef =", np.mean(coef), "\n") seqdivdictchrom[chrom] = dict(seqdivdict) thetadictchrom[chrom] = dict(thetadict) if ploidy > 1: coefdictchrom[chrom] = dict(coefdict)
def loop_D_statistic3(name, popA_list, popB_list, popC_list, popD_list, popA_ac, popB_ac, popC_ac, popD_ac, pos, block_len_snp, step_len_snp, cycle="C", blen=100, color=[ "blue", "darkorange", "turquoise", "crimson", "magenta", "limegreen", "forestgreen", "slategray", "orchid", "darkblue" ]): windows_pos = allel.moving_statistic(pos, statistic=lambda v: v[0], size=block_len_snp, step=step_len_snp) # calculate pvalues and focus in this region: duplicated region proper is_locus = np.logical_and(pos > loc_start, pos < loc_end) # gene region is_inv = np.logical_and(pos > inv_start, pos < inv_end) # inversion region # loop pdf = PdfPages("%s/%s.Dstat_%s.pdf" % (outdir, outcode, name)) colors = cm.rainbow(np.linspace(0, 1, len(popC_list))) for dn, popD in enumerate(popD_list): for bn, popB in enumerate(popB_list): for an, popA in enumerate(popA_list): print("(((%s,%s),X),%s) chr" % (popA, popB, popD)) fig = plt.figure(figsize=(10, 2)) # whole chromosome: frame ax1 = plt.subplot(1, 2, 1) sns.despine(ax=ax1, offset=10) ax1.set_title("Chr %s (((%s,%s),X),%s)" % (chrom, popA, popB, popD)) ax1.set_xlim(0, 50) ax1.set_ylim(-1, 1) ax1.set_xlabel("Mb") ax1.set_ylabel("D") plt.axhline(0, color='k', linestyle="--", label="") plt.axvline(loc_start / 1e6, color='red', linestyle=":", label="Rdl") plt.axvline(loc_end / 1e6, color='red', linestyle=":", label="") plt.axvline(inv_start / 1e6, color='orange', linestyle=":", label="inversion") plt.axvline(inv_end / 1e6, color='orange', linestyle=":", label="") ax2 = plt.subplot(1, 4, 3) sns.despine(ax=ax2, offset=10) ax2.set_xlim(loc_start / 1e6 - 1, loc_end / 1e6 + 1) ax2.set_ylim(-1, 1) ax2.set_xlabel("Mb") ax2.set_ylabel("D") plt.axhline(0, color='k', linestyle="--", label="") plt.axvline(loc_start / 1e6, color='red', linestyle=":", label="Rdl") plt.axvline(loc_end / 1e6, color='red', linestyle=":", label="") plt.axvline(inv_start / 1e6, color='orange', linestyle=":", label="inversion") plt.axvline(inv_end / 1e6, color='orange', linestyle=":", label="") for cn, popC in enumerate(popC_list): if popA != popB: # block-wise patterson D (normalised) admix_pd_n_win = allel.moving_patterson_d( aca=popA_ac[popA][:, 0:2], acb=popB_ac[popB][:, 0:2], acc=popC_ac[popC][:, 0:2], acd=popD_ac[popD][:, 0:2], size=block_len_snp, step=step_len_snp) # whole chromosome: plot plt.subplot(1, 2, 1) plt.step(windows_pos / 1e6, admix_pd_n_win, color=colors[cn]) # estimated D in locus with pval admix_pd_av_indup = allel.average_patterson_d( aca=popA_ac[popA][:, 0:2][is_locus], acb=popB_ac[popB][:, 0:2][is_locus], acc=popC_ac[popC][:, 0:2][is_locus], acd=popD_ac[popD][:, 0:2][is_locus], blen=blen) # convert Z-score (num of SD from 0) to pval (two-sided) admix_pd_av_indup_pval = scipy.stats.norm.sf( abs(admix_pd_av_indup[2])) * 2 # zoomed region: plot plt.subplot(1, 4, 3) plt.step( windows_pos / 1e6, admix_pd_n_win, color=colors[cn], where="post", label="%s\nD = %.3f +/- %.3f | Z = %.3f | p = %.3E" % (popC, admix_pd_av_indup[0], admix_pd_av_indup[1], admix_pd_av_indup[2], admix_pd_av_indup_pval)) plt.axhline(0, color='k', linestyle="--", label="") ax2.legend(loc='center left', bbox_to_anchor=(1.1, 0.5)) # save pdf pdf.savefig(fig, bbox_inches='tight') pdf.close()
fig = plt.figure(figsize=(8,12)) ax9 = plt.subplot(3, 1, 1) j=0 for i,clui in enumerate(np.append(clu_list_ids_fil,np.append("no_wt","no_alt"))): # which cluster clu_key = "cluster_"+str(clui) # which variants include in the cluster-wise analysis of selection? clu_sambool = np.isin(range(0,oc_haploty_hap_seg.n_haplotypes),test_elements=popdich_clu[clu_key]) clu_sambool = np.logical_and(clu_sambool,rmv_miss_bool) # hap div along chromosome clu_pos_wib = allel.moving_statistic(oc_hapvars_seg["POS"].subset(sel0=clu_varbool), statistic=lambda v: v[0], size=50, step=10) clu_hdi_wib = allel.moving_haplotype_diversity(oc_haploty_hap_seg.subset(sel0=clu_varbool,sel1=clu_sambool), size=50, step=10) # hap div in focus region j_index = np.array(popdich_clu[clu_key]).tolist() j_run = len(j_index) j_hdi = np.zeros(shape=j_run) for k in range(j_run): j_sel1 = j_index[0:k] + j_index[k+1:j_run] j_hdi[k] = allel.haplotype_diversity(oc_haploty_hap_seg.subset(sel0=clu_varbool_focus, sel1=j_sel1)) j_av,j_se,j_cl,j_cu,j_nu = mean_se_ci_report(j_hdi) clu_label = "%s\nh = %.6f +/- %.6f SE, %.6f-%.6f CI95, n=%i" % (clu_key, j_av, j_se, j_cl, j_cu, j_nu) print(clu_label) # plot plt.subplot(3, 1, 1)
def getPCADist(vcf, fpop1, fpop2, window_size): # Getting the samples fh1 = open(fpop1, 'r').readlines() spop1 = [(ele.split()[0], 'pop1') for ele in fh1] fh2 = open(fpop2, 'r').readlines() spop2 = [(ele.split()[0], 'pop2') for ele in fh2] pops = spop1 + spop2 Pops = {a: b for a, b in pops} Samples = list(Pops.keys()) print("Reading vcf") callset = allel.read_vcf( vcf, ['samples', 'variants/CHROM', 'variants/POS', 'calldata/GT'], samples=Samples) samples = callset['samples'] chromosomes = callset['variants/CHROM'] positions = callset['variants/POS'] gts = callset['calldata/GT'] variants = callset['variants/POS'] idx = allel.ChromPosIndex(chromosomes, positions) chroms = [] for cr in chromosomes: if cr not in chroms: chroms.append(cr) # Getting sample indices populations = [] for ele in samples: if ele in Pops.keys(): populations.append(Pops[ele]) else: populations.append('other') ds = pd.DataFrame({'sample': samples, 'pop': populations}) samples_callset_index = [list(samples).index(s) for s in ds['sample']] ds['callset_index'] = samples_callset_index dpops = defaultdict(list) for a, b in ds[['pop', 'callset_index']].values.tolist(): dpops[a].append(b) print("Calculating pop distance from the centroid") Dist = [] for chrom in chroms: #print(chrom) chr_slice = idx.locate_key(chrom) chr_vars = variants[chr_slice] # Getting genotypes chr_gts = gts[chr_slice] chr_gts # Filtering out rows (positions) with missing genotypes missing = allel.GenotypeArray(chr_gts).is_missing() bool_missing = missing.any(axis=1) chr_nomissing = chr_gts[~bool_missing] chr_nomissing chr_vars_nomissing = chr_vars[~bool_missing] # Retaining rows (positions) with segregating genotypes segs = allel.GenotypeArray(chr_nomissing).count_alleles() > 0 bool_segs = segs.all(axis=1) chr_segregating = chr_nomissing[bool_segs] chr_vars_segregating = chr_vars_nomissing[bool_segs] #chr_segregating.shape, chr_vars_segregating.shape # Converting genotypes to one code number #chr_nalt = allel.GenotypeArray(chr_gts).to_n_alt(fill=-1) chr_nalt = allel.GenotypeArray(chr_segregating).to_n_alt() chr_nalt.shape ### This is optional - locating unlinked variants #unlink = allel.locate_unlinked(chr_nalt, size=100, step=50, threshold = 0.1) #chr_unlink = chr_nalt[unlink] #chr_vars_unlink = chr_vars_segregating[unlink] # Calculating distance win_stat = allel.moving_statistic(chr_nalt, runPCA, size=int(window_size), pop_1=dpops['pop1'], pop_2=dpops['pop2']) flat_stat = np.concatenate(win_stat) starts = chr_vars_segregating[0:len(chr_vars_segregating ):int(window_size)] stops = chr_vars_segregating[int(window_size) - 1:len(chr_vars_segregating ):int(window_size)] wf = pd.DataFrame({ 'chrom': chrom, 'dist': flat_stat, 'SNP_start': starts[:len(flat_stat)], 'SNP_stop': stops[:len(flat_stat)], 'SNPs': int(window_size) }) Dist.append(wf) dW = pd.concat(Dist) dW['mid'] = dW['SNP_start'] + (dW['SNP_stop'] - dW['SNP_start']) / 2 dW['window'] = list(range(len(dW['dist']))) dW.to_csv('calculatePCADist.out', sep='\t', index=False, header=True, columns=[ 'chrom', 'SNP_start', 'SNP_stop', 'mid', 'window', 'SNPs', 'dist' ])