def test_sfs_folded(): ac = [[0, 3], [1, 2], [2, 1]] expect = [1, 2] actual = allel.sfs_folded(ac) assert_array_equal(expect, actual) for dtype in 'u2', 'i2', 'u8', 'i8': aca = np.asarray(ac, dtype=dtype) actual = allel.sfs_folded(aca) assert_array_equal(expect, actual)
def asfs_stats(gt, pos, fold): """Calculate the allele frequence spectrum. Future implementations will utilize the breakpoints from msprime tree object to find unlinked positions. Parameters ---------- gt : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. fold : bool if True, return folded SFS Returns ------- sfs : TYPE DESCRIPTION. """ # TODO: random sample OR use msprime breakpoint to reduce linkage gtseg, pos_s = get_seg(gt, pos) # sfs if fold: sfsp = (allel.sfs_folded(gtseg.count_alleles(), gtseg.shape[1]))[1:] else: sfsp = (allel.sfs(gtseg.count_alleles()[:, 1], gtseg.shape[1]))[1:-1] tots = np.sum(sfsp) sfs = sfsp / tots return sfs
def binned_sfs_mean(ac, bin_no=5): """ Caclulates the mean allele counts in bins across the site frequency spectrum. Ignores position 0 which corresponds to monomophic sites (which can be non-zero in sub populations). Arguments ------------- ac: Allele counts (in format returned from scikit allel) bin_no: number of roughly equally spaced bins. Returns ---------- dictionary with the bin range as the key and the """ sfs = allel.sfs_folded(ac)[1:] # Drop monomorphic sites if bin_no > len(sfs): raise ValueError( "The number of bins cannot exceed the length of the site frequency spectrum." ) split_sfs = np.array_split(sfs, bin_no) # Splits roughly equal idx = 0 stats = {} for array in split_sfs: bin_mean = array.mean() bin_label = f"{idx}_{idx + len(array)}" stats[bin_label] = bin_mean idx += len(array) return stats
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True): """ note: should filter on segregating if only using subset of pops note: only biallelic if >1 allele is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:] ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2] ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2] """ sfsdict = {} fig, ax = plt.subplots(figsize=(8, 5)) sns.despine(ax=ax, offset=10) for pop in ac_subpops.keys(): acu = ac_subpops[pop] flt = acu.is_segregating() & (acu.max_allele() == 1) print('SFS : retaining', np.count_nonzero(flt), 'SNPs') # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2]) ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)) if fold and scale: sfs = allel.sfs_folded_scaled(ac1) elif fold and not scale: sfs = allel.sfs_folded(ac1) elif not fold and not scale: sfs = allel.sfs(ac1[:, 1]) elif not fold and scale: sfs = allel.sfs_scaled(ac1[:, 1]) sfsdict[pop] = sfs allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop, n=ac1.sum(axis=1).max()) ax.legend() ax.set_title('{} Scaled folded site frequency spectra'.format(c)) ax.set_xlabel('minor allele frequency') if save: fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight') return(sfsdict)
def plot_sfs(segregating_biallelic_snp_acs_by_pop_id, metadataObj): # joint_sfs for two populations (works) for (pop_id_A, pop_id_B) in itertools.combinations( [pop_id for pop_id in metadataObj.pop_ids_order], 2): fig, ax = plt.subplots(figsize=(8, 5)) sns.despine(ax=ax) sfs1 = allel.sfs_folded( segregating_biallelic_snp_acs_by_pop_id[pop_id_A]) #allel.plot_sfs_folded(sfs1, ax=ax, label=pop_id_A, n=segregating_biallelic_snp_acs_by_pop_id[pop_id_A].sum(axis=1).max()) allel.plot_sfs_folded(sfs1, ax=ax, label=pop_id_A) sfs2 = allel.sfs_folded( segregating_biallelic_snp_acs_by_pop_id[pop_id_B]) #allel.plot_sfs_folded(sfs2, ax=ax, label=pop_id_B, n=segregating_biallelic_snp_acs_by_pop_id[pop_id_B].sum(axis=1).max()) allel.plot_sfs_folded(sfs2, ax=ax, label=pop_id_B) ax.legend() ax.set_title('Folded site frequency spectra') # workaround bug in scikit-allel re axis naming ax.set_xlabel('Minor allele frequency') ax.set_ylim(min([min(sfs1), min(sfs2)]), max([max(sfs1), max(sfs2)])) fig.tight_layout() fig.savefig('%s.sfsfs.%s_%s.png' % (metadataObj.prefix, pop_id_A, pop_id_B), format="png") #n : int, optional, Number of chromosomes sampled. If provided, X axis will be plotted as allele frequency, otherwise as allele count. jsfs = allel.joint_sfs_folded( segregating_biallelic_snp_acs_by_pop_id[pop_id_A][:], segregating_biallelic_snp_acs_by_pop_id[pop_id_B][:]) fig, ax = plt.subplots(figsize=(6, 6)) ax = allel.plot_joint_sfs_folded(jsfs, ax=ax, imshow_kwargs={'cmap': 'Blues'}) total = np.sum(jsfs) for i in range(len(jsfs)): for j in range(len(jsfs[i])): ax.text(j, i, "{:.2f}".format(jsfs[i, j] / total), ha="center", va="center", color="black", fontsize=12) ax.set_ylabel('Alternate allele count, %s' % pop_id_A) ax.set_xlabel('Alternate allele count, %s' % pop_id_B) fig.tight_layout() fig.savefig('%s.jsfs.%s_%s.png' % (metadataObj.prefix, pop_id_A, pop_id_B), format="png")
def plot_sfs(allele_counts, dest=os.path.join(plot_dir, "site_freq_spectrum.pdf")): fig, ax = plt.subplots(figsize=(12, 5)) sns.despine(ax=ax, offset=10) sfs = allel.sfs_folded(allele_counts) # sfs = sfs[1:-1] x = np.arange(0, sfs.shape[0]) n = allele_counts.sum(axis=1).max() x = x / n ## To frequencies # sns.barplot(x, sfs, ax=ax, color=sns.color_palette("Blues")[3]) ax.plot(x, sfs) ##allel.plot_sfs_folded(sfs, ax=ax, label='TB', n=allele_counts.sum(axis=1).max()) ##ax.legend() ax.set_title("Folded site frequency spectrum") # workaround bug in scikit-allel re axis naming ax.set_xlabel("minor allele frequency") ax.set_ylabel("num sites") fig.savefig(dest)
def sfs(haplotype, ac, nindiv=None, folded=False): """ Compute sfs for SNP matrix """ if nindiv == None: nindiv = haplotype.shape[1] tmp_df = pd.DataFrame({"N_indiv": range(1, nindiv)}) if folded: df_sfs = pd.DataFrame(allel.sfs_folded(ac), columns=["count_SNP"]) df_sfs["i_xi"] = allel.sfs_folded_scaled(ac) df_sfs.index.name = "N_indiv" df_sfs.reset_index(inplace=True) df_sfs = df_sfs.merge(tmp_df, on="N_indiv", how="right").fillna(0).astype(int) else: df_sfs = pd.DataFrame(allel.sfs(ac.T[1]), columns=["count_SNP"]) df_sfs["i_xi"] = allel.sfs_scaled(ac.T[1]) df_sfs.index.name = "N_indiv" df_sfs.reset_index(inplace=True) df_sfs = df_sfs.merge(tmp_df, on="N_indiv", how="right").fillna(0).astype(int) df_sfs["freq_indiv"] = df_sfs.N_indiv / nindiv return df_sfs