def simulate(out_path, species, model, genetic_map, seed, chrmStr, sample_size=20, population=0, ld_thresh=1.0, max_workers=1): mask_path = out_path + ".r2Mask.p" sfs_path = out_path + ".sfs.pdf" chrom = species.genome.chromosomes[chrmStr] samples = [msp.Sample(population=population, time=0)] * sample_size print("Simulating...") ts = msp.simulate(samples=samples, recombination_map=chrom.recombination_map( genetic_map.name), mutation_rate=chrom.default_mutation_rate, random_seed=seed, **model.asdict()) ts.dump(out_path) haps = allel.HaplotypeArray(ts.genotype_matrix()) SFSs = [] SFSs.append(allel.sfs(haps.count_alleles()[:, 1])[1:]) print("Simulation finished!") if ld_thresh < 1.0: ul = unlinked(ts, ld_thresh, max_workers) mask_file = open(mask_path, "wb") pickle.dump(ul, mask_file) SFSs.append(allel.sfs(haps[ul, :].count_alleles()[:, 1])[1:]) plot_sfs(SFSs, sfs_path)
def test_sfs(self): dac = [0, 1, 2, 1] expect = [1, 2, 1] actual = allel.sfs(dac) aeq(expect, actual) for dtype in 'u2', 'i2', 'u8', 'i8': daca = np.asarray(dac, dtype=dtype) actual = allel.sfs(daca) aeq(expect, actual)
def ts_to_stairway(self, ts_path, num_bootstraps=1, mask_file=None): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples haps = ts.genotype_matrix() SFSs = [] # Masking retain = np.full(ts.get_num_mutations(), False) if mask_file: mask_table = pd.read_csv(mask_file, sep="\t", header=None) chrom = ts_p.split("/")[-1].split(".")[0] sub = mask_table[mask_table[0] == chrom] mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2]) snp_locs = [int(x.site.position) for x in ts.variants()] tmp_bool = [mask_ints.contains(x) for x in snp_locs] retain = np.logical_or(retain, tmp_bool) total_length -= np.sum(mask_ints.length) retain = np.logical_not(retain) # append unmasked SFS SFSs.append(allel.sfs(allel.HaplotypeArray(haps).count_alleles()[:, 1])[1:]) # get masked allele counts and append SFS allele_counts = allel.HaplotypeArray(haps[retain, :]).count_alleles() SFSs.append(allel.sfs(allele_counts[:, 1])[1:]) sfs_path = ts_p+".sfs.pdf" plots.plot_sfs(SFSs, sfs_path) # Bootstrap allele counts derived_counts_all[0].extend(allele_counts[:, 1]) for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) # Get the SFS minus the 0 bin and write output stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def test_sfs(): dac = [0, 1, 2, 1] expect = [1, 2, 1] actual = allel.sfs(dac) assert_array_equal(expect, actual) for dtype in 'u2', 'i2', 'u8', 'i8': daca = np.asarray(dac, dtype=dtype) actual = allel.sfs(daca) assert_array_equal(expect, actual) # explicitly provide number of chromosomes expect = [1, 2, 1, 0] actual = allel.sfs(dac, n=3) assert_array_equal(expect, actual) with pytest.raises(ValueError): allel.sfs(dac, n=1)
def asfs_stats(gt, pos, fold): """Calculate the allele frequence spectrum. Future implementations will utilize the breakpoints from msprime tree object to find unlinked positions. Parameters ---------- gt : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. fold : bool if True, return folded SFS Returns ------- sfs : TYPE DESCRIPTION. """ # TODO: random sample OR use msprime breakpoint to reduce linkage gtseg, pos_s = get_seg(gt, pos) # sfs if fold: sfsp = (allel.sfs_folded(gtseg.count_alleles(), gtseg.shape[1]))[1:] else: sfsp = (allel.sfs(gtseg.count_alleles()[:, 1], gtseg.shape[1]))[1:-1] tots = np.sum(sfsp) sfs = sfsp / tots return sfs
def asfsStatsSeg(gt, pops, chrm, rand=True, plot=False): """Aggregate SFS, singletons and doubletons """ print("asfs") aSFS1 = [] aSFS2 = [] for p in pops: gtpop = gt.take(p, axis=1) acpop = gtpop.count_alleles() seg = acpop.is_segregating() gtseg = gtpop.compress(seg) # random snps if rand: n = 100000 # number of SNPs to choose randomly try: vidx = np.random.choice(gtseg.shape[0], n, replace=False) except ValueError: vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False) else: vidx = np.random.choice(gtseg.shape[0], gtseg.shape[0], replace=False) vidx.sort() gtp = gtseg.take(vidx, axis=0) sfsp = (allel.sfs(gtp.count_alleles()[:, 1])) print(sfsp) if plot: fig, ax = plt.subplots(figsize=(6, 6)) allel.stats.plot_sfs(sfsp, ax=ax) tots = np.sum(sfsp) aSFS1.append(sfsp[1] / tots) aSFS2.append(sfsp[2] / tots) return (aSFS1, aSFS2)
def sfs_plot(c, ac_subpops, save=True, fold=True, scale=True): """ note: should filter on segregating if only using subset of pops note: only biallelic if >1 allele is_biallelic_01 = ac_seg['all'].is_biallelic_01()[:] ac1 = ac_seg['BFM'].compress(is_biallelic_01, axis=0)[:, :2] ac2 = ac_seg['AOM'].compress(is_biallelic_01, axis=0)[:, :2] """ sfsdict = {} fig, ax = plt.subplots(figsize=(8, 5)) sns.despine(ax=ax, offset=10) for pop in ac_subpops.keys(): acu = ac_subpops[pop] flt = acu.is_segregating() & (acu.max_allele() == 1) print('SFS : retaining', np.count_nonzero(flt), 'SNPs') # ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)[:, :2]) ac1 = allel.AlleleCountsArray(ac_subpops[pop].compress(flt, axis=0)) if fold and scale: sfs = allel.sfs_folded_scaled(ac1) elif fold and not scale: sfs = allel.sfs_folded(ac1) elif not fold and not scale: sfs = allel.sfs(ac1[:, 1]) elif not fold and scale: sfs = allel.sfs_scaled(ac1[:, 1]) sfsdict[pop] = sfs allel.stats.plot_sfs_folded_scaled(sfsdict[pop], ax=ax, label=pop, n=ac1.sum(axis=1).max()) ax.legend() ax.set_title('{} Scaled folded site frequency spectra'.format(c)) ax.set_xlabel('minor allele frequency') if save: fig.savefig("ScaledSFS-{}.pdf".format(c), bbox_inches='tight') return(sfsdict)
def site_frequency_spectrum(genotypes: np.ndarray, population: str=None) -> np.ndarray: allele_counts = genotypes.reshape(genotypes.shape[0], -1).sum(1) sfs = allel.sfs(allele_counts, np.product(genotypes.shape[1:])) if population is not None: plt.title('{} site frequency spectrum'.format(population)) ax = plt.gca() ax = allel.plot_sfs(sfs, ax=ax) plt.savefig(os.path.join(FIGURES_DIR, '{}.sfs.png'.format(population.replace(' ', '_')))) plt.clf() return sfs / sfs.sum()
def sfs(directory, vcffile): ### create a Site Frequency Spectrum Figure callset = allel.read_vcf(directory + vcffile + ".vcf") gt = allel.GenotypeArray(callset['calldata/GT']) ac = gt.count_alleles()[:] derived = ac[:, 1] sfslist = allel.sfs(derived) xlabel = [x for x in range(1, len(sfslist) + 1)] plt.plot(xlabel, list(sfslist)) plt.xlabel("K value") plt.ylabel("Number of variants") plt.savefig(directory + "/" + vcffile + "_sfs.jpg")
def ts_to_stairway(self, ts_path, num_bootstraps=1): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples haps = ts.genotype_matrix() # Mask high-ld sites and return genotypes mask_path = ts_p + ".unlinkedMask.p" if os.path.exists(mask_path): mask_file = open(mask_path, "rb") ul = pickle.load(mask_file) allele_counts = allel.HaplotypeArray( haps[ul, :]).count_alleles() else: allele_counts = allel.HaplotypeArray(haps).count_alleles() # Bootstrap allele counts derived_counts_all[0].extend(allele_counts[:, 1]) for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) # Get the SFS minus the 0 bin and write output stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def ts_to_stairway(self, ts_path, num_bootstraps=1): """ Converts the specified tskit tree sequence to text files used by stairway plot. """ derived_counts_all = [[] for _ in range(num_bootstraps + 1)] total_length = 0 num_samples = 0 for i, ts_p in enumerate(ts_path): ts = tskit.load(ts_p) total_length += ts.sequence_length num_samples = ts.num_samples # count alleles, bootstrap over sites, return the SFS minus the 0% bin haps = ts.genotype_matrix() genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2) allele_counts = genotypes.count_alleles() derived_allele_counts = allele_counts[:, 1] derived_counts_all[0].extend(derived_allele_counts) # Write bootstrapped inputs for j in range(1, num_bootstraps + 1): nsites = np.shape(allele_counts)[0] bootset = np.random.choice(np.arange(0, nsites, 1), nsites, replace=True) bootac = allele_counts[bootset, :] der_bootac = bootac[:, 1] derived_counts_all[j].extend(der_bootac) stairway_files = [] for l in range(len(derived_counts_all)): sfs = allel.sfs(derived_counts_all[l])[1:] filename = self.workdir / "sfs_{}.txt".format(l) write_stairway_sfs(total_length, num_samples, sfs, filename) stairway_files.append(filename) return stairway_files
def sfs(haplotype, ac, nindiv=None, folded=False): """ Compute sfs for SNP matrix """ if nindiv == None: nindiv = haplotype.shape[1] tmp_df = pd.DataFrame({"N_indiv": range(1, nindiv)}) if folded: df_sfs = pd.DataFrame(allel.sfs_folded(ac), columns=["count_SNP"]) df_sfs["i_xi"] = allel.sfs_folded_scaled(ac) df_sfs.index.name = "N_indiv" df_sfs.reset_index(inplace=True) df_sfs = df_sfs.merge(tmp_df, on="N_indiv", how="right").fillna(0).astype(int) else: df_sfs = pd.DataFrame(allel.sfs(ac.T[1]), columns=["count_SNP"]) df_sfs["i_xi"] = allel.sfs_scaled(ac.T[1]) df_sfs.index.name = "N_indiv" df_sfs.reset_index(inplace=True) df_sfs = df_sfs.merge(tmp_df, on="N_indiv", how="right").fillna(0).astype(int) df_sfs["freq_indiv"] = df_sfs.N_indiv / nindiv return df_sfs
type=float, help="mutation rate (per base per generation)") parser.add_argument('generation_time', type=float, help="generation time") parser.add_argument('plot', type=bool, help='plot Ne ~ t ? T/F') args = parser.parse_args() np.random.seed(args.seed) #read in tree sequence ts = msp.load(args.infile) #count alleles, bootstrap over sites, return the SFS minus the 0% bin haps = np.array(ts.genotype_matrix()) genotypes = allel.HaplotypeArray(haps).to_genotypes(ploidy=2) allele_counts = genotypes.count_alleles() sfs = allel.sfs(allele_counts[:, 1]) sfs = sfs[1:len(sfs)] #tmp directory for input files command = ("cd " + args.outdir + ";" + "mkdir infiles") subprocess.run(command, shell=True) #write stairwayplot input out = open((join(args.outdir, "infiles", splitext(basename(args.infile))[0]) + "_strwyplt.txt"), "w") out.write( ("msp" + "\t" + str(ts.num_samples) + "\t" + str(int(ts.sequence_length)) + "\t" + str(1) + "\t" + str(ts.num_samples - 1) + "\n") ) #order is name,n_samples,sequence_length,lowest_sfs_bin,highest_sfs_bin for x in sfs: out.write(str(int(x)) + "\t")
simYRI = simYRI[simdf['pop'] == "YRI"] simYRI_ac_all = np.apply_along_axis(sum, 0, simYRI) simYRI_ac_all = np.array(simYRI_ac_all, dtype="i") tmp = np.array([100 - x for x in simYRI_ac_all], dtype="i") simYRI_ac_all = allel.AlleleCountsArray( np.transpose(np.vstack((tmp, simYRI_ac_all)))) genYRI = bingen genYRI = genYRI[pred['pop'] == "YRI"] genYRI_ac_all = np.apply_along_axis(sum, 0, genYRI) genYRI_ac_all = np.array(genYRI_ac_all, dtype="i") tmp = np.array([100 - x for x in genYRI_ac_all], dtype="i") genYRI_ac_all = allel.AlleleCountsArray( np.transpose(np.vstack((tmp, genYRI_ac_all)))) realsfs = allel.sfs(YRI_ac_all[:, 1]) gensfs = allel.sfs(genYRI_ac_all[:, 1]) simsfs = allel.sfs(simYRI_ac_all[:, 1]) sfs = pd.DataFrame() sfs['real'] = realsfs sfs['VAE'] = gensfs sfs['simulation'] = simsfs sfs['bin'] = np.arange(0, len(realsfs)) sfs.to_csv("out/1kg/1kg_sfs.csv", index=False) ################### LD decay #################### plt.hist(pos, bins=100)[2] maskstart = 4.4e7 maskstop = 4.45e7 #get LD and pairwise distance for a subset of 1000 SNPs