def countN1N2N3(gt, pops, mle): """ """ clist = [] klist = [] # make gt arrays for each subpop, then haplotype arrays gtA = gt.take(pops[0], axis=1) htA = gtA.to_haplotypes() gtB = gt.take(pops[1], axis=1) htB = gtB.to_haplotypes() if len(pops[1]) == 1: hap2 = list(range(len(pops[1]*2))) for hap1 in list(range(len(pops[0]))): ma = htA[:, [hap1]].count_alleles(max_allele=1) mb = htB[:, hap2].count_alleles(max_allele=1) jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1]) try: n3 = jsfs[0, 2] + jsfs[1, 0] n2 = jsfs[0, 1] + jsfs[1, 1] n1 = jsfs[0, 0] + jsfs[1, 2] except IndexError: z = np.zeros((2, 3), dtype=int) z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs n3 = z[0, 2] + z[1, 0] n2 = z[0, 1] + z[1, 1] n1 = z[0, 0] + z[1, 2] c_hat, k_hat = estimCandK(n1, n2, n3, mle) clist.append(c_hat) klist.append(k_hat) else: for hap1 in list(range(len(pops[0]))): for hap2 in list(combinations(range(len(pops[1])*2), 2)): ma = htA[:, [hap1]].count_alleles(max_allele=1) mb = htB[:, hap2].count_alleles(max_allele=1) z = np.zeros((2, 3), dtype=int) jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1]) try: n3 = jsfs[0, 2] + jsfs[1, 0] n2 = jsfs[0, 1] + jsfs[1, 1] n1 = jsfs[0, 0] + jsfs[1, 2] except IndexError: z = np.zeros((2, 3), dtype=int) z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs n3 = z[0, 2] + z[1, 0] n2 = z[0, 1] + z[1, 1] n1 = z[0, 0] + z[1, 2] c_hat, k_hat = estimCandK(n1, n2, n3, mle) clist.append(c_hat) klist.append(k_hat) return(np.mean(clist), np.mean(klist))
def countN1N2N3(gt, pops, mle): """ """ clist = [] klist = [] # make gt arrays for each subpop, then haplotype arrays gtA = gt.take(pops[0], axis=1) htA = gtA.to_haplotypes() gtB = gt.take(pops[1], axis=1) htB = gtB.to_haplotypes() if len(pops[1]) == 1: hap2 = list(range(len(pops[1] * 2))) for hap1 in list(range(len(pops[0]))): ma = htA[:, [hap1]].count_alleles(max_allele=1) mb = htB[:, hap2].count_alleles(max_allele=1) jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1]) try: n3 = jsfs[0, 2] + jsfs[1, 0] n2 = jsfs[0, 1] + jsfs[1, 1] n1 = jsfs[0, 0] + jsfs[1, 2] except IndexError: z = np.zeros((2, 3), dtype=int) z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs n3 = z[0, 2] + z[1, 0] n2 = z[0, 1] + z[1, 1] n1 = z[0, 0] + z[1, 2] c_hat, k_hat = estimCandK(n1, n2, n3, mle) clist.append(c_hat) klist.append(k_hat) else: for hap1 in list(range(len(pops[0]))): for hap2 in list(combinations(range(len(pops[1]) * 2), 2)): ma = htA[:, [hap1]].count_alleles(max_allele=1) mb = htB[:, hap2].count_alleles(max_allele=1) z = np.zeros((2, 3), dtype=int) jsfs = allel.joint_sfs(ma[:, 1], mb[:, 1]) try: n3 = jsfs[0, 2] + jsfs[1, 0] n2 = jsfs[0, 1] + jsfs[1, 1] n1 = jsfs[0, 0] + jsfs[1, 2] except IndexError: z = np.zeros((2, 3), dtype=int) z[:jsfs.shape[0], :jsfs.shape[1]] = jsfs n3 = z[0, 2] + z[1, 0] n2 = z[0, 1] + z[1, 1] n1 = z[0, 0] + z[1, 2] c_hat, k_hat = estimCandK(n1, n2, n3, mle) clist.append(c_hat) klist.append(k_hat) return (np.mean(clist), np.mean(klist))
def joint_site_frequency_spectrum(genotypes1: np.ndarray, genotypes2: np.ndarray, population1: str='population1', population2: str='population2') -> np.ndarray: allele_counts1 = genotypes1.reshape(genotypes1.shape[0], -1).sum(1) allele_counts2 = genotypes2.reshape(genotypes2.shape[0], -1).sum(1) joint_sfs = allel.joint_sfs(allele_counts1, allele_counts2, np.product(genotypes1.shape[1:]), np.product(genotypes2.shape[1:])) ax = plot_joint_sfs(joint_sfs, population1, population2) plt.savefig(os.path.join(FIGURES_DIR, '{}.{}.joint_sfs.png'.format(population1.replace(' ', '_'), population2.replace(' ', '_')))) plt.clf() return joint_sfs / joint_sfs.sum()
def jsfsStats(gt, pops, chrm, fold=False, plot=False): """Joint site frequency spectrum with scikit-allel """ print("jsfs") n = 100000 # number of SNPs to choose randomly try: vidx = np.random.choice(gt.shape[0], n, replace=False) except ValueError: vidx = np.random.choice(gt.shape[0], gt.shape[0], replace=False) vidx.sort() gtr = gt.take(vidx, axis=0) jsfslist = [] for i, j in combinations(pops, 2): gtpop1 = gtr.take(i, axis=1) gtpop2 = gtr.take(j, axis=1) ac1 = gtpop1.count_alleles() ac2 = gtpop2.count_alleles() if fold: # pad for allel as well popsizeA, popsizeB = len(i) / 2, len(j) / 2 fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int) jsfs = allel.joint_sfs_folded(ac1, ac2) fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs else: # pad for allel as well popsizeA, popsizeB = len(i) * 2, len(j) * 2 fs = np.zeros((popsizeA + 1, popsizeB + 1), dtype=int) jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1]) fs[:jsfs.shape[0], :jsfs.shape[1]] = jsfs if plot: fig, ax = plt.subplots(figsize=(6, 6)) allel.stats.plot_joint_sfs(fs, ax=ax) jsfsarray = np.zeros(23) jsfsarray[0] = np.sum(fs[0, 1:3]) jsfsarray[1] = np.sum(fs[1:3, 0]) jsfsarray[2] = np.sum(fs[0, 3:-3]) jsfsarray[3] = np.sum(fs[3:-3, 0]) jsfsarray[4] = np.sum(fs[0, -3:-1]) jsfsarray[5] = np.sum(fs[-3:-1, 0]) jsfsarray[6] = np.sum(fs[1:3, 1:3]) jsfsarray[7] = np.sum(fs[1:3, 3:-3]) jsfsarray[8] = np.sum(fs[3:-3, 1:3]) jsfsarray[9] = np.sum(fs[-3:-1, 3:-3]) jsfsarray[10] = np.sum(fs[3:-3, -3:-1]) jsfsarray[11] = np.sum(fs[1:3, -3:-1]) jsfsarray[12] = np.sum(fs[-3:-1, 1:3]) jsfsarray[13] = np.sum(fs[3:-3, 3:-3]) jsfsarray[14] = np.sum(fs[-3:-1, -3:-1]) jsfsarray[15] = np.sum(fs[0, -1]) jsfsarray[16] = np.sum(fs[-1, 0]) jsfsarray[17] = np.sum(fs[-1, 1:3]) jsfsarray[18] = np.sum(fs[1:3, -1]) jsfsarray[19] = np.sum(fs[-1, 3:-3]) jsfsarray[20] = np.sum(fs[3:-3, -1]) jsfsarray[21] = np.sum(fs[-1, -3:-1]) jsfsarray[22] = np.sum(fs[-3:-1, -1]) jsfslist.append(jsfsarray) return (jsfslist)
def ts_to_dadi_sfs(ts_path, out_path, out_path_nonvariant, sample_size=20, mask_file=None): ''' Generate however many different SFS with msprime and convert+save them into SFS for dadi to use. ''' ts = tskit.load(ts_path) #haps_pops_joint = np.array(ts.genotype_matrix()) haps = ts.genotype_matrix() total_length = ts.sequence_length # Masking retain = np.full(ts.get_num_mutations(), False) if mask_file: mask_table = pd.read_csv(mask_file, sep="\t", header=None) chrom = ts_path.split("/")[-1].split(".")[0] sub = mask_table[mask_table[0] == chrom] mask_ints = pd.IntervalIndex.from_arrays(sub[1], sub[2]) snp_locs = [int(x.site.position) for x in ts.variants()] tmp_bool = [mask_ints.contains(x) for x in snp_locs] retain = np.logical_or(retain, tmp_bool) #print(retain) total_length -= np.sum(mask_ints.length) #print(ts.sequence_length) #print(total_length) retain = np.logical_not(retain) haps_pops_joint = np.array(haps[retain, :]) #Break up the haplotypes into seperate populations based on sample_size haps_pop0_joint = haps_pops_joint[:, :sample_size] haps_pop1_joint = haps_pops_joint[:, sample_size:] genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes( ploidy=2) allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles() genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes( ploidy=2) allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles() sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1], allele_counts_pop1_joint[:, 1]) num_sites = sum(sum(sfs_joint)) #print(ts.num_sites) sfs_joint = dadi.Spectrum(sfs_joint) sfs_joint.to_file(out_path) sfs_joint[ 0, 0] = total_length - num_sites # need to get the number of nonvariant sites for the [0,0] entry sfs_joint.to_file(out_path_nonvariant)
def test_joint_sfs(): # https://github.com/cggh/scikit-allel/issues/144 warnings.resetwarnings() warnings.simplefilter('error') dac1 = np.array([0, 1, 2, 3, 4]) dac2 = np.array([1, 2, 1, 2, 3], dtype='u8') s = joint_sfs(dac1, dac2) e = [[0, 1, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]] assert_array_equal(e, s) warnings.resetwarnings() warnings.simplefilter('always')
def jsfs_stats(p1, gt, pos, fold): """Calculate the joint site frequency spectrum between two populations. Parameters ---------- p1 : TYPE DESCRIPTION. gt : TYPE DESCRIPTION. pos : TYPE DESCRIPTION. fold : TYPE DESCRIPTION. rand : TYPE DESCRIPTION. randn : TYPE DESCRIPTION. Returns ------- props : TYPE DESCRIPTION. """ gtr, pos_s = get_seg(gt, pos) gtpop1 = gtr.take(range(p1), axis=1) gtpop2 = gtr.take(range(p1, gtr.shape[1]), axis=1) ac1 = gtpop1.count_alleles() ac2 = gtpop2.count_alleles() # jsfs if fold: # pad for allel as well #popsizeA, popsizeB = p1/2, (gtr.shape[1]-p1)/2 jsfs = allel.joint_sfs_folded(ac1, ac2, gtpop1.shape[1], gtpop2.shape[1]) #fss = np.resize(jsfs, (int(popsizeA)+1, int(popsizeB)+1)) else: # pad for allel as well #popsizeA, popsizeB = p1, gtr.shape[1]-p1 jsfs = allel.joint_sfs(ac1[:, 1], ac2[:, 1], gtpop1.shape[1], gtpop2.shape[1]) #fss = np.resize(jsfs, (int(popsizeA)+1, int(popsizeB)+1)) props = summarizejsfs(jsfs) return props
def msprime_to_dadi_simulation(path, seed, org, chrom, sample_size=20): ''' Generate however many different SFS with msprime and convert+save them into SFS for dadi to use. ''' #For testing # print(path, seed, chrom, sample_size) # chrom = homo_sapiens.genome.chromosomes[chrom] # model = homo_sapiens.GutenkunstThreePopOutOfAfrica() chrom = getattr(stdpopsim, '_'.join(org.split('_')[:-1])).genome.chromosomes[chrom] model = getattr(getattr(stdpopsim, '_'.join(org.split('_')[:-1])), org.split('_')[-1:][0])() samples_pops_joint = [ msprime.Sample(population=0, time=0) ] * sample_size + [msprime.Sample(population=1, time=0)] * sample_size ts_pops_joint = msprime.simulate( samples=samples_pops_joint, recombination_map=chrom.recombination_map(), mutation_rate=chrom.default_mutation_rate, random_seed=seed, **model.asdict()) haps_pops_joint = np.array(ts_pops_joint.genotype_matrix()) #Break up the haplotypes into seperate populations based on sample_size haps_pop0_joint = haps_pops_joint[:, :sample_size] haps_pop1_joint = haps_pops_joint[:, sample_size:] genotypes_pop0_joint = allel.HaplotypeArray(haps_pop0_joint).to_genotypes( ploidy=2) allele_counts_pop0_joint = genotypes_pop0_joint.count_alleles() genotypes_pop1_joint = allel.HaplotypeArray(haps_pop1_joint).to_genotypes( ploidy=2) allele_counts_pop1_joint = genotypes_pop1_joint.count_alleles() sfs_joint = allel.joint_sfs(allele_counts_pop0_joint[:, 1], allele_counts_pop1_joint[:, 1]) sfs_joint = dadi.Spectrum(sfs_joint) sfs_joint.to_file(path)