def calcfst(pops, posdict, gtdict, L, snp=True, hud=True): """ """ ix = 0 pw = len(pops) fstarray = np.zeros([len(posdict.keys()), int((pw * (pw - 1)) / 2)]) # Observed FST pix = 0 popdict = {} for p in pops: popdict[pix] = list(range(ix, ix + p)) ix += p pix += 1 for r in gtdict.keys(): fst_obs = [] if snp: pos_snp = [(np.random.choice(posdict[r]))] gt_snp = np.where(posdict[r] == pos_snp)[0] if len(gt_snp) > 1: gt_snp = gt_snp[0] else: pos_snp = list(posdict[r]) for x, y in combinations(popdict.keys(), 2): popX = gtdict[r][popdict[x]] popY = gtdict[r][popdict[y]] sdfst = simData() geno = np.vstack([popX, popY]) geno_fst = geno[:, gt_snp] gtpop = [''.join(str(n) for n in y) for y in geno_fst] gtpop_fst = [i.encode() for i in gtpop] sdfst.assign_sep(pos_snp, gtpop_fst) size = [popX.shape[0], popY.shape[0]] f1 = fst(sdfst, size) if hud: fst_obs.append(f1.hsm()) else: fst_obs.append(f1.slatkin()) # fst_obs.append(f1.hbk()) # # pi for z in popdict.keys(): sdpop = simData() popZ = gtdict[r][popdict[z]] geno = popZ[:, gt_snp] gtpop = [''.join(str(n) for n in y) for y in geno] gtpop_pi = [i.encode() for i in gtpop] sdpop.assign_sep(pos_snp, gtpop_pi) pspopr = polySIM(sdpop) pi = pspopr.thetapi() print("pop {} pi:{}".format(z, pi / L)) fstarray[int(r), :] = fst_obs return (fstarray)
def permtest(gtdict, posdict, pops, n_perm, fstarray): """ """ nhap = sum(pops) fst_t = [] r = random.choice(gtdict.keys()) # FST random permutations for p in range(n_perm): popX = gtdict[r][np.random.randint(0, nhap, pops[0])] popY = gtdict[r][np.random.randint(0, nhap, pops[0])] sdfst = simData() geno_fst = np.vstack([popX, popY]) gtpop_fst = [''.join(str(n) for n in y) for y in geno_fst] sdfst.assign_sep(posdict[r], gtpop_fst) size = [popX.shape[0], popY.shape[0]] f1 = fst(sdfst, size) # fst_t.append(f1.slatkin()) fst_t.append(f1.hsm()) # fst_t.appen(f1.hbk()) # mark significant FST fst_tnp = np.array(fst_t) Fstdist = [len(np.where(f > fst_tnp)[0]) for f in fstarray] return ([1 - (f / float(n_perm)) for f in Fstdist])
def testShared(self): x = [(0.1, "0011"), (0.2, "1100"), (0.3, "0100"), (0.4, "1101"), (0.5, "0101")] d = simData() d.assign(x) f = fst(d, [2, 2]) self.assertEqual(f.shared(0, 1), [0.5])
def calcfst(pops, posdict, gtdict): """ """ fst_obs = [] ix = 0 popiix = [] pw = len(pops) fstarray = np.zeros([len(posdict.keys()), (pw*(pw-1))/2]) # Observed FST for p in pops: popiix.append(range(ix, ix + p)) ix += p for r in gtdict.keys(): fst_obs = [] for i, pix in enumerate(popiix): for j, jix in enumerate(popiix): if i > j: popX = gtdict[r][pix] popY = gtdict[r][jix] sdfst = simData() geno_fst = np.vstack([popX, popY]) gtpop_fst = [''.join(str(n) for n in y) for y in geno_fst] sdfst.assign_sep(posdict[r], gtpop_fst) size = [popX.shape[0], popY.shape[0]] f1 = fst(sdfst, size) fst_obs.append(f1.slatkin()) fstarray[int(r), :] = fst_obs return(fstarray)
def testShared(self): x = [(0.1,b"0011"),(0.2,b"1100"), (0.3,b"0100"),(0.4,b"1101"), (0.5,b"0101")] d = simData() d.assign(x) f = fst(d,[2,2]) self.assertEqual(f.shared(0,1),[0.5])
def testPriv(self): x = [(0.1, "0011"), (0.2, "1100"), (0.3, "0100"), (0.4, "1101"), (0.5, "0101")] d = simData() d.assign(x) f = fst(d, [2, 2]) p = f.priv(0, 1) expected = {0: [0.3], 1: [0.4]} self.assertEqual(p, expected)
def testFixed(self): x = [(0.1, "0011"), (0.2, "1100"), (0.3, "0100"), (0.4, "1101"), (0.5, "0101")] d = simData() d.assign(x) f = fst(d, [2, 2]) p = f.fixed(0, 1) expected = [0.1, 0.2] self.assertEqual(p, expected)
def testExceptionPriv(self): with self.assertRaises(RuntimeError): x = [(0.1, "0011"), (0.2, "1100"), (0.3, "0100"), (0.4, "1101"), (0.5, "0101")] d = simData() d.assign(x) f = fst(d, [2, 2]) #2 is out of range. sh = f.priv(2, 1)
def testExceptionPriv(self): with self.assertRaises(RuntimeError): x = [(0.1,b"0011"),(0.2,b"1100"), (0.3,b"0100"),(0.4,b"1101"), (0.5,b"0101")] d = simData() d.assign(x) f = fst(d,[2,2]) #2 is out of range. sh = f.priv(2,1)
def testFixed(self): x = [(0.1,b"0011"),(0.2,b"1100"), (0.3,b"0100"),(0.4,b"1101"), (0.5,b"0101")] d = simData() d.assign(x) f = fst(d,[2,2]) p = f.fixed(0,1) expected = [0.1,0.2] self.assertEqual(p,expected)
def testPriv(self): x = [(0.1,b"0011"),(0.2,b"1100"), (0.3,b"0100"),(0.4,b"1101"), (0.5,b"0101")] d = simData() d.assign(x) f = fst(d,[2,2]) p = f.priv(0,1) expected = {0:[0.3],1:[0.4]} self.assertEqual(p,expected)
def testException1(self): with self.assertRaises(RuntimeError): x = [(0.1, "0011"), (0.2, "1100"), (0.3, "0100"), (0.4, "1101"), (0.5, "0101")] d = simData() d.assign(x) ##the second argument's sum is > total sample size ##libsequence will throw a SeqException here, ##which gets tranlated to a RuntimeError f = fst(d, [2, 3])
def testException1(self): with self.assertRaises(RuntimeError): x = [(0.1,b"0011"),(0.2,b"1100"), (0.3,b"0100"),(0.4,b"1101"), (0.5,b"0101")] d = simData() d.assign(x) ##the second argument's sum is > total sample size ##libsequence will throw a SeqException here, ##which gets tranlated to a RuntimeError f = fst(d,[2,3])
def make_simData(g): """ Construct a :class:`libsequence.polytable.simData` from the output of msprime. :param g: The output from msprime .. note:: Thanks to Jerome Kelleher for pointing out the quick implementation using msprime >= 0.4.0. Example: >>> import msprime as msp >>> from libsequence.msprime import make_simData >>> g = msp.simulate(sample_size = 10,Ne=1e6, recombination_rate=1e-8,mutation_rate=1e-8,length=1e4) >>> s = make_simData(g) """ return simData([(v.position, v.genotypes) for v in g.variants(as_bytes=True)])
def calc_otherstats(positions, gtdict, popsizelist): """calculate stats from ms-type file Parameters ---------- poistions : array, float matrix of mutational positions gtdict : default dict dictionary of genotypes per pop; gtdict['pop1'].append((gt_string)) Returns ------- ms_otherstats : pandas df """ hapconfig_common = [] gH12 = [] gH1 = [] gH21 = [] sfs = [] hap = [] dfreq = [] popiix = [] ix = 0 pos = 0.5 # need -Sp 0.5 and -Smark for pop in popsizelist: popiix.append(range(ix, ix + pop)) ix += pop for rep in range(len(gtdict.keys())): dfreqt = [] sfst = [] hapt = [] gH12t = [] gH1t = [] gH21t = [] for iix in popiix: gtarray = gtdict[str(rep + 1)][iix] sdpop1 = simData() gtpop1 = [''.join(str(n) for n in y) for y in gtarray] sdpop1.assign_sep(positions[rep], gtpop1) # pspop1 = polySIM(sdpop1) # stats garudStats_t = (garudStats(sdpop1)) # garud 2015 gH12t.append(garudStats_t['H12']) gH1t.append(garudStats_t['H1']) gH21t.append(garudStats_t['H2H1']) # lhaf_t = lhaf(sdpop1,1) #1-HAF is most common; Ronen 2016 # sfs sfsarray, derived = site_freqspec_fx(gtarray, pos) sfst.append(sfsarray) # hapconfig hapt.append(haploconfig_fx(gtarray)) # derived dfreqt.append(derived) gH12.append(gH12t) gH1.append(gH1t) gH21.append(gH21t) sfs.append(sfst) hap.append(hapt) dfreq.append(dfreqt) garudH12 = [np.mean(pop) for pop in zip(*gH12)] garudH1 = [np.mean(pop) for pop in zip(*gH1)] garudH21 = [np.mean(pop) for pop in zip(*gH21)] sfs_summ = [ np.mean(pop, axis=0) / (popsizelist[i]) for i, pop in enumerate(zip(*sfs)) ] hap_summ = [np.mean(pop, axis=0) for i, pop in enumerate(zip(*hap))] haparray = [np.vstack(i) for i in zip(*hap)] for config in haparray: uniq, hapfreq = hapconfigfx(config) hapconfig_common.append(zip(uniq, hapfreq)) derivedfreq = [np.mean(pop) for pop in zip(*dfreq)] # jointsfs_fx(gtdict, popiix) return (garudH12, garudH1, garudH21, sfs_summ, hap_summ, derivedfreq)
def hapbaxVmig_stats(gtdict, posdict, demesizelist, sp, origcount, sel, mig): """calculates the haplotype diversity of haplotypes carrying the resistant allele. Also: number of origins, total haplotype diversity, resistant haplotype congfig """ pdist = [] hapbax = [] for rep in range(len(gtdict.keys())): rep = str(rep) smark = np.where(posdict[rep] == sp)[0] if len(smark) > 1: print("\nSkipping rep {}, smark gt 1\n".format(rep)) continue piix = gtdict[rep][0:demesizelist[0]] # index for the first pop riix = np.where(piix[:, smark] > 0)[0] # location of the selected if riix.any(): hapr = gtdict[rep][riix] uniqhaps_IBS = np.array( [np.array(x) for x in set(tuple(x) for x in hapr)]) hapfreq_IBS = np.array( [len(hapr[np.all(hapr == x, axis=1)]) for x in uniqhaps_IBS], dtype=int) uallel = hapr[:][:, smark] puallel = gtdict[rep][:, smark] hapr[:, smark] = 1 # sites that are IBS are read as uniqhaps # change all origins to 1 even if >1 uniqhaps = np.array( [np.array(x) for x in set(tuple(x) for x in hapr)]) hapfreq = np.array( [len(hapr[np.all(hapr == x, axis=1)]) for x in uniqhaps], dtype=int) print("\n#rep {}".format(rep)) print("\nsel: {}\nmig: {}".format(sel, mig)) print("#number of ORIGINS: {}".format(origcount[int(rep)])) print("#number of unique resistant ALLELES across ALL pops: {}". format(len(np.unique(puallel[puallel > 0])))) print( "#number of unique resistant ALLELES in SAMPLE pop: {}".format( len(np.unique(uallel[uallel > 0])))) if uniqhaps_IBS.shape[0] > uniqhaps.shape[0]: print("#number of resistant HAPLOTYPES (IBS) in SAMPLE pop: " "{}, frequencies {}".format(len(uniqhaps_IBS), hapfreq_IBS)) print( "#number of hidden resistant HAPLOTYPES (IBS): {}".format( uniqhaps_IBS.shape[0] - uniqhaps.shape[0])) print("#number of observable resistant HAPLOTYPES in SAMPLE pop: " "{}, frequencies {}".format(len(uniqhaps), hapfreq)) hapbax.append(len(uniqhaps)) # full hap config n = sum(hapfreq) C_freq, C_count = np.unique(hapfreq, return_counts=True) C = np.zeros(piix.shape[0]) C[C_freq - 1] = C_count # haplotype diversity Hd = 1 - sum([(((i + 1) / float(n))**2) * c for i, c in enumerate(C)]) M = max(np.nonzero(C)[0]) + 1 # greatest non-zero position K = sum(C) # number of haps # eveness from Chattopadhyay 2007 lambda_e = sum([(float(hf) / sum(hapfreq))**2 for hf in hapfreq]) Ds = 1.0 / lambda_e Ev = Ds / uniqhaps.shape[0] rAfreq = riix.shape[0] / float(piix.shape[0]) com = "#stats_r: Hd:{}\tKhaps:{}\tMaxAbsFreq:{}\tEv:{}\trFreq:{}\n" print(com.format(Hd, K, M, Ev, rAfreq)) # fill pdist with below pdist.extend([ np.count_nonzero(a != b) for i, a in enumerate(hapr) for j, b in enumerate(hapr) if j > i ]) # popgen stats resistant gtpopr = [''.join(str(n) for n in y) for y in hapr] sdpopr = simData() sdpopr.assign_sep(posdict[rep], gtpopr) pspopr = polySIM(sdpopr) theta_r = pspopr.thetaw() tajd_r = pspopr.tajimasd() hprime_r = pspopr.hprime() pi_r = pspopr.thetapi() garudStats_r = garudStats(sdpopr) # garud 2015 # popgen stats all piix[:, smark] = 1 gtpopa = [''.join(str(v) for v in y) for y in piix] sdpopa = simData() sdpopa.assign_sep(posdict[rep], gtpopa) pspopa = polySIM(sdpopa) theta_a = pspopa.thetaw() tajd_a = pspopa.tajimasd() hprime_a = pspopa.hprime() pi_a = pspopa.thetapi() # hapdiv_a = pspopa.hapdiv() # nhaps_a = pspopa.nhaps() garudStats_a = garudStats(sdpopa) # garud 2015 print("#popgen_r: theta_w:{}\ttheta_pi:{}\ttajD:{}\tfaywuH:{}". format(theta_r, pi_r, tajd_r, hprime_r)) print("#popgen_all: theta_w:{}\ttheta_pi:{}\ttajD:{}\tfaywuH:{}". format(theta_a, pi_a, tajd_a, hprime_a)) print("#garud2015_r: H12:{}\tH1:{}\tH2H1:{}".format( garudStats_r['H12'], garudStats_r['H1'], garudStats_r['H2H1'])) print("#garud2015_all: H12:{}\tH1:{}\tH2H1:{}\n".format( garudStats_a['H12'], garudStats_a['H1'], garudStats_a['H2H1'])) else: C = np.zeros(piix.shape[0]) hapbaxm = np.mean(hapbax) hapbaxSE = np.std(hapbax) / len(hapbax) return (hapbaxm, hapbaxSE)
def hapmigVsel_stats(gtdict, posdict, demesizelist, sp, origcount, sel, mig): """calculates the haplotype diversity of haplotypes carrying the resistant allele. Also: number of origins, total haplotype diversity, resistant haplotype congfig """ rfreq = [] # frequency of resistant allele Rplot = np.array([], dtype=np.int64).reshape(0, demesizelist[0]) Splot = [] pdist = [] for rep in range(len(gtdict.keys())): rep = str(rep) smark = np.where(posdict[rep] == sp)[0] if len(smark) > 1: print("\nSkipping rep {}, smark gt 1\n".format(rep)) continue piix = gtdict[rep][0:demesizelist[0]] # index for the first pop riix = np.where(piix[:, smark] > 0)[0] # location of the selected if riix.any(): hapr = gtdict[rep][riix] uniqhaps_IBS = np.array( [np.array(x) for x in set(tuple(x) for x in hapr)]) hapfreq_IBS = np.array( [len(hapr[np.all(hapr == x, axis=1)]) for x in uniqhaps_IBS], dtype=int) uallel = hapr[:][:, smark] puallel = gtdict[rep][:, smark] hapr[:, smark] = 1 # sites that are IBS are read as uniqhaps # change all origins to 1 even if >1 uniqhaps = np.array( [np.array(x) for x in set(tuple(x) for x in hapr)]) hapfreq = np.array( [len(hapr[np.all(hapr == x, axis=1)]) for x in uniqhaps], dtype=int) print("\n#rep {}".format(rep)) print("\nsel: {}\nmig: {}".format(sel, mig)) print("#number of ORIGINS: {}".format(origcount[int(rep)])) print("#number of unique resistant ALLELES across ALL pops: {}". format(len(np.unique(puallel[puallel > 0])))) print( "#number of unique resistant ALLELES in SAMPLE pop: {}".format( len(np.unique(uallel[uallel > 0])))) if uniqhaps_IBS.shape[0] > uniqhaps.shape[0]: print("#number of resistant HAPLOTYPES (IBS) in SAMPLE pop: " "{}, frequencies {}".format(len(uniqhaps_IBS), hapfreq_IBS)) print( "#number of hidden resistant HAPLOTYPES (IBS): {}".format( uniqhaps_IBS.shape[0] - uniqhaps.shape[0])) print("#number of observable resistant HAPLOTYPES in SAMPLE pop:" "{}, frequencies {}".format(len(uniqhaps), hapfreq)) # full hap config n = sum(hapfreq) C_freq, C_count = np.unique(hapfreq, return_counts=True) C = np.zeros(piix.shape[0]) C[C_freq - 1] = C_count # haplotype diversity Hd = 1 - sum([(((i + 1) / float(n))**2) * c for i, c in enumerate(C)]) M = max(np.nonzero(C)[0]) + 1 # greatest non-zero position K = sum(C) # number of haps # eveness from Chattopadhyay 2007 lambda_e = sum([(float(hf) / sum(hapfreq))**2 for hf in hapfreq]) Ds = 1.0 / lambda_e Ev = Ds / uniqhaps.shape[0] print("#stats_r: Hd:{}\tKhaps:{}\tMaxAbsFreq:{}\tEv:{}\n".format( Hd, K, M, Ev)) # fill pdist with below pdist.extend([ np.count_nonzero(a != b) for i, a in enumerate(hapr) for j, b in enumerate(hapr) if j > i ]) # popgen stats resistant gtpopr = [''.join(str(n) for n in y) for y in hapr] sdpopr = simData() sdpopr.assign_sep(posdict[rep], gtpopr) pspopr = polySIM(sdpopr) theta_r = pspopr.thetaw() tajd_r = pspopr.tajimasd() hprime_r = pspopr.hprime() pi_r = pspopr.thetapi() garudStats_r = garudStats(sdpopr) # garud 2015 # popgen stats all piix[:, smark] = 1 gtpopa = [''.join(str(n) for n in y) for y in piix] sdpopa = simData() sdpopa.assign_sep(posdict[rep], gtpopa) pspopa = polySIM(sdpopa) theta_a = pspopa.thetaw() tajd_a = pspopa.tajimasd() hprime_a = pspopa.hprime() pi_a = pspopa.thetapi() # hapdiv_a = pspopa.hapdiv() # nhaps_a = pspopa.nhaps() garudStats_a = garudStats(sdpopa) # garud 2015 print("#popgen_r: theta_w:{}\ttheta_pi:{}\ttajD:{}\tfaywuH:{}". format(theta_r, pi_r, tajd_r, hprime_r)) print("#popgen_all: theta_w:{}\ttheta_pi:{}\ttajD:{}\tfaywuH:{}". format(theta_a, pi_a, tajd_a, hprime_a)) print("#garud2015_r: H12:{}\tH1:{}\tH2H1:{}".format( garudStats_r['H12'], garudStats_r['H1'], garudStats_r['H2H1'])) print("#garud2015_all: H12:{}\tH1:{}\tH2H1:{}\n".format( garudStats_a['H12'], garudStats_a['H1'], garudStats_a['H2H1'])) else: C = np.zeros(piix.shape[0]) Rplot = np.vstack((Rplot, C)) Splot.append(piix.shape[0] - riix.shape[0]) rfreq.append(riix.shape[0] / float(piix.shape[0])) # plot of singletons, doubletons ... Piplot = np.append(np.sum(Splot), np.sum(Rplot, axis=0)) Rfreq = np.repeat(np.mean(rfreq), len(Piplot)) # for pi plot pichart = [(np.nonzero(Rplot[sar])[0][::-1] + 1) for sar in range(Rplot.shape[0]) if Rplot[sar].size] avgresist = [ np.average(i) for i in itertools.izip_longest(*pichart, fillvalue=0) ] rstr = ["R" + str(nx) for nx in range(0, len(avgresist) + 1)] rarray = np.array(avgresist) rarray = np.insert(rarray, 0, demesizelist[0] - sum(avgresist)) # pairwise diff p_dist = np.unique(pdist, return_counts=True) return (Piplot, Rfreq, rarray, rstr, p_dist)
for i in samples: windows = [] start = 0 while start < 3: ##We will only look at neutral mutations, which are element 0 of each sampl window = [j[0] for j in i[0] if (j[0] >=start and j[0] < start+0.1)] windows.append(window) start += 0.1 ##We now have a full set of windows that we can do something with print (len(windows)) ##There should be 30, and many will be empy # ### Using [pylibseq](https://github.com/molpopgen/pylibseq) # In[31]: from libsequence.windows import Windows from libsequence.polytable import simData for i in samples: ##We need to convert our list of tuples ##into types that pylibseq/libsequence understand: windows = Windows(simData(i[0]),0.1,0.1,0,3) ##Now, you can analyze the windows, etc. print(len(windows)) # Well, the pylibseq version is clearly more compact. Of course, you can/should abstract the pure Python version into a standalone function. # # Why would you ever use the manual version? It can save you memory. The pylibseq version constructs an iterable list of windows, meaning that there is an object allocated for each window. For the manual version above, we grew a list of objects, but we could just have easily processed them and let them go out of scope.
print(BigTable) # ## Summary statistics from samples # # We will use the [pylibseq](http://molpopgen.github.io/pylibseq/) package to calculate summary statistics. pylibseq is a Python wrapper around [libsequence](http://molpopgen.github.io/libsequence/). # In[15]: import libsequence.polytable as polyt import libsequence.summstats as sstats #Convert neutral mutations into libsequence "SimData" objects, #which are intended to handle binary (0/1) data like #what comes out of these simulations n = [polyt.simData(i[0]) for i in samples] #Create "factories" for calculating the summary stats an = [sstats.polySIM(i) for i in n] ##Collect a bunch of summary stats into a pandas.DataFrame: NeutralMutStats = pandas.DataFrame([ {'thetapi':i.thetapi(),'npoly':i.numpoly(),'thetaw':i.thetaw()} for i in an ]) NeutralMutStats # ### The average $\pi$ under the model # # Under the BGS model, the expectation of $\pi$ is $E[\pi]=\pi_0e^{-\frac{U}{2sh+r}},$ $U$ is the mutation rate to strongly-deleterious variants, $\pi_0$ is the value expected in the absence of BGS (_i.e._ $\pi_0 = \theta = 4N_e\mu$), $s$ and $h$ are the selection and dominance coefficients, and $r$ is the recombination rate. # # Note that the definition of $U$ is _per diploid_, meaning twice the per gamete rate. (See Hudson and Kaplan (1995) PMC1206891 for details).
# In[30]: for i in samples: windows = [] start = 0 while start < 3: ##We will only look at neutral mutations, which are element 0 of each sampl window = [j[0] for j in i[0] if (j[0] >= start and j[0] < start + 0.1)] windows.append(window) start += 0.1 ##We now have a full set of windows that we can do something with print(len(windows)) ##There should be 30, and many will be empy # ### Using [pylibseq](https://github.com/molpopgen/pylibseq) # In[31]: from libsequence.windows import Windows from libsequence.polytable import simData for i in samples: ##We need to convert our list of tuples ##into types that pylibseq/libsequence understand: windows = Windows(simData(i[0]), 0.1, 0.1, 0, 3) ##Now, you can analyze the windows, etc. print(len(windows)) # Well, the pylibseq version is clearly more compact. Of course, you can/should abstract the pure Python version into a standalone function. # # Why would you ever use the manual version? It can save you memory. The pylibseq version constructs an iterable list of windows, meaning that there is an object allocated for each window. For the manual version above, we grew a list of objects, but we could just have easily processed them and let them go out of scope.