def __call__(self, pop): if self.counter % 50 == 0 or (pop.generation==self.final): # print(pop.generation) actual_gen = np.around([(pop.generation-self.set_gen)/self.Nstart],decimals=3) # sample chromosomes from the population chr_sampled = 400 samp = fp11.sampling.sample_separate(self.__rng, pop, chr_sampled, True) neutral_sample = polyt.SimData([str2byte(mut, 'utf-8') for mut in samp[0]]) # split into windows w = Windows(neutral_sample, window_size=1/self.val_per_window, step_len=1/self.val_per_window, starting_pos=self.beginning, ending_pos=self.nwindows) # calculate summaries window_pi = np.around([PolySIM(w[i]).thetapi() for i in range(len(w))],decimals=3) window_singleton = np.around([PolySIM(w[i]).numsingletons() for i in range(len(w))],decimals=3) window_tajimasD = np.around([PolySIM(w[i]).tajimasd() for i in range(len(w))],decimals=3) # add data to output self.pi.append(np.append(actual_gen,window_pi)) self.singleton.append(np.append(actual_gen,window_singleton)) self.tajimasD.append(np.append(actual_gen,window_tajimasD)) self.counter += 1
def get_summstats(pop, repid, nsam, temp): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = np.array([], dtype=temp.dtype) locus = 0 for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) gs = garudStats(w[i]) raw_nSL = nSLiHS(w[i]) raw_nSL = [ i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3 ] nSL = np.array(raw_nSL) mean_nSL = np.nan max_nSL = np.nan if len(nSL) > 0: mean_nSL = nSL[:, 0].mean() bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5)) for b in set(bins): binscores = nSL[:, 0][np.where(bins == b)[0]] if len(binscores) > 1: bmean = binscores.mean() sd = binscores.std() # if np.isfinite(bmean) == 0: # print(bscores) # sys.exit(0) if sd > 0.0: binscores = (binscores - bmean) / sd bmax = binscores[np.where( np.abs(binscores) == max(np.abs(binscores))) [0]][0] if np.isnan( max_nSL) or np.abs(bmax) > np.abs(max_nSL): max_nSL = bmax temp[i] = np.array( [(locus, int(i), repid, pop.generation, ps.tajimasd(), ps.hprime(), ps.thetapi(), gs['H1'], gs['H12'], gs['H2H1'], mean_nSL, max_nSL)], dtype=temp.dtype) #print("done assigning ",i) #print("attempt concat") rv = np.concatenate([rv, temp.copy()]) #print("here at ",locus) locus += 1 #print("returning summstats") return rv
def get_outlier_nSL(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si ## neut.extend([i for i in sel]) ## neut = sorted(neut,key = lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) raw_nSL = nSLiHS(w[i]) raw_nSL = [ X for X in raw_nSL if np.isfinite(X[0]) == 1 and X[2] > 3 ] nSL = np.array(raw_nSL) nbig2 = 0 nbig3 = 0 nscores = 0 if len(nSL) > 0: bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5)) for b in set(bins): binscores = nSL[:, 0][np.where(bins == b)[0]] if len(binscores) > 1: bmean = binscores.mean() sd = binscores.std() # if np.isfinite(bmean) == 0: # print(bscores) # sys.exit(0) if sd > 0.0 and np.isfinite(sd): binscores = (binscores - bmean) / sd nscores += len(binscores) abs_nSL_gt_2 = np.where( np.abs(binscores) >= 2.0)[0] nbig2 += len(abs_nSL_gt_2) abs_nSL_gt_3 = np.where( np.abs(binscores) >= 3.0)[0] nbig3 += len(abs_nSL_gt_3) if nscores > 0: rv.append( Datum(repid, pop.generation, locus_index, i, nbig2 / nscores, nbig3 / nscores)) else: rv.append(Datum(repid, pop.generation, locus_index, i, 0, 0)) return rv
def get_omega_max_per_window(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): om = omega_max(w[i]) rv.append(Datum(repid, pop.generation, locus_index, i, om[0])) return rv
def get_summstats(pop, nsam, generation): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] locus = 0 for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) gs = garudStats(w[i]) rv.append( GenomeScanDataPoint(generation, locus, i, ps.tajimasd(), ps.hprime(), gs['H1'], gs['H12'], gs['H2H1'])) locus += 1 return rv
def get_summstats(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) locus = 0 # The procedure here is: # 1. Get raw nSL for all windows at all loci # 2. For each locus, collect values from first/last window # to use for normalizing. # 3. For all other windows per locus, get mean z-score reference_daf = [] reference_values = [] temp = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) assert sd.size() == 2 * args.nsam, "sample size error" w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): raw_nSL = nSLiHS(w[i]) # Filter out non-finite values # and values where derived allele # present fewer than 3 times. raw_nSL = [ i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3 ] nSL = np.array(raw_nSL) if len(nSL) > 0: if i == 0 or i == len(w) - 1: reference_values.extend(nSL[:, 0].tolist()) reference_daf.extend(nSL[:, 2].tolist()) else: temp.append(TempRecord(locus, i, nSL)) locus += 1 # bin the reference data rdaf = np.array(reference_daf) rdaf_bins = np.digitize(rdaf, np.arange(0, 2 * args.nsam, 10)) rstats = np.array(reference_values) mean_sd = {} for b in set(rdaf_bins): w = np.where(rdaf_bins == b)[0] if len(w) > 0: m = rstats[w].mean() sdev = rstats[w].std() if np.isfinite(sdev) == 1 and sdev > 0.0: mean_sd[b] = (m, sdev) rv = [] # package up the data for t in temp: tb = np.digitize(t.values[:, 2], np.arange(0, 2 * args.nsam, 10)) zscores_win = np.array([]) for b in set(tb): w = np.where(tb == b)[0] if b in mean_sd: m = mean_sd[b][0] sdev = mean_sd[b][1] zscores = (t.values[:, 0][w] - m) / sdev zscores_win = np.concatenate((zscores_win, zscores)) mz = zscores_win.mean() rv.append(DataRecord(pop.generation, repid, t.locus, t.window, mz)) return rv
# In[30]: for i in samples: windows = [] start = 0 while start < 3: ##We will only look at neutral mutations, which are element 0 of each sampl window = [j[0] for j in i[0] if (j[0] >= start and j[0] < start + 0.1)] windows.append(window) start += 0.1 ##We now have a full set of windows that we can do something with print(len(windows)) ##There should be 30, and many will be empy # ### Using [pylibseq](https://github.com/molpopgen/pylibseq) # In[31]: from libsequence.windows import Windows from libsequence.polytable import simData for i in samples: ##We need to convert our list of tuples ##into types that pylibseq/libsequence understand: windows = Windows(simData(i[0]), 0.1, 0.1, 0, 3) ##Now, you can analyze the windows, etc. print(len(windows)) # Well, the pylibseq version is clearly more compact. Of course, you can/should abstract the pure Python version into a standalone function. # # Why would you ever use the manual version? It can save you memory. The pylibseq version constructs an iterable list of windows, meaning that there is an object allocated for each window. For the manual version above, we grew a list of objects, but we could just have easily processed them and let them go out of scope.