def process_replicate(argtuple): seed, args, infile = argtuple np.random.seed(seed) tf = tarfile.open(args.tarfile, 'r') ti = tf.getmember(infile) lzma_file = tf.extract(ti) rv = [] statnames = ['thetapi', 'tajimasd', 'hprime'] with lzma.open(infile, 'rb') as f: while True: try: rep, pop = pickle.load(f) ind = np.random.choice(pop.N, args.nsam, replace=False) s = sample_separate(pop, ind) # Only loci 0, 5, and len(s)-1 can have # any neutral variants for locus in [0, 5, len(s) - 1]: sd = SimData(s[locus][0]) ps = PolySIM(sd) for stat in statnames: d = {'rep': rep, 'locus': locus, 'generation': pop.generation, 'stat': stat, 'value': getattr(ps, stat)() } rv.append(d) except: break os.remove(infile) return rv
def get_summstats(pop, repid, nsam, temp): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = np.array([], dtype=temp.dtype) locus = 0 for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) gs = garudStats(w[i]) raw_nSL = nSLiHS(w[i]) raw_nSL = [ i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3 ] nSL = np.array(raw_nSL) mean_nSL = np.nan max_nSL = np.nan if len(nSL) > 0: mean_nSL = nSL[:, 0].mean() bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5)) for b in set(bins): binscores = nSL[:, 0][np.where(bins == b)[0]] if len(binscores) > 1: bmean = binscores.mean() sd = binscores.std() # if np.isfinite(bmean) == 0: # print(bscores) # sys.exit(0) if sd > 0.0: binscores = (binscores - bmean) / sd bmax = binscores[np.where( np.abs(binscores) == max(np.abs(binscores))) [0]][0] if np.isnan( max_nSL) or np.abs(bmax) > np.abs(max_nSL): max_nSL = bmax temp[i] = np.array( [(locus, int(i), repid, pop.generation, ps.tajimasd(), ps.hprime(), ps.thetapi(), gs['H1'], gs['H12'], gs['H2H1'], mean_nSL, max_nSL)], dtype=temp.dtype) #print("done assigning ",i) #print("attempt concat") rv = np.concatenate([rv, temp.copy()]) #print("here at ",locus) locus += 1 #print("returning summstats") return rv
def get_outlier_nSL(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si ## neut.extend([i for i in sel]) ## neut = sorted(neut,key = lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) raw_nSL = nSLiHS(w[i]) raw_nSL = [ X for X in raw_nSL if np.isfinite(X[0]) == 1 and X[2] > 3 ] nSL = np.array(raw_nSL) nbig2 = 0 nbig3 = 0 nscores = 0 if len(nSL) > 0: bins = np.digitize(nSL[:, 2], np.arange(0, 2 * args.nsam, 5)) for b in set(bins): binscores = nSL[:, 0][np.where(bins == b)[0]] if len(binscores) > 1: bmean = binscores.mean() sd = binscores.std() # if np.isfinite(bmean) == 0: # print(bscores) # sys.exit(0) if sd > 0.0 and np.isfinite(sd): binscores = (binscores - bmean) / sd nscores += len(binscores) abs_nSL_gt_2 = np.where( np.abs(binscores) >= 2.0)[0] nbig2 += len(abs_nSL_gt_2) abs_nSL_gt_3 = np.where( np.abs(binscores) >= 3.0)[0] nbig3 += len(abs_nSL_gt_3) if nscores > 0: rv.append( Datum(repid, pop.generation, locus_index, i, nbig2 / nscores, nbig3 / nscores)) else: rv.append(Datum(repid, pop.generation, locus_index, i, 0, 0)) return rv
def get_omega_max_per_window(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): om = omega_max(w[i]) rv.append(Datum(repid, pop.generation, locus_index, i, om[0])) return rv
def get_summstats(pop, nsam, generation): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) rv = [] locus = 0 for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): ps = PolySIM(w[i]) gs = garudStats(w[i]) rv.append( GenomeScanDataPoint(generation, locus, i, ps.tajimasd(), ps.hprime(), gs['H1'], gs['H12'], gs['H2H1'])) locus += 1 return rv
def get_summstats(pop, repid, nsam): """ The 'genome scan' bit. """ ind = np.random.choice(pop.N, nsam, replace=False) s = sample_separate(pop, ind) locus = 0 # The procedure here is: # 1. Get raw nSL for all windows at all loci # 2. For each locus, collect values from first/last window # to use for normalizing. # 3. For all other windows per locus, get mean z-score reference_daf = [] reference_values = [] temp = [] for si, bi, locus_index in zip(s, pop.locus_boundaries, range(len(s))): neut, sel = si neut.extend([i for i in sel]) neut = sorted(neut, key=lambda x: x[0]) sd = SimData(neut) assert sd.size() == 2 * args.nsam, "sample size error" w = Windows(sd, 1.0, 1.0, bi[0], bi[1]) for i in range(len(w)): raw_nSL = nSLiHS(w[i]) # Filter out non-finite values # and values where derived allele # present fewer than 3 times. raw_nSL = [ i for i in raw_nSL if np.isfinite(i[0]) == 1 and i[2] > 3 ] nSL = np.array(raw_nSL) if len(nSL) > 0: if i == 0 or i == len(w) - 1: reference_values.extend(nSL[:, 0].tolist()) reference_daf.extend(nSL[:, 2].tolist()) else: temp.append(TempRecord(locus, i, nSL)) locus += 1 # bin the reference data rdaf = np.array(reference_daf) rdaf_bins = np.digitize(rdaf, np.arange(0, 2 * args.nsam, 10)) rstats = np.array(reference_values) mean_sd = {} for b in set(rdaf_bins): w = np.where(rdaf_bins == b)[0] if len(w) > 0: m = rstats[w].mean() sdev = rstats[w].std() if np.isfinite(sdev) == 1 and sdev > 0.0: mean_sd[b] = (m, sdev) rv = [] # package up the data for t in temp: tb = np.digitize(t.values[:, 2], np.arange(0, 2 * args.nsam, 10)) zscores_win = np.array([]) for b in set(tb): w = np.where(tb == b)[0] if b in mean_sd: m = mean_sd[b][0] sdev = mean_sd[b][1] zscores = (t.values[:, 0][w] - m) / sdev zscores_win = np.concatenate((zscores_win, zscores)) mz = zscores_win.mean() rv.append(DataRecord(pop.generation, repid, t.locus, t.window, mz)) return rv