def run_simstats(ms_files, msexe, outpath, nprocs): """ """ global header_len global header # read in all the files length_bp = stats_dt["length_bp"] nhaps = stats_dt["num_haps"] ms_dict = read_ms(ms_files, msexe, nhaps, length_bp) sim_number = len(ms_dict) # write headers outfile = outpath.parent / f"{outpath.stem}.pop_stats.txt" pops_outfile = open(outfile, 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt) header_len = header_ header = header_ls if nprocs == 1: for ms in tqdm(ms_dict.values()): pop_stats_arr = calc_simstats(ms) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) pops_outfile.close() else: # chunk and MP nk = nprocs * 10 ms_vals = list(ms_dict.values()) chunk_list = [ms_vals[i:i + nk] for i in range(0, len(ms_vals), nk)] chunksize = ceil(nk / nprocs) pool = multiprocessing.Pool(nprocs) for i, args in enumerate(chunk_list): pop_stats_arr = pool.map(calc_simstats, args, chunksize=chunksize) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) print(i) pool.close() pops_outfile.close()
def simulate_msmove(ms_path, model_dict, demo_dataframe, param_df, sim_number, outfile, nprocs, stats_config, dryrun): """ Main simulate. Parameters ---------- ms_path : TYPE DESCRIPTION. model_dict : TYPE DESCRIPTION. demo_dataframe : TYPE DESCRIPTION. param_df : TYPE DESCRIPTION. sim_number : TYPE DESCRIPTION. outfile : TYPE DESCRIPTION. nprocs : TYPE DESCRIPTION. stats_config : TYPE DESCRIPTION. dryrun : TYPE DESCRIPTION. Returns ------- None. """ # ========================================================================= # Globals for model params # ========================================================================= global ms_exe ms_exe = ms_path global demo_df demo_df = demo_dataframe global model_dt model_dt = model_dict global dry_run dry_run = dryrun # model pops global nhaps nhaps = sum(model_dt["sampleSize"]) global sample_sizes sample_sizes = model_dt["sampleSize"] global npops npops = len(sample_sizes) # set mutation rate global mu mut_rate = model_dt["mutation_rate"] if type(mut_rate) is list: if len(mut_rate) == 2: low, high = mut_rate mu = np.random.uniform(low, high, sim_number) else: mu = mut_rate else: mu = [mut_rate] # set recombination rate global rec rec_rate = model_dt["recombination_rate"] if type(rec_rate) is list: if len(rec_rate) == 2: low, high = rec_rate rec = np.random.uniform(low, high, sim_number) else: rec = rec_rate else: # rec = np.random.exponential(rec_rate, sim_number) rec = [rec_rate] # set effective population size global ploidy ploidy = model_dt["ploidy"] global init_sizes init_sizes = [size * ploidy for size in model_dt["initialSize"]] global scaled_Ne effective_size = model_dt["eff_size"] if type(effective_size) is list: if len(effective_size) == 2: low, high = effective_size scaled_Ne = np.random.randint(low, high, sim_number) * ploidy else: scaled_Ne = list(effective_size * ploidy) else: scaled_Ne = [effective_size * ploidy] global pfileout pfileout = open(f"{outfile}.ne_mu_rec.out", 'w') # ========================================================================= # Main simulations # ========================================================================= # set up generator fx for MP event = param_df["event"].values pops = param_df["pops"].values time_arr = list(zip(*param_df["time"].values)) value_arr = list(zip(*param_df["value"].values)) param_gen = ({ "time": time_arr[i], "event": event, "pops": pops, "value": value_arr[i] } for i in range(sim_number)) param_gen = list(param_gen) # check nprocs if nprocs > multiprocessing.cpu_count( ): # check that there are not more requested than available print( "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}" ) nprocs = multiprocessing.cpu_count() global statsconfig statsconfig = '' global stats_dt global header_len global header # perform sims if dry_run: for param in param_gen: run_simulation(param) break elif stats_config: stats_dt = read_config_stats(stats_config) statsconfig = stats_config # write headers pops_outfile = open(f"{outfile}.pop_stats.txt", 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt) header_len = header_ header = header_ls if nprocs == 1: for param in tqdm(param_gen): pop_stats_arr = run_simulation(param) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) else: # chunk and MP nk = nprocs * 10 chunk_list = [ param_gen[i:i + nk] for i in range(0, len(param_gen), nk) ] chunksize = ceil(nk / nprocs) pool = multiprocessing.Pool(nprocs) for i, args in enumerate(chunk_list): pop_stats_arr = pool.map(run_simulation, args, chunksize=chunksize) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) print(i) pool.close() pops_outfile.close() else: with open(f"{outfile}.{sim_number}.sims.cmd.txt", 'w') as sims_outfile: for param in tqdm(param_gen): mscmd = run_simulation(param) sims_outfile.write(f"{mscmd} >> {outfile}\n") pfileout.close()
def simulate_msprime(model_dict, demo_dataframe, param_df, sim_number: int, outfile: str, nprocs: int, stats_config: str, dryrun: bool, order: bool): """Run code for simulating msprime. Parameters ---------- model_dict : Dict Dict holding information on model specs from config file demo_dataframe : DataFrame Dataframe with info from model file param_df : DataFrame Dataframe that holds tbi values and draws sim_path : str file path sim_number : int how man independent sims to run outfile : str file name for output nprocs : int how many processors to run with MP Returns ------- Writes a trees file from mpsrime to a file """ # ========================================================================= # Globals for model params # ========================================================================= # set info dicts global model_dt model_dt = model_dict global demo_df demo_df = demo_dataframe # set dryrun global dry_run dry_run = dryrun # set models and switching global initial_model initial_model = "hudson" global hybrid_model hybrid_model = "hudson" # dtwf, smc, smc_prime global hybrid_switch_over if dryrun: hybrid_switch_over = '' # demo debug does not handle hybrid models else: hybrid_switch_over = '' # int of gens, e.g., 500 # set mutation rate global mu l_mu = np.nan mut_rate = model_dt["mutation_rate"] if type(mut_rate) is list: if len(mut_rate) == 2: low, high = mut_rate mu = np.random.uniform(low, high, sim_number) else: if len(mut_rate) < sim_number: mu = np.random.choice(mut_rate, sim_number) elif order: l_mu = len(mu) else: mu = mut_rate else: mu = [mut_rate] * sim_number # set recombination rate global rec l_rec = np.nan rec_rate = model_dt["recombination_rate"] if type(rec_rate) is list: if len(rec_rate) == 2: low, high = rec_rate rec = np.random.uniform(low, high, sim_number) # rec = np.random.exponential(rec_rate, sim_number) else: if len(rec_rate) < sim_number: rec = np.random.choice(rec_rate, sim_number) elif order: l_rec = len(rec) else: rec = rec_rate else: rec = [rec_rate] * sim_number # set ploidy global ploidy ploidy = model_dt["ploidy"] # set effective pop size global scaled_Ne l_ne = np.nan effective_size = model_dt["eff_size"] if type(effective_size) is list: if len(effective_size) == 2: low, high = effective_size scaled_Ne = np.random.randint(low, high, sim_number) * ploidy else: if len(effective_size) < sim_number: scaled_Ne = np.random.choice(effective_size, sim_number) scaled_Ne = list(scaled_Ne * ploidy) elif order: l_ne = len(scaled_Ne) scaled_Ne = list(effective_size * ploidy) else: scaled_Ne = list(effective_size * ploidy) else: scaled_Ne = [effective_size * ploidy] * sim_number # ========================================================================= # Main simulations # ========================================================================= # set up generator fx for MP if order: l_min = np.nanmin([l_mu, l_rec, l_ne]) sim_number = int(l_min) print( f"order requested, setting sim_number to shortest param file: {l_min}" ) with open(f"{outfile}.ne_mu_rec.out", 'w') as pfile: pfile.write("Ne\tmu\trec\n") for i in range(sim_number): pfile.write(f"{int(scaled_Ne[i])}\t{mu[i]}\t{rec[i]}\n") event = param_df["event"].values pops = param_df["pops"].values time_arr = list(zip(*param_df["time"].values)) value_arr = list(zip(*param_df["value"].values)) param_gen = ({ "ne_t": scaled_Ne[i], "mu_t": mu[i], "rec_t": rec[i], "time": time_arr[i], "event": event, "pops": pops, "value": value_arr[i] } for i in range(sim_number)) # param_gen = ({"time": time_arr[i], "event": event, "pops": pops, "value": value_arr[i]} for i in range(sim_number)) param_gen = list(param_gen) # check nprocs if nprocs > multiprocessing.cpu_count( ): # check that there are not more requested than available print( "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}" ) nprocs = multiprocessing.cpu_count() # perform sims global stats_dt global header_len global header global vcf vcf = False if dry_run: for param in param_gen: run_simulation(param) break elif stats_config: stats_dt = read_config_stats(stats_config) # write headers pops_outfile = open(f"{outfile}.pop_stats.txt", 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt) header_len = header_ header = header_ls if nprocs == 1: for param in tqdm(param_gen): pop_stats_arr = run_simulation(param) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) else: # chunk and MP nk = nprocs * 10 # tricky, how many jobs for each processor chunk_list = [ param_gen[i:i + nk] for i in range(0, len(param_gen), nk) ] chunksize = ceil(nk / nprocs) pool = multiprocessing.Pool(nprocs) for i, args in enumerate(chunk_list): pop_stats_arr = pool.map(run_simulation, args, chunksize=chunksize) pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs) print(i) pool.close() pops_outfile.close() else: print("No stats file given with msprime, default VCF") vcf = True for contig, param in enumerate(param_gen): mts = run_simulation(param) with open(f"{outfile}.contig_{contig}.vcf", "w") as vcf_file: mts.write_vcf(vcf_file, contig_id=f"{contig}")
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath): """Calculate stats from a VCF file.""" # if reuse_zarr is true if zarrpath.exists(): zarrfile = zarrpath else: zarrfile = zarrpath allel.vcf_to_zarr(str(vcfpath), str(zarrpath), group=chrom, fields='*', alt_number=2, log=sys.stdout, compressor=numcodecs.Blosc(cname='zstd', clevel=1, shuffle=False)) # load pop info panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population']) # load zarr callset = zarr.open_group(str(zarrfile), mode='r') samples = callset[f'{chrom}/samples'][:] samples_list = list(samples) samples_callset_index = [samples_list.index(s) for s in panel['sampleID']] panel['callset_index'] = samples_callset_index panel = panel.sort_values(by='callset_index') # load gt pos = allel.SortedIndex(callset[f'{chrom}/variants/POS']) gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT']) # separate gt for each population ix_s = 0 pop_dt = {} pop_ix = [] for i, p in enumerate(panel["population"].unique()): p_ix = panel[panel["population"] == p]["callset_index"].values ix_e = len(p_ix) * 2 + ix_s pop_ix.append(list(range(ix_s, ix_e))) pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes() ix_s = ix_e # combine and transpose haps = np.concatenate(list(pop_dt.values()), axis=1).T # prep progress bar ln_count = 0 with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): ln_count += 1 progressbar = tqdm(total=ln_count, desc="window numb", unit='window') # update stats_dt stats_dt["num_haps"] = haps.shape[0] stats_dt["pop_config"] = pop_ix stats_dt["length_bp"] = int( line.split()[-1]) # may be shorter than expected due to last window stats_dt["reps"] = ln_count # write headers outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt" pops_outfile = open(outfile, 'w') pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt, pop_names=list(pop_dt.keys()), obs=True) # calc stats # TODO: parallel chrom_ls = [] i = 0 stat_mat = np.zeros([ln_count, len(header_ls) - 1]) with open(coord_bed, 'r') as cb: for line in cb: if not line.startswith("chrom"): cb_lin = line.split() chrom = cb_lin[0] chrom_ls.append(chrom) start = int(cb_lin[1]) stop = int(cb_lin[2]) len_bp = stop - start stats_dt["length_bp"] = len_bp sites = int(cb_lin[3]) try: pos_ix = pos.locate_range(start, stop) except KeyError: continue pos_t = pos[pos_ix] - start haps_t = haps[:, pos_ix] counts_t = haps_t.sum(axis=0).astype(int) # run stats stats_ls = [start, stop, sites] popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt) for stat in stats_dt["calc_stats"]: stat_fx = getattr(popsumstats, stat) try: ss = stat_fx() # print(f"{stat} = {len(ss)}") except IndexError: ss = [np.nan] * len(stats_dt["pw_quants"]) stats_ls.extend(ss) try: stat_mat[i, :] = stats_ls i += 1 progressbar.update() except ValueError: continue # write stats out stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5) stats_str = "\t".join(map(str, stat_mean[3:])) pops_outfile.write( f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n" ) for stat in range(stat_mat.shape[0]): chrom = chrom_ls[stat] start = int(stat_mat[stat, 0]) stop = int(stat_mat[stat, 1]) sites = int(stat_mat[stat, 2]) rd = [round(num, 5) for num in stat_mat[stat, 3:]] stats_str = "\t".join(map(str, rd)) pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n") progressbar.close() pops_outfile.close() return outfile