コード例 #1
0
def run_simstats(ms_files, msexe, outpath, nprocs):
    """ """
    global header_len
    global header
    # read in all the files
    length_bp = stats_dt["length_bp"]
    nhaps = stats_dt["num_haps"]
    ms_dict = read_ms(ms_files, msexe, nhaps, length_bp)
    sim_number = len(ms_dict)
    # write headers
    outfile = outpath.parent / f"{outpath.stem}.pop_stats.txt"
    pops_outfile = open(outfile, 'w')
    pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
    header_len = header_
    header = header_ls
    if nprocs == 1:
        for ms in tqdm(ms_dict.values()):
            pop_stats_arr = calc_simstats(ms)
            pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
        pops_outfile.close()
    else:
        # chunk and MP
        nk = nprocs * 10
        ms_vals = list(ms_dict.values())
        chunk_list = [ms_vals[i:i + nk] for i in range(0, len(ms_vals), nk)]
        chunksize = ceil(nk / nprocs)
        pool = multiprocessing.Pool(nprocs)
        for i, args in enumerate(chunk_list):
            pop_stats_arr = pool.map(calc_simstats, args, chunksize=chunksize)
            pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
            print(i)
        pool.close()
        pops_outfile.close()
コード例 #2
0
def simulate_msmove(ms_path, model_dict, demo_dataframe, param_df, sim_number,
                    outfile, nprocs, stats_config, dryrun):
    """
    Main simulate.

    Parameters
    ----------
    ms_path : TYPE
        DESCRIPTION.
    model_dict : TYPE
        DESCRIPTION.
    demo_dataframe : TYPE
        DESCRIPTION.
    param_df : TYPE
        DESCRIPTION.
    sim_number : TYPE
        DESCRIPTION.
    outfile : TYPE
        DESCRIPTION.
    nprocs : TYPE
        DESCRIPTION.
    stats_config : TYPE
        DESCRIPTION.
    dryrun : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    """
    # =========================================================================
    #  Globals for model params
    # =========================================================================
    global ms_exe
    ms_exe = ms_path
    global demo_df
    demo_df = demo_dataframe
    global model_dt
    model_dt = model_dict
    global dry_run
    dry_run = dryrun

    # model pops
    global nhaps
    nhaps = sum(model_dt["sampleSize"])
    global sample_sizes
    sample_sizes = model_dt["sampleSize"]
    global npops
    npops = len(sample_sizes)

    # set mutation rate
    global mu
    mut_rate = model_dt["mutation_rate"]
    if type(mut_rate) is list:
        if len(mut_rate) == 2:
            low, high = mut_rate
            mu = np.random.uniform(low, high, sim_number)
        else:
            mu = mut_rate
    else:
        mu = [mut_rate]

    # set recombination rate
    global rec
    rec_rate = model_dt["recombination_rate"]
    if type(rec_rate) is list:
        if len(rec_rate) == 2:
            low, high = rec_rate
            rec = np.random.uniform(low, high, sim_number)
        else:
            rec = rec_rate
    else:
        # rec = np.random.exponential(rec_rate, sim_number)
        rec = [rec_rate]

    # set effective population size
    global ploidy
    ploidy = model_dt["ploidy"]
    global init_sizes
    init_sizes = [size * ploidy for size in model_dt["initialSize"]]
    global scaled_Ne
    effective_size = model_dt["eff_size"]
    if type(effective_size) is list:
        if len(effective_size) == 2:
            low, high = effective_size
            scaled_Ne = np.random.randint(low, high, sim_number) * ploidy
        else:
            scaled_Ne = list(effective_size * ploidy)
    else:
        scaled_Ne = [effective_size * ploidy]

    global pfileout
    pfileout = open(f"{outfile}.ne_mu_rec.out", 'w')
    # =========================================================================
    #  Main simulations
    # =========================================================================
    # set up generator fx for MP
    event = param_df["event"].values
    pops = param_df["pops"].values
    time_arr = list(zip(*param_df["time"].values))
    value_arr = list(zip(*param_df["value"].values))
    param_gen = ({
        "time": time_arr[i],
        "event": event,
        "pops": pops,
        "value": value_arr[i]
    } for i in range(sim_number))
    param_gen = list(param_gen)

    # check nprocs
    if nprocs > multiprocessing.cpu_count(
    ):  # check that there are not more requested than available
        print(
            "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}"
        )
        nprocs = multiprocessing.cpu_count()

    global statsconfig
    statsconfig = ''
    global stats_dt
    global header_len
    global header
    # perform sims
    if dry_run:
        for param in param_gen:
            run_simulation(param)
            break
    elif stats_config:
        stats_dt = read_config_stats(stats_config)
        statsconfig = stats_config
        # write headers
        pops_outfile = open(f"{outfile}.pop_stats.txt", 'w')
        pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
        header_len = header_
        header = header_ls
        if nprocs == 1:
            for param in tqdm(param_gen):
                pop_stats_arr = run_simulation(param)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
        else:
            # chunk and MP
            nk = nprocs * 10
            chunk_list = [
                param_gen[i:i + nk] for i in range(0, len(param_gen), nk)
            ]
            chunksize = ceil(nk / nprocs)
            pool = multiprocessing.Pool(nprocs)
            for i, args in enumerate(chunk_list):
                pop_stats_arr = pool.map(run_simulation,
                                         args,
                                         chunksize=chunksize)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
                print(i)
            pool.close()
        pops_outfile.close()
    else:
        with open(f"{outfile}.{sim_number}.sims.cmd.txt", 'w') as sims_outfile:
            for param in tqdm(param_gen):
                mscmd = run_simulation(param)
                sims_outfile.write(f"{mscmd} >> {outfile}\n")
    pfileout.close()
コード例 #3
0
ファイル: sim_msprime.py プロジェクト: stsmall/abc_scripts2
def simulate_msprime(model_dict, demo_dataframe, param_df, sim_number: int,
                     outfile: str, nprocs: int, stats_config: str,
                     dryrun: bool, order: bool):
    """Run code for simulating msprime.

    Parameters
    ----------
    model_dict : Dict
        Dict holding information on model specs from config file
    demo_dataframe : DataFrame
        Dataframe with info from model file
    param_df : DataFrame
        Dataframe that holds tbi values and draws
    sim_path : str
        file path
    sim_number : int
        how man independent sims to run
    outfile : str
        file name for output
    nprocs : int
        how many processors to run with MP

    Returns
    -------
    Writes a trees file from mpsrime to a file

    """
    # =========================================================================
    #  Globals for model params
    # =========================================================================
    # set info dicts
    global model_dt
    model_dt = model_dict
    global demo_df
    demo_df = demo_dataframe

    # set dryrun
    global dry_run
    dry_run = dryrun

    # set models and switching
    global initial_model
    initial_model = "hudson"
    global hybrid_model
    hybrid_model = "hudson"  # dtwf, smc, smc_prime
    global hybrid_switch_over
    if dryrun:
        hybrid_switch_over = ''  # demo debug does not handle hybrid models
    else:
        hybrid_switch_over = ''  # int of gens, e.g., 500

    # set mutation rate
    global mu
    l_mu = np.nan
    mut_rate = model_dt["mutation_rate"]
    if type(mut_rate) is list:
        if len(mut_rate) == 2:
            low, high = mut_rate
            mu = np.random.uniform(low, high, sim_number)
        else:
            if len(mut_rate) < sim_number:
                mu = np.random.choice(mut_rate, sim_number)
            elif order:
                l_mu = len(mu)
            else:
                mu = mut_rate

    else:
        mu = [mut_rate] * sim_number

    # set recombination rate
    global rec
    l_rec = np.nan
    rec_rate = model_dt["recombination_rate"]
    if type(rec_rate) is list:
        if len(rec_rate) == 2:
            low, high = rec_rate
            rec = np.random.uniform(low, high, sim_number)
            # rec = np.random.exponential(rec_rate, sim_number)
        else:
            if len(rec_rate) < sim_number:
                rec = np.random.choice(rec_rate, sim_number)
            elif order:
                l_rec = len(rec)
            else:
                rec = rec_rate
    else:
        rec = [rec_rate] * sim_number

    # set ploidy
    global ploidy
    ploidy = model_dt["ploidy"]

    # set effective pop size
    global scaled_Ne
    l_ne = np.nan
    effective_size = model_dt["eff_size"]
    if type(effective_size) is list:
        if len(effective_size) == 2:
            low, high = effective_size
            scaled_Ne = np.random.randint(low, high, sim_number) * ploidy
        else:
            if len(effective_size) < sim_number:
                scaled_Ne = np.random.choice(effective_size, sim_number)
                scaled_Ne = list(scaled_Ne * ploidy)
            elif order:
                l_ne = len(scaled_Ne)
                scaled_Ne = list(effective_size * ploidy)
            else:
                scaled_Ne = list(effective_size * ploidy)
    else:
        scaled_Ne = [effective_size * ploidy] * sim_number
    # =========================================================================
    #  Main simulations
    # =========================================================================
    # set up generator fx for MP
    if order:
        l_min = np.nanmin([l_mu, l_rec, l_ne])
        sim_number = int(l_min)
        print(
            f"order requested, setting sim_number to shortest param file: {l_min}"
        )

    with open(f"{outfile}.ne_mu_rec.out", 'w') as pfile:
        pfile.write("Ne\tmu\trec\n")
        for i in range(sim_number):
            pfile.write(f"{int(scaled_Ne[i])}\t{mu[i]}\t{rec[i]}\n")

    event = param_df["event"].values
    pops = param_df["pops"].values
    time_arr = list(zip(*param_df["time"].values))
    value_arr = list(zip(*param_df["value"].values))
    param_gen = ({
        "ne_t": scaled_Ne[i],
        "mu_t": mu[i],
        "rec_t": rec[i],
        "time": time_arr[i],
        "event": event,
        "pops": pops,
        "value": value_arr[i]
    } for i in range(sim_number))
    # param_gen = ({"time": time_arr[i], "event": event, "pops": pops, "value": value_arr[i]} for i in range(sim_number))
    param_gen = list(param_gen)

    # check nprocs
    if nprocs > multiprocessing.cpu_count(
    ):  # check that there are not more requested than available
        print(
            "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}"
        )
        nprocs = multiprocessing.cpu_count()
    # perform sims
    global stats_dt
    global header_len
    global header
    global vcf
    vcf = False
    if dry_run:
        for param in param_gen:
            run_simulation(param)
            break
    elif stats_config:
        stats_dt = read_config_stats(stats_config)
        # write headers
        pops_outfile = open(f"{outfile}.pop_stats.txt", 'w')
        pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
        header_len = header_
        header = header_ls
        if nprocs == 1:
            for param in tqdm(param_gen):
                pop_stats_arr = run_simulation(param)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
        else:
            # chunk and MP
            nk = nprocs * 10  # tricky, how many jobs for each processor
            chunk_list = [
                param_gen[i:i + nk] for i in range(0, len(param_gen), nk)
            ]
            chunksize = ceil(nk / nprocs)
            pool = multiprocessing.Pool(nprocs)
            for i, args in enumerate(chunk_list):
                pop_stats_arr = pool.map(run_simulation,
                                         args,
                                         chunksize=chunksize)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
                print(i)
            pool.close()
        pops_outfile.close()
    else:
        print("No stats file given with msprime, default VCF")
        vcf = True
        for contig, param in enumerate(param_gen):
            mts = run_simulation(param)
            with open(f"{outfile}.contig_{contig}.vcf", "w") as vcf_file:
                mts.write_vcf(vcf_file, contig_id=f"{contig}")
コード例 #4
0
def calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath, outpath):
    """Calculate stats from a VCF file."""
    # if reuse_zarr is true
    if zarrpath.exists():
        zarrfile = zarrpath
    else:
        zarrfile = zarrpath
        allel.vcf_to_zarr(str(vcfpath),
                          str(zarrpath),
                          group=chrom,
                          fields='*',
                          alt_number=2,
                          log=sys.stdout,
                          compressor=numcodecs.Blosc(cname='zstd',
                                                     clevel=1,
                                                     shuffle=False))

    # load pop info
    panel = pd.read_csv(pops, sep='\t', usecols=['sampleID', 'population'])

    # load zarr
    callset = zarr.open_group(str(zarrfile), mode='r')
    samples = callset[f'{chrom}/samples'][:]
    samples_list = list(samples)
    samples_callset_index = [samples_list.index(s) for s in panel['sampleID']]
    panel['callset_index'] = samples_callset_index
    panel = panel.sort_values(by='callset_index')

    # load gt
    pos = allel.SortedIndex(callset[f'{chrom}/variants/POS'])
    gt = allel.GenotypeArray(callset[f'{chrom}/calldata/GT'])

    # separate gt for each population
    ix_s = 0
    pop_dt = {}
    pop_ix = []
    for i, p in enumerate(panel["population"].unique()):
        p_ix = panel[panel["population"] == p]["callset_index"].values
        ix_e = len(p_ix) * 2 + ix_s
        pop_ix.append(list(range(ix_s, ix_e)))
        pop_dt[p] = gt.take(p_ix, axis=1).to_haplotypes()
        ix_s = ix_e

    # combine and transpose
    haps = np.concatenate(list(pop_dt.values()), axis=1).T

    # prep progress bar
    ln_count = 0
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                ln_count += 1

    progressbar = tqdm(total=ln_count, desc="window numb", unit='window')

    # update stats_dt
    stats_dt["num_haps"] = haps.shape[0]
    stats_dt["pop_config"] = pop_ix
    stats_dt["length_bp"] = int(
        line.split()[-1])  # may be shorter than expected due to last window
    stats_dt["reps"] = ln_count

    # write headers
    outfile = outpath.parent / f"{outpath.stem}.Obs.pop_stats.txt"
    pops_outfile = open(outfile, 'w')
    pops_outfile, header_, header_ls = headers(pops_outfile,
                                               stats_dt,
                                               pop_names=list(pop_dt.keys()),
                                               obs=True)

    # calc stats
    # TODO: parallel
    chrom_ls = []
    i = 0
    stat_mat = np.zeros([ln_count, len(header_ls) - 1])
    with open(coord_bed, 'r') as cb:
        for line in cb:
            if not line.startswith("chrom"):
                cb_lin = line.split()
                chrom = cb_lin[0]
                chrom_ls.append(chrom)
                start = int(cb_lin[1])
                stop = int(cb_lin[2])
                len_bp = stop - start
                stats_dt["length_bp"] = len_bp
                sites = int(cb_lin[3])
                try:
                    pos_ix = pos.locate_range(start, stop)
                except KeyError:
                    continue
                pos_t = pos[pos_ix] - start
                haps_t = haps[:, pos_ix]
                counts_t = haps_t.sum(axis=0).astype(int)
                # run stats
                stats_ls = [start, stop, sites]
                popsumstats = PopSumStats(pos_t, haps_t, counts_t, stats_dt)
                for stat in stats_dt["calc_stats"]:
                    stat_fx = getattr(popsumstats, stat)
                    try:
                        ss = stat_fx()
                        # print(f"{stat} =  {len(ss)}")
                    except IndexError:
                        ss = [np.nan] * len(stats_dt["pw_quants"])
                    stats_ls.extend(ss)
                try:
                    stat_mat[i, :] = stats_ls
                    i += 1
                    progressbar.update()
                except ValueError:
                    continue
    # write stats out
    stat_mean = np.round(np.nanmean(stat_mat, axis=0), 5)
    stats_str = "\t".join(map(str, stat_mean[3:]))
    pops_outfile.write(
        f"mean_{chrom}\t{int(stat_mat[0, 0])}\t{stop}\t{np.sum(stat_mat[:, 2])}\t{stats_str}\n"
    )
    for stat in range(stat_mat.shape[0]):
        chrom = chrom_ls[stat]
        start = int(stat_mat[stat, 0])
        stop = int(stat_mat[stat, 1])
        sites = int(stat_mat[stat, 2])
        rd = [round(num, 5) for num in stat_mat[stat, 3:]]
        stats_str = "\t".join(map(str, rd))
        pops_outfile.write(f"{chrom}\t{start}\t{stop}\t{sites}\t{stats_str}\n")
    progressbar.close()
    pops_outfile.close()

    return outfile