Example #1
0
def main():
    """Run main function."""
    argsDict = parse_args(sys.argv[1:])
    # =========================================================================
    #  Gather args
    # =========================================================================
    if argsDict["mode"] == "sim":
        mspath = Path(argsDict["ms_file"])
        configFile = argsDict["configFile"]
        msexe = argsDict["ms"]
        outpath = Path(argsDict["outfile"])
        nprocs = argsDict["nprocs"]

    else:
        vcfpath = Path(argsDict["vcfFileIn"])
        chrom = argsDict["chr_arm"]
        configFile = argsDict["configFile"]
        pops = argsDict["pops_file"]
        coord_bed = argsDict["coords_bed"]
        zarrpath = Path(argsDict["zarr_path"])
        outpath = Path(argsDict["outfile"])
    # =========================================================================
    #  Config parser
    # =========================================================================
    global stats_dt
    stats_dt = read_config_stats(configFile)
    # =========================================================================
    #  Main executions
    # =========================================================================
    if argsDict["mode"] == "sim":
        if mspath.is_dir():
            # will open many files, suffix with msout
            ms_files = list(mspath.glob("*.msout"))
        else:
            ms_files = [mspath]
        run_simstats(ms_files, msexe, outpath, nprocs)

    elif argsDict["mode"] == "obs":
        outfile = calc_obsStats(vcfpath, chrom, pops, coord_bed, zarrpath,
                                outpath)
Example #2
0
def simulate_msmove(ms_path, model_dict, demo_dataframe, param_df, sim_number,
                    outfile, nprocs, stats_config, dryrun):
    """
    Main simulate.

    Parameters
    ----------
    ms_path : TYPE
        DESCRIPTION.
    model_dict : TYPE
        DESCRIPTION.
    demo_dataframe : TYPE
        DESCRIPTION.
    param_df : TYPE
        DESCRIPTION.
    sim_number : TYPE
        DESCRIPTION.
    outfile : TYPE
        DESCRIPTION.
    nprocs : TYPE
        DESCRIPTION.
    stats_config : TYPE
        DESCRIPTION.
    dryrun : TYPE
        DESCRIPTION.

    Returns
    -------
    None.

    """
    # =========================================================================
    #  Globals for model params
    # =========================================================================
    global ms_exe
    ms_exe = ms_path
    global demo_df
    demo_df = demo_dataframe
    global model_dt
    model_dt = model_dict
    global dry_run
    dry_run = dryrun

    # model pops
    global nhaps
    nhaps = sum(model_dt["sampleSize"])
    global sample_sizes
    sample_sizes = model_dt["sampleSize"]
    global npops
    npops = len(sample_sizes)

    # set mutation rate
    global mu
    mut_rate = model_dt["mutation_rate"]
    if type(mut_rate) is list:
        if len(mut_rate) == 2:
            low, high = mut_rate
            mu = np.random.uniform(low, high, sim_number)
        else:
            mu = mut_rate
    else:
        mu = [mut_rate]

    # set recombination rate
    global rec
    rec_rate = model_dt["recombination_rate"]
    if type(rec_rate) is list:
        if len(rec_rate) == 2:
            low, high = rec_rate
            rec = np.random.uniform(low, high, sim_number)
        else:
            rec = rec_rate
    else:
        # rec = np.random.exponential(rec_rate, sim_number)
        rec = [rec_rate]

    # set effective population size
    global ploidy
    ploidy = model_dt["ploidy"]
    global init_sizes
    init_sizes = [size * ploidy for size in model_dt["initialSize"]]
    global scaled_Ne
    effective_size = model_dt["eff_size"]
    if type(effective_size) is list:
        if len(effective_size) == 2:
            low, high = effective_size
            scaled_Ne = np.random.randint(low, high, sim_number) * ploidy
        else:
            scaled_Ne = list(effective_size * ploidy)
    else:
        scaled_Ne = [effective_size * ploidy]

    global pfileout
    pfileout = open(f"{outfile}.ne_mu_rec.out", 'w')
    # =========================================================================
    #  Main simulations
    # =========================================================================
    # set up generator fx for MP
    event = param_df["event"].values
    pops = param_df["pops"].values
    time_arr = list(zip(*param_df["time"].values))
    value_arr = list(zip(*param_df["value"].values))
    param_gen = ({
        "time": time_arr[i],
        "event": event,
        "pops": pops,
        "value": value_arr[i]
    } for i in range(sim_number))
    param_gen = list(param_gen)

    # check nprocs
    if nprocs > multiprocessing.cpu_count(
    ):  # check that there are not more requested than available
        print(
            "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}"
        )
        nprocs = multiprocessing.cpu_count()

    global statsconfig
    statsconfig = ''
    global stats_dt
    global header_len
    global header
    # perform sims
    if dry_run:
        for param in param_gen:
            run_simulation(param)
            break
    elif stats_config:
        stats_dt = read_config_stats(stats_config)
        statsconfig = stats_config
        # write headers
        pops_outfile = open(f"{outfile}.pop_stats.txt", 'w')
        pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
        header_len = header_
        header = header_ls
        if nprocs == 1:
            for param in tqdm(param_gen):
                pop_stats_arr = run_simulation(param)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
        else:
            # chunk and MP
            nk = nprocs * 10
            chunk_list = [
                param_gen[i:i + nk] for i in range(0, len(param_gen), nk)
            ]
            chunksize = ceil(nk / nprocs)
            pool = multiprocessing.Pool(nprocs)
            for i, args in enumerate(chunk_list):
                pop_stats_arr = pool.map(run_simulation,
                                         args,
                                         chunksize=chunksize)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
                print(i)
            pool.close()
        pops_outfile.close()
    else:
        with open(f"{outfile}.{sim_number}.sims.cmd.txt", 'w') as sims_outfile:
            for param in tqdm(param_gen):
                mscmd = run_simulation(param)
                sims_outfile.write(f"{mscmd} >> {outfile}\n")
    pfileout.close()
Example #3
0
def simulate_msprime(model_dict, demo_dataframe, param_df, sim_number: int,
                     outfile: str, nprocs: int, stats_config: str,
                     dryrun: bool, order: bool):
    """Run code for simulating msprime.

    Parameters
    ----------
    model_dict : Dict
        Dict holding information on model specs from config file
    demo_dataframe : DataFrame
        Dataframe with info from model file
    param_df : DataFrame
        Dataframe that holds tbi values and draws
    sim_path : str
        file path
    sim_number : int
        how man independent sims to run
    outfile : str
        file name for output
    nprocs : int
        how many processors to run with MP

    Returns
    -------
    Writes a trees file from mpsrime to a file

    """
    # =========================================================================
    #  Globals for model params
    # =========================================================================
    # set info dicts
    global model_dt
    model_dt = model_dict
    global demo_df
    demo_df = demo_dataframe

    # set dryrun
    global dry_run
    dry_run = dryrun

    # set models and switching
    global initial_model
    initial_model = "hudson"
    global hybrid_model
    hybrid_model = "hudson"  # dtwf, smc, smc_prime
    global hybrid_switch_over
    if dryrun:
        hybrid_switch_over = ''  # demo debug does not handle hybrid models
    else:
        hybrid_switch_over = ''  # int of gens, e.g., 500

    # set mutation rate
    global mu
    l_mu = np.nan
    mut_rate = model_dt["mutation_rate"]
    if type(mut_rate) is list:
        if len(mut_rate) == 2:
            low, high = mut_rate
            mu = np.random.uniform(low, high, sim_number)
        else:
            if len(mut_rate) < sim_number:
                mu = np.random.choice(mut_rate, sim_number)
            elif order:
                l_mu = len(mu)
            else:
                mu = mut_rate

    else:
        mu = [mut_rate] * sim_number

    # set recombination rate
    global rec
    l_rec = np.nan
    rec_rate = model_dt["recombination_rate"]
    if type(rec_rate) is list:
        if len(rec_rate) == 2:
            low, high = rec_rate
            rec = np.random.uniform(low, high, sim_number)
            # rec = np.random.exponential(rec_rate, sim_number)
        else:
            if len(rec_rate) < sim_number:
                rec = np.random.choice(rec_rate, sim_number)
            elif order:
                l_rec = len(rec)
            else:
                rec = rec_rate
    else:
        rec = [rec_rate] * sim_number

    # set ploidy
    global ploidy
    ploidy = model_dt["ploidy"]

    # set effective pop size
    global scaled_Ne
    l_ne = np.nan
    effective_size = model_dt["eff_size"]
    if type(effective_size) is list:
        if len(effective_size) == 2:
            low, high = effective_size
            scaled_Ne = np.random.randint(low, high, sim_number) * ploidy
        else:
            if len(effective_size) < sim_number:
                scaled_Ne = np.random.choice(effective_size, sim_number)
                scaled_Ne = list(scaled_Ne * ploidy)
            elif order:
                l_ne = len(scaled_Ne)
                scaled_Ne = list(effective_size * ploidy)
            else:
                scaled_Ne = list(effective_size * ploidy)
    else:
        scaled_Ne = [effective_size * ploidy] * sim_number
    # =========================================================================
    #  Main simulations
    # =========================================================================
    # set up generator fx for MP
    if order:
        l_min = np.nanmin([l_mu, l_rec, l_ne])
        sim_number = int(l_min)
        print(
            f"order requested, setting sim_number to shortest param file: {l_min}"
        )

    with open(f"{outfile}.ne_mu_rec.out", 'w') as pfile:
        pfile.write("Ne\tmu\trec\n")
        for i in range(sim_number):
            pfile.write(f"{int(scaled_Ne[i])}\t{mu[i]}\t{rec[i]}\n")

    event = param_df["event"].values
    pops = param_df["pops"].values
    time_arr = list(zip(*param_df["time"].values))
    value_arr = list(zip(*param_df["value"].values))
    param_gen = ({
        "ne_t": scaled_Ne[i],
        "mu_t": mu[i],
        "rec_t": rec[i],
        "time": time_arr[i],
        "event": event,
        "pops": pops,
        "value": value_arr[i]
    } for i in range(sim_number))
    # param_gen = ({"time": time_arr[i], "event": event, "pops": pops, "value": value_arr[i]} for i in range(sim_number))
    param_gen = list(param_gen)

    # check nprocs
    if nprocs > multiprocessing.cpu_count(
    ):  # check that there are not more requested than available
        print(
            "not {nprocs} processors available, setting to {multiprocessing.cpu_count()}"
        )
        nprocs = multiprocessing.cpu_count()
    # perform sims
    global stats_dt
    global header_len
    global header
    global vcf
    vcf = False
    if dry_run:
        for param in param_gen:
            run_simulation(param)
            break
    elif stats_config:
        stats_dt = read_config_stats(stats_config)
        # write headers
        pops_outfile = open(f"{outfile}.pop_stats.txt", 'w')
        pops_outfile, header_, header_ls = headers(pops_outfile, stats_dt)
        header_len = header_
        header = header_ls
        if nprocs == 1:
            for param in tqdm(param_gen):
                pop_stats_arr = run_simulation(param)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
        else:
            # chunk and MP
            nk = nprocs * 10  # tricky, how many jobs for each processor
            chunk_list = [
                param_gen[i:i + nk] for i in range(0, len(param_gen), nk)
            ]
            chunksize = ceil(nk / nprocs)
            pool = multiprocessing.Pool(nprocs)
            for i, args in enumerate(chunk_list):
                pop_stats_arr = pool.map(run_simulation,
                                         args,
                                         chunksize=chunksize)
                pops_outfile = stats_out(pop_stats_arr, pops_outfile, nprocs)
                print(i)
            pool.close()
        pops_outfile.close()
    else:
        print("No stats file given with msprime, default VCF")
        vcf = True
        for contig, param in enumerate(param_gen):
            mts = run_simulation(param)
            with open(f"{outfile}.contig_{contig}.vcf", "w") as vcf_file:
                mts.write_vcf(vcf_file, contig_id=f"{contig}")