Exemple #1
0
    def test_simulate_with_mask(self):
        engines = ["msprime", "slim"]
        species = stdpopsim.get_species("HomSap")
        L = 1000
        contig = species.get_contig(length=L)
        contig.mutation_rate = 1e-3
        contig.recombination_map = msprime.RateMap.uniform(L, 0)
        samples = [msprime.SampleSet(2, population=0, ploidy=1)]
        model = stdpopsim.PiecewiseConstantSize(100)
        for engine_name in engines:
            engine = stdpopsim.get_engine(engine_name)

            # test engine with exclusion mask
            contig.inclusion_mask = None
            contig.exclusion_mask = np.array([[0, L // 2]])
            ts = engine.simulate(demographic_model=model,
                                 contig=contig,
                                 samples=samples)
            # check that positions of mutations are within mask
            positions = np.array(
                [ts.site(m.site).position for m in ts.mutations()])
            assert np.all(positions >= L // 2)

            # test engine with inclusion mask
            contig.exclusion_mask = None
            contig.inclusion_mask = np.array([[0, L // 2]])
            ts = engine.simulate(demographic_model=model,
                                 contig=contig,
                                 samples=samples)
            # check that positions of mutations are within mask
            positions = np.array(
                [ts.site(m.site).position for m in ts.mutations()])
            assert np.all(positions < L // 2)
Exemple #2
0
    def get_samples(self, *args):
        """
        Returns a list of msprime.Sample objects, with the number of samples
        from each population determined by the positional arguments.
        For instance, ``model.get_samples(2, 5, 7)`` would return a list of 14 samples,
        two of which are from the model's first population (i.e., with population ID
        ``model.populations[0].id``), five are from the model's second population,
        and seven are from the model's third population.
        The number of of arguments must be less than or equal to the number of
        "sampling" populations, ``model.num_sampling_populations``;
        if the number of arguments is less than the number of sampling populations,
        then remaining numbers are treated as zero.

        .. todo:: This documentation is broken. We're now returning msprime
            SampleSet objects.
        """
        samples = []
        for pop_index, n in enumerate(args):
            if self.populations[pop_index].allow_samples:
                samples.append(
                    msprime.SampleSet(
                        num_samples=n,
                        population=pop_index,
                        time=self.populations[pop_index].sampling_time,
                        ploidy=1,  # Avoid breaking too much at once.
                    ))
            elif n > 0:
                raise ValueError(
                    "Samples requested from non-sampling population {pop_index}"
                )
        return samples
Exemple #3
0
 def test_wf_hudson_ancient_samples(self):
     t = 10
     n = 20
     ts = msprime.sim_ancestry(
         [msprime.SampleSet(1, time=j, population=0) for j in range(n)],
         population_size=10,
         model=[msprime.DiscreteTimeWrightFisher(duration=t), "hudson"],
         random_seed=2,
     )
     tree = ts.first()
     assert tree.num_roots == 1
     times = ts.tables.nodes.time[ts.tables.nodes.flags == 0]
     dtwf_times = times[np.logical_and(times > 0, times < t)]
     assert dtwf_times.shape[0] > 0
     assert np.all(dtwf_times == np.floor(dtwf_times))
     coalescent_times = times[times > t]
     assert coalescent_times.shape[0] > 0
     assert np.all(coalescent_times != np.floor(coalescent_times))
def _discoal_str_to_msprime(args):
    # takes discoal command line as input and returns an iterator over the
    # msprime tree sequences.

    tokens = args.split(" ")
    # positional args
    sample_size = int(tokens[0])
    nreps = int(tokens[1])
    seq_length = int(tokens[2])
    # parse discoal command line for params
    # init ones we definitely need for comparison
    theta = rho = alpha = sweep_site = sweep_mod_time = None
    refsize = 1e6
    for i in range(3, len(tokens)):
        # pop size change case
        if tokens[i] == "-en":
            raise ValueError(
                "sweeps with population size changes remain unimplemented")
        # migration rate case
        if (tokens[i] == "-m") or (tokens[i] == "-p"):
            raise ValueError(
                "sweeps with multiple populations remain unimplemented")
        # split or admixture case
        if (tokens[i] == "-ea") or (tokens[i] == "-ed"):
            raise ValueError("sweeps with splits or admixture not supported")
        # sweep params
        if tokens[i] == "-x":
            sweep_site = float(tokens[i + 1])
        if (tokens[i] == "-ws") or (tokens[i] == "-wd") or (tokens[i]
                                                            == "-wn"):
            sweep_mod_time = float(tokens[i + 1])
        if tokens[i] == "-a":
            alpha = float(tokens[i + 1])
        if tokens[i] == "-N":
            refsize = float(tokens[i + 1])
        # coalescent params
        if tokens[i] == "-t":
            theta = float(tokens[i + 1])
        if tokens[i] == "-r":
            rho = float(tokens[i + 1])
    mod_list = []
    if alpha is not None:
        # sweep model
        s = alpha / (2 * refsize)
        mod = msprime.SweepGenicSelection(
            position=np.floor(sweep_site * seq_length),
            start_frequency=1.0 / (2 * refsize),
            end_frequency=1.0 - (1.0 / (2 * refsize)),
            s=s * 2,  # discoal fitness model is 1, 1+s, 1+2s
            dt=1e-6,
        )
        mod_list.append(msprime.StandardCoalescent(duration=sweep_mod_time))
        mod_list.append(mod)
        # if an event is defined from discoal line
        # best thing to do is rescale to Ne=0.25
        # so that time scale are consistent
        # see note at msprime/cli.py line 626
        # and following for alternate solution
        if sweep_mod_time > 0:
            refsize = 0.25
            mod.s = alpha / refsize
    # append final model
    mod_list.append("hudson")
    # scale theta and rho
    recomb_rate = rho / (4 * refsize * (seq_length - 1))
    mu = theta / (4 * refsize * seq_length)
    # We're only interested in ancestry sim here.
    assert mu == 0
    replicates = msprime.sim_ancestry(
        [msprime.SampleSet(sample_size, ploidy=1)],
        population_size=refsize,
        model=mod_list,
        recombination_rate=recomb_rate,
        sequence_length=seq_length,
        discrete_genome=False,
        num_replicates=nreps,
    )
    return replicates
Exemple #5
0
####################
# Run

# Simulate an ancestral history for 3 diploid samples under the coalescent
# with recombination on a seqlength region with parameters defined above..
newick = "((true:" + str(Tqt) + ",query:" + str(Tqt) + "):" + str(
    Ttf - Tqt) + ",false:" + str(Ttf) + ")"
demography = msprime.Demography.from_species_tree(
    newick,
    time_units="myr",
    initial_size=pop_size,
    generation_time=generation_length)

ts = msprime.sim_ancestry(samples=[
    msprime.SampleSet(1, population="true", time=true_age),
    msprime.SampleSet(1, population="query", time=query_age),
    msprime.SampleSet(1, population="false", time=false_age)
],
                          demography=demography,
                          recombination_rate=recomb_rate,
                          ploidy=2,
                          sequence_length=seqlength,
                          random_seed=123456)

mts = msprime.sim_mutations(ts, rate=mutation_rate, random_seed=5678)
# Default mutation model is msprime.JC69.

# Create arbitrary sequence of same length, because msprime doesn't bother to simulate non-variable sites.
bases = ["A", "C", "T", "G"]
bgseq = random.choices(bases, k=seqlength)