def log_arg_likelihood(arg, recombination_rate, Ne=1): # Get the tables into the format we need to interchange with the low-level code. lw_tables = _msprime.LightweightTableCollection() lw_tables.fromdict(arg.tables.asdict()) return _msprime.log_likelihood_arg( lw_tables, Ne=Ne, recombination_rate=recombination_rate )
def log_arg_likelihood(ts, recombination_rate, Ne=1): """ Returns the log probability of the stored tree sequence under the Hudson ARG. An exact expression for this probability is given in equation (1) of `Kuhner et al. (2000) <https://www.genetics.org/content/156/3/1393>`_. We assume branch lengths stored in generations, resulting in a coalescence rate of :math:`1 / (2 N_e)` per pair of lineages. .. warning:: The stored tree sequence must store the full realisation of the ARG, including all recombination events and all common ancestor events, regardless of whether the recombinations cause a change in the ancestral tree or whether the common ancestor events cause coalescence of ancestral material. See :ref:`sec_tutorial_record_full_arg` for details of this data structure, and how to generate them using ``msprime``. :param tskit.TreeSequence ts: The tree sequence object. :param float recombination_rate: The per-link, per-generation recombination probability. Must be non-negative. :param float Ne: The diploid effective population size. :return: The log probability of the tree sequence under the Hudson ancestral recombination graph model. If the recombination rate is zero and the tree sequence contains at least one recombination event, then returns `-float("inf")`. """ # Get the tables into the format we need to interchange with the low-level code. lw_tables = _msprime.LightweightTableCollection() lw_tables.fromdict(ts.tables.asdict()) return _msprime.log_likelihood_arg(lw_tables, Ne=Ne, recombination_rate=recombination_rate)
def test_top_keys_match(self): tables = get_example_tables() d1 = tables.asdict() lwt = c_module.LightweightTableCollection() lwt.fromdict(d1) d2 = lwt.asdict() self.assertEqual(d1.keys(), d2.keys())
def test_missing_sequence_length(self): tables = get_example_tables() d = tables.asdict() del d["sequence_length"] lwt = c_module.LightweightTableCollection() with self.assertRaises(ValueError): lwt.fromdict(d)
def verify_optional_column(self, tables, table_len, table_name, col_name): d = tables.asdict() table_dict = d[table_name] table_dict[col_name] = None lwt = c_module.LightweightTableCollection() lwt.fromdict(d) out = lwt.asdict() self.assertTrue( np.array_equal(out[table_name][col_name], np.zeros(table_len, dtype=np.int32) - 1))
def test_missing_tables(self): tables = get_example_tables() d = tables.asdict() table_names = set(d.keys()) - {"sequence_length"} for table_name in table_names: d = tables.asdict() del d[table_name] lwt = c_module.LightweightTableCollection() with self.assertRaises(ValueError): lwt.fromdict(d)
def log_arg_likelihood(arg, recombination_rate, Ne=0.25): # TODO: Ne should default to 1 for compatability with msprime.simulate. Setting # to 1/4 now to keep the tests working. # Get the tables into the format we need to interchange with the low-level code. lw_tables = _msprime.LightweightTableCollection() lw_tables.fromdict(arg.tables.asdict()) return _msprime.log_likelihood_arg(lw_tables, Ne=Ne, recombination_rate=recombination_rate)
def test_bad_top_level_types(self): tables = get_example_tables() d = tables.asdict() for key in d.keys(): bad_type_dict = tables.asdict() # A list should be a ValueError for both the tables and sequence_length bad_type_dict[key] = ["12345"] lwt = c_module.LightweightTableCollection() with self.assertRaises(TypeError): lwt.fromdict(bad_type_dict)
def verify_offset_pair(self, tables, table_len, table_name, col_name): offset_col = col_name + "_offset" d = tables.asdict() table_dict = d[table_name] table_dict[col_name] = None table_dict[offset_col] = None lwt = c_module.LightweightTableCollection() lwt.fromdict(d) out = lwt.asdict() self.assertEqual(out[table_name][col_name].shape, (0, )) self.assertTrue( np.array_equal(out[table_name][offset_col], np.zeros(table_len + 1, dtype=np.uint32))) # Setting one or the other raises a ValueError d = tables.asdict() table_dict = d[table_name] table_dict[col_name] = None lwt = c_module.LightweightTableCollection() with self.assertRaises(TypeError): lwt.fromdict(d) d = tables.asdict() table_dict = d[table_name] table_dict[offset_col] = None lwt = c_module.LightweightTableCollection() with self.assertRaises(TypeError): lwt.fromdict(d) d = tables.asdict() table_dict = d[table_name] bad_offset = np.zeros_like(table_dict[offset_col]) bad_offset[:-1] = table_dict[offset_col][:-1][::-1] bad_offset[-1] = table_dict[offset_col][-1] table_dict[offset_col] = bad_offset lwt = c_module.LightweightTableCollection() with self.assertRaises(c_module.LibraryError): lwt.fromdict(d)
def verify_required_columns(self, tables, table_name, required_cols): d = tables.asdict() table_dict = {col: None for col in d[table_name].keys()} for col in required_cols: table_dict[col] = d[table_name][col] lwt = c_module.LightweightTableCollection() d[table_name] = table_dict lwt.fromdict(d) other = lwt.asdict() for col in required_cols: self.assertTrue( np.array_equal(other[table_name][col], table_dict[col])) # Removing any one of these required columns gives an error. for col in required_cols: d = tables.asdict() copy = dict(table_dict) copy[col] = None d[table_name] = copy lwt = c_module.LightweightTableCollection() with self.assertRaises(TypeError): lwt.fromdict(d)
def verify_columns(self, value): tables = get_example_tables() d = tables.asdict() table_names = set(d.keys()) - {"sequence_length"} for table_name in table_names: table_dict = d[table_name] for colname in table_dict.keys(): copy = dict(table_dict) copy[colname] = value lwt = c_module.LightweightTableCollection() d = tables.asdict() d[table_name] = copy with self.assertRaises(ValueError): lwt.fromdict(d)
def test_missing_columns(self): tables = get_example_tables() d = tables.asdict() table_names = set(d.keys()) - set(["sequence_length"]) for table_name in table_names: table_dict = d[table_name] for colname in table_dict.keys(): copy = dict(table_dict) del copy[colname] lwt = c_module.LightweightTableCollection() d = tables.asdict() d[table_name] = copy with self.assertRaises(ValueError): lwt.fromdict(d)
def verify(self, num_rows): tables = get_example_tables() d = tables.asdict() table_names = set(d.keys()) - {"sequence_length"} for table_name in sorted(table_names): table_dict = d[table_name] for colname in sorted(table_dict.keys()): copy = dict(table_dict) copy[colname] = table_dict[colname][:num_rows].copy() lwt = c_module.LightweightTableCollection() d = tables.asdict() d[table_name] = copy with self.assertRaises(ValueError): lwt.fromdict(d)
def test_table_columns_match(self): tables = get_example_tables() d1 = tables.asdict() lwt = c_module.LightweightTableCollection() lwt.fromdict(d1) d2 = lwt.asdict() tables = [ "individuals", "nodes", "edges", "migrations", "sites", "mutations", "populations", "provenances", ] for table_name in tables: t1 = d1[table_name] t2 = d2[table_name] self.assertEqual(t1.keys(), t2.keys())
def verify(self, tables): lwt = c_module.LightweightTableCollection() lwt.fromdict(tables.asdict()) other_tables = tskit.TableCollection.fromdict(lwt.asdict()) self.assertEqual(tables, other_tables)
def mutate( tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None, discrete=False, ): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. Currently only the :class:`.InfiniteSites` mutation model is supported. This parameter is useful if you wish to obtain sequences with letters from the nucleotide alphabet rather than the default 0/1 states. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation, as either a single number (for a uniform rate) or as a :class:`.MutationMap`. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.BinaryMutations` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time ago at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time ago at which a mutation can occur (Default: no restriction). :param bool discrete: Whether to generate mutations at only integer positions along the genome. Default is False, which produces infinite-sites mutations at floating-point positions. :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") seed = random_seed if random_seed is None: seed = core.get_random_seed() else: seed = int(seed) if rate is None: rate = 0 try: rate = float(rate) rate_map = MutationMap(position=[0.0, tree_sequence.sequence_length], rate=[rate, 0.0]) except TypeError: rate_map = rate if not isinstance(rate_map, MutationMap): raise TypeError("rate must be a float or a MutationMap") if start_time is None: start_time = -sys.float_info.max else: start_time = float(start_time) if end_time is None: end_time = sys.float_info.max else: end_time = float(end_time) if start_time > end_time: raise ValueError("start_time must be <= end_time") keep = bool(keep) discrete = bool(discrete) if model is None: model = BinaryMutations() if not isinstance(model, MutationModel): raise TypeError("model must be a MutationModel") argspec = inspect.getargvalues(inspect.currentframe()) parameters = { "command": "mutate", **{arg: argspec.locals[arg] for arg in argspec.args}, } parameters["random_seed"] = seed encoded_provenance = provenance.json_encode_provenance( provenance.get_provenance_dict(parameters)) rng = _msprime.RandomGenerator(seed) mutation_generator = _msprime.MutationGenerator(random_generator=rng, rate_map=rate_map._ll_map, model=model) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) mutation_generator.generate(lwt, keep=keep, start_time=start_time, end_time=end_time, discrete=discrete) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(encoded_provenance) return tables.tree_sequence()
def mutate(tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. Currently only the :class:`.InfiniteSites` mutation model is supported. This parameter is useful if you wish to obtain sequences with letters from the nucleotide alphabet rather than the default 0/1 states. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.InfiniteSites` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time at which a mutation can occur (Default: no restriction). :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") if random_seed is None: random_seed = simulations._get_random_seed() random_seed = int(random_seed) rng = _msprime.RandomGenerator(random_seed) if model is None: model = InfiniteSites() try: alphabet = model.alphabet except AttributeError: raise TypeError("model must be an InfiniteSites instance") if rate is None: rate = 0 rate = float(rate) keep = bool(keep) parameters = { "command": "mutate", "rate": rate, "random_seed": random_seed, "keep": keep } if start_time is None: start_time = -sys.float_info.max else: start_time = float(start_time) parameters["start_time"] = start_time if end_time is None: end_time = sys.float_info.max else: end_time = float(end_time) parameters["end_time"] = end_time # TODO Add a JSON representation of the model to the provenance. provenance_dict = provenance.get_provenance_dict(parameters) if start_time > end_time: raise ValueError("start_time must be <= end_time") mutation_generator = _msprime.MutationGenerator(rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) mutation_generator.generate(lwt, keep=keep) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(json.dumps(provenance_dict)) return tables.tree_sequence()
import pathlib import pickle import _msprime import msprime pop_configs = [msprime.PopulationConfiguration(5) for _ in range(2)] migration_matrix = [[0, 1], [1, 0]] ts = msprime.simulate( population_configurations=pop_configs, migration_matrix=migration_matrix, mutation_rate=1, record_migrations=True, random_seed=1, ) lwt = _msprime.LightweightTableCollection() lwt.fromdict(ts.tables.asdict()) test_dir = pathlib.Path(__file__).parent with open(test_dir / f"msprime-{msprime.__version__}.pkl", "wb") as f: pickle.dump(lwt.asdict(), f)