def _get_upgrade_provenance(root): """ Returns the provenance string from upgrading the specified HDF5 file. """ # TODO add more parameters here like filename, etc. parameters = { "source_version": list(map(int, root.attrs["format_version"])) } s = json.dumps(provenance.get_provenance_dict("upgrade", parameters)) return s.encode()
def _get_v2_provenance(command, attrs): """ Returns the V2 tree provenance attributes reformatted as a provenance record. """ environment = {} parameters = {} # Try to get the provenance strings. Malformed JSON should not prevent us # from finishing the conversion. try: environment = json.loads(str(attrs["environment"])) except ValueError: logging.warn("Failed to convert environment provenance") try: parameters = json.loads(str(attrs["parameters"])) except ValueError: logging.warn("Failed to convert parameters provenance") provenance_dict = provenance.get_provenance_dict(command, parameters) provenance_dict["version"] = environment.get("msprime_version", "Unknown_version") provenance_dict["environment"] = environment return json.dumps(provenance_dict).encode()
def mutate(tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. Currently only the :class:`.InfiniteSites` mutation model is supported. This parameter is useful if you wish to obtain sequences with letters from the nucleotide alphabet rather than the default 0/1 states. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.InfiniteSites` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time at which a mutation can occur (Default: no restriction). :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") if random_seed is None: random_seed = simulations._get_random_seed() random_seed = int(random_seed) rng = _msprime.RandomGenerator(random_seed) if model is None: model = InfiniteSites() try: alphabet = model.alphabet except AttributeError: raise TypeError("model must be an InfiniteSites instance") if rate is None: rate = 0 rate = float(rate) keep = bool(keep) parameters = { "command": "mutate", "rate": rate, "random_seed": random_seed, "keep": keep } if start_time is None: start_time = -sys.float_info.max else: start_time = float(start_time) parameters["start_time"] = start_time if end_time is None: end_time = sys.float_info.max else: end_time = float(end_time) parameters["end_time"] = end_time # TODO Add a JSON representation of the model to the provenance. provenance_dict = provenance.get_provenance_dict(parameters) if start_time > end_time: raise ValueError("start_time must be <= end_time") mutation_generator = _msprime.MutationGenerator(rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) mutation_generator.generate(lwt, keep=keep) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(json.dumps(provenance_dict)) return tables.tree_sequence()
def add_provenance(provenance_table, method_name): d = provenance.get_provenance_dict("tsutil.{}".format(method_name)) provenance_table.add_row(json.dumps(d))
def simulate(sample_size=None, Ne=1, length=None, recombination_rate=None, recombination_map=None, mutation_rate=None, population_configurations=None, migration_matrix=None, demographic_events=[], samples=None, model=None, record_migrations=False, random_seed=None, mutation_generator=None, num_replicates=None): """ Simulates the coalescent with recombination under the specified model parameters and returns the resulting :class:`.TreeSequence`. :param int sample_size: The number of individuals in our sample. If not specified or None, this defaults to the sum of the subpopulation sample sizes. Either ``sample_size``, ``population_configurations`` or ``samples`` must be specified. :param float Ne: The effective (diploid) population size for the reference population. This determines the factor by which the per-generation recombination and mutation rates are scaled in the simulation. This defaults to 1 if not specified. :param float length: The length of the simulated region in bases. This parameter cannot be used along with ``recombination_map``. Defaults to 1 if not specified. :param float recombination_rate: The rate of recombination per base per generation. This parameter cannot be used along with ``recombination_map``. Defaults to 0 if not specified. :param recombination_map: The map describing the changing rates of recombination along the simulated chromosome. This parameter cannot be used along with the ``recombination_rate`` or ``length`` parameters, as these values are encoded within the map. Defaults to a uniform rate as described in the ``recombination_rate`` parameter if not specified. :type recombination_map: :class:`.RecombinationMap` :param float mutation_rate: The rate of mutation per base per generation. If not specified, no mutations are generated. :param list population_configurations: The list of :class:`.PopulationConfiguration` instances describing the sampling configuration, relative sizes and growth rates of the populations to be simulated. If this is not specified, a single population with a sample of size ``sample_size`` is assumed. :type population_configurations: list or None. :param list migration_matrix: The matrix describing the rates of migration between all pairs of populations. If :math:`N` populations are defined in the ``population_configurations`` parameter, then the migration matrix must be an :math:`N\\times N` matrix consisting of :math:`N` lists of length :math:`N` or an :math`N\\times N` numpy array. :param list demographic_events: The list of demographic events to simulate. Demographic events describe changes to the populations in the past. Events should be supplied in non-decreasing order of time. Events with the same time value will be applied sequentially in the order that they were supplied before the simulation algorithm continues with the next time step. :param list samples: The list specifying the location and time of all samples. This parameter may be used to specify historical samples, and cannot be used in conjunction with the ``sample_size`` parameter. Each sample is a (``population``, ``time``) pair such that the sample in position ``j`` in the list of samples is drawn in the specified population at the specfied time. Time is measured in generations, as elsewhere. :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param int num_replicates: The number of replicates of the specified parameters to simulate. If this is not specified or None, no replication is performed and a :class:`.TreeSequence` object returned. If :obj:`num_replicates` is provided, the specified number of replicates is performed, and an iterator over the resulting :class:`.TreeSequence` objects returned. :return: The :class:`.TreeSequence` object representing the results of the simulation if no replication is performed, or an iterator over the independent replicates simulated if the :obj:`num_replicates` parameter has been used. :rtype: :class:`.TreeSequence` or an iterator over :class:`.TreeSequence` replicates. :warning: If using replication, do not store the results of the iterator in a list! For performance reasons, the same underlying object may be used for every TreeSequence returned which will most likely lead to unexpected behaviour. """ seed = random_seed if random_seed is None: seed = _get_random_seed() # To support numpy integer inputs here too we convert to integer. rng = RandomGenerator(int(seed)) sim = simulator_factory( sample_size=sample_size, random_generator=rng, Ne=Ne, length=length, recombination_rate=recombination_rate, recombination_map=recombination_map, population_configurations=population_configurations, migration_matrix=migration_matrix, demographic_events=demographic_events, samples=samples, model=model, record_migrations=record_migrations) # The provenance API is very tentative, and only included now as a # pre-alpha feature. parameters = {"TODO": "encode simulation parameters"} provenance_dict = provenance.get_provenance_dict("simulate", parameters) if mutation_generator is None: mu = 0 if mutation_rate is None else mutation_rate mutation_generator = MutationGenerator(rng, mu) else: if mutation_rate is not None: raise ValueError( "Cannot specify both mutation_rate and mutation_generator") if num_replicates is None: return next( _replicate_generator(sim, mutation_generator, 1, provenance_dict)) else: return _replicate_generator(sim, mutation_generator, num_replicates, provenance_dict)
def mutate( tree_sequence, rate=None, random_seed=None, model=None, keep=False, start_time=None, end_time=None): """ Simulates mutations on the specified ancestry and returns the resulting :class:`tskit.TreeSequence`. Mutations are generated at the specified rate in measured generations. Mutations are generated under the infinite sites model, and so the rate of new mutations is per unit of sequence length per generation. If a random seed is specified, this is used to seed the random number generator. If the same seed is specified and all other parameters are equal then the same mutations will be generated. If no random seed is specified then one is generated automatically. If the ``model`` parameter is specified, this determines the model under which mutations are generated. Currently only the :class:`.InfiniteSites` mutation model is supported. This parameter is useful if you wish to obtain sequences with letters from the nucleotide alphabet rather than the default 0/1 states. By default mutations from the infinite sites model with a binary alphabet are generated. By default, sites and mutations in the parameter tree sequence are discarded. If the ``keep`` parameter is true, however, *additional* mutations are simulated. Under the infinite sites mutation model, all new mutations generated will occur at distinct positions from each other and from any existing mutations (by rejection sampling). The time interval over which mutations can occur may be controlled using the ``start_time`` and ``end_time`` parameters. The ``start_time`` defines the lower bound (in time-ago) on this interval and ``max_time`` the upper bound. Note that we may have mutations associated with nodes with time <= ``start_time`` since mutations store the node at the bottom (i.e., towards the leaves) of the branch that they occur on. :param tskit.TreeSequence tree_sequence: The tree sequence onto which we wish to throw mutations. :param float rate: The rate of mutation per generation. (Default: 0). :param int random_seed: The random seed. If this is `None`, a random seed will be automatically generated. Valid random seeds must be between 1 and :math:`2^{32} - 1`. :param MutationModel model: The mutation model to use when generating mutations. If not specified or None, the :class:`.InfiniteSites` mutation model is used. :param bool keep: Whether to keep existing mutations (default: False). :param float start_time: The minimum time at which a mutation can occur. (Default: no restriction.) :param float end_time: The maximum time at which a mutation can occur (Default: no restriction). :return: The :class:`tskit.TreeSequence` object resulting from overlaying mutations on the input tree sequence. :rtype: :class:`tskit.TreeSequence` """ try: tables = tree_sequence.tables except AttributeError: raise ValueError("First argument must be a TreeSequence instance.") if random_seed is None: random_seed = simulations._get_random_seed() random_seed = int(random_seed) rng = _msprime.RandomGenerator(random_seed) if model is None: model = InfiniteSites() try: alphabet = model.alphabet except AttributeError: raise TypeError("model must be an InfiniteSites instance") if rate is None: rate = 0 rate = float(rate) keep = bool(keep) parameters = { "command": "mutate", "rate": rate, "random_seed": random_seed, "keep": keep} if start_time is None: start_time = -sys.float_info.max else: start_time = float(start_time) parameters["start_time"] = start_time if end_time is None: end_time = sys.float_info.max else: end_time = float(end_time) parameters["end_time"] = end_time # TODO Add a JSON representation of the model to the provenance. provenance_dict = provenance.get_provenance_dict(parameters) if start_time > end_time: raise ValueError("start_time must be <= end_time") mutation_generator = _msprime.MutationGenerator( rng, rate, alphabet=alphabet, start_time=start_time, end_time=end_time) lwt = _msprime.LightweightTableCollection() lwt.fromdict(tables.asdict()) mutation_generator.generate(lwt, keep=keep) tables = tskit.TableCollection.fromdict(lwt.asdict()) tables.provenances.add_row(json.dumps(provenance_dict)) return tables.tree_sequence()