class TestGenome(unittest.TestCase, test_species.GenomeTestMixin): """ Tests for the human genome. """ genome = stdpopsim.get_species("HomSap").genome def test_basic_attributes(self): self.assertEqual(len(self.genome.chromosomes), 25) def test_recombination_rates(self): # recompute recombination rates from HapMapII_GRCh37 map then # compare the results to the current recombination rates for each chromosome genetic_map = "HapMapII_GRCh37" species = stdpopsim.get_species("HomSap") for chrom in self.genome.chromosomes: if chrom.id == "chrY": with self.assertWarns(Warning): contig = species.get_contig(chrom.id, genetic_map=genetic_map) else: contig = species.get_contig(chrom.id, genetic_map=genetic_map) self.assertAlmostEqual( chrom.recombination_rate, contig.recombination_map.mean_recombination_rate, )
def main(): num_samples = 100 seed = 42 data = { "species": [], "model": [], "cpu_time": [], "ram": [], "file_size": [] } for species_id in ["PonAbe", "HomSap", "DroMel", "AraTha"]: species = stdpopsim.get_species(species_id) # Get the shortest chromosome chrom = sorted(species.genome.chromosomes, key=lambda x: x.length)[0] assert chrom.recombination_rate > 0 for model in species.demographic_models: with tempfile.NamedTemporaryFile() as out: cmd = ( f"{species_id} -d {model.id} -c {chrom.id} {num_samples} " f"-s {seed} -o {out.name}") cpu_time, ram = time_cmd(["stdpopsim"] + cmd.split()) file_size = os.path.getsize(out.name) data["species"].append(species.id) data["model"].append(model.id) data["cpu_time"].append(cpu_time) data["ram"].append(ram) data["file_size"].append(file_size) df = pd.DataFrame(data) df.to_csv("data/benchmark.csv") print(df)
def _twopop_IM(engine_id, out_dir, seed, NA=1000, N1=500, N2=5000, T=1000, M12=0, M21=0, pulse=None, **sim_kwargs): species = stdpopsim.get_species("AraTha") contig = species.get_contig("chr5", length_multiplier=0.01) # ~270 kb model = stdpopsim.IsolationWithMigration(NA=NA, N1=N1, N2=N2, T=T, M12=M12, M21=M21) if pulse is not None: model.demographic_events.append(pulse) model.demographic_events.sort(key=lambda x: x.time) model.generation_time = species.generation_time samples = model.get_samples(50, 50, 0) engine = stdpopsim.get_engine(engine_id) t0 = time.perf_counter() ts = engine.simulate(model, contig, samples, seed=seed, **sim_kwargs) t1 = time.perf_counter() out_file = out_dir / f"{seed}.trees" ts.dump(out_file) return out_file, t1 - t0
def homsap_composite_model(length, sample_counts, seed, model=hominin_composite()): if "Nea" in sample_counts and sample_counts["Nea"] != 4: raise RuntimeError( "Must have one sample each for the Vindija and Altai Neanderthals" ) species = stdpopsim.get_species("HomSap") model = hominin_composite() contig = random_autosomal_chunk(species, "HapMapII_GRCh37", length, seed) samples = model.get_samples( *[ sample_counts.get(p.id, 0) if p.id != "Nea" and p.sampling_time is not None else 0 for p in model.populations ] ) if "Nea" in sample_counts: # Altai and Vindija Neanderthal dates from Prüfer et al. 2017. T_Altai = 115e3 / model.generation_time T_Vindija = 55e3 / model.generation_time pop = {p.id: i for i, p in enumerate(model.populations)} samples.extend( [ msprime.Sample(pop["Nea"], T_Altai), msprime.Sample(pop["Nea"], T_Altai), msprime.Sample(pop["Nea"], T_Vindija), msprime.Sample(pop["Nea"], T_Vindija), ] ) return species, model, contig, samples
def test_number_of_calls(self): # Test that genetic map citations are converted. species = stdpopsim.get_species("HomSap") genetic_map = species.get_genetic_map("HapMapII_GRCh37") contig = species.get_contig("chr22", genetic_map=genetic_map.id) model = stdpopsim.PiecewiseConstantSize(species.population_size) engine = stdpopsim.get_default_engine() cites_and_cites = [ genetic_map.citations, model.citations, engine.citations, species.genome.mutation_rate_citations, species.genome.recombination_rate_citations, species.genome.assembly_citations, ] ncite = len(set([ref.doi for cites in cites_and_cites for ref in cites])) # Patch out writing to a file, then # ensure that the method is called # the correct number of times. with mock.patch("builtins.open", mock.mock_open()): with open('tmp.bib', 'w') as bib: with mock.patch.object( stdpopsim.citations.Citation, "fetch_bibtex") as mock_bib: cli.write_bibtex(engine, model, contig, species, bib) self.assertEqual(mock_bib.call_count, ncite)
def OutOfAfrica_3G09_with_DFE(seed): """ The Gutenkunst et al. HomSap/OutOfAfrica_3G09 model, simulated with a DFE. """ species = stdpopsim.get_species("HomSap") model = species.get_demographic_model("OutOfAfrica_3G09") contig = species.get_contig("chr1", length_multiplier=0.001) samples = model.get_samples(100, 100, 100) # YRI, CEU, CHB mutation_types = KimDFE() # Simulate. engine = stdpopsim.get_engine("slim") ts = engine.simulate( model, contig, samples, seed=seed, mutation_types=mutation_types, slim_scaling_factor=10, slim_burn_in=10, # Set slim_script=True to print the script instead of running it. # slim_script=True, ) return ts
class TestGenomeData(test_species.GenomeTestBase): genome = stdpopsim.get_species("AnoGam").genome @pytest.mark.skip("Recombination rate QC not done yet") @pytest.mark.parametrize( ["name", "rate"], { "2L": -1, "2R": -1, "3L": -1, "3R": -1, "X": -1, "Mt": -1 }.items(), ) def test_recombination_rate(self, name, rate): assert rate == pytest.approx( self.genome.get_chromosome(name).recombination_rate) @pytest.mark.skip("Mutation rate QC not done yet") @pytest.mark.parametrize( ["name", "rate"], { "2L": -1, "2R": -1, "3L": -1, "3R": -1, "X": -1, "Mt": -1 }.items(), ) def test_mutation_rate(self, name, rate): assert rate == pytest.approx( self.genome.get_chromosome(name).mutation_rate)
def test_recombination_rates(self, chr_id): # We should recast this test and just hard code in the values. # Tests should be *obvious* not clever. # recompute recombination rates from HapMapII_GRCh37 map then # compare the results to the current recombination rates for each chromosome genetic_map = "HapMapII_GRCh37" species = stdpopsim.get_species("HomSap") chrom = species.genome.get_chromosome(chr_id) if chr_id in ["X", "Y", "MT"]: with pytest.warns(stdpopsim.NonAutosomalWarning): contig = species.get_contig(chr_id, genetic_map=genetic_map) elif chr_id in ["3", "5", "7", "11", "16", "17", "18", "20"]: contig = species.get_contig(chr_id, genetic_map=genetic_map) else: # The rest of the chromosomes are currently emitting a warning about # the mismatch in chromosome lengths because of the fact that we're # on 37 for the map. This should be resolved when we start using the # lifted over map. with pytest.warns(UserWarning, match="longer than chromosome length"): contig = species.get_contig(chr_id, genetic_map=genetic_map) assert pytest.approx( chrom.recombination_rate, contig.recombination_map.mean_rate, )
def test_download_over_cache(self): species = stdpopsim.get_species("DroMel") gm = species.get_genetic_map("ComeronCrossover_dm6") gm.download() self.assertTrue(gm.is_cached()) gm.download() self.assertTrue(gm.is_cached())
def test_bad_genetic_map(self): species = stdpopsim.get_species("HomSap") with mock.patch("stdpopsim.cli.exit", autospec=True) as mocked_exit: cli.get_genetic_map_wrapper(species, "XXX") available_maps = ", ".join([gm.id for gm in species.genetic_maps]) mocked_exit.assert_called_once_with( f"GeneticMap 'HomSap/XXX' not in catalog ({available_maps})")
class TestGetChromosomeAnnotations(tests.CacheReadingTest): """ Tests if we get chromosome level annotations using the Ensembl_GRCh38 human GFF. """ # TODO: The HomSap annotations are huge. Once we include a smaller # annotation set, we should instead use that, so tests are faster. species = stdpopsim.get_species("HomSap") an = species.get_annotations("Ensembl_GRCh38_gff3") def test_known_chromosome(self): cm = self.an.get_chromosome_annotations("21") self.assertIsInstance(cm, pandas.DataFrame) def test_known_chromosome_prefix(self): cm = self.an.get_chromosome_annotations("chr21") self.assertIsInstance(cm, pandas.DataFrame) def test_unknown_chromosome(self): for bad_chrom in ["", "ABD", None]: with self.assertRaises(ValueError): self.an.get_chromosome_annotations(bad_chrom) def test_get_genes(self): g = self.an.get_genes_from_chromosome("21") self.assertIsInstance(g, pandas.DataFrame) def test_get_genes_full(self): g = self.an.get_genes_from_chromosome("21", full_table=True) self.assertIsInstance(g, pandas.DataFrame) def test_bad_annot_type(self): bad_annot = "foo" with self.assertRaises(ValueError): self.an.get_annotation_type_from_chromomosome(bad_annot, "21")
def test_bad_params(self): engine = stdpopsim.get_engine("slim") species = stdpopsim.get_species("HomSap") contig = species.get_contig("chr1") model = stdpopsim.PiecewiseConstantSize(species.population_size) samples = model.get_samples(10) for scaling_factor in (0, -1, -1e-6): with self.assertRaises(ValueError): engine.simulate( demographic_model=model, contig=contig, samples=samples, slim_scaling_factor=scaling_factor, dry_run=True, ) for burn_in in (-1, -1e-6): with self.assertRaises(ValueError): engine.simulate( demographic_model=model, contig=contig, samples=samples, slim_burn_in=burn_in, dry_run=True, )
class TestGetContig(unittest.TestCase): """ Tests for the get contig method. """ species = stdpopsim.get_species("HomSap") def test_length_multiplier(self): contig1 = self.species.get_contig("chr22") for x in [0.125, 1.0, 2.0]: contig2 = self.species.get_contig("chr22", length_multiplier=x) self.assertEqual(contig1.recombination_map.get_positions()[-1] * x, contig2.recombination_map.get_positions()[-1]) def test_length_multiplier_on_empirical_map(self): with self.assertRaises(ValueError): self.species.get_contig("chr1", genetic_map="HapMapII_GRCh37", length_multiplier=2) def test_genetic_map(self): # TODO we should use a different map here so we're not hitting the cache. contig = self.species.get_contig("chr22", genetic_map="HapMapII_GRCh37") self.assertIsInstance(contig.recombination_map, msprime.RecombinationMap)
def test_get_known_genetic_map(self): good = ["HapmapII_GRCh37", "Decode_2010_sex_averaged"] species = stdpopsim.get_species("homsap") for name in good: gmap = species.get_genetic_map(name) self.assertIsInstance(gmap, stdpopsim.GeneticMap) self.assertEqual(gmap.name, name)
def test_bad_model(self): species = stdpopsim.get_species("HomSap") with mock.patch("stdpopsim.cli.exit", autospec=True) as mocked_exit: cli.get_model_wrapper(species, "XXX") mocked_exit.assert_called_once_with( "DemographicModel 'HomSap/XXX' not in catalog" )
def test_known_chromosome(self): species = stdpopsim.get_species("CanFam") genetic_map = species.get_genetic_map("Campbell2016_CanFam3_1") chrom = species.genome.get_chromosome("1") cm = genetic_map.get_chromosome_map(chrom.id) self.assertIsInstance(cm, msprime.RateMap) self.assertEqual(chrom.length, cm.sequence_length)
def test_bad_genetic_map(self): species = stdpopsim.get_species("HomSap") with mock.patch("stdpopsim.cli.exit", autospec=True) as mocked_exit: cli.get_genetic_map_wrapper(species, "XXX") mocked_exit.assert_called_once_with( "Genetic map 'HomSap/XXX' not in catalog" )
def test_required_params(self): species = stdpopsim.get_species("HomSap") model = species.get_demographic_model("AshkSub_7G19") contig = (species.get_contig("chr1"), ) for engine in stdpopsim.all_engines(): with self.assertRaises(TypeError): engine.simulate(model, contig)
def setup_sample_file(args): """ Return a Thousand Genomes Project sample data file, the corresponding recombination rate array, a prefix to use for files, and None """ filename = args.sample_file map = args.genetic_map if not filename.endswith(".samples"): raise ValueError("Sample data file must end with '.samples'") sd = tsinfer.load(filename) inference_pos = sd.sites_position[:][sd.sites_inference[:]] match = re.search(r'(chr\d+)', filename) if match or map is not None: if map is not None: chr_map = msprime.RecombinationMap.read_hapmap(map) else: chr = match.group(1) print( f"Using {chr} from HapMapII_GRCh37 for the recombination map") map = stdpopsim.get_species("HomSap").get_genetic_map( id="HapMapII_GRCh37") if not map.is_cached(): map.download() chr_map = map.get_chromosome_map(chr) inference_distances = physical_to_genetic(chr_map, inference_pos) d = np.diff(inference_distances) rho = np.concatenate(([0.0], d)) else: inference_distances = inference_pos d = np.diff(inference_distances) rho = np.concatenate(([0.0], d / sd.sequence_length)) return sd, rho, filename[:-len(".samples")], None
class PiecewiseConstantSizeMixin: """ Mixin that sets up a simple demographic model. """ species = stdpopsim.get_species("HomSap") contig = species.get_contig("chr22", length_multiplier=0.001) # ~50 kb N0 = 1000 # size in the present N1 = 500 # ancestral size T = 500 # generations since size change occurred T_mut = 300 # introduce a mutation at this generation model = stdpopsim.PiecewiseConstantSize(N0, (T, N1)) model.generation_time = 1 samples = model.get_samples(100) mutation_types = [ stdpopsim.ext.MutationType(convert_to_substitution=False) ] mut_id = len(mutation_types) def allele_frequency(self, ts): """ Get the allele frequency of the drawn mutation. """ # surely there's a simpler way! assert ts.num_mutations == 1 samples = ts.samples() mut = next(ts.mutations()) tree = ts.at(ts.site(mut.site).position) have_mut = [u for u in samples if tree.is_descendant(u, mut.node)] af = len(have_mut) / len(samples) return af
def run(self): species = stdpopsim.get_species(self.arguments[0]) sid = f"sec_catalog_{species.id}" species_target = self.get_target(sid) section = nodes.section(ids=[sid], names=[sid]) section += nodes.title(text=species.name) section += self.species_summary(species) genome_section = nodes.section( ids=[f"sec_catalog_{species.id}_genome"]) genome_section += nodes.title(text="Genome") genome_section += self.chromosomes_table(species) section += genome_section section += nodes.transition() maps_section = nodes.section( ids=[f"sec_catalog_{species.id}_genetic_maps"]) maps_section += nodes.title(text="Genetic Maps") maps_section += self.genetic_maps_table(species) for gmap in species.genetic_maps: maps_section += self.genetic_map_section(species, gmap) section += maps_section section += nodes.transition() models_section = nodes.section( ids=[f"sec_catalog_{species.id}_models"]) models_section += nodes.title(text="Models") models_section += self.models_table(species) for model in species.demographic_models: models_section += self.model_section(species, model) section += models_section return [species_target, section]
def test_get_known_genetic_map(self): good = ["HapMapII_GRCh37", "DeCodeSexAveraged_GRCh36"] species = stdpopsim.get_species("HomSap") for name in good: gmap = species.get_genetic_map(name) self.assertIsInstance(gmap, stdpopsim.GeneticMap) self.assertEqual(gmap.id, name)
def test_script_generation(self): engine = stdpopsim.get_engine("slim") species = stdpopsim.get_species("HomSap") contig = species.get_contig("chr1") model = stdpopsim.PiecewiseConstantSize(species.population_size) samples = model.get_samples(10) model.generation_time = species.generation_time out, _ = capture_output(engine.simulate, demographic_model=model, contig=contig, samples=samples, slim_script=True) self.assertTrue("sim.registerLateEvent" in out) model = species.get_demographic_model("AncientEurasia_9K19") samples = model.get_samples(1, 2, 3, 4, 5, 6, 7) out, _ = capture_output(engine.simulate, demographic_model=model, contig=contig, samples=samples, slim_script=True) self.assertTrue("sim.registerLateEvent" in out) model = species.get_demographic_model("AmericanAdmixture_4B11") samples = model.get_samples(10, 10, 10) out, _ = capture_output(engine.simulate, demographic_model=model, contig=contig, samples=samples, slim_script=True) self.assertTrue("sim.registerLateEvent" in out)
class TestGenomeData(test_species.GenomeTestBase): genome = stdpopsim.get_species("AedAeg").genome @pytest.mark.parametrize( ["name", "rate"], { "1": 0.306e-8, "2": 0.249e-8, "3": 0.291e-8, "MT": 0.0 }.items(), ) def test_recombination_rate(self, name, rate): assert pytest.approx( rate, self.genome.get_chromosome(name).recombination_rate) @pytest.mark.parametrize(["name", "rate"], { "1": 3.5e-9, "2": 3.5e-9, "3": 3.5e-9, "MT": 3.5e-9 }.items()) def test_mutation_rate(self, name, rate): assert pytest.approx(rate, self.genome.get_chromosome(name).mutation_rate)
def test_number_of_calls(self): # Test that genetic map citations are converted. species = stdpopsim.get_species("HomSap") genetic_map = species.get_genetic_map("HapMapII_GRCh37") contig = species.get_contig("chr20", genetic_map=genetic_map.id) model = stdpopsim.PiecewiseConstantSize(species.population_size) engine = stdpopsim.get_default_engine() local_cites = stdpopsim.Citation.merge( [stdpopsim.citations._stdpopsim_citation] + genetic_map.citations + model.citations + engine.citations + species.genome.citations + species.citations ) dois = set([ref.doi for ref in local_cites]) ncite = len(dois) assert ncite == len(local_cites) cli_cites = cli.get_citations(engine, model, contig, species) assert len(cli_cites) == len(local_cites) # Patch out writing to a file, then # ensure that the method is called # the correct number of times. with mock.patch("builtins.open", mock.mock_open()): with open("tmp.bib", "w") as bib: with mock.patch.object( stdpopsim.citations.Citation, "fetch_bibtex", autospec=True ) as mock_bib: cli.write_bibtex(engine, model, contig, species, bib) assert mock_bib.call_count == ncite
def get_models_help(species_id, model_id): """ Generate help text for the specified species. If model_id is None, generate help for all models. Otherwise, it must be a string with a valid model ID. """ species = stdpopsim.get_species(species_id) if model_id is None: models_text = f"\nAll simulation models for {species.name}\n\n" models = [model.id for model in species.demographic_models] else: models = [model_id] models_text = f"\nModel description\n\n" # TODO improve this text formatting. indent = " " * 4 wrapper = textwrap.TextWrapper(initial_indent=indent, subsequent_indent=indent) for model_id in models: model = get_model_wrapper(species, model_id) models_text += f"{model.id}: {model.description}\n" models_text += wrapper.fill(textwrap.dedent(model.long_description)) models_text += "\n\n" models_text += indent + "Populations:\n" for population in model.populations: if population.allow_samples: models_text += indent * 2 models_text += f"{population.id}: {population.description}\n" models_text += "\n" return models_text
class TestGenome(test_species.GenomeTestBase): genome = stdpopsim.get_species("HomSap").genome def test_basic_attributes(self): assert len(self.genome.chromosomes) == 25 @pytest.mark.parametrize("chr_id", [chrom.id for chrom in genome.chromosomes]) def test_recombination_rates(self, chr_id): # recompute recombination rates from HapMapII_GRCh37 map then # compare the results to the current recombination rates for each chromosome genetic_map = "HapMapII_GRCh37" species = stdpopsim.get_species("HomSap") chrom = species.genome.get_chromosome(chr_id) if chr_id in ["X", "Y", "MT"]: with pytest.warns(stdpopsim.NonAutosomalWarning): contig = species.get_contig(chr_id, genetic_map=genetic_map) elif chr_id in ["3", "5", "7", "11", "16", "17", "18", "20"]: contig = species.get_contig(chr_id, genetic_map=genetic_map) else: # The rest of the chromosomes are currently emitting a warning about # the mismatch in chromosome lengths because of the fact that we're # on 37 for the map. This should be resolved when we start using the # lifted over map. with pytest.warns(UserWarning, match="longer than chromosome length"): contig = species.get_contig(chr_id, genetic_map=genetic_map) assert pytest.approx( chrom.recombination_rate, contig.recombination_map.mean_rate, )
class TestSpeciesData(test_species.SpeciesTestBase): species = stdpopsim.get_species("AnaPla") def test_ensembl_id(self): assert self.species.ensembl_id == "anas_platyrhynchos" def test_name(self): assert self.species.name == "Anas platyrhynchos" def test_common_name(self): assert self.species.common_name == "Mallard" # QC Tests. These tests are performed by another contributor # independently referring to the citations provided in the # species definition, filling in the appropriate values # and deleting the pytest "skip" annotations. # @pytest.mark.skip("Population size QC not done yet") def test_qc_population_size(self): assert self.species.population_size == 156000 # @pytest.mark.skip("Generation time QC not done yet") def test_qc_generation_time(self): assert self.species.generation_time == 4
class TestGenome(unittest.TestCase, test_species.GenomeTestMixin): """ Tests for the Pongo abelii genome. """ genome = stdpopsim.get_species("PonAbe").genome def test_basic_attributes(self): self.assertEqual(len(self.genome.chromosomes), 24) def test_chromosome_lengths(self): genome = self.genome self.assertEqual(genome.get_chromosome("chr1").length, 229942017) self.assertEqual(genome.get_chromosome("chr2a").length, 113028656) self.assertEqual(genome.get_chromosome("chr2b").length, 135000294) self.assertEqual(genome.get_chromosome("chr3").length, 202140232) self.assertEqual(genome.get_chromosome("chr4").length, 198332218) self.assertEqual(genome.get_chromosome("chr5").length, 183952662) self.assertEqual(genome.get_chromosome("chr6").length, 174210431) self.assertEqual(genome.get_chromosome("chr7").length, 157549271) self.assertEqual(genome.get_chromosome("chr8").length, 153482349) self.assertEqual(genome.get_chromosome("chr9").length, 135191526) self.assertEqual(genome.get_chromosome("chr10").length, 133410057) self.assertEqual(genome.get_chromosome("chr11").length, 132107971) self.assertEqual(genome.get_chromosome("chr12").length, 136387465) self.assertEqual(genome.get_chromosome("chr13").length, 117095149) self.assertEqual(genome.get_chromosome("chr14").length, 108868599) self.assertEqual(genome.get_chromosome("chr15").length, 99152023) self.assertEqual(genome.get_chromosome("chr16").length, 77800216) self.assertEqual(genome.get_chromosome("chr17").length, 73212453) self.assertEqual(genome.get_chromosome("chr18").length, 94050890) self.assertEqual(genome.get_chromosome("chr19").length, 60714840) self.assertEqual(genome.get_chromosome("chr20").length, 62736349) self.assertEqual(genome.get_chromosome("chr21").length, 48394510) self.assertEqual(genome.get_chromosome("chr22").length, 46535552) self.assertEqual(genome.get_chromosome("chrX").length, 156195299)
def get_rho(ancestors, filename): inference_pos = ancestors.sites_position[:] match = re.search(r'(chr\d+)', filename) if match is None: raise ValueError("chr must be in filename") chr = match.group(1) map = params.genetic_map if match or map is not None: if map is not None: print(f"Using {chr} from GRCh38 for the recombination map") chr_map = msprime.RecombinationMap.read_hapmap(map + chr + ".txt") else: print( f"Using {chr} from HapMapII_GRCh37 for the recombination map") map = stdpopsim.get_species("HomSap").get_genetic_map( id="HapMapII_GRCh37") if not map.is_cached(): map.download() chr_map = map.get_chromosome_map(chr) inference_distances = physical_to_genetic(chr_map, inference_pos) d = np.diff(inference_distances) rho = np.concatenate(([0.0], d)) else: inference_distances = inference_pos d = np.diff(inference_distances) rho = np.concatenate(([0.0], d / sd.sequence_length)) if np.any(d == 0): w = np.where(d == 0) raise ValueError("Zero recombination rates at", w, inference_pos[w]) return rho