def main(): # Parser that controls which species should be updated parser = argparse.ArgumentParser( description='Query ensembl API for chromosome lengths') parser.add_argument( 'species', type=str, nargs='*', help="""Ensembl ids for species to update with underscores in place of spaces (e.g. homo_sapiens)""") parser.add_argument('--list-species', dest='list_species', action='store_true', help='list all species defined in stdpopsim') args = parser.parse_args() # Lists species and exits if user requested list of species if args.list_species: for species in stdpopsim.all_species(): print(species.ensembl_id) return None # Create a list of species ensembl ids based on user input if len(args.species) == 0: embl_ids = [s.ensembl_id for s in stdpopsim.all_species()] else: embl_ids = [s.lower() for s in args.species] # Iterate over list of species ensembl ids and write genome data writer = DataWriter() writer.write_ensembl_release() for eid in embl_ids: writer.write_genome_data(eid)
def test_params_match_docs_tables(self): for species in stdpopsim.all_species(): for model in species.demographic_models: table_path = pathlib.Path( os.path.join( "./docs/parameter_tables", species.id, model.id + ".csv" ) ) if model.qc_model is not None: assert table_path.exists() with open(table_path) as csv_file: reader = csv.reader(csv_file) param_list = list(reader) generation_time = None mutation_rate = None for param_data in param_list: if param_data[0].startswith("Generation time"): generation_time = float(param_data[1]) if param_data[0].startswith("Mutation rate"): mutation_rate = float(param_data[1]) assert model.mutation_rate == mutation_rate if generation_time is None: # default is 1 if unspecified assert model.generation_time == 1 else: assert model.generation_time == generation_time
def list_species(): """ List species in stdpopsim with their Ensembl IDs. """ click.echo("ID Ensembl ID") for species in stdpopsim.all_species(): click.echo(f"{species.id} {species.ensembl_id}")
def run_download_genetic_maps(args): species_names = [args.species] if args.species is None: species_names = [species.id for species in stdpopsim.all_species()] for species_id in species_names: species = get_species_wrapper(species_id) if len(args.genetic_maps) == 0: genetic_maps = [gmap.id for gmap in species.genetic_maps] else: genetic_maps = args.genetic_maps for genetic_map_id in genetic_maps: genetic_map = get_genetic_map_wrapper(species, genetic_map_id) genetic_map.download()
def update_genome_data(species): """ Update the species genome data from Ensembl. Ensembl IDs for species can optionally be provided, e.g. update-genome-data homo_sapiens will update the genome data for humans. Multiple species can be specified. By default all species are updated. """ if len(species) == 0: embl_ids = [s.ensembl_id for s in stdpopsim.all_species()] else: embl_ids = [s.lower() for s in species] writer = DataWriter() for eid in embl_ids: writer.write_genome_data(eid) writer.write_ensembl_release()
def download_process_annotations(): """ loop through all species and download annotation. from those annotations suck out what we want and store them in appropriately named files for upload """ for spc in stdpopsim.all_species(): if spc.annotations: for an in spc.annotations: CHROM_IDS = [chrom.id for chrom in spc.genome.chromosomes] logger.info(f"Downloading GFF file {an.id}") gff = get_gff_recarray(an.url, an.gff_sha256) logger.info(f"extracting annotations {an.id}") exons = gff[np.where( np.logical_and( gff.source == an.annotation_source, gff.type == an.annotation_type, ))] logger.info(f"merging overlapping regions {an.id}") # create zarr store and zarr root spc_name_path = os.path.join(annot_path, spc.id) os.makedirs(spc_name_path, exist_ok=True) for chrom_id in CHROM_IDS: chrom_exons = exons[np.where(exons.seqid == chrom_id)] if len(chrom_exons) == 0: continue intervals = gff_recarray_to_stdpopsim_intervals( chrom_exons) # double check that the intervals can be used in stdpopsim stdpopsim.utils.check_intervals_validity(intervals) out_file = os.path.join( spc_name_path, an.file_pattern.format(id=chrom_id)) np.savetxt(out_file, intervals, fmt="%d") tf = spc_name_path + f"/{an.id}.tar.gz" make_tarfile(tf, spc_name_path, "") logger.info("made tarball at " + spc_name_path) for f in glob.glob(spc_name_path + "/*.txt"): logger.info("removing " + f) os.remove(f)
def stdpopsim_cli_parser(): # TODO the CLI defined by this hierarchical and clumsy, but it's the best # I could figure out. It can definitely be improved! top_parser = argparse.ArgumentParser( description="Command line interface for stdpopsim.") top_parser.add_argument("-V", "--version", action='version', version='%(prog)s {}'.format( stdpopsim.__version__)) top_parser.add_argument("-v", "--verbosity", action='count', default=0, help="Increase the verbosity") subparsers = top_parser.add_subparsers(dest="subcommand") subparsers.required = True for species in stdpopsim.all_species(): add_simulate_species_parser(subparsers, species) return top_parser
def test_defaults(self): num_maps = sum(len(species.genetic_maps) for species in stdpopsim.all_species()) assert num_maps > 0 self.run_download("", num_maps)
assert ts.num_populations == self.model.num_populations assert len(mut_info.keys()) > 0 # number of mutations assert num_nonneutral > 0 # nonneutral mutations @pytest.mark.skipif(IS_WINDOWS, reason="SLiM not available on windows") class CatalogDFEModelTestMixin(DFEModelTestMixin): """ Mixin for demographic models in the catalog. """ def test_id_valid(self): assert utils.is_valid_dfe_id(self.dfe.id) qc_test_classes = [] for species in stdpopsim.all_species(): for dfe in species.dfes: model = stdpopsim.PiecewiseConstantSize(1000) superclasses = [] superclasses.append(CatalogDFEModelTestMixin) classname = f"Test{species.id}{model.id}{dfe.id}" cls = type(classname, tuple(superclasses), dict(model=model, dfe=dfe)) qc_test_classes.append(cls) # Basic sanity checks to double check that no errors get introduced # that lead to these qc tests being skipped silently. assert len(qc_test_classes) > 0 for cls in qc_test_classes: assert issubclass(cls, DFEModelTestMixin) # Insert the class into the current test module's namespace. setattr(sys.modules[__name__], cls.__name__, cls)
def test_all_species_model_help(self): for species in stdpopsim.all_species(): self.run_stdpopsim(f"{species} --help-models")
def test_all_species_genetic_maps_help(self): for species in stdpopsim.all_species(): self.run_stdpopsim(f"{species} --help-genetic-maps")
def test_str(self): for species in stdpopsim.all_species(): s = str(species.genome) assert isinstance(s, str) assert len(s) > 0
assert not (citation.doi in log_output) # The following two tests use the "caplog" pytest fixture, which captures # the logging output. The caplog param is automatically passed to test_*() # methods by pytest, which we pass through to verify_noQC_citations_not_written(). @pytest.mark.filterwarnings("ignore::stdpopsim.QCMissingWarning") @pytest.mark.usefixtures("caplog") def test_noQC_citations_not_written(self, caplog): self.verify_noQC_citations_not_written( "EscCol -d FakeModel -D 10 -L 10", caplog ) @pytest.mark.filterwarnings("ignore::stdpopsim.QCMissingWarning") @pytest.mark.usefixtures("caplog") def test_noQC_citations_not_written_verbose(self, caplog): self.verify_noQC_citations_not_written( "-vv EscCol -d FakeModel -D 10 -L 10", caplog ) @pytest.mark.parametrize( "species_id", [species.id for species in stdpopsim.all_species()] ) def test_species_simulation(species_id): cmd = f"-q {species_id} -L 1 --seed 1234 10" # Just check to see if the simulation runs with mock.patch("sys.stdout", autospec=True) as stdout: stdout.buffer = open(os.devnull, "wb") stdpopsim.cli.stdpopsim_main(cmd.split())
import allel import zarr import numpy as np import stdpopsim as stp import logging import warnings import urllib.request import os logger = logging.getLogger(__name__) # make root directory for zarr annotations annot_path = "annotations" os.mkdir(annot_path) # loop through species and download for spc in stp.all_species(): if spc.annotations: address = spc.annotations[0].url genome_version = os.path.basename(address).split(".")[1] logger.info(f"Downloading GFF file {spc.id}") tmp_path = f"{spc.id}.tmp.gff.gz" try: x, y = urllib.request.urlretrieve(address, tmp_path) except FileNotFoundError: warnings.warn("can't connnect to url") logger.info(f"creating zarr arrays {spc.id}") # create zarr store and zarr root spc_path = os.path.join(annot_path, spc.id + "." + genome_version + ".zip") store = zarr.ZipStore(spc_path) root = zarr.group(store=store, overwrite=True) x = allel.gff3_to_dataframe(tmp_path)
def main(): writer = DataWriter() writer.write_ensembl_release() for species in stdpopsim.all_species(): writer.write_genome_data(species)
def stdpopsim_cli_parser(): # TODO the CLI defined by this hierarchical and clumsy, but it's the best # I could figure out. It can definitely be improved! top_parser = argparse.ArgumentParser( description="Command line interface for stdpopsim.") top_parser.add_argument("-V", "--version", action='version', version='%(prog)s {}'.format( stdpopsim.__version__)) top_parser.add_argument("-v", "--verbosity", action='count', default=0, help="Increase the verbosity") top_parser.add_argument( "-c", "--cache-dir", type=str, default=None, help=("Set the cache directory to the specified value. " "Note that this can also be set using the environment variable " "STDPOPSIM_CACHE. If both the environment variable and this " "option are set, the option takes precedence. " f"Default: {stdpopsim.get_cache_dir()}")) top_parser.add_argument("-e", "--engine", default=stdpopsim.get_default_engine().id, choices=[e.id for e in stdpopsim.all_engines()], help="Specify a simulation engine.") for engine in stdpopsim.all_engines(): group = top_parser.add_argument_group( f"{engine.id} specific parameters") engine.add_arguments(group) subparsers = top_parser.add_subparsers(dest="subcommand") subparsers.required = True for species in stdpopsim.all_species(): add_simulate_species_parser(subparsers, species) download_maps_parser = subparsers.add_parser( "download-genetic-maps", help="Download genetic maps", description=( "Download genetic maps and store them in the cache directory. " "Maps are downloaded regardless of whether they are already " "in the cache or not. Please use the --cache-dir option to " "download maps to a specific directory. ")) download_maps_parser.add_argument( "species", nargs="?", help=("Download genetic maps for this species. If not specified " "download all known genetic maps.")) download_maps_parser.add_argument( "genetic_maps", type=str, nargs="*", help=("If specified, download these genetic maps. If no maps " "are provided, download all maps for this species.")) download_maps_parser.set_defaults(runner=run_download_genetic_maps) return top_parser
def test_all_species_annots_help(self): for species in stdpopsim.all_species(): self.run_stdpopsim(f"{species} --help-annotations")
def test_str(self): for species in stdpopsim.all_species(): s = str(species.genome) self.assertIsInstance(s, str) self.assertGreater(len(s), 0)
def stdpopsim_cli_parser(): # TODO the CLI defined by this hierarchical and clumsy, but it's the best # I could figure out. It can definitely be improved! top_parser = argparse.ArgumentParser( description="Command line interface for stdpopsim.") top_parser.add_argument("-V", "--version", action='version', version='%(prog)s {}'.format( stdpopsim.__version__)) top_parser.add_argument("-v", "--verbosity", action='count', default=1, help="Increase the verbosity") top_parser.add_argument( "-c", "--cache-dir", type=str, default=None, help=("Set the cache directory to the specified value. " "Note that this can also be set using the environment variable " "STDPOPSIM_CACHE. If both the environment variable and this " "option are set, the option takes precedence. " f"Default: {stdpopsim.get_cache_dir()}")) top_parser.add_argument("-e", "--engine", default=stdpopsim.get_default_engine().id, choices=[e.id for e in stdpopsim.all_engines()], help="Specify a simulation engine.") supported_models = stdpopsim.get_engine("msprime").supported_models msprime_parser = top_parser.add_argument_group( "msprime specific parameters") msprime_parser.add_argument( "--msprime-model", default=supported_models[0], choices=supported_models, help="Specify the simulation model used by msprime. " "See msprime API documentation for details.") def time_or_model(arg, _arg_is_time=[ True, ], parser=top_parser): if _arg_is_time[0]: try: arg = float(arg) except ValueError: parser.error(f"`{arg}' is not a number") else: if arg not in supported_models: parser.error(f"`{arg}' is not a supported model") _arg_is_time[0] = not _arg_is_time[0] return arg msprime_parser.add_argument( "--msprime-change-model", metavar=("T", "MODEL"), type=time_or_model, default=[], action="append", nargs=2, help="Change to the specified simulation MODEL at generation T. " "This option may provided multiple times.") # SLiM is not available for windows. if not IS_WINDOWS: def slim_exec(path): # Hack to set the SLIM environment variable at parse time, # before get_version() can be called. os.environ["SLIM"] = path return path slim_parser = top_parser.add_argument_group("SLiM specific parameters") slim_parser.add_argument("--slim-path", metavar="PATH", type=slim_exec, default=None, help="Full path to `slim' executable.") slim_parser.add_argument( "--slim-script", action="store_true", default=False, help="Write script to stdout and exit without running SLiM.") slim_parser.add_argument( "--slim-scaling-factor", metavar="Q", default=1, type=float, help="Rescale model parameters by Q to speed up simulation. " "See SLiM manual: `5.5 Rescaling population sizes to " "improve simulation performance`. " "[default=%(default)s].") slim_parser.add_argument( "--slim-burn-in", metavar="X", default=10, type=float, help="Length of the burn-in phase, in units of N generations " "[default=%(default)s].") subparsers = top_parser.add_subparsers(dest="subcommand") subparsers.required = True for species in stdpopsim.all_species(): add_simulate_species_parser(subparsers, species) download_maps_parser = subparsers.add_parser( "download-genetic-maps", help="Download genetic maps", description=( "Download genetic maps and store them in the cache directory. " "Maps are downloaded regardless of whether they are already " "in the cache or not. Please use the --cache-dir option to " "download maps to a specific directory. ")) download_maps_parser.add_argument( "species", nargs="?", help=("Download genetic maps for this species. If not specified " "download all known genetic maps.")) download_maps_parser.add_argument( "genetic_maps", type=str, nargs="*", help=("If specified, download these genetic maps. If no maps " "are provided, download all maps for this species.")) download_maps_parser.set_defaults(runner=run_download_genetic_maps) return top_parser
# shouldn't be offered to the user. with self.assertLogs() as logs: out, err = capture_output(stdpopsim.cli.stdpopsim_main, cmd.split()) log_output = "\n".join(logs.output) for citation in self.model.citations: self.assertFalse(citation.author in out) self.assertFalse(citation.doi in out) self.assertFalse(citation.author in err) self.assertFalse(citation.doi in err) self.assertFalse(citation.author in log_output) self.assertFalse(citation.doi in log_output) def test_noQC_citations_not_written(self): self.verify_noQC_citations_not_written( "EscCol -d FakeModel -D 10 -L 10") def test_noQC_citations_not_written_verbose(self): self.verify_noQC_citations_not_written( "-vv EscCol -d FakeModel -D 10 -L 10") @pytest.mark.parametrize("species_id", [species.id for species in stdpopsim.all_species()]) def test_species_simulation(species_id): cmd = f"-q {species_id} -L 1 --seed 1234 10" # Just check to see if the simulation runs with mock.patch("sys.stdout", autospec=True) as stdout: stdout.buffer = open(os.devnull, "wb") stdpopsim.cli.stdpopsim_main(cmd.split())