Example #1
0
def main():
    # Parser that controls which species should be updated
    parser = argparse.ArgumentParser(
        description='Query ensembl API for chromosome lengths')
    parser.add_argument(
        'species',
        type=str,
        nargs='*',
        help="""Ensembl ids for species to update with underscores in place of
                        spaces (e.g. homo_sapiens)""")
    parser.add_argument('--list-species',
                        dest='list_species',
                        action='store_true',
                        help='list all species defined in stdpopsim')
    args = parser.parse_args()

    # Lists species and exits if user requested list of species
    if args.list_species:
        for species in stdpopsim.all_species():
            print(species.ensembl_id)
        return None

    # Create a list of species ensembl ids based on user input
    if len(args.species) == 0:
        embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
    else:
        embl_ids = [s.lower() for s in args.species]

    # Iterate over list of species ensembl ids and write genome data
    writer = DataWriter()
    writer.write_ensembl_release()
    for eid in embl_ids:
        writer.write_genome_data(eid)
Example #2
0
 def test_params_match_docs_tables(self):
     for species in stdpopsim.all_species():
         for model in species.demographic_models:
             table_path = pathlib.Path(
                 os.path.join(
                     "./docs/parameter_tables", species.id, model.id + ".csv"
                 )
             )
             if model.qc_model is not None:
                 assert table_path.exists()
                 with open(table_path) as csv_file:
                     reader = csv.reader(csv_file)
                     param_list = list(reader)
                     generation_time = None
                     mutation_rate = None
                     for param_data in param_list:
                         if param_data[0].startswith("Generation time"):
                             generation_time = float(param_data[1])
                         if param_data[0].startswith("Mutation rate"):
                             mutation_rate = float(param_data[1])
                 assert model.mutation_rate == mutation_rate
                 if generation_time is None:
                     # default is 1 if unspecified
                     assert model.generation_time == 1
                 else:
                     assert model.generation_time == generation_time
Example #3
0
def list_species():
    """
    List species in stdpopsim with their Ensembl IDs.
    """
    click.echo("ID       Ensembl ID")
    for species in stdpopsim.all_species():
        click.echo(f"{species.id}   {species.ensembl_id}")
Example #4
0
def run_download_genetic_maps(args):
    species_names = [args.species]
    if args.species is None:
        species_names = [species.id for species in stdpopsim.all_species()]
    for species_id in species_names:
        species = get_species_wrapper(species_id)
        if len(args.genetic_maps) == 0:
            genetic_maps = [gmap.id for gmap in species.genetic_maps]
        else:
            genetic_maps = args.genetic_maps
        for genetic_map_id in genetic_maps:
            genetic_map = get_genetic_map_wrapper(species, genetic_map_id)
            genetic_map.download()
Example #5
0
def update_genome_data(species):
    """
    Update the species genome data from Ensembl. Ensembl IDs for
    species can optionally be provided, e.g.

    update-genome-data homo_sapiens

    will update the genome data for humans. Multiple species can
    be specified. By default all species are updated.
    """
    if len(species) == 0:
        embl_ids = [s.ensembl_id for s in stdpopsim.all_species()]
    else:
        embl_ids = [s.lower() for s in species]
    writer = DataWriter()
    for eid in embl_ids:
        writer.write_genome_data(eid)
    writer.write_ensembl_release()
Example #6
0
def download_process_annotations():
    """
    loop through all species and download annotation.
    from those annotations suck out what we want and
    store them in appropriately named files for upload
    """
    for spc in stdpopsim.all_species():
        if spc.annotations:
            for an in spc.annotations:
                CHROM_IDS = [chrom.id for chrom in spc.genome.chromosomes]
                logger.info(f"Downloading GFF file {an.id}")
                gff = get_gff_recarray(an.url, an.gff_sha256)
                logger.info(f"extracting annotations {an.id}")
                exons = gff[np.where(
                    np.logical_and(
                        gff.source == an.annotation_source,
                        gff.type == an.annotation_type,
                    ))]
                logger.info(f"merging overlapping regions {an.id}")
                # create zarr store and zarr root
                spc_name_path = os.path.join(annot_path, spc.id)
                os.makedirs(spc_name_path, exist_ok=True)
                for chrom_id in CHROM_IDS:
                    chrom_exons = exons[np.where(exons.seqid == chrom_id)]
                    if len(chrom_exons) == 0:
                        continue
                    intervals = gff_recarray_to_stdpopsim_intervals(
                        chrom_exons)
                    # double check that the intervals can be used in stdpopsim
                    stdpopsim.utils.check_intervals_validity(intervals)
                    out_file = os.path.join(
                        spc_name_path, an.file_pattern.format(id=chrom_id))
                    np.savetxt(out_file, intervals, fmt="%d")
                tf = spc_name_path + f"/{an.id}.tar.gz"
                make_tarfile(tf, spc_name_path, "")
                logger.info("made tarball at " + spc_name_path)
                for f in glob.glob(spc_name_path + "/*.txt"):
                    logger.info("removing " + f)
                    os.remove(f)
Example #7
0
def stdpopsim_cli_parser():

    # TODO the CLI defined by this hierarchical and clumsy, but it's the best
    # I could figure out. It can definitely be improved!
    top_parser = argparse.ArgumentParser(
        description="Command line interface for stdpopsim.")
    top_parser.add_argument("-V",
                            "--version",
                            action='version',
                            version='%(prog)s {}'.format(
                                stdpopsim.__version__))
    top_parser.add_argument("-v",
                            "--verbosity",
                            action='count',
                            default=0,
                            help="Increase the verbosity")
    subparsers = top_parser.add_subparsers(dest="subcommand")
    subparsers.required = True

    for species in stdpopsim.all_species():
        add_simulate_species_parser(subparsers, species)

    return top_parser
Example #8
0
 def test_defaults(self):
     num_maps = sum(len(species.genetic_maps) for species in stdpopsim.all_species())
     assert num_maps > 0
     self.run_download("", num_maps)
Example #9
0
        assert ts.num_populations == self.model.num_populations
        assert len(mut_info.keys()) > 0  # number of mutations
        assert num_nonneutral > 0  # nonneutral mutations


@pytest.mark.skipif(IS_WINDOWS, reason="SLiM not available on windows")
class CatalogDFEModelTestMixin(DFEModelTestMixin):
    """
    Mixin for demographic models in the catalog.
    """
    def test_id_valid(self):
        assert utils.is_valid_dfe_id(self.dfe.id)


qc_test_classes = []
for species in stdpopsim.all_species():
    for dfe in species.dfes:
        model = stdpopsim.PiecewiseConstantSize(1000)
        superclasses = []
        superclasses.append(CatalogDFEModelTestMixin)
        classname = f"Test{species.id}{model.id}{dfe.id}"
        cls = type(classname, tuple(superclasses), dict(model=model, dfe=dfe))
        qc_test_classes.append(cls)

# Basic sanity checks to double check that no errors get introduced
# that lead to these qc tests being skipped silently.
assert len(qc_test_classes) > 0
for cls in qc_test_classes:
    assert issubclass(cls, DFEModelTestMixin)
    # Insert the class into the current test module's namespace.
    setattr(sys.modules[__name__], cls.__name__, cls)
Example #10
0
 def test_all_species_model_help(self):
     for species in stdpopsim.all_species():
         self.run_stdpopsim(f"{species} --help-models")
Example #11
0
 def test_all_species_genetic_maps_help(self):
     for species in stdpopsim.all_species():
         self.run_stdpopsim(f"{species} --help-genetic-maps")
Example #12
0
 def test_str(self):
     for species in stdpopsim.all_species():
         s = str(species.genome)
         assert isinstance(s, str)
         assert len(s) > 0
Example #13
0
            assert not (citation.doi in log_output)

    # The following two tests use the "caplog" pytest fixture, which captures
    # the logging output. The caplog param is automatically passed to test_*()
    # methods by pytest, which we pass through to verify_noQC_citations_not_written().

    @pytest.mark.filterwarnings("ignore::stdpopsim.QCMissingWarning")
    @pytest.mark.usefixtures("caplog")
    def test_noQC_citations_not_written(self, caplog):
        self.verify_noQC_citations_not_written(
            "EscCol -d FakeModel -D 10 -L 10", caplog
        )

    @pytest.mark.filterwarnings("ignore::stdpopsim.QCMissingWarning")
    @pytest.mark.usefixtures("caplog")
    def test_noQC_citations_not_written_verbose(self, caplog):
        self.verify_noQC_citations_not_written(
            "-vv EscCol -d FakeModel -D 10 -L 10", caplog
        )


@pytest.mark.parametrize(
    "species_id", [species.id for species in stdpopsim.all_species()]
)
def test_species_simulation(species_id):
    cmd = f"-q {species_id} -L 1 --seed 1234 10"
    # Just check to see if the simulation runs
    with mock.patch("sys.stdout", autospec=True) as stdout:
        stdout.buffer = open(os.devnull, "wb")
        stdpopsim.cli.stdpopsim_main(cmd.split())
import allel
import zarr
import numpy as np
import stdpopsim as stp
import logging
import warnings
import urllib.request
import os

logger = logging.getLogger(__name__)
# make root directory for zarr annotations
annot_path = "annotations"
os.mkdir(annot_path)
# loop through species and download
for spc in stp.all_species():
    if spc.annotations:
        address = spc.annotations[0].url
        genome_version = os.path.basename(address).split(".")[1]
        logger.info(f"Downloading GFF file {spc.id}")
        tmp_path = f"{spc.id}.tmp.gff.gz"
        try:
            x, y = urllib.request.urlretrieve(address, tmp_path)
        except FileNotFoundError:
            warnings.warn("can't connnect to url")
        logger.info(f"creating zarr arrays {spc.id}")
        # create zarr store and zarr root
        spc_path = os.path.join(annot_path,
                                spc.id + "." + genome_version + ".zip")
        store = zarr.ZipStore(spc_path)
        root = zarr.group(store=store, overwrite=True)
        x = allel.gff3_to_dataframe(tmp_path)
def main():
    writer = DataWriter()
    writer.write_ensembl_release()
    for species in stdpopsim.all_species():
        writer.write_genome_data(species)
Example #16
0
def stdpopsim_cli_parser():

    # TODO the CLI defined by this hierarchical and clumsy, but it's the best
    # I could figure out. It can definitely be improved!
    top_parser = argparse.ArgumentParser(
        description="Command line interface for stdpopsim.")
    top_parser.add_argument("-V",
                            "--version",
                            action='version',
                            version='%(prog)s {}'.format(
                                stdpopsim.__version__))
    top_parser.add_argument("-v",
                            "--verbosity",
                            action='count',
                            default=0,
                            help="Increase the verbosity")
    top_parser.add_argument(
        "-c",
        "--cache-dir",
        type=str,
        default=None,
        help=("Set the cache directory to the specified value. "
              "Note that this can also be set using the environment variable "
              "STDPOPSIM_CACHE. If both the environment variable and this "
              "option are set, the option takes precedence. "
              f"Default: {stdpopsim.get_cache_dir()}"))

    top_parser.add_argument("-e",
                            "--engine",
                            default=stdpopsim.get_default_engine().id,
                            choices=[e.id for e in stdpopsim.all_engines()],
                            help="Specify a simulation engine.")

    for engine in stdpopsim.all_engines():
        group = top_parser.add_argument_group(
            f"{engine.id} specific parameters")
        engine.add_arguments(group)

    subparsers = top_parser.add_subparsers(dest="subcommand")
    subparsers.required = True

    for species in stdpopsim.all_species():
        add_simulate_species_parser(subparsers, species)

    download_maps_parser = subparsers.add_parser(
        "download-genetic-maps",
        help="Download genetic maps",
        description=(
            "Download genetic maps and store them in the cache directory. "
            "Maps are downloaded regardless of whether they are already "
            "in the cache or not. Please use the --cache-dir option to "
            "download maps to a specific directory. "))
    download_maps_parser.add_argument(
        "species",
        nargs="?",
        help=("Download genetic maps for this species. If not specified "
              "download all known genetic maps."))
    download_maps_parser.add_argument(
        "genetic_maps",
        type=str,
        nargs="*",
        help=("If specified, download these genetic maps. If no maps "
              "are provided, download all maps for this species."))

    download_maps_parser.set_defaults(runner=run_download_genetic_maps)

    return top_parser
Example #17
0
 def test_all_species_annots_help(self):
     for species in stdpopsim.all_species():
         self.run_stdpopsim(f"{species} --help-annotations")
Example #18
0
 def test_str(self):
     for species in stdpopsim.all_species():
         s = str(species.genome)
         self.assertIsInstance(s, str)
         self.assertGreater(len(s), 0)
Example #19
0
def stdpopsim_cli_parser():

    # TODO the CLI defined by this hierarchical and clumsy, but it's the best
    # I could figure out. It can definitely be improved!
    top_parser = argparse.ArgumentParser(
        description="Command line interface for stdpopsim.")
    top_parser.add_argument("-V",
                            "--version",
                            action='version',
                            version='%(prog)s {}'.format(
                                stdpopsim.__version__))
    top_parser.add_argument("-v",
                            "--verbosity",
                            action='count',
                            default=1,
                            help="Increase the verbosity")
    top_parser.add_argument(
        "-c",
        "--cache-dir",
        type=str,
        default=None,
        help=("Set the cache directory to the specified value. "
              "Note that this can also be set using the environment variable "
              "STDPOPSIM_CACHE. If both the environment variable and this "
              "option are set, the option takes precedence. "
              f"Default: {stdpopsim.get_cache_dir()}"))

    top_parser.add_argument("-e",
                            "--engine",
                            default=stdpopsim.get_default_engine().id,
                            choices=[e.id for e in stdpopsim.all_engines()],
                            help="Specify a simulation engine.")

    supported_models = stdpopsim.get_engine("msprime").supported_models
    msprime_parser = top_parser.add_argument_group(
        "msprime specific parameters")
    msprime_parser.add_argument(
        "--msprime-model",
        default=supported_models[0],
        choices=supported_models,
        help="Specify the simulation model used by msprime. "
        "See msprime API documentation for details.")

    def time_or_model(arg, _arg_is_time=[
        True,
    ], parser=top_parser):
        if _arg_is_time[0]:
            try:
                arg = float(arg)
            except ValueError:
                parser.error(f"`{arg}' is not a number")
        else:
            if arg not in supported_models:
                parser.error(f"`{arg}' is not a supported model")
        _arg_is_time[0] = not _arg_is_time[0]
        return arg

    msprime_parser.add_argument(
        "--msprime-change-model",
        metavar=("T", "MODEL"),
        type=time_or_model,
        default=[],
        action="append",
        nargs=2,
        help="Change to the specified simulation MODEL at generation T. "
        "This option may provided multiple times.")

    # SLiM is not available for windows.
    if not IS_WINDOWS:

        def slim_exec(path):
            # Hack to set the SLIM environment variable at parse time,
            # before get_version() can be called.
            os.environ["SLIM"] = path
            return path

        slim_parser = top_parser.add_argument_group("SLiM specific parameters")
        slim_parser.add_argument("--slim-path",
                                 metavar="PATH",
                                 type=slim_exec,
                                 default=None,
                                 help="Full path to `slim' executable.")
        slim_parser.add_argument(
            "--slim-script",
            action="store_true",
            default=False,
            help="Write script to stdout and exit without running SLiM.")
        slim_parser.add_argument(
            "--slim-scaling-factor",
            metavar="Q",
            default=1,
            type=float,
            help="Rescale model parameters by Q to speed up simulation. "
            "See SLiM manual: `5.5 Rescaling population sizes to "
            "improve simulation performance`. "
            "[default=%(default)s].")
        slim_parser.add_argument(
            "--slim-burn-in",
            metavar="X",
            default=10,
            type=float,
            help="Length of the burn-in phase, in units of N generations "
            "[default=%(default)s].")

    subparsers = top_parser.add_subparsers(dest="subcommand")
    subparsers.required = True

    for species in stdpopsim.all_species():
        add_simulate_species_parser(subparsers, species)

    download_maps_parser = subparsers.add_parser(
        "download-genetic-maps",
        help="Download genetic maps",
        description=(
            "Download genetic maps and store them in the cache directory. "
            "Maps are downloaded regardless of whether they are already "
            "in the cache or not. Please use the --cache-dir option to "
            "download maps to a specific directory. "))
    download_maps_parser.add_argument(
        "species",
        nargs="?",
        help=("Download genetic maps for this species. If not specified "
              "download all known genetic maps."))
    download_maps_parser.add_argument(
        "genetic_maps",
        type=str,
        nargs="*",
        help=("If specified, download these genetic maps. If no maps "
              "are provided, download all maps for this species."))

    download_maps_parser.set_defaults(runner=run_download_genetic_maps)

    return top_parser
Example #20
0
        # shouldn't be offered to the user.
        with self.assertLogs() as logs:
            out, err = capture_output(stdpopsim.cli.stdpopsim_main,
                                      cmd.split())
        log_output = "\n".join(logs.output)
        for citation in self.model.citations:
            self.assertFalse(citation.author in out)
            self.assertFalse(citation.doi in out)
            self.assertFalse(citation.author in err)
            self.assertFalse(citation.doi in err)
            self.assertFalse(citation.author in log_output)
            self.assertFalse(citation.doi in log_output)

    def test_noQC_citations_not_written(self):
        self.verify_noQC_citations_not_written(
            "EscCol -d FakeModel -D 10 -L 10")

    def test_noQC_citations_not_written_verbose(self):
        self.verify_noQC_citations_not_written(
            "-vv EscCol -d FakeModel -D 10 -L 10")


@pytest.mark.parametrize("species_id",
                         [species.id for species in stdpopsim.all_species()])
def test_species_simulation(species_id):
    cmd = f"-q {species_id} -L 1 --seed 1234 10"
    # Just check to see if the simulation runs
    with mock.patch("sys.stdout", autospec=True) as stdout:
        stdout.buffer = open(os.devnull, "wb")
        stdpopsim.cli.stdpopsim_main(cmd.split())