def test_external_db_mapping_transcript(self, new_pipegraph, mock_download,
                                         shared_prebuild):
     # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     ena = g.get_external_db_to_gene_id_mapping("ENA_FEATURE_TRANSCRIPT")
     assert ena["CM003155.1:CDS:9690..12623"] == set(["UMAG_05624"])
 def test_same_same(self, new_pipegraph):
     g = EnsemblGenome("Ustilago_maydis", 33)
     g2 = EnsemblGenome("Ustilago_maydis", 33)
     assert g is g2
     new_pipegraph.new_pipegraph()
     g3 = EnsemblGenome("Ustilago_maydis", 33)
     assert g is not g3
 def test_additional_fasta(self, mock_download, shared_prebuild):
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     seq = g.get_genome_sequence("U37796.1", 0, 100)
     assert (seq == (
         "taatcgtgaattgagctaggggcgccaagttacgtggcaaaagcgggctgactggcggcgaagatgtgt"
         "tggtctgcacctgagttcacgaacctgagac").upper())
    def test_get_additional_gene_gtfs_land_in_df_genes(self, mock_download,
                                                       shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        assert len(g.get_additional_gene_gtfs()) == 1
        ppg.run_pipegraph()

        print(g.df_genes.chr.unique())
        assert "A2_pra2" in g.df_genes.index
        assert "A2_pra2.1" in g.df_transcripts.index
 def test_external_db_mapping(self, new_pipegraph, mock_download,
                              shared_prebuild):
     # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     goa = g.get_external_db_to_gene_id_mapping("GOA")
     assert goa["A0A0D1CJ64"] == set(["UMAG_05734"])
     with pytest.raises(KeyError):
         g.get_external_db_to_gene_id_mapping("GOAnosuchthing")
 def test_external_db_mapping_translation(self, new_pipegraph,
                                          mock_download, shared_prebuild):
     # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     up = g.get_external_db_to_gene_id_mapping("Uniprot/SWISSPROT")
     assert up["P30598"] == set(["UMAG_10718"])
     upp = g.get_external_db_to_translation_id_mapping("Uniprot/SWISSPROT")
     assert upp["P30598"] == set(["KIS66849.N"])
    def test_transcript_ids(self, mock_download, shared_prebuild):
        # test that ustilago has at least one gene with multilpe transcripts
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_genes
        assert (df.transcript_stable_ids.apply(lambda x: len(x)) > 1).any()
 def test_outside_of_ppg_after_download(self, mock_download,
                                        shared_prebuild):
     species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)
     ppg.run_pipegraph()
     len_genes = len(g.df_genes)
     len_transcripts = len(g.df_transcripts)
     len_proteins = len(g.df_proteins)
     assert len_genes > 0
     assert len_transcripts > 0
     assert len_proteins > 0
     ppg.util.global_pipegraph = None
     g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)
     assert len_genes == len(g.df_genes)
     assert len_transcripts == len(g.df_transcripts)
     assert len_proteins == len(g.df_proteins)
 def test_transcript_iterator(self, mock_download, shared_prebuild):
     g = EnsemblGenome("Ashbya_gossypii", 41, shared_prebuild)
     ppg.run_pipegraph()
     transcripts = list(g.transcripts.values())
     assert len(transcripts) == len(g.df_transcripts)
     assert set([x.transcript_stable_id
                 for x in transcripts]) == set(g.df_transcripts.index)
    def test_multiple_exon_transcripts(self, mock_download, shared_prebuild):
        # test that ustilago has at least one transcript with multiple exons
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_transcripts
        print(df.exons)
        assert (df.exons.apply(lambda x: len(x)) > 1).any()
 def test_df_exons(self, mock_download, shared_prebuild):
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     ppg.util.global_pipegraph.test_keep_output = True
     ppg.util.global_pipegraph.dump_runtimes("logs/runtimes.txt")
     exon_count = sum([len(x) for x in g.df_transcripts["exons"]])
     df_exons = g.df_exons
     assert len(df_exons) > 0
     assert len(g.df_exons) == exon_count
     assert hasattr(type(g).df_exons, "__call__")
 def test_newest_gene_ids(self, new_pipegraph, mock_download,
                          shared_prebuild):
     # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     assert g.newest_stable_ids_for("UM05644") == set(["UMAG_05644"])
     assert g.newest_stable_ids_for("UMAG_05629") == set(["UMAG_05629"])
     assert g.newest_stable_ids_for("UM06501P0") == set([])
     assert g.newest_stable_ids_for("UM04933T0") == set([])
     with pytest.raises(KeyError):
         g.newest_stable_ids_for("no_such_gene")
 def test_get_true_chromosomes(self, mock_download, shared_prebuild):
     # we need something with contigs and chromosomes
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     should = [
         "1",
         "2",
         "3",
         "4",
         "5",
         "6",
         "7",
         "8",
         "9",
         "10",
         "11",
         "12",
         "13",
         "14",
         "15",
         "16",
         "17",
         "18",
         "19",
         "20",
         "21",
         "22",
         "23",
         "U37796.1",  # it is a true chromosome if there's chromosome: in the fasta description
     ]
     all_contigs = should + [
         "um_scaf_contig_1.256",
         "um_scaf_contig_1.265",
         "um_scaf_contig_1.271",
         "um_scaf_contig_1.264",
     ]
     assert set(g.get_true_chromosomes()) == set(should)
     assert set(g.get_chromosome_lengths()) == set(should + all_contigs)
 def test_get_external_dbs(self, new_pipegraph, mock_download,
                           shared_prebuild):
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     assert g.get_external_dbs() == [
         "BRENDA",
         "BioCyc",
         "ChEMBL",
         "ENA_FEATURE_GENE",
         "ENA_FEATURE_PROTEIN",
         "ENA_FEATURE_TRANSCRIPT",
         "ENA_GENE",
         "Ensembl_Fungi",
         "GO",
         "GOA",
         "IntAct",
         "IntEnz",
         "Interpro",
         "KEGG",
         "KEGG_Enzyme",
         "MEROPS",
         "MINT",
         "MetaCyc",
         "NCBI_TAXONOMY",
         "PHI",
         "PHIE",
         "PHIP",
         "PRIDE",
         "PUBMED",
         "PeroxiBase",
         "Reactome",
         "SWISS_MODEL",
         "UniParc",
         "UniPathway",
         "Uniprot/SPTREMBL",
         "Uniprot/SWISSPROT",
         "protein_id",
     ]
    def test_transcript_sequence(self, shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        assert g.transcripts["KIS71708"].mrna == (
            "ATGAACGTCAAGCTTGCGCCGCCCGACGAGATGAACGGCGAGATCATTGCCATCCTCATT"
            "CTCTTCAACTTTCATTGCCGCGTTCAACTTAATGACGAGGCACAAGCCGCTCGCAAAAAG"
            "TTGCTCCTCTTCCTCATGGACAAAATTTACCAGACACGCGCACCTGCGCCTTCGTATGCA"
            "GCATTTGCAGACGAGCTCGGGGCCGCGATGGAAGGAGACGAGGACAATCACCGTATCATG"
            "ACCGACTACCTCGAGACCATGCTCGATCTGCTCCATGTTCCCGATGGTTTGACCAAGCTG"
            "TTCAACGAAAAGCTGAACAGAATCCTGCCTAGCTACGAGCCCATGGGATTGCTCAATGCC"
            "ACCGACATCTTCTTCGAGCGAAGGTCCTTCTTCGGCCTCTTTTTCCGTCGCATCAAGTTG"
            "ATCTTTGACAGTCTGGATTTGCAAACAAGGGATCACTTGACAATAGCTGCGCGCGCATGG"
            "AAAGAGGGACAGGCTTTCGACTTGAATGACAGCGAGCTCTCCGGTATCGACTCGGCGCAT"
            "TTGCTCGATGCAAGGCTTGGCGCGTTCCGAGACTACCAGCTTGGCTTACTCCGCGGCGAC"
            "TATACCATGGCAAAGGACAACATGGAAAGGTTCTTTGACTTTTACGCTCCTGGCGCTGAC"
            "CGCGAGCTTCATCAACACACGCTTCTTCATCTAGCCGCTTTCCATGTCAGGACTGAGAGC"
            "TTCTGCGCAGCCAAAGCAGCCTTGGACGAGGCTATAAGCCTTGCGCGGTCGGCAAACGAC"
            "AGCGAGTGCATCTCTGCCTGCGAGAGCCTCATGCAATACATCCAAGGTGTGGGCACTAGC"
            "ACCCTCGCTTCTGTTCCGGGAGCAGCCAATGTCTCGACGAACGAGAGGCAACGCCGACCT"
            "ATCTATGACGCGGTGTGGCAGACCCGTTGCAGCCTAGCTAAGGGCCGCTCAGCAATCGAG"
            "ACACTTCAAGACTTGGAGGATTGTGCTGCACCTTCTCAGCCTTCTAGGGACTCGTTGGCG"
            "GCCAGCGAGGCCTCCTTGCAGCTCATTCAGGATGCCAAACGACGTCTCGGCAGGGATACA"
            "TTTCCAAGCGACGGTGAAGTCGCGCGTCTTTGGGATACCTTGGGCCAGCCAGCGCTTGCT"
            "GACGTCTATCGGAATCGGAACATTGCAGGTGCAAATGGTCGAGCACGCTCAGCTTTGCAA"
            "GAAGAAAGCCGAATCGATTGCATCTGTCACAAAGCCAAAACG"
            "CTTGCACGTGCGGGAGAGTACGAAGCGGCGCTCAGCTTGCTGGTCTCACCGGCCACTTTC"
            "GAAGCGATCTCATTCTGCGAGTACACGATCTGGCACAGGGCAATCGCTGAGGTCCTGCGC"
            "TTACGAGCAACGCGCAGACAAGATGTAGCGACCTTGCAATTGTTGGCAGAGAGCCTGCCG"
            "GGTTCCGACCAGGCTCATGTCGATCGCGACGTTGAAGATGCTGTCGATTCGCCGAGCGCT"
            "TTGGTCGAGCTCGCGTTGCGCTGCCTCAAGTCCGGAAAATCGAGTGCGACTGAGAAGAAT"
            "TTCGGATTGCGATTCAAAGAGTACAGCCGACTTACAGCAGAGCAAGTGGCAGAAGCGCTG"
            "CTGAACAAAGCTGCGATAAGGATGAAACGTGGTCGACCGATCCTCGCACTGATGCCTACA"
            "CTGGCAAGTCTATCGATTGCCAAGGACATGGAGTGCAATCGGTTGATCCTCAATGCAAGA"
            "GTTCAGCTAGCTGAGGCATTGGGGCTTCAGCTCAAGATGCCAGATGGAGCACGGCTGCTC"
            "TTGGAATCGGATCTGCCCAACTGCCTGTCGAGTGATGATGTCGAGCTGAGAGCGCGCGCG"
            "AAGTGGACGTATGCACGGATGCTGCTCTCATGCTCGGACAAGCAAGAGCGCGAAGATCTG"
            "ACCAAGGTGCTCTACTGGTTGCGAGAAGCCGAAAGAG"
            "ACGCACAACAGGCCGAATGCCTCGAGCTTCACACGCAGATTCTCTACTATATGTTGCGGC"
            "TGCACCACCACCTGGGCGATGACAGAGAAACAATCTCTGTGACAGCTCGTTTAGATACAG"
            "TAGAGCGCGCTTGGACTCGCTTGGATGCTTCGCAGGATCAAGCCCATCTGCAACAGGTTC"
            "GCCAAATCCTAGATATCGTTGTCTCTGTCGCAGGTTATGTGGCTAGCGGAGAAGCTGCGA"
            "ACAAGCGTCTTGAAATGGTCTAG")

        assert (g.transcripts["KIS71709"].mrna == "ATGGCATTTTCAGAAGATACCAAG"
                "GAGCGAATCATTAAGGCGGTCGATGTCTCCAAGACTTTGTTGCATTACGGCTG"
                "GGTGCCTTTCGTTCTTTACATCGGCTTCACCCGAAGCACGCCCCAGCCTAGCTTGATCAA"
                "GCTCATCAGTCCTCTCGCATGA")
        assert g.transcripts["KIS71709"].mrna == g.transcripts["KIS71709"].cdna
    def test_all_transcripts(self, mock_download, shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_transcripts
        assert "gene_stable_id" in df.columns
        assert len(df) == 6928 + 4  # from the a2 locus
        assert df["chr"].dtype.name == "category"
        assert df["biotype"].dtype.name == "category"
        assert df.loc["KIS71021"].chr == "2"
        assert df.loc["KIS71021"].strand == 1
        assert df.loc["KIS71021"].start == 354_742
        assert df.loc["KIS71021"].stop == 356_690
        assert df.loc["KIS71021"].gene_stable_id == "UMAG_12118"
        assert df.loc["KIS71021"].biotype == "protein_coding"
        assert df.loc["KIS71021"].exons == ((354_742, 354_936), (355_222,
                                                                 356_690))
        assert df.loc["KIS71021"].exon_stable_ids == ("KIS71021-1",
                                                      "KIS71021-2")
    def test_all_genes(self, mock_download, shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_genes
        df2 = g.df_genes  # caching
        assert len(df) == 6910 + 4  # from the a2 locus
        assert df is df2
        assert df.loc["UMAG_12015"].strand == 1
        assert df.loc["UMAG_12015"].tss == 4370
        assert df.loc["UMAG_12015"].tes == 5366
        assert df.loc["UMAG_12015"].transcript_stable_ids == ("KIS66832", )

        assert df.loc["UMAG_00663"].strand == -1
        assert df.loc["UMAG_00663"].tss == 1_947_590
        assert df.loc["UMAG_00663"].tes == 1_945_040
        assert df.loc["UMAG_00663"].transcript_stable_ids == ("KIS72250", )

        assert df.loc["UMAG_03168"].transcript_stable_ids == ("KIS68597",
                                                              "KIS68596")
        assert df["chr"].dtype.name == "category"
        assert df["biotype"].dtype.name == "category"
    def test_download(self, new_pipegraph, mock_download, shared_prebuild):
        species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
        g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)

        def shorten_genome_fasta(output_path):
            with open(g.find_file("genome.fasta")) as op:
                head = op.read(1024 * 100)
            (output_path / "test.fasta").write_text(head)

        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        test_fasta_job.depends_on(g.download_genome())
        g._prebuilds.append(test_fasta_job)

        subread = Subread(version="1.6.3")
        index = g.build_index(subread, "test.fasta")
        subread_old = Subread(version="1.4.3-p1")
        index_old = g.build_index(subread_old, "test.fasta")

        new_pipegraph.run()
        # note that these are not the checksums from CHECKSUMS files (those are fore
        # the gziped variants, we keep them ungziped and let the filesystem handle
        # the gzip, since we can't rely on the downstream reading gzip...
        assert (checksum_file(
            g.find_file("genome.fasta")) == "584a734589964a654c7c1dc23b0167ab")
        assert (checksum_file(
            g.find_file("cdna.fasta")) == "3fc1f19ab829573169cb2488abe39211")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("pep.fasta")) == "9580fd44832d419c38469d657f6e2484")
        with pytest.raises(OSError):
            g.find_file("no such file")
        assert index.name_file("subread_index.reads").exists()
        assert index.name_file("subread_index.files").exists()
        assert index.name_file("subread_index.00.b.array").exists()
        assert index_old.name_file("subread_index.reads").exists()
        assert index_old.name_file("subread_index.files").exists()
        assert index_old.name_file("subread_index.00.b.array").exists()
        assert index.name_file("subread_index.reads") != index_old.name_file(
            "subread_index.reads")
        assert g.find_file("test.fasta.md5sum").exists()
        with pytest.raises(OSError):
            assert g.find_file("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_prebuild("test.fasta") is test_fasta_job
        with pytest.raises(OSError):
            assert g.find_prebuild("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_file("genome.fasta.fai").exists()
        assert g.find_file("cdna.fasta.fai").exists()

        new_pipegraph.new_pipegraph()
        pb = PrebuildManager(shared_prebuild.prebuilt_path)
        g = EnsemblGenome(species, "41", prebuild_manager=pb)
        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        g._prebuilds.append(test_fasta_job)

        subread_intermediate = Subread(version="1.5.0")
        index_intermediate = g.build_index(subread_intermediate, "test.fasta")
        assert index_intermediate.name_file(
            "subread_index.reads") == index_old.name_file(
                "subread_index.reads")
        index_genome = g.build_index(subread_intermediate)
        assert "/genome/" in str(index_genome.filenames[0])

        assert g.get_chromosome_lengths() == {
            "IV": 1_467_287,
            "MT": 23564,
            "V": 1_519_140,
            "III": 907_494,
            "II": 870_771,
            "VII": 1_800_949,
            "I": 693_414,
            "VI": 1_836_693,
        }

        assert g.get_genome_sequence("VI", 20, 30) == "ACCGCTGAGA"
        assert (g.get_cdna_sequence("EFAGOT00000000349") ==
                "GCTCGCGTGGCGTAATGGCAACGCGTCTGACTTCTAATCAGAAGATTGTGGGTTCGACCC"
                "CCACCGTGAGTG")
        assert (g.get_protein_sequence("AAS53315") ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        assert (g.genetic_code.translate_dna(g.get_cds_sequence("AAS53315")) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")

        assert (g.genetic_code.translate_dna(
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53315"])) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        with pytest.raises(ValueError):
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53316"])

        assert (
            g.df_genes_meta.loc["AGOS_ADL186C"]["description"] ==
            "Restriction of telomere capping protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q75AV6]"
        )
 def test_species_formating(self, shared_prebuild):
     species = "ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
     with pytest.raises(ValueError):
         EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)
 def test_unknown_species_raises(self, mock_download, shared_prebuild):
     species = "Unknown_unknown"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
     EnsemblGenome(species, "41",
                   prebuild_manager=shared_prebuild).download_genome()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
    def test_get_canonical_ids(self, new_pipegraph, mock_download,
                               shared_prebuild):
        g = EnsemblGenome("Homo_sapiens", 96, shared_prebuild)
        g._pb_find_server().callback()
        g._pb_download_sql_table("gene").callback()
        g._pb_download_sql_table("alt_allele").callback()
        g._pb_download_sql_table("seq_region").callback()
        g._pb_download_sql_table("seq_region_attrib").callback()
        g._pb_download_sql_table("attrib_type").callback()
        g._pb_download_gtf().callback()
        g._pb_download_sql_table_definitions().callback()
        g.job_genes().callback()
        g.job_transcripts().callback()
        assert g.name_to_canonical_id("DSEL") == "ENSG00000171451"
        assert g.name_to_canonical_id("THEMIS") == "ENSG00000172673"
        # test the breakage
        with pytest.raises(ValueError):
            g.name_to_canonical_id("SOD2")
        with pytest.raises(ValueError):
            g.name_to_canonical_id("IGF2")
        with pytest.raises(ValueError):
            g.name_to_canonical_id("ABCF2")
        with pytest.raises(ValueError):
            g.name_to_canonical_id("TBCE")

        assert g.name_to_canonical_id("SOD2", True) == "ENSG00000112096"
        assert g.name_to_canonical_id("IGF2", True) == "ENSG00000167244"
        assert g.name_to_canonical_id("ABCF2", True) == "ENSG00000033050"
        assert g.name_to_canonical_id("TBCE", True) == "ENSG00000284770"
    def test_all_proteins(self, mock_download, shared_prebuild):
        g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
        ppg.run_pipegraph()

        df = g.df_proteins
        assert df.strand.isin([1, -1]).all()
    def test_get_additional_gene_gtfs(self, mock_download, shared_prebuild):
        g = EnsemblGenome("Ashbya_gossypii", 33, shared_prebuild)
        assert len(g.get_additional_gene_gtfs()) == 0

        g = EnsemblGenome("Homo_sapiens", 74, shared_prebuild)
        assert "ribosomal_genes_grch37" in g.get_additional_gene_gtfs()[0].name
        assert g.get_additional_gene_gtfs()[0].exists()
        g = EnsemblGenome("Homo_sapiens", 75, shared_prebuild)
        assert "ribosomal_genes_grch38" in g.get_additional_gene_gtfs()[0].name
        assert g.get_additional_gene_gtfs()[0].exists()
        g = EnsemblGenome("Mus_musculus", 68, shared_prebuild)
        assert "ribosomal_genes_mm10" in g.get_additional_gene_gtfs()[0].name
        assert g.get_additional_gene_gtfs()[0].exists()
        g = EnsemblGenome("Mus_musculus", 67, shared_prebuild)
        assert len(g.get_additional_gene_gtfs()) == 0
 def test_genes_iterator(self, mock_download, shared_prebuild):
     g = EnsemblGenome("Ashbya_gossypii", 41, shared_prebuild)
     ppg.run_pipegraph()
     genes = list(g.genes.values())
     assert len(genes) == len(g.df_genes)
     assert set([x.gene_stable_id for x in genes]) == set(g.df_genes.index)
    def test_get_true_chromosomes_genome_without_chromosomes(
            self, mock_download, shared_prebuild):
        # we need something with contigs and chromosomes
        g = EnsemblGenome("Giardia_lamblia", 43, shared_prebuild)
        ppg.run_pipegraph()
        should = [
            "ctg02_1",
            "ctg02_2",
            "ctg02_3",
            "ctg02_4",
            "ctg02_5",
            "ctg02_6",
            "ctg02_7",
            "ctg02_8",
            "ctg02_9",
            "ctg02_10",
            "ctg02_11",
            "ctg02_12",
            "ctg02_13",
            "ctg02_14",
            "ctg02_15",
            "ctg02_16",
            "ctg02_17",
            "ctg02_18",
            "ctg02_19",
            "ctg02_20",
            "ctg02_21",
            "ctg02_22",
            "ctg02_23",
            "ctg02_24",
            "ctg02_25",
            "ctg02_26",
            "ctg02_27",
            "ctg02_28",
            "ctg02_29",
            "ctg02_30",
            "ctg02_31",
            "ctg02_32",
            "ctg02_33",
            "ctg02_34",
            "ctg02_35",
            "ctg02_36",
            "ctg02_37",
            "ctg02_38",
            "ctg02_39",
            "ctg02_40",
            "ctg02_41",
            "ctg02_42",
            "ctg02_43",
            "ctg02_44",
            "ctg02_45",
            "ctg02_46",
            "ctg02_47",
            "ctg02_48",
            "ctg02_49",
            "ctg02_50",
            "ctg02_51",
            "ctg02_52",
            "ctg02_53",
            "ctg02_54",
            "ctg02_55",
            "ctg02_56",
            "ctg02_57",
            "ctg02_58",
            "ctg02_59",
            "ctg02_60",
            "ctg02_61",
            "ctg02_62",
            "ctg02_63",
            "ctg02_64",
            "ctg02_65",
            "ctg02_66",
            "ctg02_67",
            "ctg02_68",
            "ctg02_69",
            "ctg02_70",
            "ctg02_71",
            "ctg02_72",
            "ctg02_73",
            "ctg02_74",
            "ctg02_75",
            "ctg02_76",
            "ctg02_77",
            "ctg02_78",
            "ctg02_79",
            "ctg02_80",
            "ctg02_81",
            "ctg02_82",
            "ctg02_83",
            "ctg02_84",
            "ctg02_85",
            "ctg02_86",
            "ctg02_87",
            "ctg02_88",
            "ctg02_89",
            "ctg02_90",
            "ctg02_91",
            "ctg02_92",
            "ctg02_93",
            "ctg02_94",
            "ctg02_95",
            "ctg02_96",
            "ctg02_97",
            "ctg02_99",
            "ctg02_98",
            "ctg02_100",
            "ctg02_101",
            "ctg02_102",
            "ctg02_103",
            "ctg02_104",
            "ctg02_105",
            "ctg02_106",
            "ctg02_107",
            "ctg02_108",
            "ctg02_109",
            "ctg02_110",
            "ctg02_111",
            "ctg02_112",
            "ctg02_113",
            "ctg02_114",
            "ctg02_115",
            "ctg02_116",
            "ctg02_117",
            "ctg02_118",
            "ctg02_120",
            "ctg02_119",
            "ctg02_121",
            "ctg02_122",
            "ctg02_123",
            "ctg02_124",
            "ctg02_125",
            "ctg02_127",
            "ctg02_126",
            "ctg02_129",
            "ctg02_128",
            "ctg02_130",
            "ctg02_131",
            "ctg02_132",
            "ctg02_133",
            "ctg02_134",
            "ctg02_135",
            "ctg02_136",
            "ctg02_137",
            "ctg02_138",
            "ctg02_139",
            "ctg02_140",
            "ctg02_141",
            "ctg02_142",
            "ctg02_144",
            "ctg02_143",
            "ctg02_145",
            "ctg02_146",
            "ctg02_147",
            "ctg02_148",
            "ctg02_149",
            "ctg02_151",
            "ctg02_150",
            "ctg02_152",
            "ctg02_153",
            "ctg02_155",
            "ctg02_154",
            "ctg02_157",
            "ctg02_156",
            "ctg02_158",
            "ctg02_159",
            "ctg02_160",
            "ctg02_161",
            "ctg02_162",
            "ctg02_163",
            "ctg02_164",
            "ctg02_165",
            "ctg02_167",
            "ctg02_168",
            "ctg02_166",
            "ctg02_169",
            "ctg02_170",
            "ctg02_171",
            "ctg02_172",
            "ctg02_174",
            "ctg02_176",
            "ctg02_175",
            "ctg02_178",
            "ctg02_177",
            "ctg02_179",
            "ctg02_180",
            "ctg02_181",
            "ctg02_183",
            "ctg02_182",
            "ctg02_184",
            "ctg02_185",
            "ctg02_186",
            "ctg02_187",
            "ctg02_188",
            "ctg02_189",
            "ctg02_190",
            "ctg02_191",
            "ctg02_193",
            "ctg02_192",
            "ctg02_194",
            "ctg02_196",
            "ctg02_195",
            "ctg02_197",
            "ctg02_198",
            "ctg02_200",
            "ctg02_199",
            "ctg02_201",
            "ctg02_202",
            "ctg02_203",
            "ctg02_204",
            "ctg02_206",
            "ctg02_208",
            "ctg02_207",
            "ctg02_205",
            "ctg02_209",
            "ctg02_210",
            "ctg02_211",
            "ctg02_212",
            "ctg02_214",
            "ctg02_213",
            "ctg02_215",
            "ctg02_216",
            "ctg02_220",
            "ctg02_218",
            "ctg02_221",
            "ctg02_217",
            "ctg02_219",
            "ctg02_223",
            "ctg02_222",
            "ctg02_224",
            "ctg02_225",
            "ctg02_226",
            "ctg02_227",
            "ctg02_228",
            "ctg02_229",
            "ctg02_231",
            "ctg02_230",
            "ctg02_232",
            "ctg02_234",
            "ctg02_235",
            "ctg02_233",
            "ctg02_237",
            "ctg02_238",
            "ctg02_236",
            "ctg02_239",
            "ctg02_241",
            "ctg02_240",
            "ctg02_242",
            "ctg02_245",
            "ctg02_243",
            "ctg02_246",
            "ctg02_244",
            "ctg02_247",
            "ctg02_249",
            "ctg02_250",
            "ctg02_248",
            "ctg02_252",
            "ctg02_251",
            "ctg02_253",
            "ctg02_254",
            "ctg02_255",
            "ctg02_256",
            "ctg02_257",
            "ctg02_258",
            "ctg02_259",
            "ctg02_260",
            "ctg02_173",
            "ctg02_261",
            "ctg02_262",
            "ctg02_263",
            "ctg02_266",
            "ctg02_265",
            "ctg02_268",
            "ctg02_264",
            "ctg02_267",
            "ctg02_269",
            "ctg02_271",
            "ctg02_270",
            "ctg02_273",
            "ctg02_272",
            "ctg02_274",
            "ctg02_275",
            "ctg02_276",
            "ctg02_277",
            "ctg02_278",
            "ctg02_279",
            "ctg02_280",
            "ctg02_282",
            "ctg02_281",
            "ctg02_283",
            "ctg02_284",
            "ctg02_285",
            "ctg02_286",
            "ctg02_287",
            "ctg02_289",
            "ctg02_288",
            "ctg02_290",
            "ctg02_291",
            "ctg02_292",
            "ctg02_293",
            "ctg02_294",
            "ctg02_295",
            "ctg02_296",
            "ctg02_297",
            "ctg02_298",
            "ctg02_299",
            "ctg02_300",
            "ctg02_301",
            "ctg02_302",
            "ctg02_303",
            "ctg02_304",
            "ctg02_305",
            "ctg02_306",
        ]

        assert set(g.get_true_chromosomes()) == set(should)
        assert set(g.get_true_chromosomes()) == set(g.get_chromosome_lengths())
 def test_download_jobs_called_init(self, new_pipegraph, mock_download,
                                    shared_prebuild):
     species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
     g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)
     g.find_prebuild("genome.fasta")  # this is the actual test.