コード例 #1
0
 def test_get_true_chromosomes(self, mock_download, shared_prebuild):
     # we need something with contigs and chromosomes
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     should = [
         "1",
         "2",
         "3",
         "4",
         "5",
         "6",
         "7",
         "8",
         "9",
         "10",
         "11",
         "12",
         "13",
         "14",
         "15",
         "16",
         "17",
         "18",
         "19",
         "20",
         "21",
         "22",
         "23",
         "U37796.1",  # it is a true chromosome if there's chromosome: in the fasta description
     ]
     all_contigs = should + [
         "um_scaf_contig_1.256",
         "um_scaf_contig_1.265",
         "um_scaf_contig_1.271",
         "um_scaf_contig_1.264",
     ]
     assert set(g.get_true_chromosomes()) == set(should)
     assert set(g.get_chromosome_lengths()) == set(should + all_contigs)
コード例 #2
0
    def test_get_true_chromosomes_genome_without_chromosomes(
            self, mock_download, shared_prebuild):
        # we need something with contigs and chromosomes
        g = EnsemblGenome("Giardia_lamblia", 43, shared_prebuild)
        ppg.run_pipegraph()
        should = [
            "ctg02_1",
            "ctg02_2",
            "ctg02_3",
            "ctg02_4",
            "ctg02_5",
            "ctg02_6",
            "ctg02_7",
            "ctg02_8",
            "ctg02_9",
            "ctg02_10",
            "ctg02_11",
            "ctg02_12",
            "ctg02_13",
            "ctg02_14",
            "ctg02_15",
            "ctg02_16",
            "ctg02_17",
            "ctg02_18",
            "ctg02_19",
            "ctg02_20",
            "ctg02_21",
            "ctg02_22",
            "ctg02_23",
            "ctg02_24",
            "ctg02_25",
            "ctg02_26",
            "ctg02_27",
            "ctg02_28",
            "ctg02_29",
            "ctg02_30",
            "ctg02_31",
            "ctg02_32",
            "ctg02_33",
            "ctg02_34",
            "ctg02_35",
            "ctg02_36",
            "ctg02_37",
            "ctg02_38",
            "ctg02_39",
            "ctg02_40",
            "ctg02_41",
            "ctg02_42",
            "ctg02_43",
            "ctg02_44",
            "ctg02_45",
            "ctg02_46",
            "ctg02_47",
            "ctg02_48",
            "ctg02_49",
            "ctg02_50",
            "ctg02_51",
            "ctg02_52",
            "ctg02_53",
            "ctg02_54",
            "ctg02_55",
            "ctg02_56",
            "ctg02_57",
            "ctg02_58",
            "ctg02_59",
            "ctg02_60",
            "ctg02_61",
            "ctg02_62",
            "ctg02_63",
            "ctg02_64",
            "ctg02_65",
            "ctg02_66",
            "ctg02_67",
            "ctg02_68",
            "ctg02_69",
            "ctg02_70",
            "ctg02_71",
            "ctg02_72",
            "ctg02_73",
            "ctg02_74",
            "ctg02_75",
            "ctg02_76",
            "ctg02_77",
            "ctg02_78",
            "ctg02_79",
            "ctg02_80",
            "ctg02_81",
            "ctg02_82",
            "ctg02_83",
            "ctg02_84",
            "ctg02_85",
            "ctg02_86",
            "ctg02_87",
            "ctg02_88",
            "ctg02_89",
            "ctg02_90",
            "ctg02_91",
            "ctg02_92",
            "ctg02_93",
            "ctg02_94",
            "ctg02_95",
            "ctg02_96",
            "ctg02_97",
            "ctg02_99",
            "ctg02_98",
            "ctg02_100",
            "ctg02_101",
            "ctg02_102",
            "ctg02_103",
            "ctg02_104",
            "ctg02_105",
            "ctg02_106",
            "ctg02_107",
            "ctg02_108",
            "ctg02_109",
            "ctg02_110",
            "ctg02_111",
            "ctg02_112",
            "ctg02_113",
            "ctg02_114",
            "ctg02_115",
            "ctg02_116",
            "ctg02_117",
            "ctg02_118",
            "ctg02_120",
            "ctg02_119",
            "ctg02_121",
            "ctg02_122",
            "ctg02_123",
            "ctg02_124",
            "ctg02_125",
            "ctg02_127",
            "ctg02_126",
            "ctg02_129",
            "ctg02_128",
            "ctg02_130",
            "ctg02_131",
            "ctg02_132",
            "ctg02_133",
            "ctg02_134",
            "ctg02_135",
            "ctg02_136",
            "ctg02_137",
            "ctg02_138",
            "ctg02_139",
            "ctg02_140",
            "ctg02_141",
            "ctg02_142",
            "ctg02_144",
            "ctg02_143",
            "ctg02_145",
            "ctg02_146",
            "ctg02_147",
            "ctg02_148",
            "ctg02_149",
            "ctg02_151",
            "ctg02_150",
            "ctg02_152",
            "ctg02_153",
            "ctg02_155",
            "ctg02_154",
            "ctg02_157",
            "ctg02_156",
            "ctg02_158",
            "ctg02_159",
            "ctg02_160",
            "ctg02_161",
            "ctg02_162",
            "ctg02_163",
            "ctg02_164",
            "ctg02_165",
            "ctg02_167",
            "ctg02_168",
            "ctg02_166",
            "ctg02_169",
            "ctg02_170",
            "ctg02_171",
            "ctg02_172",
            "ctg02_174",
            "ctg02_176",
            "ctg02_175",
            "ctg02_178",
            "ctg02_177",
            "ctg02_179",
            "ctg02_180",
            "ctg02_181",
            "ctg02_183",
            "ctg02_182",
            "ctg02_184",
            "ctg02_185",
            "ctg02_186",
            "ctg02_187",
            "ctg02_188",
            "ctg02_189",
            "ctg02_190",
            "ctg02_191",
            "ctg02_193",
            "ctg02_192",
            "ctg02_194",
            "ctg02_196",
            "ctg02_195",
            "ctg02_197",
            "ctg02_198",
            "ctg02_200",
            "ctg02_199",
            "ctg02_201",
            "ctg02_202",
            "ctg02_203",
            "ctg02_204",
            "ctg02_206",
            "ctg02_208",
            "ctg02_207",
            "ctg02_205",
            "ctg02_209",
            "ctg02_210",
            "ctg02_211",
            "ctg02_212",
            "ctg02_214",
            "ctg02_213",
            "ctg02_215",
            "ctg02_216",
            "ctg02_220",
            "ctg02_218",
            "ctg02_221",
            "ctg02_217",
            "ctg02_219",
            "ctg02_223",
            "ctg02_222",
            "ctg02_224",
            "ctg02_225",
            "ctg02_226",
            "ctg02_227",
            "ctg02_228",
            "ctg02_229",
            "ctg02_231",
            "ctg02_230",
            "ctg02_232",
            "ctg02_234",
            "ctg02_235",
            "ctg02_233",
            "ctg02_237",
            "ctg02_238",
            "ctg02_236",
            "ctg02_239",
            "ctg02_241",
            "ctg02_240",
            "ctg02_242",
            "ctg02_245",
            "ctg02_243",
            "ctg02_246",
            "ctg02_244",
            "ctg02_247",
            "ctg02_249",
            "ctg02_250",
            "ctg02_248",
            "ctg02_252",
            "ctg02_251",
            "ctg02_253",
            "ctg02_254",
            "ctg02_255",
            "ctg02_256",
            "ctg02_257",
            "ctg02_258",
            "ctg02_259",
            "ctg02_260",
            "ctg02_173",
            "ctg02_261",
            "ctg02_262",
            "ctg02_263",
            "ctg02_266",
            "ctg02_265",
            "ctg02_268",
            "ctg02_264",
            "ctg02_267",
            "ctg02_269",
            "ctg02_271",
            "ctg02_270",
            "ctg02_273",
            "ctg02_272",
            "ctg02_274",
            "ctg02_275",
            "ctg02_276",
            "ctg02_277",
            "ctg02_278",
            "ctg02_279",
            "ctg02_280",
            "ctg02_282",
            "ctg02_281",
            "ctg02_283",
            "ctg02_284",
            "ctg02_285",
            "ctg02_286",
            "ctg02_287",
            "ctg02_289",
            "ctg02_288",
            "ctg02_290",
            "ctg02_291",
            "ctg02_292",
            "ctg02_293",
            "ctg02_294",
            "ctg02_295",
            "ctg02_296",
            "ctg02_297",
            "ctg02_298",
            "ctg02_299",
            "ctg02_300",
            "ctg02_301",
            "ctg02_302",
            "ctg02_303",
            "ctg02_304",
            "ctg02_305",
            "ctg02_306",
        ]

        assert set(g.get_true_chromosomes()) == set(should)
        assert set(g.get_true_chromosomes()) == set(g.get_chromosome_lengths())
コード例 #3
0
    def test_download(self, new_pipegraph, mock_download, shared_prebuild):
        species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
        g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)

        def shorten_genome_fasta(output_path):
            with open(g.find_file("genome.fasta")) as op:
                head = op.read(1024 * 100)
            (output_path / "test.fasta").write_text(head)

        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        test_fasta_job.depends_on(g.download_genome())
        g._prebuilds.append(test_fasta_job)

        subread = Subread(version="1.6.3")
        index = g.build_index(subread, "test.fasta")
        subread_old = Subread(version="1.4.3-p1")
        index_old = g.build_index(subread_old, "test.fasta")

        new_pipegraph.run()
        # note that these are not the checksums from CHECKSUMS files (those are fore
        # the gziped variants, we keep them ungziped and let the filesystem handle
        # the gzip, since we can't rely on the downstream reading gzip...
        assert (checksum_file(
            g.find_file("genome.fasta")) == "584a734589964a654c7c1dc23b0167ab")
        assert (checksum_file(
            g.find_file("cdna.fasta")) == "3fc1f19ab829573169cb2488abe39211")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("pep.fasta")) == "9580fd44832d419c38469d657f6e2484")
        with pytest.raises(OSError):
            g.find_file("no such file")
        assert index.name_file("subread_index.reads").exists()
        assert index.name_file("subread_index.files").exists()
        assert index.name_file("subread_index.00.b.array").exists()
        assert index_old.name_file("subread_index.reads").exists()
        assert index_old.name_file("subread_index.files").exists()
        assert index_old.name_file("subread_index.00.b.array").exists()
        assert index.name_file("subread_index.reads") != index_old.name_file(
            "subread_index.reads")
        assert g.find_file("test.fasta.md5sum").exists()
        with pytest.raises(OSError):
            assert g.find_file("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_prebuild("test.fasta") is test_fasta_job
        with pytest.raises(OSError):
            assert g.find_prebuild("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_file("genome.fasta.fai").exists()
        assert g.find_file("cdna.fasta.fai").exists()

        new_pipegraph.new_pipegraph()
        pb = PrebuildManager(shared_prebuild.prebuilt_path)
        g = EnsemblGenome(species, "41", prebuild_manager=pb)
        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        g._prebuilds.append(test_fasta_job)

        subread_intermediate = Subread(version="1.5.0")
        index_intermediate = g.build_index(subread_intermediate, "test.fasta")
        assert index_intermediate.name_file(
            "subread_index.reads") == index_old.name_file(
                "subread_index.reads")
        index_genome = g.build_index(subread_intermediate)
        assert "/genome/" in str(index_genome.filenames[0])

        assert g.get_chromosome_lengths() == {
            "IV": 1_467_287,
            "MT": 23564,
            "V": 1_519_140,
            "III": 907_494,
            "II": 870_771,
            "VII": 1_800_949,
            "I": 693_414,
            "VI": 1_836_693,
        }

        assert g.get_genome_sequence("VI", 20, 30) == "ACCGCTGAGA"
        assert (g.get_cdna_sequence("EFAGOT00000000349") ==
                "GCTCGCGTGGCGTAATGGCAACGCGTCTGACTTCTAATCAGAAGATTGTGGGTTCGACCC"
                "CCACCGTGAGTG")
        assert (g.get_protein_sequence("AAS53315") ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        assert (g.genetic_code.translate_dna(g.get_cds_sequence("AAS53315")) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")

        assert (g.genetic_code.translate_dna(
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53315"])) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        with pytest.raises(ValueError):
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53316"])

        assert (
            g.df_genes_meta.loc["AGOS_ADL186C"]["description"] ==
            "Restriction of telomere capping protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q75AV6]"
        )