def test_cdna_creation(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        g.job_transcripts()
        ppg.run_pipegraph()
        assert g.find_file("genome.fasta").exists()
        assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists()
        tf = g.df_transcripts
        assert "BAF35033" in tf.index
        assert tf.loc["BAF35033"].exons == ((1313, 2816), )

        should = dict(
            iter_fasta(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
                )))
        should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
        actual = dict(iter_fasta(g.find_file("cdna.fasta")))
        if actual != should:
            assert not set(should.keys()).difference(set(actual.keys(
            )))  # they are all here, we just have more (tRNA...)
            for k in should:
                assert actual[k] == should[k]
    def test_multiple_fasta_files(self, new_pipegraph):
        import tempfile

        tf = tempfile.NamedTemporaryFile(suffix=".fasta")
        tf.write(b">Extra\nAGTC")
        tf.flush()
        g = FileBasedGenome(
            "Candidatus_carsonella",
            [
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                tf.name,
            ],
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        ppg.run_pipegraph()
        assert g.get_genome_sequence("Extra", 0, 4) == "AGTC"
        assert g.get_chromosome_lengths() == {"Extra": 4, "Chromosome": 159662}

        # test that changing the fasta leads to an explosion
        new_pipegraph.new_pipegraph()
        tf.seek(0, 0)
        tf.write(b">Extra\nAGTCA")
        tf.flush()
        g = FileBasedGenome(
            "Candidatus_carsonella",
            [
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                tf.name,
            ],
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
        )
        g.download_genome()
        with pytest.raises(UpstreamChangedError):
            ppg.run_pipegraph()
    def test_job_creating_fasta(self, new_pipegraph):
        new_pipegraph.quiet = False

        def gen_fasta():
            import shutil

            shutil.copy(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                "shu.fasta.gz",
            )

        fasta_job = ppg.FileGeneratingJob("shu.fasta.gz", gen_fasta)
        g = FileBasedGenome(
            "Candidatus_carsonella",
            fasta_job,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            None,
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        ppg.run_pipegraph()
        assert (
            g.get_cdna_sequence("BAF35032") ==
            "ATGAATACTATATTTTCAAGAATAACACCATTAGGAAATGGTACGTTATGTGTTATAAGAAT"
            "TTCTGGAAAAAATGTAAAATTTTTAATACAAAAAATTGTAAAAAAAAATATAAAAGAAAAAATAG"
            "CTACTTTTTCTAAATTATTTTTAGATAAAGAATGTGTAGATTATGCAATGATTATTTTTTTTAAA"
            "AAACCAAATACGTTCACTGGAGAAGATATAATCGAATTTCATATTCACAATAATGAAACTATTGT"
            "AAAAAAAATAATTAATTATTTATTATTAAATAAAGCAAGATTTGCAAAAGCTGGCGAATTTTTAG"
            "AAAGACGATATTTAAATGGAAAAATTTCTTTAATAGAATGCGAATTAATAAATAATAAAATTTTA"
            "TATGATAATGAAAATATGTTTCAATTAACAAAAAATTCTGAAAAAAAAATATTTTTATGTATAAT"
            "TAAAAATTTAAAATTTAAAATAAATTCTTTAATAATTTGTATTGAAATCGCAAATTTTAATTTTA"
            "GTTTTTTTTTTTTTAATGATTTTTTATTTATAAAATATACATTTAAAAAACTATTAAAACTTTTA"
            "AAAATATTAATTGATAAAATAACTGTTATAAATTATTTAAAAAAGAATTTCACAATAATGATATT"
            "AGGTAGAAGAAATGTAGGAAAGTCTACTTTATTTAATAAAATATGTGCACAATATGACTCGATTG"
            "TAACTAATATTCCTGGTACTACAAAAAATATTATATCAAAAAAAATAAAAATTTTATCTAAAAAA"
            "ATAAAAATGATGGATACAGCAGGATTAAAAATTAGAACTAAAAATTTAATTGAAAAAATTGGAAT"
            "TATTAAAAATATAAATAAAATTTATCAAGGAAATTTAATTTTGTATATGATTGATAAATTTAATA"
            "TTAAAAATATATTTTTTAACATTCCAATAGATTTTATTGATAAAATTAAATTAAATGAATTAATA"
            "ATTTTAGTTAACAAATCAGATATTTTAGGAAAAGAAGAAGGAGTTTTTAAAATAAAAAATATATT"
            "AATAATTTTAATTTCTTCTAAAAATGGAACTTTTATAAAAAATTTAAAATGTTTTATTAATAAAA"
            "TCGTTGATAATAAAGATTTTTCTAAAAATAATTATTCTGATGTTAAAATTCTATTTAATAAATTT"
            "TCTTTTTTTTATAAAGAATTTTCATGTAACTATGATTTAGTGTTATCAAAATTAATTGATTTTCA"
            "AAAAAATATATTTAAATTAACAGGAAATTTTACTAATAAAAAAATAATAAATTCTTGTTTTAGAA"
            "ATTTTTGTATTGGTAAATGA")
 def test_transcript_wrong_order(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "start > stop" in str(job.exception)
 def test_transcript_transcript_outside_gene(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_outside_gene.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "Transcript outside of gene" in str(job.exception)
     assert isinstance(job.exception, ValueError)
 def test_transcripts_unique_check(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     g.get_additional_gene_gtfs = lambda: [
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.more_transcripts.gtf.gz"
         )
     ]
     g.download_genome()
     job = g.job_transcripts()
     with pytest.raises(ppg.RuntimeError):
         ppg.run_pipegraph()
     assert "transcript_stable_ids were not unique" in str(job.exception)
 def test_get_gtf_using_additional_gtf(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
         ),
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
         ),
     )
     g.get_additional_gene_gtfs = lambda: [
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz"
         )
     ]
     g.download_genome()
     j = g.job_genes()
     for x in j.prerequisites:
         if hasattr(x, "filenames"):
             print(x, x.filenames)
             if (get_sample_data(
                     "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz"
             )) in x.filenames:
                 break
     else:
         assert False  # wrong preqs
     ppg.run_pipegraph()
     assert "TEST1_001" in g.df_genes.index
    def test_protein_creation(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            None,
            ProkaryoticCode(),
        )
        g.download_genome()
        g.job_transcripts()
        ppg.run_pipegraph()

        should = dict(
            iter_fasta(
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
                )))
        should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
        actual = dict(iter_fasta(g.find_file("pep.fasta")))
        if actual != should:
            assert not set(should.keys()).difference(set(actual.keys(
            )))  # they are all here, we just have more (tRNA...)
            for k in should:
                if actual[k] != should[k]:
                    print(k)
                    print(len(actual[k]))
                    print(len(should[k]))

                    print(actual[k])
                    print(should[k])
                    # print(g.get_cds_sequence(k.decode('utf-8')))
                # else:
                # print('ok', k)
                # assert actual[k] == should[k]
            assert False
Exemple #9
0
def get_Candidatus_carsonella_ruddii_pv(name=None, **kwargs):
    """A FilebasedGenome used by other libraries for their tests"""
    from mbf_genomes import FileBasedGenome

    if name is None:  # pragma: no cover
        name = "Candidatus_carsonella"
    return FileBasedGenome(
        name,
        get_sample_path(
            "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
        ),
        get_sample_path(
            "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
        ),
        get_sample_path(
            "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
        ),
        get_sample_path(
            "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
        ),
        **kwargs,
    )
    def test_indexing(self):
        g = FileBasedGenome(
            "Candidatus_carsonella",
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
            ),
            get_sample_data(
                "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz"
            ),
        )
        g.download_genome()
        g.job_transcripts()
        g.job_genes()
        with pytest.raises(ValueError):
            g.df_transcripts
        with pytest.raises(ValueError):
            g.df_genes
        ppg.run_pipegraph()
        assert g.find_file("genome.fasta").exists()
        assert g.find_prebuild("genome.fasta") == g.genome_fasta_dependencies
        assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists()
        for should_file, actual_file in [
            (
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
                ),
                g.find_file("genome.fasta"),
            ),
            (
                get_sample_data(
                    "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz"
                ),
                g.find_file("cdna.fasta"),
            ),
        ]:
            should = dict(iter_fasta(should_file))
            should = {k[:k.find(b" ")]: v for (k, v) in should.items()}
            actual = dict(iter_fasta(actual_file))
            if should != actual:
                assert len(should) == len(actual)
                assert set(should.keys()) == set(actual.keys())
                assert False == should_file  # noqa:E712
        tf = g.df_transcripts
        assert "BAF35033" in tf.index
        assert not hasattr(g, "_transcripts")
        assert tf.loc["BAF35033"].exons == ((1313, 2816), )

        gf = g.df_genes
        assert len(gf) == 246
        # transcript_stable_ids is tuples, this genome has only one transcript
        # per gene
        assert set([len(x) for x in gf.transcript_stable_ids]) == set([1])

        assert g.find_file("pep.fasta").exists()
        assert g.find_prebuild("pep.fasta") == g.protein_fasta_dependencies
        assert g.find_file("pep.fasta").with_suffix(".fasta.fai").exists()
        assert (
            g.get_protein_sequence("BAF35037") ==
            "MFKFINRFLNLKKRYFYIFLINFFYFFNKCNFIKKKKIYKKIITKKFENYLLKLIIQKYAK")
 def test_empty_gtf_and_cdna_and_protein(self):
     g = FileBasedGenome(
         "Candidatus_carsonella",
         get_sample_data(
             "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz"
         ),
         None,
         None,
     )
     g.download_genome()
     assert g.gtf_filename is None
     assert g.cdna_fasta_filename is None
     g.job_transcripts()
     g.job_genes()
     g.job_proteins()
     ppg.run_pipegraph()
     assert len(g.df_transcripts) == 0
     assert len(g.get_gtf()) == 0
     assert len(g.df_genes) == 0
     assert len(g.df_proteins) == 0