def test_cdna_creation(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() g.job_transcripts() ppg.run_pipegraph() assert g.find_file("genome.fasta").exists() assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists() tf = g.df_transcripts assert "BAF35033" in tf.index assert tf.loc["BAF35033"].exons == ((1313, 2816), ) should = dict( iter_fasta( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ))) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(g.find_file("cdna.fasta"))) if actual != should: assert not set(should.keys()).difference(set(actual.keys( ))) # they are all here, we just have more (tRNA...) for k in should: assert actual[k] == should[k]
def test_multiple_fasta_files(self, new_pipegraph): import tempfile tf = tempfile.NamedTemporaryFile(suffix=".fasta") tf.write(b">Extra\nAGTC") tf.flush() g = FileBasedGenome( "Candidatus_carsonella", [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), tf.name, ], get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() ppg.run_pipegraph() assert g.get_genome_sequence("Extra", 0, 4) == "AGTC" assert g.get_chromosome_lengths() == {"Extra": 4, "Chromosome": 159662} # test that changing the fasta leads to an explosion new_pipegraph.new_pipegraph() tf.seek(0, 0) tf.write(b">Extra\nAGTCA") tf.flush() g = FileBasedGenome( "Candidatus_carsonella", [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), tf.name, ], get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, ) g.download_genome() with pytest.raises(UpstreamChangedError): ppg.run_pipegraph()
def test_job_creating_fasta(self, new_pipegraph): new_pipegraph.quiet = False def gen_fasta(): import shutil shutil.copy( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), "shu.fasta.gz", ) fasta_job = ppg.FileGeneratingJob("shu.fasta.gz", gen_fasta) g = FileBasedGenome( "Candidatus_carsonella", fasta_job, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), None, get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() ppg.run_pipegraph() assert ( g.get_cdna_sequence("BAF35032") == "ATGAATACTATATTTTCAAGAATAACACCATTAGGAAATGGTACGTTATGTGTTATAAGAAT" "TTCTGGAAAAAATGTAAAATTTTTAATACAAAAAATTGTAAAAAAAAATATAAAAGAAAAAATAG" "CTACTTTTTCTAAATTATTTTTAGATAAAGAATGTGTAGATTATGCAATGATTATTTTTTTTAAA" "AAACCAAATACGTTCACTGGAGAAGATATAATCGAATTTCATATTCACAATAATGAAACTATTGT" "AAAAAAAATAATTAATTATTTATTATTAAATAAAGCAAGATTTGCAAAAGCTGGCGAATTTTTAG" "AAAGACGATATTTAAATGGAAAAATTTCTTTAATAGAATGCGAATTAATAAATAATAAAATTTTA" "TATGATAATGAAAATATGTTTCAATTAACAAAAAATTCTGAAAAAAAAATATTTTTATGTATAAT" "TAAAAATTTAAAATTTAAAATAAATTCTTTAATAATTTGTATTGAAATCGCAAATTTTAATTTTA" "GTTTTTTTTTTTTTAATGATTTTTTATTTATAAAATATACATTTAAAAAACTATTAAAACTTTTA" "AAAATATTAATTGATAAAATAACTGTTATAAATTATTTAAAAAAGAATTTCACAATAATGATATT" "AGGTAGAAGAAATGTAGGAAAGTCTACTTTATTTAATAAAATATGTGCACAATATGACTCGATTG" "TAACTAATATTCCTGGTACTACAAAAAATATTATATCAAAAAAAATAAAAATTTTATCTAAAAAA" "ATAAAAATGATGGATACAGCAGGATTAAAAATTAGAACTAAAAATTTAATTGAAAAAATTGGAAT" "TATTAAAAATATAAATAAAATTTATCAAGGAAATTTAATTTTGTATATGATTGATAAATTTAATA" "TTAAAAATATATTTTTTAACATTCCAATAGATTTTATTGATAAAATTAAATTAAATGAATTAATA" "ATTTTAGTTAACAAATCAGATATTTTAGGAAAAGAAGAAGGAGTTTTTAAAATAAAAAATATATT" "AATAATTTTAATTTCTTCTAAAAATGGAACTTTTATAAAAAATTTAAAATGTTTTATTAATAAAA" "TCGTTGATAATAAAGATTTTTCTAAAAATAATTATTCTGATGTTAAAATTCTATTTAATAAATTT" "TCTTTTTTTTATAAAGAATTTTCATGTAACTATGATTTAGTGTTATCAAAATTAATTGATTTTCA" "AAAAAATATATTTAAATTAACAGGAAATTTTACTAATAAAAAAATAATAAATTCTTGTTTTAGAA" "ATTTTTGTATTGGTAAATGA")
def test_transcript_wrong_order(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_wrong_order.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "start > stop" in str(job.exception)
def test_transcript_transcript_outside_gene(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.transcript_outside_gene.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "Transcript outside of gene" in str(job.exception) assert isinstance(job.exception, ValueError)
def test_transcripts_unique_check(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.get_additional_gene_gtfs = lambda: [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.more_transcripts.gtf.gz" ) ] g.download_genome() job = g.job_transcripts() with pytest.raises(ppg.RuntimeError): ppg.run_pipegraph() assert "transcript_stable_ids were not unique" in str(job.exception)
def test_get_gtf_using_additional_gtf(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.get_additional_gene_gtfs = lambda: [ get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz" ) ] g.download_genome() j = g.job_genes() for x in j.prerequisites: if hasattr(x, "filenames"): print(x, x.filenames) if (get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.additional.gtf.gz" )) in x.filenames: break else: assert False # wrong preqs ppg.run_pipegraph() assert "TEST1_001" in g.df_genes.index
def test_protein_creation(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), None, ProkaryoticCode(), ) g.download_genome() g.job_transcripts() ppg.run_pipegraph() should = dict( iter_fasta( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ))) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(g.find_file("pep.fasta"))) if actual != should: assert not set(should.keys()).difference(set(actual.keys( ))) # they are all here, we just have more (tRNA...) for k in should: if actual[k] != should[k]: print(k) print(len(actual[k])) print(len(should[k])) print(actual[k]) print(should[k]) # print(g.get_cds_sequence(k.decode('utf-8'))) # else: # print('ok', k) # assert actual[k] == should[k] assert False
def get_Candidatus_carsonella_ruddii_pv(name=None, **kwargs): """A FilebasedGenome used by other libraries for their tests""" from mbf_genomes import FileBasedGenome if name is None: # pragma: no cover name = "Candidatus_carsonella" return FileBasedGenome( name, get_sample_path( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_path( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_path( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_path( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), **kwargs, )
def test_indexing(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.42.gtf.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.pep.all.fa.gz" ), ) g.download_genome() g.job_transcripts() g.job_genes() with pytest.raises(ValueError): g.df_transcripts with pytest.raises(ValueError): g.df_genes ppg.run_pipegraph() assert g.find_file("genome.fasta").exists() assert g.find_prebuild("genome.fasta") == g.genome_fasta_dependencies assert g.find_file("genome.fasta").with_suffix(".fasta.fai").exists() for should_file, actual_file in [ ( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), g.find_file("genome.fasta"), ), ( get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.cdna.all.fa.gz" ), g.find_file("cdna.fasta"), ), ]: should = dict(iter_fasta(should_file)) should = {k[:k.find(b" ")]: v for (k, v) in should.items()} actual = dict(iter_fasta(actual_file)) if should != actual: assert len(should) == len(actual) assert set(should.keys()) == set(actual.keys()) assert False == should_file # noqa:E712 tf = g.df_transcripts assert "BAF35033" in tf.index assert not hasattr(g, "_transcripts") assert tf.loc["BAF35033"].exons == ((1313, 2816), ) gf = g.df_genes assert len(gf) == 246 # transcript_stable_ids is tuples, this genome has only one transcript # per gene assert set([len(x) for x in gf.transcript_stable_ids]) == set([1]) assert g.find_file("pep.fasta").exists() assert g.find_prebuild("pep.fasta") == g.protein_fasta_dependencies assert g.find_file("pep.fasta").with_suffix(".fasta.fai").exists() assert ( g.get_protein_sequence("BAF35037") == "MFKFINRFLNLKKRYFYIFLINFFYFFNKCNFIKKKKIYKKIITKKFENYLLKLIIQKYAK")
def test_empty_gtf_and_cdna_and_protein(self): g = FileBasedGenome( "Candidatus_carsonella", get_sample_data( "mbf_genomes/Candidatus_carsonella_ruddii_pv.ASM1036v1.dna.toplevel.fa.gz" ), None, None, ) g.download_genome() assert g.gtf_filename is None assert g.cdna_fasta_filename is None g.job_transcripts() g.job_genes() g.job_proteins() ppg.run_pipegraph() assert len(g.df_transcripts) == 0 assert len(g.get_gtf()) == 0 assert len(g.df_genes) == 0 assert len(g.df_proteins) == 0