def test_sequences(test_data_dir, tmp_path, gbk, flavor): """Parse a genbank, write it to disk, then parse it again and compare.""" gbk = test_data_dir / gbk with open(gbk, "r") as fh: collections = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, flavor) with open(tmp_gbk, "r") as fh: new_collection = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) assert len(collections[0].genes) == len(new_collection[0].genes) for gene_a, gene_b in zip(collections[0], new_collection[0]): assert gene_a._location == gene_b._location tx_a = gene_a.transcripts[0] tx_b = gene_b.transcripts[0] assert tx_a._location == tx_b._location if tx_a.is_coding: assert tx_a.cds._location == tx_b.cds._location assert tx_a.get_protein_sequence() == tx_b.get_protein_sequence() assert tx_a.get_transcript_sequence() == tx_b.get_transcript_sequence()
def test_missing_translation(test_data_dir, tmp_path): gbk = test_data_dir / "INSC1003_wrong_missing_translation.gbk" with open(gbk, "r") as fh: collections = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED))) tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, update_translations=False) with open(tmp_gbk, "r") as fh: annot = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0] genes = annot.genes assert "translation" not in genes[0].transcripts[0].qualifiers assert "translation" in genes[1].transcripts[0].qualifiers assert genes[1].transcripts[0].qualifiers["translation"] == { "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK" "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE" "NDIPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGF" "VHICRLDTAGARVLEN" } # now export to file and force the translations to be recalculated tmp_gbk = tmp_path / "tmp.gbk" with open(tmp_gbk, "w") as fh: collection_to_genbank(collections, fh, update_translations=True) with open(tmp_gbk, "r") as fh: annot = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0] genes = annot.genes assert genes[0].transcripts[0].qualifiers["translation"] == { "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAA" "QPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLE" "STVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDAR" "LLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMK" "GMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRT" "LRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWL" "KNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHV" "VTPNKKANTSSMDYYHLLRHAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATT" "LAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVL" "RYVGNIDEDGACRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV*" } assert genes[1].transcripts[0].qualifiers["translation"] == { "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK" "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE" "NDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRE" "RLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN*" }
def test_reserved_attributes(self, test_data_dir, tmp_path): gb = test_data_dir / "INSC1020_subset_gff3.gb" parsed = list(parse_genbank(gb)) a = [x.to_annotation_collection() for x in parsed] tmp_gff = tmp_path / "tmp.gff" with pytest.warns(ReservedKeyWarning): with open(tmp_gff, "w") as fh: collection_to_gff3(a, fh, raise_on_reserved_attributes=False) with pytest.raises(GFF3ExportException): with open(tmp_gff, "w") as fh: collection_to_gff3(a, fh, raise_on_reserved_attributes=True)
def test_broken_frameshift(self, test_data_dir): """If I merge the transcript, the frames list no longer matches the location and an exception is raised.""" gbk = test_data_dir / "insO_frameshift.gbk" with open(gbk, "r") as fh: gbk_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh)))[0] cds = gbk_rec.genes[0].get_primary_transcript().cds cds._location = cds._location.merge_overlapping() with pytest.raises(MismatchedFrameException): _ = cds.translate()
def test_records_to_fasta_from_genbank_fasta_header(test_data_dir, tmp_path): """Because we are exporting from a ParsedAnnotationRecord directly the FASTA comments are retained.""" gbk = test_data_dir / "INSC1003.gbk" with open(gbk, "r") as fh: parsed = parse_genbank(fh) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as ofh: for rec in parsed: rec.to_fasta(ofh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa", "r") as fh2: assert fh1.read() == fh2.read()
def test_genbank_to_gff(self, test_data_dir, tmp_path, gbk, gff3, add_sequences): """ INSC1006_chrI.gff3 and INSC1003.gff3 were created from INSC1006_chrI.gbff and INSC1003.gbk respectively, so we can compare to the source file. """ gbk = test_data_dir / gbk with open(gbk, "r") as fh: parsed = list(ParsedAnnotationRecord.parsed_annotation_records_to_model(parse_genbank(fh))) tmp_gff = tmp_path / "tmp.gff" with open(tmp_gff, "w") as fh: collection_to_gff3(parsed, fh, add_sequences=add_sequences) for l1, l2 in zip(open(tmp_gff), open(test_data_dir / gff3)): assert l1 == l2
def test_tbl_export_from_genbank(test_data_dir, tmp_path, genbank, expected_tbl): genbank = test_data_dir / genbank recs = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(genbank))) tmp = tmp_path / "tmp.tbl" with open(tmp, "w") as fh: collection_to_tbl(recs, fh, locus_tag_prefix="test", submitter_lab_name="inscripta", random_seed=123) with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2: assert fh1.read() == fh2.read()
def test_collection_to_fasta_from_genbank(test_data_dir, tmp_path): """This FASTA export matches exactly because there are no FASTA comments.""" gbk = test_data_dir / "INSC1006_chrI.gbff" with open(gbk, "r") as fh: parsed = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as fh: collection_to_fasta(parsed, fh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa", "r") as fh2: assert fh1.read() == fh2.read()
def test_records_to_fasta_from_genbank(test_data_dir, tmp_path): """INSC1006_chrI.gbff will have FASTA comments when exported by BioPython, and so the sequence will match but the comments will be lost.""" gbk = test_data_dir / "INSC1006_chrI.gbff" with open(gbk, "r") as fh: parsed = parse_genbank(fh) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as ofh: for rec in parsed: rec.to_fasta(ofh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa", "r") as fh2: f1 = fh1.readlines() f2 = fh2.readlines() assert f1[1:] == f2[1:] assert f1[0] != f2[0] assert f1[0].split()[0] == f2[0].split()[0]
def test_collection_to_fasta_from_genbank_fasta_header(test_data_dir, tmp_path): """INSC1003.fa has FASTA comments, and so the sequence will match but the comments will be lost.""" gbk = test_data_dir / "INSC1003.gbk" with open(gbk, "r") as fh: parsed = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh))) tmp_fasta = tmp_path / "tmp.fasta" with open(tmp_fasta, "w") as fh: collection_to_fasta(parsed, fh) with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa", "r") as fh2: f1 = fh1.readlines() f2 = fh2.readlines() assert f1[1:] == f2[1:] assert f1[0] != f2[0] assert f1[0].split()[0] == f2[0].split()[0]
def test_parse_inso(self, test_data_dir): """This proves we handle frame and phase""" gbk = test_data_dir / "insO_frameshift.gbk" gff3 = test_data_dir / "insO_frameshift.gff3" with open(gbk, "r") as fh: gbk_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_genbank(fh)))[0] gff3_rec = list( ParsedAnnotationRecord.parsed_annotation_records_to_model( parse_gff3_embedded_fasta(gff3)))[0] expected_protein = ( "MKKRNFSAEFKRESAQLVVDQKYTVADAAKAMDVGLSTMTRWVKQLRDERQGKTPKASPITPEQIEIRKLRKKLQRIEMENEILKKNRP" "EKPDGRRAVLRSQVLELHGISHGSAGARSIATMATRRGYQMGRWLAGRLMKELGLVSCQQPTHRYKRGGHEHVAIPNYLERQFAVTEPNQV" "WCGDVTYIWTGKRWAYLAVVLDLFARKPVGWAMSFSPDSRLTMKALEMAWETRGKPVGVMFQSDQGSHYTSRQFRQLLWRYRIRQSMSRR" "GNCWDNSPMERFFRSLKNEWVPATGYVSFSDAAHAITDYIVGYYSALRPHEYNGGLPPNESENRYWKNSNAEASFS*" ) assert (str(gbk_rec.genes[0].get_primary_protein()) == str( gff3_rec.genes[0].get_primary_protein()) == expected_protein)