Example #1
0
def test_sequences(test_data_dir, tmp_path, gbk, flavor):
    """Parse a genbank, write it to disk, then parse it again and compare."""
    gbk = test_data_dir / gbk
    with open(gbk, "r") as fh:
        collections = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, flavor)

    with open(tmp_gbk, "r") as fh:
        new_collection = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    assert len(collections[0].genes) == len(new_collection[0].genes)

    for gene_a, gene_b in zip(collections[0], new_collection[0]):
        assert gene_a._location == gene_b._location
        tx_a = gene_a.transcripts[0]
        tx_b = gene_b.transcripts[0]
        assert tx_a._location == tx_b._location
        if tx_a.is_coding:
            assert tx_a.cds._location == tx_b.cds._location
            assert tx_a.get_protein_sequence() == tx_b.get_protein_sequence()
        assert tx_a.get_transcript_sequence() == tx_b.get_transcript_sequence()
Example #2
0
def test_missing_translation(test_data_dir, tmp_path):
    gbk = test_data_dir / "INSC1003_wrong_missing_translation.gbk"
    with open(gbk, "r") as fh:
        collections = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))

    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, update_translations=False)

    with open(tmp_gbk, "r") as fh:
        annot = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0]
        genes = annot.genes
        assert "translation" not in genes[0].transcripts[0].qualifiers
        assert "translation" in genes[1].transcripts[0].qualifiers
        assert genes[1].transcripts[0].qualifiers["translation"] == {
            "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK"
            "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE"
            "NDIPELAAKLMKDVIAEPYRERLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGF"
            "VHICRLDTAGARVLEN"
        }

    # now export to file and force the translations to be recalculated
    tmp_gbk = tmp_path / "tmp.gbk"
    with open(tmp_gbk, "w") as fh:
        collection_to_genbank(collections, fh, update_translations=True)

    with open(tmp_gbk, "r") as fh:
        annot = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh, gbk_type=GenBankParserType.SORTED)))[0]
        genes = annot.genes
        assert genes[0].transcripts[0].qualifiers["translation"] == {
            "MRVLKFGGTSVANAERFLRVADILESNARQGQVATVLSAPAKITNHLVAMIEKTISGQDALPNISDAERIFAELLTGLAAA"
            "QPGFPLAQLKTFVDQEFAQIKHVLHGISLLGQCPDSINAALICRGEKMSIAIMAGVLEARGHNVTVIDPVEKLLAVGHYLE"
            "STVDIAESTRRIAASRIPADHMVLMAGFTAGNEKGELVVLGRNGSDYSAAVLAACLRADCCEIWTDVDGVYTCDPRQVPDAR"
            "LLKSMSYQEAMELSYFGAKVLHPRTITPIAQFQIPCLIKNTGNPQAPGTLIGASRDEDELPVKGISNLNNMAMFSVSGPGMK"
            "GMVGMAARVFAAMSRARISVVLITQSSSEYSISFCVPQSDCVRAERAMQEEFYLELKEGLLEPLAVTERLAIISVVGDGMRT"
            "LRGISAKFFAALARANINIVAIAQGSSERSISVVVNNDDATTGVRVTHQMLFNTDQVIEVFVIGVGGVGGALLEQLKRQQSWL"
            "KNKHIDLRVCGVANSKALLTNVHGLNLENWQEELAQAKEPFNLGRLIRLVKEYHLLNPVIVDCTSSQAVADQYADFLREGFHV"
            "VTPNKKANTSSMDYYHLLRHAAEKSRRKFLYDTNVGAGLPVIENLQNLLNAGDELMKFSGILSGSLSYIFGKLDEGMSFSEATT"
            "LAREMGYTEPDPRDDLSGMDVARKLLILARETGRELELADIEIEPVLPAEFNAEGDVAAFMANLSQLDDLFAARVAKARDEGKVL"
            "RYVGNIDEDGACRVKIAEVDGNDPLFKVKNGENALAFYSHYYQPLPLVLRGYGAGNDVTAAGVFADLLRTLSWKLGV*"
        }
        assert genes[1].transcripts[0].qualifiers["translation"] == {
            "MVKVYAPASSANMSVGFDVLGAAVTPVDGALLGDVVTVEAAETFSLNNLGRFADKLPSEPRENIVYQCWERFCQELGK"
            "QIPVAMTLEKNMPIGSGLGSSACSVVAALMAMNEHCGKPLNDTRLLALMGELEGRISGSIHYDNVAPCFLGGMQLMIEE"
            "NDIISQQVPGFDEWLWVLAYPGIKVSTAEARAILPAQYRRQDCIAHGRHLAGFIHACYSRQPELAAKLMKDVIAEPYRE"
            "RLLPGFRQARQAVAEIGAVASGISGSGPTLFALCDKPDTAQRVADWLGKNYLQNQEGFVHICRLDTAGARVLEN*"
        }
 def test_reserved_attributes(self, test_data_dir, tmp_path):
     gb = test_data_dir / "INSC1020_subset_gff3.gb"
     parsed = list(parse_genbank(gb))
     a = [x.to_annotation_collection() for x in parsed]
     tmp_gff = tmp_path / "tmp.gff"
     with pytest.warns(ReservedKeyWarning):
         with open(tmp_gff, "w") as fh:
             collection_to_gff3(a, fh, raise_on_reserved_attributes=False)
     with pytest.raises(GFF3ExportException):
         with open(tmp_gff, "w") as fh:
             collection_to_gff3(a, fh, raise_on_reserved_attributes=True)
Example #4
0
    def test_broken_frameshift(self, test_data_dir):
        """If I merge the transcript, the frames list no longer matches the location and an exception is raised."""
        gbk = test_data_dir / "insO_frameshift.gbk"
        with open(gbk, "r") as fh:
            gbk_rec = list(
                ParsedAnnotationRecord.parsed_annotation_records_to_model(
                    parse_genbank(fh)))[0]

        cds = gbk_rec.genes[0].get_primary_transcript().cds
        cds._location = cds._location.merge_overlapping()
        with pytest.raises(MismatchedFrameException):
            _ = cds.translate()
Example #5
0
def test_records_to_fasta_from_genbank_fasta_header(test_data_dir, tmp_path):
    """Because we are exporting from a ParsedAnnotationRecord directly the FASTA comments are retained."""
    gbk = test_data_dir / "INSC1003.gbk"
    with open(gbk, "r") as fh:
        parsed = parse_genbank(fh)
        tmp_fasta = tmp_path / "tmp.fasta"
        with open(tmp_fasta, "w") as ofh:
            for rec in parsed:
                rec.to_fasta(ofh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa",
                                           "r") as fh2:
        assert fh1.read() == fh2.read()
    def test_genbank_to_gff(self, test_data_dir, tmp_path, gbk, gff3, add_sequences):
        """
        INSC1006_chrI.gff3 and INSC1003.gff3 were created from INSC1006_chrI.gbff and INSC1003.gbk
        respectively, so we can compare to the source file.
        """
        gbk = test_data_dir / gbk
        with open(gbk, "r") as fh:
            parsed = list(ParsedAnnotationRecord.parsed_annotation_records_to_model(parse_genbank(fh)))

        tmp_gff = tmp_path / "tmp.gff"
        with open(tmp_gff, "w") as fh:
            collection_to_gff3(parsed, fh, add_sequences=add_sequences)

        for l1, l2 in zip(open(tmp_gff), open(test_data_dir / gff3)):
            assert l1 == l2
Example #7
0
def test_tbl_export_from_genbank(test_data_dir, tmp_path, genbank,
                                 expected_tbl):
    genbank = test_data_dir / genbank
    recs = list(
        ParsedAnnotationRecord.parsed_annotation_records_to_model(
            parse_genbank(genbank)))
    tmp = tmp_path / "tmp.tbl"
    with open(tmp, "w") as fh:
        collection_to_tbl(recs,
                          fh,
                          locus_tag_prefix="test",
                          submitter_lab_name="inscripta",
                          random_seed=123)
    with open(tmp) as fh1, open(test_data_dir / expected_tbl) as fh2:
        assert fh1.read() == fh2.read()
Example #8
0
def test_collection_to_fasta_from_genbank(test_data_dir, tmp_path):
    """This FASTA export matches exactly because there are no FASTA comments."""
    gbk = test_data_dir / "INSC1006_chrI.gbff"
    with open(gbk, "r") as fh:
        parsed = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_fasta = tmp_path / "tmp.fasta"
    with open(tmp_fasta, "w") as fh:
        collection_to_fasta(parsed, fh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa",
                                           "r") as fh2:
        assert fh1.read() == fh2.read()
Example #9
0
def test_records_to_fasta_from_genbank(test_data_dir, tmp_path):
    """INSC1006_chrI.gbff will have FASTA comments when exported by BioPython,
    and so the sequence will match but the comments will be lost."""
    gbk = test_data_dir / "INSC1006_chrI.gbff"
    with open(gbk, "r") as fh:
        parsed = parse_genbank(fh)
        tmp_fasta = tmp_path / "tmp.fasta"
        with open(tmp_fasta, "w") as ofh:
            for rec in parsed:
                rec.to_fasta(ofh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1006_chrI.fa",
                                           "r") as fh2:
        f1 = fh1.readlines()
        f2 = fh2.readlines()
        assert f1[1:] == f2[1:]
        assert f1[0] != f2[0]
        assert f1[0].split()[0] == f2[0].split()[0]
Example #10
0
def test_collection_to_fasta_from_genbank_fasta_header(test_data_dir,
                                                       tmp_path):
    """INSC1003.fa has FASTA comments, and so the sequence will match but the comments will be lost."""
    gbk = test_data_dir / "INSC1003.gbk"
    with open(gbk, "r") as fh:
        parsed = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_genbank(fh)))

    tmp_fasta = tmp_path / "tmp.fasta"
    with open(tmp_fasta, "w") as fh:
        collection_to_fasta(parsed, fh)

    with open(tmp_fasta, "r") as fh1, open(test_data_dir / "INSC1003.fa",
                                           "r") as fh2:
        f1 = fh1.readlines()
        f2 = fh2.readlines()
        assert f1[1:] == f2[1:]
        assert f1[0] != f2[0]
        assert f1[0].split()[0] == f2[0].split()[0]
Example #11
0
    def test_parse_inso(self, test_data_dir):
        """This proves we handle frame and phase"""
        gbk = test_data_dir / "insO_frameshift.gbk"
        gff3 = test_data_dir / "insO_frameshift.gff3"

        with open(gbk, "r") as fh:
            gbk_rec = list(
                ParsedAnnotationRecord.parsed_annotation_records_to_model(
                    parse_genbank(fh)))[0]

        gff3_rec = list(
            ParsedAnnotationRecord.parsed_annotation_records_to_model(
                parse_gff3_embedded_fasta(gff3)))[0]

        expected_protein = (
            "MKKRNFSAEFKRESAQLVVDQKYTVADAAKAMDVGLSTMTRWVKQLRDERQGKTPKASPITPEQIEIRKLRKKLQRIEMENEILKKNRP"
            "EKPDGRRAVLRSQVLELHGISHGSAGARSIATMATRRGYQMGRWLAGRLMKELGLVSCQQPTHRYKRGGHEHVAIPNYLERQFAVTEPNQV"
            "WCGDVTYIWTGKRWAYLAVVLDLFARKPVGWAMSFSPDSRLTMKALEMAWETRGKPVGVMFQSDQGSHYTSRQFRQLLWRYRIRQSMSRR"
            "GNCWDNSPMERFFRSLKNEWVPATGYVSFSDAAHAITDYIVGYYSALRPHEYNGGLPPNESENRYWKNSNAEASFS*"
        )
        assert (str(gbk_rec.genes[0].get_primary_protein()) == str(
            gff3_rec.genes[0].get_primary_protein()) == expected_protein)