Ejemplo n.º 1
0
def test_conversion_highlevel(path):
    """
    Test whether the high-level GenBank interface can properly read
    the locus, annotation and sequence from GenBank file and write
    these properties to a file, without data changing.
    """
    suffix = path[-2:]
    gb_file = gb.GenBankFile()
    gb_file.read(path)
    ref_locus = gb.get_locus(gb_file)
    ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix)

    gb_file = gb.GenBankFile()
    gb.set_locus(gb_file, *ref_locus)
    gb.set_annotated_sequence(gb_file, ref_annot_seq)
    temp_file_name = biotite.temp_file("gb")
    gb_file.write(temp_file_name)

    gb_file = gb.GenBankFile()
    gb_file.read(temp_file_name)
    test_locus = gb.get_locus(gb_file)
    test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix)
    assert test_locus == ref_locus
    assert test_annot_seq.sequence == ref_annot_seq.sequence
    assert test_annot_seq.annotation == ref_annot_seq.annotation
    assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
Ejemplo n.º 2
0
def test_genbank_conversion():
    gb_file = gb.GenBankFile()
    gb_file.read(join(data_dir, "ec_bl21.gb"))
    assert gb_file.get_locus()["length"] == "4558953"
    assert gb_file.get_locus()["type"] == "DNA circular"
    assert gb_file.get_definition() == ("Escherichia coli BL21(DE3), "
                                        "complete genome.")
    assert gb_file.get_version() == "CP001509.3"
    assert gb_file.get_gi() == "296142109"
    assert gb_file.get_db_link() == {
        "BioProject": "PRJNA20713",
        "BioSample": "SAMN02603478"
    }
    assert len(gb_file.get_references()) == 5
    for ref in gb_file.get_references()[1:]:
        assert ref["location"] == (1, 4558953)
        assert ref["journal"].endswith("Republic of Korea")
    assert gb_file.get_comment() == ("On May 17, 2010 this sequence version "
                                     "replaced CP001509.2. Bacteria available "
                                     "from F. William Studier "
                                     "(studier\x40bnl.gov).")
    annotation = gb_file.get_annotation(include_only=["CDS"])
    feature = annotation.get_features()[5]
    assert feature.key == "CDS"
    assert feature.qual["gene"] == "yaaA"
    assert feature.qual["transl_table"] == "11"
    assert str(feature.locs[0]) == "< 5681-6457"
Ejemplo n.º 3
0
def test_genbank_consistency(path):
    """
    Test whether the same annotation (if reasonable) can be read from a
    GFF3 file and a GenBank file.
    """
    file = gb.GenBankFile()
    file.read(join(data_dir, path))
    ref_annot = gb.get_annotation(file)

    file = gff.GFFFile()
    file.read(join(data_dir, path[:-3] + ".gff3"))
    test_annot = gff.get_annotation(file)
    
    # Remove qualifiers, since they will be different
    # in GFF3 and GenBank
    ref_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in ref_annot]
    )
    test_annot = seq.Annotation(
        [seq.Feature(feature.key, feature.locs) for feature in test_annot]
    )
    for feature in test_annot:
        # Only CDS, gene, intron and exon should be equal
        # in GenBank and GFF3
        if feature.key in ["CDS", "gene", "intron", "exon"]:
            try:
                assert feature in test_annot
            except AssertionError:
                print(feature.key)
                for loc in feature.locs:
                    print(loc)
                raise
Ejemplo n.º 4
0
def test_genbank_utility_gp():
    """
    Check whether the high-level utility functions return the expected
    content of a known GenPept file. 
    """
    gp_file = gb.GenBankFile()
    gp_file.read(join(data_dir, "bt_lysozyme.gp"))
    #[print(e) for e in gp_file._field_pos]
    assert gb.get_locus(gp_file) \
        == ("AAC37312", 147, "", False, "MAM", "27-APR-1993")
    assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]."
    assert gb.get_version(gp_file) == "AAC37312.1"
    assert gb.get_gi(gp_file) == 163334
    annotation = gb.get_annotation(gp_file)
    feature = seq.Feature("Site", [
        seq.Location(start, stop)
        for start, stop in zip([52, 55, 62, 76, 78, 81, 117, 120, 125],
                               [53, 55, 62, 76, 78, 81, 117, 120, 126])
    ], {
        "note": "lysozyme catalytic cleft [active]",
        "site_type": "active"
    })
    in_annotation = False
    for f in annotation:
        if f.key == feature.key and f.locs == feature.locs and \
           all([(key, val in f.qual.items())
                for key, val in feature.qual.items()]):
            in_annotation = True
    assert in_annotation
    assert len(gb.get_sequence(gp_file, format="gp")) == 147
Ejemplo n.º 5
0
def test_genbank_utility_gb():
    """
    Check whether the high-level utility functions return the expected
    content of a known GenBank file. 
    """
    gb_file = gb.GenBankFile()
    gb_file.read(join(data_dir, "ec_bl21.gb"))
    assert gb.get_locus(gb_file) \
        == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017")
    assert gb.get_definition(gb_file) \
        == ("Escherichia coli BL21(DE3), complete genome.")
    assert gb.get_version(gb_file) == "CP001509.3"
    assert gb.get_gi(gb_file) == 296142109
    assert gb.get_db_link(gb_file) \
        == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"}
    annotation = gb.get_annotation(gb_file, include_only=["CDS"])
    feature = seq.Feature(
        "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], {
            "gene": "yaaA",
            "transl_table": "11"
        })
    in_annotation = False
    for f in annotation:
        if f.key == feature.key and f.locs == feature.locs and \
           all([(key, val in f.qual.items())
                for key, val in feature.qual.items()]):
            in_annotation = True
    assert in_annotation
    assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
Ejemplo n.º 6
0
def test_conversion_lowlevel(path):
    """
    Test whether the low-level GenBank interface can properly read
    a GenBank file and write a file, without data changing.
    """
    gb_file = gb.GenBankFile()
    gb_file.read(path)
    ref_parsed_fields = [field for field in gb_file]

    gb_file = gb.GenBankFile()
    for name, content, subfields in ref_parsed_fields:
        gb_file.append(name, content, subfields)
    temp_file_name = biotite.temp_file("gb")
    gb_file.write(temp_file_name)

    gb_file = gb.GenBankFile()
    gb_file.read(temp_file_name)
    test_parsed_fields = [field for field in gb_file]
    assert test_parsed_fields == ref_parsed_fields
Ejemplo n.º 7
0
def test_contiguous_field_pos(path):
    """
    Check whether the internal index of a GenBankFile is contiguous
    """
    gb_file = gb.GenBankFile()
    gb_file.read(path)
    assert gb_file._field_pos[0][0] == 0
    for i in range(1, len(gb_file._field_pos)):
        start, _, _ = gb_file._field_pos[i]
        _, stop, _ = gb_file._field_pos[i - 1]
        assert start == stop
Ejemplo n.º 8
0
def fetch_gb_annotation(pdb_chain=str):

    # input line retained for debugging
    # pdb_chain = "6FRH_A"

    # Fetch GenBank files of the TK's first chain and extract annotatation
    file_name = entrez.fetch(pdb_chain, biotite.temp_dir(), "gb", "protein",
                             "gb")
    gb_file = gb.GenBankFile()
    gb_file.read(file_name)
    annotation = gb.get_annotation(gb_file, include_only=["SecStr"])
    return annotation
Ejemplo n.º 9
0
def test_file_access():
    """
    Test getting, setting, deleting and inserting fields in a GenBank
    file.
    """
    gb_file = gb.GenBankFile()
    gb_file.append("SOMEFIELD", ["Some content", "some other content"])
    gb_file.insert(0, "OTHERFIELD", ["Additional content"])
    assert gb_file[1] \
        == ("SOMEFIELD", ["Some content", "some other content"], {})
    gb_file[1] \
        = "NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]}
    gb_file.append("THIRDFIELD", ["Supplementary content"])
    assert len(gb_file) == 3
    assert gb_file[0] == ("OTHERFIELD", ["Additional content"], {})
    del gb_file[0]
    assert gb_file[0] \
        == ("NEWFIELD", ["Extra content"], {"SUBFIELD" : ["L 1", "L 2"]})
    del gb_file[0]
    assert gb_file[0] == ("THIRDFIELD", ["Supplementary content"], {})
    del gb_file[0]
    assert len(gb_file) == 0
Ejemplo n.º 10
0
# Code source: Patrick Kunzmann
# License: BSD 3 clause

import biotite
import biotite.sequence as seq
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.sequence.align as align
import biotite.database.entrez as entrez
import numpy as np
import matplotlib.pyplot as plt

# Download E. coli BL21 genome
file_name = entrez.fetch("CP001509", biotite.temp_dir(), "gb", "nuccore", "gb")
gb_file = gb.GenBankFile()
gb_file.read(file_name)
annot_seq = gb_file.get_annotated_sequence(include_only=["gene"])
# Find leuL gene
for feature in annot_seq.annotation:
    if "gene" in feature.qual and feature.qual["gene"] == "leuL":
        leul_feature = feature
# Get leuL sequence
leul_seq = annot_seq[leul_feature]

# Download Salmonella enterica genome without annotations
file_name = entrez.fetch("CP019649", biotite.temp_dir(), "fa", "nuccore",
                         "fasta")
fasta_file = fasta.FastaFile()
fasta_file.read(file_name)
se_genome = fasta.get_sequence(fasta_file)
Ejemplo n.º 11
0
def test_reverse_complement():
    gb_file = gb.GenBankFile()
    gb_file.read(join(data_dir, "ec_bl21.gb"))
    annot_seq = gb.get_annotated_sequence(gb_file)
    assert annot_seq == annot_seq.reverse_complement().reverse_complement()