def test_conversion_highlevel(path): """ Test whether the high-level GenBank interface can properly read the locus, annotation and sequence from GenBank file and write these properties to a file, without data changing. """ suffix = path[-2:] gb_file = gb.GenBankFile.read(path) ref_locus = gb.get_locus(gb_file) ref_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) gb_file = gb.GenBankFile() gb.set_locus(gb_file, *ref_locus) gb.set_annotated_sequence(gb_file, ref_annot_seq) temp = TemporaryFile("w+") gb_file.write(temp) temp.seek(0) gb_file = gb.GenBankFile.read(temp) temp.close() test_locus = gb.get_locus(gb_file) test_annot_seq = gb.get_annotated_sequence(gb_file, format=suffix) assert test_locus == ref_locus assert test_annot_seq.sequence == ref_annot_seq.sequence assert test_annot_seq.annotation == ref_annot_seq.annotation assert test_annot_seq.sequence_start == ref_annot_seq.sequence_start
def test_genbank_utility_gp(): """ Check whether the high-level utility functions return the expected content of a known GenPept file. """ gp_file = gb.GenBankFile.read(join(data_dir("sequence"), "bt_lysozyme.gp")) #[print(e) for e in gp_file._field_pos] assert gb.get_locus(gp_file) \ == ("AAC37312", 147, "", False, "MAM", "27-APR-1993") assert gb.get_definition(gp_file) == "lysozyme [Bos taurus]." assert gb.get_version(gp_file) == "AAC37312.1" assert gb.get_gi(gp_file) == 163334 annotation = gb.get_annotation(gp_file) feature = seq.Feature( "Site", [seq.Location(start, stop) for start, stop in zip( [52,55,62,76,78,81,117,120,125], [53,55,62,76,78,81,117,120,126] )], {"note": "lysozyme catalytic cleft [active]", "site_type": "active"} ) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gp_file, format="gp")) == 147
def test_genbank_utility_gb(): """ Check whether the high-level utility functions return the expected content of a known GenBank file. """ gb_file = gb.GenBankFile.read(join(data_dir("sequence"), "ec_bl21.gb")) assert gb.get_locus(gb_file) \ == ("CP001509", 4558953, "DNA", True, "BCT", "16-FEB-2017") assert gb.get_definition(gb_file) \ == ("Escherichia coli BL21(DE3), complete genome.") assert gb.get_version(gb_file) == "CP001509.3" assert gb.get_gi(gb_file) == 296142109 assert gb.get_db_link(gb_file) \ == {"BioProject" : "PRJNA20713", "BioSample" : "SAMN02603478"} annotation = gb.get_annotation(gb_file, include_only=["CDS"]) feature = seq.Feature( "CDS", [seq.Location(5681, 6457, seq.Location.Strand.REVERSE)], {"gene": "yaaA", "transl_table": "11"} ) in_annotation = False for f in annotation: if f.key == feature.key and f.locs == feature.locs and \ all([(key, val in f.qual.items()) for key, val in feature.qual.items()]): in_annotation = True assert in_annotation assert len(gb.get_sequence(gb_file, format="gb")) == 4558953
"protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile() multi_file.read(file_name) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) seq_lengths[i] = length annotation = gb.get_annotation(gb_file) # Find features, that represent a sigma factor domain for feature in annotation: if feature.key == "Region" and "note" in feature.qual \ and "Sigma-70 factor domain" in feature.qual["note"]: # Extract the domain number # and decrement for 0-based indexing # # e.g. 'Sigma-70 factor domain-2.' => 1 # ^ domain_index = int( re.findall("(?<=Sigma-70 factor domain-)\d+", feature.qual["note"])[0]) - 1 # Expect a single contiguous location of the domain
import biotite.sequence.graphics as graphics import biotite.database.entrez as entrez PLASMID_URL = "https://media.addgene.org/snapgene-media/" \ "v1.6.2-0-g4b4ed87/sequences/67/17/246717/" \ "addgene-plasmid-26094-sequence-246717.gbk" response = requests.get(PLASMID_URL) gb_file = gb.GenBankFile.read(io.StringIO(response.text)) annotation = gb.get_annotation(gb_file, include_only=[ "promoter", "terminator", "protein_bind", "RBS", "CDS", "rep_origin", "primer_bind" ]) _, seq_length, _, _, _, _ = gb.get_locus(gb_file) # AddGene stores the plasmid name in the 'KEYWORDS' field # [0][0][0] -> # The first (and only) 'KEYWORDS' field # The first entry in the tuple # The first (and only) line in the field plasmid_name = gb_file.get_fields("KEYWORDS")[0][0][0] def custom_feature_formatter(feature): # AddGene stores the feature label in the '\label' qualifier label = feature.qual.get("label") if feature.key == "promoter": return True, biotite.colors["dimgreen"], "black", label elif feature.key == "terminator": return True, "firebrick", "black", label