import biotite.sequence.io.fasta as fasta import biotite.sequence.io.genbank as gb import biotite.sequence.graphics as graphics import biotite.application.clustalo as clustalo import biotite.database.entrez as entrez # Search for protein products of LexA gene in UniProtKB/Swiss-Prot database query = entrez.SimpleQuery("lexA", "Gene Name") \ & entrez.SimpleQuery("srcdb_swiss-prot", "Properties") # Search for the first 200 hits # More than 200 UIDs are not recommended for the EFetch service uids = entrez.search(query, db_name="protein", number=200) file_name = entrez.fetch_single_file(uids, biotite.temp_file("lexa.gb"), db_name="protein", ret_type="gb") # The file contains multiple concatenated GenPept files # -> Usage of MultiFile multi_file = gb.MultiFile("gp") multi_file.read(file_name) # Separate MultiFile into single GenPeptFile instances files = [f for f in multi_file] print("Definitions:") for file in files[:10]: print(file.get_definition()) print() print("Sources:") for file in files[:10]: print(file.get_source()) ######################################################################## # The names of the sources are too long to be properly displayed later # on. Therefore, we write a function that creates a proper abbreviation # for a species name.
uids += ids # Download corresponding GenBank files as single, merged file file_name = entrez.fetch_single_file(uids, biotite.temp_file("gb"), "protein", ret_type="gb") # Array that will hold for each of the genes and each of the 4 domains # the first and last position # The array is initally filled with -1, as the value -1 will indicate # that the domain does not exist in the sigma factor domain_pos = np.full((len(genes), 4, 2), -1, dtype=int) # Array that will hold the total sequence length of each sigma factor seq_lengths = np.zeros(len(genes), dtype=int) # Read the merged file containing multiple GenBank entries multi_file = gb.MultiFile() multi_file.read(file_name) # Iterate over each GenBank entry for i, gb_file in enumerate(multi_file): _, length, _, _, _, _ = gb.get_locus(gb_file) seq_lengths[i] = length annotation = gb.get_annotation(gb_file) # Find features, that represent a sigma factor domain for feature in annotation: if feature.key == "Region" and "note" in feature.qual \ and "Sigma-70 factor domain" in feature.qual["note"]: # Extract the domain number # and decrement for 0-based indexing # # e.g. 'Sigma-70 factor domain-2.' => 1 # ^
def test_multi_file(): multi_file = gb.MultiFile(file_type="gp") multi_file.read(join(data_dir, "multifile.gp")) accessions = [f.get_accession() for f in multi_file] assert accessions == ["1L2Y_A", "3O5R_A", "5UGO_A"]