Exemple #1
0
import biotite.sequence.io.fasta as fasta
import biotite.sequence.io.genbank as gb
import biotite.sequence.graphics as graphics
import biotite.application.clustalo as clustalo
import biotite.database.entrez as entrez
# Search for protein products of LexA gene in UniProtKB/Swiss-Prot database
query =   entrez.SimpleQuery("lexA", "Gene Name") \
        & entrez.SimpleQuery("srcdb_swiss-prot", "Properties")
# Search for the first 200 hits
# More than 200 UIDs are not recommended for the EFetch service
uids = entrez.search(query, db_name="protein", number=200)
file_name = entrez.fetch_single_file(uids, biotite.temp_file("lexa.gb"),
                              db_name="protein", ret_type="gb")
# The file contains multiple concatenated GenPept files
# -> Usage of MultiFile
multi_file = gb.MultiFile("gp")
multi_file.read(file_name)
# Separate MultiFile into single GenPeptFile instances
files = [f for f in multi_file]
print("Definitions:")
for file in files[:10]:
    print(file.get_definition())
print()
print("Sources:")
for file in files[:10]:
    print(file.get_source())

########################################################################
# The names of the sources are too long to be properly displayed later
# on. Therefore, we write a function that creates a proper abbreviation
# for a species name.
Exemple #2
0
    uids += ids
# Download corresponding GenBank files as single, merged file
file_name = entrez.fetch_single_file(uids,
                                     biotite.temp_file("gb"),
                                     "protein",
                                     ret_type="gb")

# Array that will hold for each of the genes and each of the 4 domains
# the first and last position
# The array is initally filled with -1, as the value -1 will indicate
# that the domain does not exist in the sigma factor
domain_pos = np.full((len(genes), 4, 2), -1, dtype=int)
# Array that will hold the total sequence length of each sigma factor
seq_lengths = np.zeros(len(genes), dtype=int)
# Read the merged file containing multiple GenBank entries
multi_file = gb.MultiFile()
multi_file.read(file_name)
# Iterate over each GenBank entry
for i, gb_file in enumerate(multi_file):
    _, length, _, _, _, _ = gb.get_locus(gb_file)
    seq_lengths[i] = length
    annotation = gb.get_annotation(gb_file)
    # Find features, that represent a sigma factor domain
    for feature in annotation:
        if feature.key == "Region" and "note" in feature.qual \
           and "Sigma-70 factor domain" in feature.qual["note"]:
            # Extract the domain number
            # and decrement for 0-based indexing
            #
            # e.g. 'Sigma-70 factor domain-2.' => 1
            #                              ^
Exemple #3
0
def test_multi_file():
    multi_file = gb.MultiFile(file_type="gp")
    multi_file.read(join(data_dir, "multifile.gp"))
    accessions = [f.get_accession() for f in multi_file]
    assert accessions == ["1L2Y_A", "3O5R_A", "5UGO_A"]