def test_get_sequence(): file = pdbx.PDBxFile.read(join(data_dir("structure"), "5ugo.cif")) sequences = pdbx.get_sequence(file) file = pdbx.PDBxFile.read(join(data_dir("structure"), "4gxy.cif")) sequences += pdbx.get_sequence(file) assert str(sequences[0]) == "CCGACGGCGCATCAGC" assert type(sequences[0]) is seq.NucleotideSequence assert str(sequences[1]) == "GCTGATGCGCC" assert type(sequences[1]) is seq.NucleotideSequence assert str(sequences[2]) == "GTCGG" assert type(sequences[2]) is seq.NucleotideSequence assert (str(sequences[3]) == "MSKRKAPQETLNGGITDMLTELANFEKNVSQAIHKYN" "AYRKAASVIAKYPHKIKSGAEAKKLPGVGTKIAEKIDEFLATGKLRKLEKIRQD" "DTSSSINFLTRVSGIGPSAARKFVDEGIKTLEDLRKNEDKLNHHQRIGLKYFGD" "FEKRIPREEMLQMQDIVLNEVKKVDSEYIATVCGSFRRGAESSGDMDVLLTHPS" "FTSESTKQPKLLHQVVEQLQKVHFITDTLSKGETKFMGVCQLPSKNDEKEYPHR" "RIDIRLIPKDQYYCGVLYFTGSDIFNKNMRAHALEKGFTINEYTIRPLGVTGVA" "GEPLPVDSEKDIFDYIQWKYREPKDRSE") assert type(sequences[3]) is seq.ProteinSequence assert (str(sequences[4]) == "GGCGGCAGGTGCTCCCGACCCTGCGGTCGGGAGTTAA" "AAGGGAAGCCGGTGCAAGTCCGGCACGGTCCCGCCACTGTGACGGGGAGTCGCC" "CCTCGGGATGTGCCACTGGCCCGAAGGCCGGGAAGGCGGAGGGGCGGCGAGGAT" "CCGGAGTCAGGAAACCTGCCTGCCGTC") assert type(sequences[4]) is seq.NucleotideSequence
def test_search_sequence(): IDENTIY_CUTOFF = 0.9 pdbx_file = pdbx.PDBxFile.read(join(data_dir("structure"), "1l2y.cif")) ref_sequence = pdbx.get_sequence(pdbx_file)[0] query = rcsb.SequenceQuery(ref_sequence, "protein", min_identity=IDENTIY_CUTOFF) test_ids = rcsb.search(query) for id in test_ids: fasta_file = fasta.FastaFile.read(rcsb.fetch(id, "fasta")) test_sequence = fasta.get_sequence(fasta_file) matrix = align.SubstitutionMatrix.std_protein_matrix() alignment = align.align_optimal(ref_sequence, test_sequence, matrix, terminal_penalty=False)[0] identity = align.get_sequence_identity(alignment, mode="shortest") assert identity >= IDENTIY_CUTOFF
import matplotlib.colors as colors import biotite import biotite.structure as struc import biotite.structure.io.pdbx as pdbx import biotite.sequence as seq import biotite.sequence.io.fasta as fasta import biotite.sequence.align as align import biotite.sequence.graphics as graphics import biotite.application.blast as blast import biotite.application.clustalo as clustalo import biotite.database.rcsb as rcsb import biotite.database.entrez as entrez # Get structure and sequence pdbx_file = pdbx.PDBxFile.read(rcsb.fetch("1GUU", "mmcif")) sequence = pdbx.get_sequence(pdbx_file)[0] # 'use_author_fields' is set to false, # to ensure that values in the 'res_id' annotation point to the sequence structure = pdbx.get_structure(pdbx_file, model=1, use_author_fields=False) structure = structure[struc.filter_amino_acids(structure)] # Identity threshold for a sequence to be counted as homologous sequence IDENTITY_THESHOLD = 0.4 # Find homologous proteins in SwissProt via BLAST app = blast.BlastWebApp("blastp", sequence, database="swissprot") app.start() app.join() alignments = app.get_alignments() hit_seqs = [sequence] hit_ids = ["Query"] hit_starts = [1]