def detect_mutations(input: str, db: str, out: str): os.system( f"blastx -query {input} -subject {db} -subject_besthit -max_hsps 1 -outfmt 5 > tmp.blast.xml" ) with open("tmp.blast.xml", "r") as f: with open(out, "w") as fh: done = 0 for p in parse(f): mutations = [] for alignment in p.alignments: al_mutations = [] hsps = alignment.hsps[0] query = hsps.query subject = hsps.sbjct if (len(query)) != len(subject): continue for i in range(len(subject)): if query[i] not in ['X', '-', '*' ] and query[i] != subject[i]: al_mutations.append( f"{alignment.title.split('|')[0]} {subject[i]}{i + hsps.sbjct_start}{query[i]}" ) mutations.append(al_mutations) done += 1 fh.write(f"{p.query};") fh.write(','.join( list(itertools.chain.from_iterable(mutations)))) fh.write("\n") os.unlink("tmp.blast.xml")
def create_blast_dict(): #seq = acc2sequence(acc) with open(blast_cache, mode="r") as f: records = [record for record in parse(f)] for record in records: homologs = [{ "accession": alignment.accession, "identity": round((hsp.identities / hsp.align_length) * 100, 2), "coverage": round((hsp.align_length / record.query_length) * 100, 2) } for alignment in record.alignments for hsp in alignment.hsps] with open(dict_cache, mode="wb") as f: pickle.dump(homologs, f) print("cached the homologs dictionary")
def calc_immutability(consensus_filename, protein_seqs_filename, CUTOFF): out = NcbitblastnCommandline(subject=consensus_filename, query=protein_seqs_filename, out="results.xml", outfmt=5) # evalue=? out( ) # NCBI tblastn is run locally to align proteins to the consensus sequence consensus_seq = SeqIO.parse(consensus_filename, "fasta").__next__().seq._data protein_alignments = [ x.alignments[0].hsps[0] for x in parse(open(out.out, "r")) ] mutability_score = [UNCONSERVED_SCORE for _ in range(len(consensus_seq))] for alignment in protein_alignments: position = alignment.sbjct_start - 1 # print("\ncalculating for",alignment.sbjct) for aa in alignment.sbjct: assert aa != " ", "empty aa in alignment subject" if aa != "X": codon0 = consensus_seq[position:position + 3] # print(aa, end="") aa0 = "none" try: aa0 = codon_table[codon0] except: position += 3 continue #We do not calculate if there is an ambiguous nucleotide in the consensus seq. We could #have calculated for the other nucleotides in the codon however it requires more #detailed calculation. # print(position,codon0,aa0,end="|") for pos_in_codon in range(3): # codonX=codon0 if mutability_score[position + pos_in_codon] != UNCONSERVED_SCORE: continue # if this position was previously processed, there is no reason to process again. SKIP IT. This might be the case if there are several proteins aligned to here. mutability_score[position + pos_in_codon] = 0 for putative_nuc in SUBSTITUTION_MODEL[ codon0[pos_in_codon]].keys(): codonX = replace(codon0, putative_nuc, pos_in_codon) aaX = codon_table[codonX] if aaX != "_": #and aa0!=aaX: #To include self conversion sounds rigth way as both matrices include these in their calculations. Asuming any deleterious mutation would not be conserved thus not needed for consideration. try: aa_score = aa_score_method(aa0, aaX) except: aa_score = aa_score_method( aaX, aa0 ) #pam30 is a triangular matrix. so we check reversed subs_score = SUBSTITUTION_MODEL[ codon0[pos_in_codon]][putative_nuc] # print(aa0,"-->",aaX,":",aa_score*subs_score,end="|") mutability_score[ position + pos_in_codon] += aa_score * subs_score position += 3 print("\nconsevation scores complete:") standart_dev = std([x for x in mutability_score if x != None]) average = mean([x for x in mutability_score if x != None]) printlines = ["", "", "", "", "", ""] for i in range(len(consensus_seq)): if i % 100 == 0: print(i, end=" " * (100 - len(str(i)))) printlines[0] += str(i) + " " * (100 - len(str(i))) print() for i in range(len(consensus_seq)): if i % 100 == 0: print("|", end="") printlines[1] += "|" else: print(" ", end="") printlines[1] += " " print() prot = "" #TODO: automatically find where protein starts, add all one by one. This code below assumes all has ORF +1.It would not work for all cases for i in range(2, len(consensus_seq), 3): d = " " if mutability_score[i] != None: try: d = " " + codon_table[consensus_seq[i:i + 3]] + " " except: pass prot += d print(" " + prot) # printlines[2]=" "+prot for i in range(len(consensus_seq)): print(consensus_seq[i], end="") printlines[3] += consensus_seq[i] print() for i in range(len(consensus_seq)): d = "_" if mutability_score[i] != None: d = " " if mutability_score[i] < average + standart_dev * 2: d = "░" if mutability_score[i] < average + standart_dev: d = "▒" if mutability_score[i] < average - standart_dev: d = "▓" if mutability_score[i] < average - standart_dev * 2: d = "█" print(d, end="") printlines[4] += d print() immutable_seq = "" for i in range(len(consensus_seq)): d = "N" if mutability_score[i] != None and mutability_score[i] < CUTOFF: d = consensus_seq[i] print(d, end="") printlines[5] += d immutable_seq += d # print() # for i in range(0,30000,100): # print("\n".join(printlines[i:i+100])) print("\n<<<REPORT>>>") print("settings - Score for unconserved nucleotides:", UNCONSERVED_SCORE, "Cutoff score:", CUTOFF) print("max score:", max([x for x in mutability_score if x != None]), "min score:", min([x for x in mutability_score if x != None])) print("mean score", average, "standart dev:", standart_dev) print("number of non-Ns in consensus seq:", len([x for x in consensus_seq if x != "N"])) print("number of non-Ns in immutable seq:", len([x for x in immutable_seq if x != "N"])) print("scores of coding regions summed:", sum([x for x in mutability_score if x != None])) # print("scores of all regions summed:", sum([x for x in mutability_score])) return immutable_seq, mutability_score
from Bio.Blast.NCBIWWW import qblast from Bio.Blast.NCBIXML import parse from Bio import SeqIO records = SeqIO.parse("./apoe.fas", "fasta") PROGRAM = 'blastp' DATABASE = 'nr' for rec in records: # query NCBI Blast API xml_result = qblast(PROGRAM, DATABASE, rec.seq) # Parse xml result results = parse(xml_result) # iterate over each result for record in results: for alignment in record.alignments: print(alignment)
def _main(self): email = '*****@*****.**' genome_dir = '/home/allis/Dropbox/Science/Микра/Thermococcus/sequence/GenBank/Thermococcales/Thermococcus/' genome = 'Thermococcus_barophilus_Ch5.gb' gene = 'TBCH5v1_1369' #cooS database = 'nr' segment = [3200, 12000] seq = SeqLoader.load_file(os.path.join(genome_dir, genome)) if not seq: raise RuntimeError('No genome loaded') seq = seq[0] index = get_indexes_of_genes(seq, gene) if not index: raise RuntimeError('No gene found') feature = seq.features[index[0]] query = feature.extract(seq) segments_file = 'CO-clusters.gb' #get cluster variants if needed if not os.path.isfile(segments_file): blast_file = 'blast.results.xml' if os.path.isfile(blast_file): blast = list(parse(open(blast_file))) else: blast = BlastCLI.blast_seq(query, database, 100, remote=True, task='blastn', parse_results=True, save_results_to='blast.results.xml') if not blast: raise RuntimeError('Blast returned no results') flt = BlastFilter(lambda hsp, r: hsp.align_length > 700, filter_hsps=True) flt(blast) queries = [] for ali in BlastCLI.iter_alignments(blast): q = BlastCLI.Query(ali, 'hsp', start_offset=segment[0], end_offset=segment[1]) if q: queries.append(q) print(queries[-1]) segments = BlastWWW.fetch_queries(email, queries) safe_write(segments, segments_file) for r in segments: print('[%s] %s: %dbp' % (r.id, pretty_rec_name(r), len(r))) return 0 #find primers in alignments of the selected features local_files = [ os.path.join(genome_dir, f) for f in ('Thermococcus_barophilus_DT4-complete-genome.gb', 'Thermococcus_ST-423.gb', 'Thermococcus_CH1-complete.gb') ] loader = SeqLoader(self.abort_event) segments = loader.load_files([segments_file] + local_files) fprimers, transF_ali = find_primers( segments, 'transF', dict(plen=(20, 30), max_mismatches=5, min_first_matches=3, AT_first=True)) rprimers, cooS_ali = find_primers(segments, 'cooS', dict(plen=(20, 30), max_mismatches=4, min_first_matches=3, AT_first=True), reverse=True) if not fprimers: print('\nNo forward primers found') return 1 if not rprimers: print('\nNo reverse primers found') return 1 print('\nForward primers:') for p in fprimers: print('%s: %s' % (p.id, p)) print('\nReverse primers:') for p in rprimers: print('%s: %s' % (p.id, p)) print() #add primers to alignments and save them transF_ali = PrimerFinder.add_primers_to_alignment( fprimers, transF_ali) cooS_ali = PrimerFinder.add_primers_to_alignment(rprimers, cooS_ali, reverse=True) AlignmentUtils.save(transF_ali, 'transF.aln') AlignmentUtils.save(cooS_ali, 'cooS.aln')