def run_blastp(query, subject, outfile, evalue=1e-4): """ This function runs blastp :param query: file that contains query sequences :param subject: file that contains the sequences used for blasting :param outfile: the path to the file that will contain blast results if the blastp is successful :param evalue: threshold value for blast results, default is 1e-4 or user defined :return: returns the amount of blast hits which says if the blastp was successful """ cmd = blastp(query=query, subject=subject, evalue=evalue, outfmt=6, out=outfile) #print(cmd) stdout, stderr = cmd() #if stderr: # logging.warn("STDERR = ", stderr) if not os.path.isfile(outfile): raise Exception('No BLAST output') if os.path.getsize(outfile) == 0: return 0 return len(open(outfile).read().splitlines())
def blast_gene(seq, database): tempfasta = open('temp.fasta', 'w') SeqIO.write(seq, tempfasta, 'fasta') tempfasta.close() run = blastp(query='temp.fasta', db=database, num_descriptions=5, num_threads=6, outfmt=5, out='temp.xml') run() result_handle = open('temp.xml') result = NCBIXML.read(result_handle) rets = [] for i in result.descriptions: ttl = i.title e = i.e if 'Tfl|' in ttl: species = 'T. flavus' d = ttl[ttl.find('Tfl'):] elif 'Pfu|' in ttl: species = 'P. funiculosum' d = ttl[ttl.find('Pfu'):] elif 'PMAA_' in ttl: species = 'T. marneffei' d = ttl[ttl.find('PMAA'):] else: species = ttl[ttl.find('[') + 1:ttl.find(']')] d = ttl[ttl.find('| ') + 1:ttl.find('[') - 1] rets.append(species) rets.append(d) rets.append(str(e)) return rets
def Blast(type,protein_sequence,start, end, genomic_sequence): result=[] ORF=[] M = re.search('M',protein_sequence) if M: query = protein_sequence[M.start():] query=query+"*" temp = open("temp.ORF", "w") print >>temp, '>blasting' print >>temp, query temp.close() cline=blastp(query="'temp.ORF'", db = "./Blast/DB.blast.txt", evalue=0.01, outfmt=5, out=type +".BLAST") os.system(str(cline)) blast_out=open(type+".BLAST") string=str(blast_out.read()) DEF=re.search("<Hit_def>((.*))</Hit_def>",string) if DEF: if DEF.group(1)=='L1': real_start=start+M.start()+M.start()+M.start() result.append(type) result.append('L1') L1_pre=genomic_sequence[(start+3*M.start()):int(end)] splice='(C|T)(C|T)(A|C|G|T)(C|T)AG(A)TG' spliced=re.search(splice,str(L1_pre)) if spliced: start_L1 = int(spliced.start())+6 if start_L1 % 3 == 0: if start_L1 > 600: L1_post=L1_pre result.append(str(start_L1)+".."+str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post=L1_pre[start_L1:] result.append(str(int(real_start+1)+int(start_L1))+".."+str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post=L1_pre result.append(str(real_start+1)+".."+str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post=L1_pre result.append(str(real_start+1)+".."+str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: real_start=start+M.start()+M.start()+M.start() result.append(type) result.append(DEF.group(1)) result.append(str(real_start+1)+".."+str(end)) result.append(genomic_sequence[int(real_start):int(end)]) result.append(query) return result
def do_blast(self, subject): print 'Running BLAST...' blast = blastp() blast.db = self.standard blast.query = subject blast.outfmt = 5 blast.out = subject + '.xml' blast.comp_based_stats = "0" if os.path.exists(blast.out): return blast()
def secondary_blast(self): if os.path.exists('orthologs.faa') is False or os.path.exists('secondary.xml') is True: return os.system('makeblastdb -dbtype prot -in orthologs.faa') blast = blastp() blast.db = 'orthologs.faa' blast.query = self.substract blast.outfmt = 5 blast.out = 'secondary.xml' blast.evalue = self.expect_diff blast.comp_based_stats = "0" blast()
def primary_blast(self): if os.path.exists('primary.xml') is True: return print 'Running BLAST...' blast = blastp() blast.db = self.db blast.query = self.genomes[0][1] blast.outfmt = 5 blast.out = 'primary.xml' blast.evalue = self.expect blast.comp_based_stats = "0" blast()
def circle_blast(self, loc): print 'Running Circualr BLAST...' blast = blastp() blast.db = loc + '/circle' blast.outfmt = 5 blast.comp_based_stats = "0" for subject in self.genomes: self.build_genome(loc, subject) blast.query = subject blast.out = loc + '/' + subject + '.xml' if os.path.exists(blast.out): return blast()
def match_fasta_position(query, subject, num=None): """return dataframe of position matching in subject and query fasta, or a dictionary given a list of number""" df = [] blastp(query=query, subject=subject, out=query + '_' + subject + '.xml', outfmt=5, max_hsps=1)() xml = SearchIO.read(query + '_' + subject + '.xml', "blast-xml") for n in range(len(xml)): x = xml[n][0] hit_gap = np.array( [index for index, value in enumerate(x.hit) if value == '-']) q_gap = np.array( [index for index, value in enumerate(x.query) if value == '-']) hit_num = list(np.arange(x.hit_start + 1, x.hit_end + 1)) q_num = list(np.arange(x.query_start + 1, x.query_end + 1)) for i in hit_gap: hit_num.insert(i, np.nan) for i in q_gap: q_num.insert(i, np.nan) df.append( list( zip( len(x.query) * [query], len(x.query) * xml[n].description, x.query, q_num, x.hit, hit_num))) df = pd.DataFrame([j for i in df for j in i], columns=[ 'query', 'sequence', 'query_res', 'query_num', 'hit_res', 'hit_num' ]) df.set_index(['query'], inplace=True) os.remove(query + '_' + subject + '.xml') if num == None: return df else: return df[df.query_num.isin(list(map(int, num)))].to_dict('records')
def run_blastp(query, subject, outfile, evalue=1e-4): cmd = blastp(query=query, subject=subject, evalue=evalue, outfmt=6, out=outfile) print(cmd) stdout, stderr = cmd() #if stderr: # logging.warn("STDERR = ", stderr) if not os.path.isfile(outfile): raise Exception('No BLAST output') if os.path.getsize(outfile) == 0: return 0 return len(open(outfile).read().splitlines())
def blast_gene(seq, database): tempfasta = open('temp.fasta', 'w') SeqIO.write(seq, tempfasta, 'fasta') tempfasta.close() run = blastp(query='temp.fasta', db=database, max_target_seqs=1, num_threads=6, outfmt=5, out='temp.xml') run() result_handle = open('temp.xml') result = NCBIXML.read(result_handle) if len(result.descriptions) > 0: rets = result.descriptions[0].title rets = [rets[rets.find('GQ'):]] rets.append(str(result.descriptions[0].e)) return rets else: return ['none']
def blast_gene(ids, eval, database, of): fasta_sequences = SeqIO.parse(open(database), "fasta") for seq in fasta_sequences: if seq.id == ids: SeqIO.write(seq, "temp.fasta", "fasta") run = blastp(query='temp.fasta', db=database, num_threads=6, outfmt=5, word_size=4, evalue=eval, out='temp.xml') run() result_handle = open('temp.xml') result = NCBIXML.read(result_handle) rets = [] for i in result.descriptions: ttl = i.title e = i.e species = ttl.split(' ')[0] rets.append(species) rets.append(str(e)) # for i in result.alignments: # for j in i.hsps: # rets.append(str(j.frame[1])) # rets.append(str(j.query)) # rets.append(str(j.match)) # rets.append(str(j.sbjct_start)) os.remove('temp.fasta') os.remove('temp.xml') genlis = [] for i in range(0, len(rets), 2): genlis.append(rets[i]) print rets[i] fasta_sequences = SeqIO.parse(open(database), "fasta") seqs = [] for seq in fasta_sequences: if seq.id in genlis: seqs.append(seq) SeqIO.write(seqs, of, "fasta")
def Blast(type, protein_sequence, start, end, genomic_sequence): result = [] ORF = [] M = re.search('M', protein_sequence) if M: query = protein_sequence[M.start():] query = query + "*" temp = open("temp.ORF", "w") print >> temp, '>blasting' print >> temp, query temp.close() cline = blastp(query="'temp.ORF'", db="./Blast/DB.blast.txt", evalue=0.01, outfmt=5, out=type + ".BLAST") os.system(str(cline)) blast_out = open(type + ".BLAST") string = str(blast_out.read()) DEF = re.search("<Hit_def>((.*))</Hit_def>", string) if DEF: if DEF.group(1) == 'L1': real_start = start + M.start() + M.start() + M.start() result.append(type) result.append('L1') L1_pre = genomic_sequence[(start + 3 * M.start()):int(end)] splice = '(C|T)(C|T)(A|C|G|T)(C|T)AG(A)TG' spliced = re.search(splice, str(L1_pre)) if spliced: start_L1 = int(spliced.start()) + 6 if start_L1 % 3 == 0: if start_L1 > 600: L1_post = L1_pre result.append(str(start_L1) + ".." + str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post = L1_pre[start_L1:] result.append( str(int(real_start + 1) + int(start_L1)) + ".." + str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post = L1_pre result.append(str(real_start + 1) + ".." + str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: L1_post = L1_pre result.append(str(real_start + 1) + ".." + str(end)) result.append(str(L1_post)) result.append(Seq(str(L1_post)).translate()) else: real_start = start + M.start() + M.start() + M.start() result.append(type) result.append(DEF.group(1)) result.append(str(real_start + 1) + ".." + str(end)) result.append(genomic_sequence[int(real_start):int(end)]) result.append(query) return result
def get_bsr(work_path, job_name, reference_fasta, comparison_fastas, strain_names, force_redo=False, matrix='PAM30'): """Return a BSR_RecordSet containing BSR_Record objects with the results of a BSR analysis. Args: work_path (str): The path were generated files will be saved to. job_name (str): Prefix used for all generated file names, and stored in the returned BSR_Record reference_fasta (str): Path of the FastA file used as the reference comparison_fastas (iter[str]): List or tuple of paths to FastA files for which BSR will be generated. strain_names (iter[str]): Strain/species names of organisms in the analysis. Used when generating graphs and optionally to obtain data of that strain. force_redo (bool): If True the analysis will be performed from the start, overwriting any files in the workpath. Defaults to False. matrix (str) optional, 'PAM30' (default) or 'BLOSUM62': Specifies the scoring matrix used by BLAST. PAM30 is better for more similar sequences. Returns: BSR_RecordSet. See documentation for this object for more info A series of blastp are performed using the reference_fasta as the query against the comparison_fastas. Files archiving the process are created and saved to disc using pickle. If the process is interupted then it can be continued by running the same command. """ assert len(comparison_fastas) == len(strain_names) - 1 #matrix = MATRICES[matrix] created_files = [] # Create (or load) the initial BSR_records from the prime fasta fprefix = work_path + job_name init_bsr_records_fn = fprefix + ' - ref BSR records.pickle' bsr_records = _do_next_step_pickle( init_bsr_records_fn, get_initial_BSR_Records_with_ref_scores, (reference_fasta, strain_names[0], matrix), # Don't forget to pass *args as tuple force_redo=force_redo) created_files.append(init_bsr_records_fn) # Go through each proteome to be compared to the reference and get BSR. if type(comparison_fastas) is not list: comparison_fastas = [comparison_fastas] comparison_num = 0 for comparee_index, subject_fasta in enumerate(comparison_fastas): comparison_num += 1 # Get a blast of every reference sequences against the comparison proteome blast_results_path = fprefix + ' - BLAST ref vs ' + os.path.split( subject_fasta)[1] + '.xml' created_files.append(blast_results_path) if not os.path.isfile(blast_results_path) or force_redo: blast_query = blastp(query=reference_fasta, subject=subject_fasta, remote=False, outfmt=5, max_hsps=1, matrix=matrix, comp_based_stats="0") # run the blast blast_results = blast_query() with open(blast_results_path, 'w') as f: f.write(blast_results[0]) print('BLAST results file saved:', blast_results_path) bsr_records_path = fprefix + ' - {} of {} BSR calculated.pickle'.format( comparison_num, len(comparison_fastas)) created_files.append(bsr_records_path) bsr_records = _do_next_step_pickle(bsr_records_path, get_bsr_for_strain, (bsr_records, blast_results_path, strain_names[comparee_index + 1]), force_redo=force_redo) final_results_path = fprefix + ' - complete BSR results.pickle' created_files.append(final_results_path) final_results = BSR_RecordSet(job_name, bsr_records, reference_fasta, comparison_fastas, strain_names, created_files, matrix) fn = work_path + job_name + ' - Final record set.pickle' with open(fn, 'wb') as f: pickle.dump(final_results, f) print('File saved:', fn) return final_results