def _run_all(out_dir, query, blastdb, num_iterations, num_threads, template, flann_x, flann_y, flann_index, num_neighbors, score_out_name, score_matrix, open_penalty, extend_penalty, alignment_out_name): o_path = Path(out_dir).expanduser().absolute() o_path.mkdir(exist_ok=True, parents=True) q_path = Path(query).expanduser().absolute() NcbipsiblastCommandline(query=q_path, db=blastdb, num_iterations=num_iterations, num_threads=num_threads, out_ascii_pssm=o_path / (q_path.stem + '.mtx'), save_pssm_after_last_round=True)() t_path = Path(template).expanduser().absolute() NcbipsiblastCommandline(query=t_path, db=blastdb, num_iterations=num_iterations, num_threads=num_threads, out_ascii_pssm=o_path / (t_path.stem + '.mtx'), save_pssm_after_last_round=True)() machina.predict.predict_scores(o_path / (q_path.stem + '.mtx'), o_path / (t_path.stem + '.mtx'), flann_x=Path(flann_x), flann_y=Path(flann_y), flann_index=Path(flann_index), num_neighbors=num_neighbors, out_dir=o_path, out_name=Path(score_out_name)) machina.generate_alignment.alignment_local_and_save( o_path / score_matrix, o_path / (q_path.stem + '.mtx'), o_path / (t_path.stem + '.mtx'), -open_penalty, -extend_penalty, o_path, Path(alignment_out_name))
def generate_pairwise_alignment(self, query_id: str, target_id: str, out_dir: str, pssm_dir: str): Path(f'{out_dir}/{query_id}').mkdir(parents=True, exist_ok=True) SeqIO.write( SeqRecord(_pp(f'{pssm_dir}/{query_id[2:4]}/{query_id}.mtx'), id=query_id), 'query.fasta', 'fasta') SeqIO.write( SeqRecord(_pp(f'{pssm_dir}/{target_id[2:4]}/{target_id}.mtx'), id=target_id), 'subject.fasta', 'fasta') if self.algo == 'psiblast': if not Path( f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm').exists(): NcbipsiblastCommandline( db=f'{self.blast_db_dir}/uniref90', num_iterations=3, out_pssm=f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm', query='query.fasta', save_pssm_after_last_round=True, num_threads=os.cpu_count())() NcbipsiblastCommandline( in_pssm=f'{pssm_dir}/{query_id[2:4]}/{query_id}.pssm', evalue=99999, subject='subject.fasta', outfmt=5, out=f'{out_dir}/{query_id}/{target_id}.xml')() elif self.algo == 'deltablast': NcbideltablastCommandline( subject='subject.fasta', rpsdb=f'{self.blast_db_dir}/cdd_delta', evalue=99999, outfmt=5, out=f'{out_dir}/{query_id}/{target_id}.xml', query='query.fasta')() Path('query.fasta').unlink() Path('subject.fasta').unlink()
def create_profile(query, template, blastdb, num_iterations, cmd, out_dir): o_path = Path(out_dir).expanduser().absolute() o_path.mkdir(exist_ok=True, parents=True) q_path = Path(query).expanduser().absolute() NcbipsiblastCommandline(query=q_path, db=blastdb, num_iterations=num_iterations, out_ascii_pssm=o_path / (q_path.stem + '.mtx'), save_pssm_after_last_round=True)() q_path = Path(template).expanduser().absolute() NcbipsiblastCommandline(query=q_path, db=blastdb, num_iterations=num_iterations, out_ascii_pssm=o_path / (q_path.stem + '.mtx'), save_pssm_after_last_round=True)()
def do_psiblast(dirpath, rec): """ Run a PSIBLAST query on the given sequence. """ # save the query to a fasta file query_basepath = Path(dirpath, rec.id) SeqIO.write(rec, str(query_basepath) + ".fasta", "fasta") # build the query query = str(query_basepath) + ".fasta" db = "cpdb2_db" evalue = 0.001 outfmt = 5 out = str(query_basepath) + "_blast.xml" num_threads = 6 num_iterations = 3 # out_pssm=str(query_basepath)+"_blast.pssm" # out_ascii_pssm=str(query_basepath)+"_blast.ascii_pssm" # save_pssm_after_last_round=True try: psib_cline = NcbipsiblastCommandline(query=query, db=db, evalue=evalue, outfmt=outfmt, out=out, num_threads=num_threads, num_iterations=num_iterations) #print(psib_cline) stdout, stderr = psib_cline() except: print("Failed to run PSIBLAST on record %s" % rec.id) return -1 return 1
def process_input_user(seq_file, dir): processed_num = 0 if not os.path.exists(dir): os.mkdir(dir) index = 0 for seq_record in list(SeqIO.parse(seq_file, "fasta")): processed_num += 1 print("in loop, processing" + str(processed_num) + "\n") pssmfile = dir + str(index) + "_pssm.txt" inputfile = dir + 'tempseq.fasta' seql = len(seq_record) if not os.path.exists(pssmfile): if os.path.exists(inputfile): os.remove(inputfile) SeqIO.write(seq_record, inputfile, 'fasta') try: psiblast_cline = NcbipsiblastCommandline( query=inputfile, db='./db/swissprot/swissprot', num_iterations=3, evalue=0.001, out_ascii_pssm=pssmfile, num_threads=4) stdout, stderr = psiblast_cline() except: print("invalid protein: " + seq_record) index = index + 1
def run_psiblast(path, database, in_pssm, out_xml): psiblast_cline = NcbipsiblastCommandline(db=f'{path}/{database}.db', in_pssm=f'{path}/{in_pssm}', out=f'{path}/{out_xml}', max_target_seqs=10000, outfmt=5) psiblast_cline()
def psi_blaster(in_file, out_file): """Perform a PSI-BLAST via the refseq-protein.00 database available locally and then get XML output. Args: in_file [str]: Path to the input file for BLAST search. Returns: out_psi.xml [file]: Tell the user operation is done """ in_file = str(in_file) print("Initiating PSI-BLAST...") cline = NcbipsiblastCommandline(query=in_file, outfmt=5, db="refseq_protein.00", num_iterations=3, out=out_file) cmd = str(cline) cmd = cmd.split(" ") return subprocess.run(cmd, check=True)
def gen_pssm(fname, pssm_dir): #fname = './data/secreted_pos_30_train_0' seq_records = list(SeqIO.parse(fname, 'fasta')) i = 0 for seq_record in seq_records: inputfile = 'input.fasta' if not os.path.exists(pssm_dir): os.mkdir(pssm_dir) pssmfile = pssm_dir + "/" + seq_record.id + "_pssm.txt" # psi-blast output file if not os.path.exists(pssmfile): # psi-blast input file if os.path.exists(inputfile): os.remove(inputfile) SeqIO.write(seq_record, inputfile, 'fasta') # psi-blast psiblast_cline = NcbipsiblastCommandline(query=inputfile, db='Swissprot', evalue=0.001, num_iterations=3, out_ascii_pssm=pssmfile) stdout, stderr = psiblast_cline() i = i + 1 print("{:.2f}% ====> {}/{} finished".format( i / len(seq_records) * 100, i, len(seq_records)))
def psiblast(query, db, out_dir, out_file): out_file = os.path.join(out_dir, "{}.pssm".format(out_file)) psi_cline = NcbipsiblastCommandline('psiblast', db = db, query = query, evalue = 0.001, num_iterations = 3, save_pssm_after_last_round=True, out_ascii_pssm = out_file ) psi_cline() return out_file
def pssm_gen_ncbi(database, input_fasta, out_pssm, num_iter, num_thr): from Bio.Blast.Applications import NcbipsiblastCommandline psi_cline = NcbipsiblastCommandline('psiblast', db=database, query=inp_fasta, num_threads=num_thr, num_iterations=num_iter, outfmt=5, out_ascii_pssm=out_pssm) psi_cline() return
def getPSSMFiles(fastafile, outfileprefix='', dbName='swissprot'): inputfile = 'input.fasta' for seq_record in SeqIO.parse(fastafile, 'fasta'): print('\r{} '.format(seq_record.id), end="") # psi-blast input file if os.path.exists(inputfile): os.remove(inputfile) SeqIO.write(seq_record, inputfile, 'fasta') # psi-blast output file pssmfile = "".join((outfileprefix, seq_record.id, '.txt')) if os.path.exists(pssmfile): os.remove(pssmfile) # psi-blast psiblast_cline = NcbipsiblastCommandline(query=inputfile, db=dbName, evalue=0.001, num_iterations=3, out_ascii_pssm=pssmfile) stdout, stderr = psiblast_cline() # If seq_record does not have pssm, generating it by blosum62 Matrix if not os.path.exists(pssmfile): print('\r{} does not have pssm'.format(seq_record.id)) with open(pssmfile, 'w') as pw: pw.writelines(" \n") pw.writelines( "last position-specific scoring matrix computed, weighted \n" ) pw.writelines(alphabet + '\n') s = seq_record.seq k = 1 for aa in s: line = str(k) + ' ' + aa + ' ' k += 1 idx = alphabet.find(aa) col = 0 for a in alphabet: line = line + str(blosumMatrix[idx][col]) + ' ' col += 1 line = line + '\n' pw.writelines(line)
def upSampling_pssm(num_upsamp, upsamp_file, fpos, vrate, pssmdir): seq_records = list(SeqIO.parse(fpos, 'fasta')) N = len(seq_records) proteins = [] i = 0 inputfile = 'input.fasta' if not os.path.exists(pssmdir): os.mkdir(pssmdir) while True: k = np.random.randint(0, N) seq_record = seq_records[k] pssmfile = os.path.join(pssmdir, seq_record.id + "_pssm.txt") # psi-blast output file if not os.path.exists(pssmfile): # psi-blast input file if os.path.exists(inputfile): os.remove(inputfile) SeqIO.write(seq_record, inputfile, 'fasta') # psi-blast psiblast_cline = NcbipsiblastCommandline(query=inputfile, db='Swissprot', evalue=0.001, num_iterations=3, out_ascii_pssm=pssmfile) stdout, stderr = psiblast_cline() # If psi-blast didn't constructe pssm if not os.path.exists(pssmfile): continue pssm = readPSSM(pssmfile) prot = genSeq(str(seq_record.seq), pssm, vrate) fake_seq = Seq(prot, IUPAC.ExtendedIUPACProtein) fake_record = SeqRecord(fake_seq, id='fake' + str(i)) proteins.append(fake_record) i = i + 1 print("{:.2f}% ====> {}/{} finished".format(i / num_upsamp * 100, i, num_upsamp)) if i == num_upsamp: break SeqIO.write(proteins, upsamp_file, 'fasta')
def _psi_blast_pssm(input): file, path, blast_db_path, evalue, num_iterations, sigmoid, tmp_fastas_path ,tmp_pssms_path = input[0], input[1], input[2], input[3], input[4], input[5], input[6], input[7] # create and call command blastx_cline = NcbipsiblastCommandline( cmd='psiblast', query=path+tmp_fastas_path+file, db=blast_db_path, evalue=evalue, num_iterations=num_iterations, outfmt=5, num_threads=3, out_ascii_pssm=path+tmp_pssms_path+file.split('.')[0])() if os.path.isfile(path+tmp_pssms_path+file.split('.')[0]): pssm = _parse_pssm_ascii(path+tmp_pssms_path+file.split('.')[0]) else: # TODO: what to do in this case?! e.g. no sequences returned by psiblast due to too low evalue return None if sigmoid: return _sigmoid(pssm) else: return pssm
def generate_sequence_profiles(structural_alignment_path, pssm_dir, blastdb='uniref90'): results = [] seq_index = SeqIO.index(structural_alignment_path, 'fasta') for record_id in seq_index: results.append(record_id.split('&')[0]) results.append(record_id.split('&')[1]) for domain in tqdm(set(results)): if Path(f'{pssm_dir}/{domain[2:4]}/{domain}.mtx').exists(): continue Path(f'{pssm_dir}/{domain[2:4]}').mkdir(parents=True, exist_ok=True) seq_record = seq_index[[ _ for _ in seq_index if _.startswith(f'{domain}&') ][0]] NcbipsiblastCommandline( db=blastdb, num_threads=os.cpu_count(), num_iterations=3, out_ascii_pssm=Path( f'{pssm_dir}/{domain[2:4]}/{domain}.mtx').as_posix(), save_pssm_after_last_round=True)(stdin=str(seq_record.seq))
def BLAST_submission(task): """Run a single PSI-BLAST search Parameters ---------- task : array List/array which contains the query data and the parameters for the BLAST search Saved Files and Figures ----------------------- PROTEIN_NAME.blast_result.xml : xml file containing PSI-BLAST hits (e.g. A2A2V5.blast_result.xml) PROTIEN_NAME.blast_result.pssm : pssm file containing PSI-BLAST similiarity scoring matrix (e.g. A2A2V5.blast_result.pssm) """ #Obtain query and parameters protein_name, query, output_hit_file, output_pssm_file, evalue, hitsize, overwrite_results, database,\ remote_tag, compress_results = task #IF overwrite existing results is FALSE if not overwrite_results: #check if compressed file exist and is not empty: IF yes return and don't execute PSI-BLAST if os.path.exists(output_hit_file + ".gz") and os.path.getsize(output_hit_file + ".gz") > 0: #IF compress results is TURE -> return and don't execute PSI-BLAST if compress_results: #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name) return #ELSE decompress file, return and don't execute PSI-BLAST else: with gzip.open(output_hit_file + ".gz", 'rb') as blast_result_in, open( output_hit_file, 'wb') as blast_result_out: blast_result_out.write(blast_result_in.read()) os.remove(output_hit_file + ".gz") with gzip.open(output_pssm_file + ".gz", 'rb') as blast_result_in, open( output_pssm_file, 'wb') as blast_result_out: blast_result_out.write(blast_result_in.read()) os.remove(output_pssm_file + ".gz") #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name) return #check if non compressed file exist and is not empty: if os.path.exists( output_hit_file) and os.path.getsize(output_hit_file) > 0: #IF compress results is TURE -> compress the existing result file, return and don't execute PSI-BLAST if compress_results: with open(output_hit_file, "rb") as blast_result_in, gzip.open( output_hit_file + ".gz", 'wb') as blast_result_out: blast_result_out.writelines(blast_result_in) os.remove(output_hit_file) with open(output_pssm_file, "rb") as blast_result_in, gzip.open( output_pssm_file + ".gz", 'wb') as blast_result_out: blast_result_out.writelines(blast_result_in) os.remove(output_pssm_file) #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name) return #ELSE: return and don't execute PSI-BLAST else: #logging.warning("Skipped PSI-BLAST search for protein:" + "\t" + protein_name) return #ELSE remove existing results else: #remove raw results if os.path.exists(output_hit_file): os.remove(output_file) if os.path.exists(output_pssm_file): os.remove(output_pssm_file) #remove compressed results if os.path.exists(output_hit_file + ".gz"): os.remove(output_hit_file + ".gz") if os.path.exists(output_pssm_file + ".gz"): os.remove(output_pssm_file + ".gz") #Run PSI-BLAST search #logging.info("Run PSI-BLAST search for protein:" + "\t" + protein_name) #TODO: Add -out parameter for the hits and out_ascii_pssm for the pssm file #TODO: Both compressed if required psiblast_cline = NcbipsiblastCommandline('psiblast', db=database, evalue=evalue, max_target_seqs=hitsize, outfmt=5, out=output_hit_file, out_ascii_pssm=output_pssm_file, remote=remote_tag, inclusion_ethresh=evalue, num_iterations=3, use_sw_tback=True, seg="no") out, err = psiblast_cline(stdin=query) #Exception handling of BLAST execution if out or err: logging.warning(out) logging.warning(err) return #IF compress results is TURE -> compress the blast xml result file if compress_results: if os.path.exists(output_hit_file): with open(output_hit_file, "rb") as blast_result_in, gzip.open( output_hit_file + ".gz", 'wb') as blast_result_out: blast_result_out.writelines(blast_result_in) os.remove(output_hit_file) if os.path.exists(output_pssm_file): with open(output_pssm_file, "rb") as blast_result_in, gzip.open( output_pssm_file + ".gz", 'wb') as blast_result_out: blast_result_out.writelines(blast_result_in) os.remove(output_pssm_file)
selected.append(splited[0]) AC_BIN_dict = {} for line in inf: line = line.strip('\n') splited = line.split('\t') AC_BIN_dict[splited[0]] = splited[1] outf = open(outf_name, "w") os.chdir(sequences_dir) print "Current directory changed into " + sequences_dir + "!" percent_count = 0 for item in selected: outf.write(item + "\t") # cline = NcbipsiblastCommandline(query = item + ".fasta", db = "swissprot", max_hsps_per_subject = 1, num_threads = threads_count, outfmt = 0, out = "tmp.log") cline = NcbipsiblastCommandline(query=item + ".fasta", db="swissprot", num_threads=threads_count, outfmt=0, out=item + ".log") if percent_count % (len(selected) / 100) == 0: print str(percent_count / (len(selected) / 100)) + "% finished!" if percent_count == 0: print "Time elasped: Unknown" print "Time remaining: Unknown" else: m, s = divmod(time.time() - start_time, 60) h, m = divmod(m, 60) print "Time elasped: %d:%02d:%02d" % (h, m, s) del h, m, s m, s = divmod( (float(len(selected)) / percent_count) * (time.time() - start_time) - (time.time() - start_time), 60)
def get_pssm(self, fasta_dir='fasta/', outdir='pssm_raw/', num_iterations=3, run=True): """Compute the PSSM files Args: fasta_dir (str, optional): irectory where the fasta queries are stored blast (str, optional): path to the psiblast executable db (str, optional): path to the blast database outdir (str, optional): output directory where to store the pssm files num_iterations (int, optional): number of iterations for the blast calculations """ fasta_dir = os.path.join(self.caseID, fasta_dir) outdir = os.path.join(self.caseID, outdir) if not os.path.isdir(outdir): os.mkdir(outdir) out_fmt = '7 qseqid qgi qacc qaccver qlen sseqid sallseqid sgi sallgi ,\ sacc saccver sallacc slen qstart qend sstart send qseq sseq ,\ evalue bitscore score length pident nident mismatch positive ,\ gapopen gaps ppos frames qframe sframe btop staxids stitle ,\ salltitles sstrand qcovs qcovhsp qcovus' for q in os.listdir(fasta_dir): # get the fasta quey query = os.path.join(fasta_dir, q) name = os.path.splitext(os.path.basename(query))[0] # set up the output names out_ascii_pssm = os.path.join(outdir, name + '.pssm') out_pssm = os.path.join(outdir, name + '.cptpssm') out_xml = os.path.join(outdir, name + '.xml') # get the parameters blast_param = self._get_psiblast_parameters(query) # set up the psiblast calculation psi_cline = NcbipsiblastCommandline( self.blast, db=self.db, query=query, evalue=0.0001, word_size=blast_param['wordSize'], gapopen=blast_param['gapOpen'], gapextend=blast_param['gapExtend'], matrix=blast_param['scoringMatrix'], outfmt=7, #out_fmt, comp_based_stats='T', max_target_seqs=2000, save_each_pssm=True, num_iterations=num_iterations, save_pssm_after_last_round=True, out_ascii_pssm=out_ascii_pssm, out_pssm=out_pssm, out=out_xml) # check that it's correct psi_cline._validate() if run: # run the blast query psi_cline() # copyt the final pssm to its final name shutil.copy2(out_ascii_pssm + '.%d' % num_iterations, out_ascii_pssm) # remove all the other files for filename in glob.glob(out_pssm + '.*'): os.remove(filename) for filename in glob.glob(out_ascii_pssm + '.*'): os.remove(filename) os.remove(out_xml)
from Bio.Blast.Applications import NcbipsiblastCommandline import re feature_num = 20 threads_count = 4 input_filename = "input.fasta" output_filename = "test_features.npy" inf_name = "../AC_BIN.dat" inf = open(inf_name, "r") AC_BIN_dict = {} for line in inf: line = line.strip('\n') splited = line.split('\t') AC_BIN_dict[splited[0]] = splited[1] cline = NcbipsiblastCommandline(query=input_filename, db="swissprot", num_threads=threads_count, outfmt=0, out="temp.log") stdout, stderr = cline() blastf = open("temp.log", "r") outf = open(output_filename, "w") AC = "" expect = "" count = 0 for line in blastf: line = line.strip("\n") matched = re.match( r'^sp\|(\w+)\.\w+\|\w+?\s+RecName:\s+.+?\s+[0-9]+(\.[0-9])?\s+([0-9]+.[0-9]+|[0-9]+e-[0-9]+)\s*$', line) if matched: if count < feature_num:
def searchPSIBLAST(self, id, psiblast): "Psi-Blast over a local database or over the internet" if psiblast == "local": threads = LP(self.parameterfile, "psiblast_threading") evalue = LP(self.parameterfile, "psiblast_evalue") reference_protein = "refseq_protein" in_sequence = self.dirname + id + ".fa" output = self.dirname + id + ".xml" if threads == False: psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output) psiblast() else: try: threads = int(threads) psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output, num_threads=threads) psiblast() except: psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output) psiblast() try: open(self.dirname + id + ".fasta") open.close() remove(self.dirname + id + ".fa") except: move(self.dirname + id + ".fa", self.dirname + id + ".fasta") else: evalue = LP(self.parameterfile, "psiblast_evalue") reference_protein = "refseq_protein" in_sequence = self.dirname + id + ".fa" for seq_record in SeqIO.parse(in_sequence, "fasta", IUPAC.protein): sequence = seq_record.seq psiblast = NCBIWWW.qblast("blastp", reference_protein, sequence, service="psi", expect=evalue, hitlist_size=500) psiblast try: open(self.dirname + id + ".fasta") open.close() remove(self.dirname + id + ".fa") except: move(self.dirname + id + ".fa", self.dirname + id + ".fasta") output = self.dirname + id + ".xml" saveblast = open(output, "w") saveblast.write(psiblast.read()) saveblast.close() psiblast.close()
import pandas as pd import sys #~~~~~~~~~~~~~~~~~~~~~~~~~~ # DEFINING SOME VARIABLES: #~~~~~~~~~~~~~~~~~~~~~~~~~~ query = sys.argv[1] pssm = query + '-pssm.asn1' output = 'psiblast-' + query + '.tsv' out_format = " '6 qaccver saccver bitscore evalue qlen slen length qcovs pident staxids sscinames' " # blastp output cols out_columns = out_format[ 4:-2] # gets only the output format specifiers in out_format database = '~/datalocal/nr_db/nr' e_cut = 1e-3 #~~~~~~~~~~~~~~~~~~~~~ # RUN 1 BLASTP ROUND: #~~~~~~~~~~~~~~~~~~~~~ print('about to run') # Run BLASTP in command line, according to above settings \ psiblast_cline = NcbipsiblastCommandline(in_pssm=pssm, num_iterations='1', out=output, outfmt=out_format, db=database, evalue=e_cut) print(psiblast_cline) print('psiblast done')
import os, sys, glob, subprocess, shutil from os.path import dirname, abspath, isdir import pandas as pd from Bio.Blast.Applications import NcbipsiblastCommandline from Bio import SeqIO #~~~~~~~~~~~~~~~~~~~~~~~~~~ # DEFINE NAMES OF FILES : #~~~~~~~~~~~~~~~~~~~~~~~~~~ query = sys.argv[1] # what the query sequence was ('PflA' or 'PflB') parentdir = dirname(dirname(abspath(__file__))) query_fa = parentdir + '/1_bidirectional-best-hits/Cj' + query + '.faa' blastdb = query + '-db/' + query + '-hits' pssm = query + '-pssm.asn1' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # CONVERT TO PSSM BY A SINGLE PSI-BLAST ITERATION: #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ psiblast_cline = NcbipsiblastCommandline( query=query_fa, db=blastdb, num_iterations='2', out_pssm=pssm ) # do one iteration of psiblast, take the original query (CjPflA or CjPflB) as the 'subject', and save the pssm it creates stdout, stderr = psiblast_cline( ) # make the PSSM with the original query and dummy database
def blast(algo): seqindex = SeqIO.index('data/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75.fa', 'fasta', key_function=lambda x: x.split()[0]) hie = pickle.load(Path('data/train/scop40_1fold_hie.pkl').open('rb')) scop = Scop(dir_path='data/scop', version='1.75') tmpdir = Path(f'.{algo}') tmpdir.mkdir(exist_ok=True) auc_result = {} for sf in tqdm(hie): px_list = hie[sf] if len(px_list) < 1: continue sid = random.sample(px_list, 1)[0] record = seqindex[sid] f_fasta = tmpdir/f'{sid}.fasta' f_xml = tmpdir/f'{sid}.xml' SeqIO.write(record, f_fasta.as_posix(), 'fasta') try: if algo == 'psiblast': NcbipsiblastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() elif algo == 'deltablast': NcbideltablastCommandline(query=f_fasta.as_posix(), db='astral-scopdom-seqres-gd-sel-gs-bib-40-1.75', num_threads=int(os.cpu_count()), num_iterations=3, evalue=999999, outfmt=5, out=f_xml.as_posix())() else: raise ValueError(f'Invalid algorithm ({algo})') except ApplicationError as e: logging.error(e) f_xml.unlink() continue finally: f_fasta.unlink() results = SearchIO.parse(f_xml.as_posix(), 'blast-xml') results = list(results)[-1] results = list(results)[:500] sf_sccs = scop.getNodeBySunid(sf).sccs roc_score = [] roc_label = [] for result in results: result_sf_sccs = result.description.split(' ')[0][:-2] roc_score.append(-result.hsps[0].evalue) if result_sf_sccs == sf_sccs: roc_label.append(1) else: roc_label.append(0) if np.all(np.array(roc_label) == 1): auc = 1.0 elif np.all(np.array(roc_label) == 0): auc = 0.0 else: auc = metrics.roc_auc_score(roc_label, roc_score) auc_result[sf_sccs] = {'auc': auc, 'sample': sid, 'num': len(results)} f_xml.unlink() now = int(time.time()) pickle.dump(auc_result, Path(f'auc_result_{algo}_{now}.pkl').open('wb'))
def searchPSIBLAST(self, id, psiblast): "Psi-Blast over a local database or over the internet" if psiblast == "local": # edit psiblast_evalue at Parameters.py threads = psiblast_threading evalue = psiblast_evalue reference_protein = "refseq_protein" in_sequence = "./Data/" + id + ".fa" output = "./Data/"+ id + ".xml" if threads == False: psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output) psiblast() else: try: threads = int(threads) psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output, num_threads=threads) psiblast() except: psiblast = NcbipsiblastCommandline(query=in_sequence, db=reference_protein, outfmt=5, threshold=evalue, out=output) psiblast() try: open("./Data/" + id + ".fasta") open.close() remove("./Data/" + id + ".fa") except: move("./Data/" + id + ".fa", "./Data/" + id + ".fasta") else: # edit psiblast_evalue at Parameters.py evalue = psiblast_evalue reference_protein = "refseq_protein" in_sequence = "./Data/" + id + ".fa" for seq_record in SeqIO.parse(in_sequence, "fasta",IUPAC.protein): sequence = seq_record.seq psiblast = NCBIWWW.qblast("blastp", reference_protein, sequence, service="psi", expect=evalue, hitlist_size=500) psiblast try: open("./Data/" + id + ".fasta") open.close() remove("./Data/" + id + ".fa") except: move("./Data/" + id + ".fa", "./Data/" + id + ".fasta") output = "./Data/"+ id + ".xml" saveblast = open(output, "w") saveblast.write(psiblast.read()) saveblast.close() psiblast.close()
def generate_profiles(in_dataframe, out_path): """Rather complicated and quite honetly ugly looking function used for generating the profiles from a given set of sequences. Intended to be used internally. """ out_path = Path(out_path) dataset = in_dataframe s = Sultan() print('Unpacking and generating Uniprot DB.') s.gunzip('-fk ../data/swiss-prot/uniprot_sprot.fasta.gz').run() cmd = NcbimakeblastdbCommandline( input_file='../data/swiss-prot/uniprot_sprot.fasta', dbtype='prot') cmd() if not (out_path / 'profile').exists(): s.mkdir(out_path / 'profile').run() with TemporaryDirectory() as psi_temp: for _, sample in tqdm(dataset.iterrows(), total=len(dataset), desc='Generating profiles'): with NamedTemporaryFile(mode='w') as blast_in: if isinstance(sample.name, tuple): sample_id, chain = sample.name[0], sample.name[1] out_name = f'{sample_id}_{chain}' dump_path = out_path / 'full_test_summary.joblib' else: sample_id = sample.name out_name = sample_id dump_path = out_path / 'jpred_summary.joblib' sequence, structure = sample[['Sequence', 'Structure']] structure = ' ' + structure print(f'>{out_name}', file=blast_in) print(sequence, file=blast_in) blast_in.seek(0) cmd = NcbipsiblastCommandline( query=blast_in.name, db='../data/swiss-prot/uniprot_sprot.fasta', evalue=0.01, num_iterations=3, out_ascii_pssm=f'{psi_temp}/{out_name}.pssm', num_descriptions=10000, num_alignments=10000, # out=f'{psi_temp}{out_name}.alns.blast', num_threads=8) cmd() if not os.path.exists( os.path.join(psi_temp, out_name + '.pssm')): tqdm.write( f'Unable to generate profile for {out_name}. No hits in the database.' ) dataset.drop(index=sample.name, inplace=True) continue with open(f'{psi_temp}/{out_name}.pssm', 'r') as pssm_file: pssm_file.readline() pssm_file.readline() profile = [] offset = False position = 0 for line in pssm_file: line = line.rstrip() if not line: break line = line.split() line.append(structure[position]) position += 1 if not offset: for i in range(2): line.insert(0, '') offset = True profile.append(line) profile = pd.DataFrame(profile) profile.drop( (profile.columns[col] for col in range(2, 22)), axis=1, inplace=True) profile.drop((profile.columns[-3:-1]), axis=1, inplace=True) profile.drop((profile.columns[0]), axis=1, inplace=True) profile.columns = profile.iloc[0] profile = profile[1:] profile.rename(columns={profile.columns[0]: "Sequence"}, inplace=True) profile.rename(columns={profile.columns[-1]: "Structure"}, inplace=True) profile = profile[ ['Structure'] + [col for col in profile.columns if col != 'Structure']] profile.loc[:, 'A':'V'] = profile.loc[:, 'A':'V'].astype( float).divide(100) profile.to_csv(out_path / 'profile' / (out_name + '.profile'), sep='\t', index=False) print( f'Dumping clean test to {dump_path}. Profiles are generated in {out_path}/profile' ) dump(dataset, dump_path)
def blast(query, algo, iter, key_name, roc_top, result_dict_cover, blast_ranking): tmpdir = Path(f'.{algo}_iter{iter}') tmpdir.mkdir(exist_ok=True) f_fasta = Path(f'data/test/{query}.fasta') f_xml = tmpdir / f'{query}.xml' if not f_xml.exists(): if algo == 'psiblast': NcbipsiblastCommandline( query=f_fasta.as_posix(), db= 'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only', num_threads=int(os.cpu_count()), num_iterations=iter, max_target_seqs=500, evalue=999999, outfmt=5, out=f_xml.as_posix())() elif algo == 'deltablast': NcbideltablastCommandline( query=f_fasta.as_posix(), db= 'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only', rpsdb='data/blastdb/cdd_delta', num_threads=int(os.cpu_count()), num_iterations=iter, max_target_seqs=500, evalue=999999, outfmt=5, out=f_xml.as_posix())() elif algo == 'psiblast_pssm': f_pssm = Path(f'data/test/{query}.pssm') NcbipsiblastCommandline( in_pssm=f_pssm.as_posix(), db= 'data/blastdb/astral-scopdom-seqres-gd-sel-gs-bib-40-1.75_train_only', num_threads=int(os.cpu_count()), num_iterations=iter, max_target_seqs=500, evalue=999999, outfmt=5, out=f_xml.as_posix())() else: raise ValueError(f'Invalid algorithm ({algo})') results = SearchIO.parse(f_xml.as_posix(), 'blast-xml') results = list(results)[-1] # final iteration results = list(results) sf_sccs = scop100.getDomainBySid(query).getAscendent('sf').sccs all_tp_count = len( hie[scop100.getDomainBySid(query).getAscendent('sf').sunid]) roc_score, roc_label = [], [] assert len(results) >= roc_top blast_ranking[key_name][query] = [] for result in results: blast_ranking[key_name][query].append( {result.id: ('', '', result.hsps[0].evalue)}) for result in results: result_sf_sccs = '.'.join( result.description.split(' ')[0].split('.')[:-1]) roc_score.append(-result.hsps[0].evalue) label = 1 if result_sf_sccs == sf_sccs else 0 roc_label.append(label) if label == 1: result_dict_cover[key_name].append( len(result.hsps[0].query.seq.ungap('-')) / len(SeqIO.read(f_fasta.as_posix(), 'fasta').seq)) if roc_label.count(0) == roc_top: break fpc, tpc = [0], [0] for i in range(0, roc_top): fpc.append(i) tpc.append(roc_label[:i + 1].count(1)) auc = metrics.auc(fpc, tpc) / roc_top / all_tp_count return fpc, tpc, auc
2 # -*- coding: utf-8 -*- """ Spyder Editor This is a temporary script file. """ from Bio.Blast.Applications import NcbipsiblastCommandline from Bio import SeqIO import os inputfile = 'input.fasta' for seq_record in SeqIO.parse('PDNA-224.fasta', 'fasta'): print('{} is calculating pssm'.format(seq_record.id)) if os.path.exists(inputfile): os.remove(inputfile) pssmfile = "".join(('pssm', '_', seq_record.id, '.txt')) SeqIO.write(seq_record, inputfile, 'fasta') psiblast_cline = NcbipsiblastCommandline(query=inputfile, db='swissprot', evalue=0.001, num_iterations=3, out_ascii_pssm=pssmfile) stdout, stderr = psiblast_cline()
if REMOTE: db = BLAST_CONFIG['default_db'] kwargs = {} else: db = config['paths']['blast']['dbs'][ BLAST_CONFIG['default_db']] kwargs = { 'num_iterations': BLAST_CONFIG['num_iterations'], 'num_threads': BLAST_CONFIG['num_threads'] } cline = NcbipsiblastCommandline( cmd=os.path.join(BLAST_PATH, 'bin', 'psiblast'), query=in_file.name, db=db, num_alignments=1, out_ascii_pssm=out_file_name, remote=REMOTE, **kwargs) print(cline) stdout, stderr = cline() # Note: It's possible that out_file_name doesn't get generated at all, if there are no hits found. # In this case, generate an empty .pssm file so we don't try to regenerate this file the next time this # gene/protein is encountered. We'll also have to check for 0 sized files to make this work of course. open(out_file_name, 'a').close( ) # no-op if file exists, creates an empty file if it doesn't # Copy all 'valid' .pssm files (with size>0) to the output folder for downstream processing. if os.path.getsize(out_file_name) > 0: shutil.copyfile(out_file_name,
def __generate_sequence_profiles_old(): mtx_dir_name = 'pssm_deltablast' DB_INDEX = SeqIO.index('data/scop40_structural_alignment.fasta', 'fasta') records = {} for i in DB_INDEX: domkey = i.split('&')[0] records[domkey] = SeqRecord(DB_INDEX[i].seq.ungap('-'), id=domkey, name='', description='') with Path('data/scop40_scopdom_pdbatom_seq.fasta').open('w') as f: SeqIO.write(records.values(), f, 'fasta') DB_INDEX = SeqIO.index('data/scop40_scopdom_pdbatom_seq.fasta', 'fasta') for sid in tqdm(list(DB_INDEX)): mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}') mtx_dir.mkdir(exist_ok=True, parents=True) mtx_file = mtx_dir / f'{sid}.mtx' if mtx_file.exists(): logging.debug(f'PSSM already exists: {mtx_file}') continue try: SeqIO.write(DB_INDEX[sid], f'{sid}.fasta', 'fasta') NcbipsiblastCommandline(query=f'{sid}.fasta', db='uniref90', num_threads=int(os.cpu_count()), num_iterations=3, out_ascii_pssm=mtx_file.as_posix(), save_pssm_after_last_round=True)() except Exception as e: logging.exception(e) continue finally: if Path(f'{sid}.fasta').exists(): Path(f'{sid}.fasta').unlink() logging.info('') for sid in tqdm( pickle.load(Path('data/one_domain_superfamily.pkl').open('rb'))): mtx_dir = Path(f'data/{mtx_dir_name}/{sid[2:4]}') mtx_dir.mkdir(exist_ok=True, parents=True) mtx_file = mtx_dir / f'{sid}.mtx' if mtx_file.exists(): logging.debug(f'PSSM already exists: {mtx_file}') continue try: tmalign = TMalignCommandLine(f'data/scop_e/{sid[2:4]}/{sid}.ent', f'data/scop_e/{sid[2:4]}/{sid}.ent') tmalign.run() assert str(tmalign.alignment[0].seq).find('-') == -1 SeqIO.write(tmalign.alignment[0], f'{sid}.fasta', 'fasta') NcbipsiblastCommandline(query=f'{sid}.fasta', db='uniref90', num_threads=int(os.cpu_count()), num_iterations=3, out_ascii_pssm=mtx_file.as_posix(), save_pssm_after_last_round=True)() except Exception as e: logging.error(f'sid={sid}') logging.exception(e) continue finally: if Path(f'{sid}.fasta').exists(): Path(f'{sid}.fasta').unlink()