def getting_blastn_cline2_rRNA_vs_geno_getting_positions(pathMain,operon_list,fasta_files,uniqueGenoNames): dict_blast_split_lines = {} for operonfile, fastafile, uniqueGenoName in itertools.izip(operon_list,fasta_files,uniqueGenoNames): name = uniqueGenoName finalPath = pathMain + operonfile fastaPath = pathMain + fastafile # using NcbiblastnCommandline blastn_cline2 = NcbiblastnCommandline(query= finalPath, subject = fastaPath, outfmt = 6, max_hsps = 2)()[0] blastn_cline2_split_lines = blastn_cline2.splitlines( ) dict_blast_split_lines[name] = blastn_cline2_split_lines return dict_blast_split_lines
def RunCommand(self): """Run the BLAST search.""" self.fh_in, self.infile = tempfile.mkstemp() self.fh_out, self.outfile = tempfile.mkstemp() with open(self.infile, 'w+') as f: f.write('>Name\n') f.write(self.command_data[0]) blast_program = self.command_data[1] database = self.command_data[2] # Check if user supplied additional options and extract them if self.command_data[3]: option = self.command_data[3] options = {} for x in range(0, len(option.split()) - 1, 2): options[option.split()[x]] = option.split()[x + 1] else: options = {} args, kwargs = blast_program, {'query': self.infile, 'db': database, 'out': self.outfile} if blast_program.endswith('blastn'): blast_cmd = NcbiblastnCommandline(args, **kwargs) elif blast_program.endswith('blastp'): blast_cmd = NcbiblastpCommandline(args, **kwargs) elif blast_program.endswith('blastx'): blast_cmd = NcbiblastxCommandline(args, **kwargs) elif blast_program.endswith('tblastn'): blast_cmd = NcbitblastnCommandline(args, **kwargs) elif blast_program.endswith('tblastx'): blast_cmd = NcbitblastxCommandline(args, **kwargs) else: return if options: try: for key in options: blast_cmd.set_parameter(key, options[key]) except ValueError as e: messagebox.showerror('xbb tools', 'Commandline error:\n\n' + str(e)) self.tid.destroy() return self.worker = BlastWorker(blast_cmd) self.worker.start() self.UpdateResults()
def RunCommand(self): self.fh_in, self.infile = tempfile.mkstemp() self.fh_out, self.outfile = tempfile.mkstemp() with open(self.infile, "w+") as f: f.write(">Name\n") f.write(self.command_data[0]) blast_program = self.command_data[1] database = self.command_data[2] # Check if user supplied additional options and extract them if self.command_data[3]: option = self.command_data[3] options = {} for x in range(0, len(option.split()) - 1, 2): options[option.split()[x]] = option.split()[x + 1] else: options = {} args, kwargs = blast_program, {"query": self.infile, "db": database, "out": self.outfile} if blast_program.endswith("blastn"): blast_cmd = NcbiblastnCommandline(args, **kwargs) elif blast_program.endswith("blastp"): blast_cmd = NcbiblastpCommandline(args, **kwargs) elif blast_program.endswith("blastx"): blast_cmd = NcbiblastxCommandline(args, **kwargs) elif blast_program.endswith("tblastn"): blast_cmd = NcbitblastnCommandline(args, **kwargs) elif blast_program.endswith("tblastx"): blast_cmd = NcbitblastxCommandline(args, **kwargs) else: return if options: try: for key in options: blast_cmd.set_parameter(key, options[key]) except ValueError as e: messagebox.showerror("xbb tools", "Commandline error:\n\n" + str(e)) self.tid.destroy() return self.worker = BlastWorker(blast_cmd) self.worker.start() self.UpdateResults()
def directed_local_alignment(proteome_file, ref_seq_file, proteome_name, workdir, mode, source, strand, per_proteome_sequences): """ This function does the main lifting, running Supermatcher or BLAST on your proteomes/genomes. """ #open result file alignment_file = os.path.join(workdir, "curr_alignment.aln") #If Supermatcher was chosen as the alignment algorithm if mode.lower() == "supermatcher": #Unlike BLAST, Supermatcher only searches one strand, so we create a temp file to use as the supermatcher #bsequence, and we can write into it either the positive or negative strand depending on the "strand" parameter records = open_proteome(proteome_file) if not records: return [] if not strand: records = [ i.reverse_complement(id=True, name=True, description=True, features=True, annotations=True, letter_annotations=True, dbxrefs=True) for i in records ] #write file proteome_fasta_file = os.path.join(workdir, "curr_proteome.fasta") SeqIO.write(records, proteome_fasta_file, "fasta") #Build the matching command if source.lower() == "protein" or source.lower() == "proteome": matrix = "EBLOSUM62" #AA matrix elif source.lower() == "nucleotide" or source.lower() == "genome": matrix = "EDNAFULL" #run supermatcher cmd = SuperMatcherCommandline(asequence=ref_seq_file, bsequence=proteome_fasta_file, gapopen=10, gapextend=0.5, datafile=matrix, outfile=alignment_file) #Excecute the command stdout, stderr = cmd() #Parse the resulting alignments alignments = [] try: #Create list of MultipleSeqAlignment objects representing the Supermatcher results (list may be empty) align_seq_list = list(AlignIO.parse( alignment_file, "amir_emboss")) #List of MultipleSeqAlignment objects #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments #will be used if per_proteome_sequences==None for number, alignment in enumerate( align_seq_list[0:per_proteome_sequences]): #get the alignent align_seq = alignment[ 1] #SeqRecord objects, [0] is query and [1] is sbjct #remove gaps align_seq._set_seq(align_seq.seq.ungap("-")) #get name if (not per_proteome_sequences or per_proteome_sequences > 1 ) and len(align_seq_list) > 1: usable_name = proteome_name + "_" + str(number) else: usable_name = proteome_name align_seq.name = usable_name align_seq.id = usable_name #finalize parsing score, identity_percentage = parse_supermatcher_result( alignment_file, number) #add to list alignments.append( (score, align_seq, identity_percentage, 0, 1)) #1 is given arbitrarily as gene_percentage #alignments is a list of tuples with each element being (score, align_seq, identity_percentage) #detlete temp files os.remove(alignment_file) os.remove(proteome_fasta_file) #return return alignments except ValueError or IndexError: raise NoMatchForSeqException(proteome_fasta_file, ref_seq_file) #If BLAST was chosen as the alignment algorithm elif mode.upper() == "BLAST": #if file is called XXXX.file_type.gz, db name should be XXXX if proteome_file.endswith(".gz"): db_name_temp = ".".join(proteome_file.split(".")[:-2]) #if file is called XXXX.file_type, db name should be XXXX else: db_name_temp = ".".join(proteome_file.split(".")[:-1]) #define dir directory = os.path.dirname(proteome_file) directory_files = os.listdir(directory) #iterate through files and find database file for dir_file in directory_files: #determine type of die if os.path.basename( db_name_temp) in dir_file and ".nhr" in dir_file: db_name = directory + "/" + dir_file.split(".nhr")[0] elif os.path.basename( db_name_temp) in dir_file and ".phr" in dir_file: db_name = directory + "/" + dir_file.split(".phr")[0] #open file records = open_proteome(proteome_file) if not records: return [] #Build matching command if source.lower() == "protein" or source.lower() == "proteome": cmd = NcbiblastpCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5) elif source.lower() == "nucleotide" or source.lower() == "genome": cmd = NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5, task="blastn") #cmd = NcbiblastnCommandline(query=ref_seq_file, db=db_name, out=alignment_file, outfmt=5) #Execute command stdout, stderr = cmd() #Open result try: result_handle = open(alignment_file) blast_record = list( NCBIXML.parse(result_handle))[0] #BLAST record object result_handle.close() except ValueError: raise NoMatchForSeqException(proteome_file, ref_seq_file) #Parse resulting alignments alignments = [] try: #Iterate through list, only the first alignment will be used if per_proteome_sequences==1 and all alignments #will be used if per_proteome_sequences==None for number, alignment in enumerate( blast_record.alignments[0:per_proteome_sequences]): hsp = alignment.hsps[ 0] #HSP contains all the details about the alignment sequence = hsp.sbjct score = hsp.score evalue = hsp.expect identities = hsp.identities query_length = blast_record.query_letters align_length = hsp.align_length #calculate percentages identity_percentage = float(identities) / align_length if (not per_proteome_sequences or per_proteome_sequences > 1 ) and len(blast_record.alignments) > 1: name = proteome_name + "_" + str(number) else: name = proteome_name #length percentage percentage = float(align_length) / query_length #convert to SeqRecord0 align_seq = SeqRecord(Seq(sequence, IUPAC.protein), id=name, name=name, description=name) align_seq._set_seq(align_seq.seq.ungap("-")) #Remove the gaps #score, identity_percentage = parse_blast_result(alignment_file) alignments.append((score, align_seq, identity_percentage, evalue, percentage)) #alignments is a list of tuples with each element being (score, align_seq, identity_percentage, evalue, length_percentage) except IndexError: #If alignments are empty, doesn't actually do anything sequence = "" score = 0 identity_percentage = 0 #remove temp file os.remove(alignment_file) #return return alignments #not BLAST or Supermatcher! else: raise BaseException( "Only Supermatcher and BLAST modes are currently supported.")
def final_fasta(self, path_to_fasta): records_id = [ record.id for record in SeqIO.parse(path_to_fasta, "fasta") ] rank_prefixes = [ rank_id.split("_")[0] for rank_id in records_id if "_TR_1_x_" in rank_id ] other_id = [other for other in records_id if "Contig" in other] other_prefixes = [oth_id.split("_")[0] for oth_id in other_id] require_pref = set(other_prefixes).difference(set(rank_prefixes)) ffasta_path = self.path_to_final.joinpath(path_to_fasta.name) prime_tmp_fasta_path = self.path_to_prime.joinpath( f"{path_to_fasta.stem}_tmp.fasta") prime_fasta_path = self.path_to_prime.joinpath(path_to_fasta.name) with open(ffasta_path, "w") as ffasta: for record in SeqIO.parse(path_to_fasta, "fasta"): if record.id not in other_id: SeqIO.write(record, ffasta, "fasta") else: continue if len(require_pref) == 0: return tmp_id = self.__tmp_other_id(require_pref, other_id) with open(prime_tmp_fasta_path, "w") as tmp, \ open(prime_fasta_path) as src: for record in SeqIO.parse(src, "fasta"): if record.id in tmp_id: SeqIO.write(record, tmp, "fasta") outfmt = "6 qseqid sseqid slen qcovhsp" cline = NcbiblastnCommandline(query=ffasta_path, subject=prime_tmp_fasta_path, out="-", outfmt=outfmt, task=self.task) output = cline()[0].strip() rows = [line.split() for line in output.splitlines()] cols = ["qseqid", "sseqid", "slen", "qcovhsp"] data_types = { "qseqid": str, "sseqid": str, "slen": int, "qcovhsp": float } b_tab = pd.DataFrame(rows, columns=cols).astype(data_types) if b_tab.empty: return b_tab.rename(columns={ 0: "qseqid", 1: "sseqid", 2: "slen", 3: "qcovhsp" }, inplace=True) best_contig_id = self.__get_best_contig(b_tab) with open(ffasta_path, "a") as ffasta, \ open(prime_tmp_fasta_path) as tmp: for record in SeqIO.parse(tmp, "fasta"): if record.id == best_contig_id: SeqIO.write(record, ffasta, "fasta") else: continue
def identify_DNA_chain(test_seq, evalue=100): ''' performs local blast search via known NPS input: test_seq - either BIO.Seq object, or string evalue - blast evalue threshold (default 100) returns: str(sequence_name) str(direction) ('top', 'bot') # TODO: check for telomeric sequence is stupid ''' # Merge all sequences to one nps_seeds = { 'all': os.path.join(DATA_PATH, 'positioning_sequences', 'sequences.fasta') } db_seq_records = [] for nps_seed in nps_seeds.values(): db_seq_records.extend(list(SeqIO.parse(nps_seed, "fasta"))) nps_seq_records = [] for seq_rec in db_seq_records: # TERRIBLE SOLUTION REFACTOR!!!! #dyad_locations=seq_rec.description.split('|')[1].split()[1].split(',') nps_seq_records.append(seq_rec) #for dyad in dyad_locations: # nps_seq_records.append(seq_rec) #nps_seq_records[-1].seq=seq_rec.seq[int(dyad)-70:int(dyad)+70] #nps_seq_records.append(nps_seq_records[-1].reverse_complement()) #nps_seq_records[-1].id=seq_rec.id+'_rev_comp' #nps_seq_records[-1].name=seq_rec.name+'_rev_comp' #nps_seq_records[-1].description=seq_rec.description #remove gaps from MSA for record in nps_seq_records: record.seq = record.seq.ungap("-") n1 = str(uuid.uuid4()) n2 = str(uuid.uuid4()) if (not isinstance(test_seq, Seq)) and (isinstance(test_seq, str)): test_seq = Seq(test_seq) else: raise TypeError("Test sequence must be either Bio.Seq or string") with tempfile.TemporaryDirectory() as TEMP: SeqIO.write([SeqRecord(test_seq, id='Query', name='Query')], os.path.join(TEMP, n2 + '.fasta'), 'fasta') SeqIO.write(nps_seq_records, os.path.join(TEMP, n1 + '.faa'), "fasta") os.system( 'makeblastdb -dbtype nucl -in %s.faa -out %s.db > /dev/null' % (os.path.join(TEMP, n1), os.path.join(TEMP, n1))) blastn_cline = NcbiblastnCommandline( query=os.path.join(TEMP, n2 + '.fasta'), db=os.path.join(TEMP, n1 + '.db'), evalue=evalue, outfmt=5, strand='both', word_size=20, perc_identity=90, out=os.path.join(TEMP, n1 + '.xml')) stdout, stderr = blastn_cline(cwd=TEMP) blast_record = NCBIXML.read(open(os.path.join(TEMP, n1 + '.xml'), 'r')) sname = list() evalue = list() hsp_list = list() for alignment in blast_record.alignments: for hsp in alignment.hsps: sname.append(alignment.title) evalue.append(hsp.expect) hsp_list.append(hsp) # length_list.append(alignment.length) if len(evalue) > 0: nps_identified = sname[evalue.index(min(evalue))].split()[1] strand = hsp_list[evalue.index(min(evalue))].strand[1] if strand == 'Plus': direction = 'DNAtop' elif strand == 'Minus': direction = 'DNAbot' elif 'TTAGGGTTAGGGTTAGGGTTAGGGTTAGGGTTAGGG' in str(test_seq): direction = 'DNAtop' nps_identified = 'telomeric_human' elif 'CCCTAACCCTAACCCTAACCCTAACCCTAACCCTAA' in str(test_seq): direction = 'DNAbot' nps_identified = 'telomeric_human' else: direction = nps_identified = None #os.system("rm %s.faa %s.db.nhr %s.db.nin %s.db.nsq %s.fasta %s.xml"%(n1,n1,n1,n1,n2,n1)) return nps_identified, direction
def blast(dbname, blast_program, query, evalue_threshold=0.001): infile = None with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: infile = f.name f.write(">Query\n%s\n" % query) outfile = "%s.out.xml" % infile if blast_program == 'tblastn': blast_cl = NcbitblastnCommandline(query=infile, db=dbname, evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile) else: blast_cl = NcbiblastnCommandline(query=infile, db=dbname, evalue=evalue_threshold, word_size=6, outfmt=5, out=outfile) cl = str(blast_cl) cl = "%s/%s" % (settings.NCBI_BIN_DIR, cl) # print cl r = subprocess.call(cl.split(" ")) os.unlink(infile) if r != 0: print "Blast failed: %s" % cl return [] results = [] with open(outfile, "r") as f: blast_record = NCBIXML.read(f) for alignment in blast_record.alignments: accession = Blast_Accession(alignment.accession) for hsp in alignment.hsps: if accession.fragment_length is not None: if hsp.sbjct_start > accession.fragment_length and \ hsp.sbjct_end > accession.fragment_length: continue # don't apply '% accession.fragment_length' to # sbjct_start/end. Blast_Result#strand compares sbjct_start # and sbjct_end to determine which strand the hit is on. # Caller should just handle when sbjct_start/end is greater # than fragment length. alternatively, we can store strand # explicit, but that also creates complexity when using # sbjct_start/end coordinates. f = Blast_Result(fragment_id=accession.fragment_id, fragment_length=accession.fragment_length, hit_def=alignment.hit_def, query_start=hsp.query_start, query_end=hsp.query_end, subject_start=hsp.sbjct_start, subject_end=hsp.sbjct_end, evalue=hsp.expect, alignment=dict(query=hsp.query, match=hsp.match, matchi=inverse_match(hsp.match), subject=hsp.sbjct)) results.append(f) os.unlink(outfile) return results
def test_primers(args): ref = args.Reference[0] primers = args.Primers[0] out = args.Output[0] nproc = args.processes[0] tm_offset = args.tm_offset[0] tm_size = args.tm_size[0] #min_align = args.min_align[0] skip_tm = args.skip_tm # File Handler iTFH = TFH(ref, primers, out) # 2. Run blastmakedb db = os.path.join(iTFH.outdir, os.path.basename(iTFH.ref) + ".db") cline = NcbimakeblastdbCommandline(dbtype="nucl", input_file=iTFH.ref, out=db) print("Building BLAST Database...") print(cline) run(cline.__str__()) # 3. Run short-blast result = os.path.join(iTFH.outdir, os.path.basename(iTFH.primers) + ".blast.tsv") result_tmp = os.path.join(iTFH.outdir, os.path.basename(iTFH.primers) + ".tmp") cline = NcbiblastnCommandline( query=iTFH.primers, db=db, task="blastn-short", num_threads=nproc, outfmt= "6 qseqid sseqid sstart send mismatch qlen length pident qseq sseq", out=result_tmp) print("Running short-BLAST...") print(cline) run(cline.__str__()) f = open(result, "w") f.write( "#PrimerName\tTargetName\tTargetStart\tTargetEnd\t#Mismatches\tPrimerLength\tAlignedLength\t%Identity\tPrimerSeq\tContigSeq\n" ) f.writelines(open(result_tmp, "r").readlines()) f.close() os.remove(result_tmp) if skip_tm: return # 4. Thermodynamics of BLAST results print("Running thermodynamic check on blast results...") tm_result_file = os.path.join( iTFH.outdir, os.path.basename(iTFH.primers) + ".blast.TM.tsv") chunks = to_chunks(result, nproc) tm_result = run_thermodynamics(chunks, nproc, iTFH.ref, tm_offset, tm_size) # PrimerName TargetName TargetStart TargetEnd #Mismatches PrimerLength AlignedLength %Identity PrimerSeq ContigSeq Struct_found TM DG DH DS f = open(tm_result_file, "w") f.write( "#PrimerName\tTargetName\tTargetStart\tTargetEnd\t#Mismatches\tPrimerLength\tAlignedLength\t%Identity\tPrimerSeq\tContigSeq\tPrimerTM\tHeteroDimerTM\tHeteroDimerDG\t3EndStabilityTM\t3EndStabilityDG\n" ) for line in tm_result: f.write(line + "\n") f.close()
def blast_product(product, tmp_dir, db, string_min, string_max, subunit_length): blast_dir = mkdtemp(dir=tmp_dir) queryFileName = blast_dir + '/query' outFileName = blast_dir + '/output.xml' SeqIO.write(product, queryFileName, 'fasta') cline = NcbiblastnCommandline( cmd='blastn', query=queryFileName, out=outFileName, outfmt=5, db=db, evalue=0.01) stderr = '' try: stdout, stderr = cline() except ApplicationError as err: return('', err.stderr) result_handle = open(outFileName) status = '' blast_record = NCBIXML.read(result_handle) midline_regex = re.compile(r"\|{20,}") alignment_status = '' self_alignments = [] conflicting_alignments = [] matching_alignments = [] reasons = [] # counter for tracking number of self hits selfhits = 0 # counter for tracking conflicting hits identified conflicting = 0 # counter for tracking hits with match exceeding subunit length matching = 0 for alignment in blast_record.alignments: alignment_data = { 'accession': alignment.hit_id, 'description': alignment.hit_def, 'subj_length': alignment.length, } hsp_count = 0 hsp_idents = [] # lengths of consecutive bases... hsp_match_lengths = [] hsp_alignments = [] hsp_hit_lengths = [] # Original RNAit implementation reports single value for identity, which # is tricky without tiling HSPs We'll use some slightly different critera # here # >1 hsp suggests the alignment is to a repetitive sequence which is # unlikely to amplify cleanly so mark these as bad for hsp in alignment.hsps: hsp_count += 1 # check for matches of >20bp identity by checking for stretches of # >20 '|' characters in the HSP midline match = midline_regex.search(hsp.match) match_len = match.end() - match.start() ident = (hsp.identities / hsp.align_length) hsp_idents.append(ident) hsp_match_lengths.append(match_len) hsp_hit_lengths.append(hsp.align_length) # pretty format alignment text_alignment = format_alignment(hsp) hsp_alignments.append(text_alignment) if (hsp_count == 1): length_cov = hsp_match_lengths[0] / blast_record.query_letters if (hsp_idents[0] > 0.99 and length_cov == 1): alignment_status = 'Self alignment' self_alignments.append(alignment_data) selfhits += 1 if selfhits > 1: reasons.append('Multiple self hits') elif (hsp_idents[0] * 100 > string_min and hsp_idents[0] * 100 < string_max): alignment_status = 'Conflicting hits' conflicting_alignments.append(alignment_data) conflicting += 1 reasons.append("Identity is %s" % (hsp_idents[0])) elif (hsp_match_lengths[0] > subunit_length): alignment_status = 'Match exceeding subunit length' matching_alignments.append(alignment_data) matching += 1 reasons.append( "%s bp identical sequence" % (hsp_match_lengths[0])) else: alignment_status = 'Good' else: alignment_status = 'Multiple HPSs' hsp_idents = list(map(format_ident, hsp_idents)) alignment_data['status'] = alignment_status alignment_data['reasons'] = reasons alignment_data['hsps'] = hsp_count alignment_data['ident'] = ";".join(map(str, hsp_idents)) alignment_data['hsp_alignments'] = hsp_alignments alignment_data['hsp_hit_lengths'] = ";".join(map(str, hsp_hit_lengths)) if selfhits > 1: primer_status = 'Bad' elif conflicting: primer_status = 'Bad' elif matching: primer_status = 'Bad' else: primer_status = 'Suitable' blast_data = { 'record': blast_record, 'primer_status': primer_status, 'self_hits': selfhits, 'self_alignments': self_alignments, 'conflicting_alignments': conflicting_alignments, 'matching_alignments': matching_alignments, } shutil.rmtree(blast_dir) return(blast_data, None)
# Make Blast DB leader_filename = '../Genome/' + ECgenome blast_database_file = leader_filename + ".fasta" #os.system("makeblastdb -in %s -dbtype nucl -title %s_BLAST_DB -out %s_BLAST_DB" % (blast_database_file, leader_filename, leader_filename)) # # Do BLAST via BioPython with reading in XML format (although file is larger than necessary in this case ... does make it more portable). # blast_output_file = blast_leader + '_' + ECgenome + ".blast_output" blastn_cline = NcbiblastnCommandline(query=blast_input_file, db=leader_filename + "_BLAST_DB", task='blastn-short', evalue='0.1', outfmt=5, out=blast_output_file) print(blastn_cline) # Should not really produce standard output or error. stdout, stderr = blastn_cline() result_handle = open(blast_output_file) blast_records = NCBIXML.parse(result_handle) recordmap = { record.query.split(':')[0]: record for record in list(blast_records) }
for item in testseqs: nam = item.split()[0] if nam not in identdict: identdict[nam] = {} seqnams.add(nam) fi = open("query.txt", 'w') fi.write(item.split()[1]) fi.close() for i in range(1, 2): call([ "python", "convertformat.py", "-i", "nexus", "fasta", "ancestorT{:d}.nex".format(i) ]) blastn_run = NcbiblastnCommandline( query="query.txt", subject="ancestorT{}.fasta".format(i), evalue=0.001, outfmt=5, out="test.xml") stdout, stderr = blastn_run() result_handle = open("test.xml") blast_records = NCBIXML.parse(result_handle) for rec in blast_records: for alignment in rec.alignments: node = alignment.title.split()[-1] if node not in identdict[nam]: identdict[nam][node] = [] for hsp in alignment.hsps: identdict[nam][node].append(hsp.bits) for nam in identdict: sumdict = []
def split_to_exons(): print('Splitting best hits to exons...') with open(separat_exons) as all_exons: all_exons_parsed = SeqIO.to_dict( SeqIO.parse(all_exons, 'fasta', generic_dna)) with open(best_separate_exons, 'w') as best_exons: for besthit in best_hits: locus = besthit.split()[1].split('-')[0] probe = besthit.split()[0] exons = [ val for key, val in all_exons_parsed.items() if locus in key ] for exon in exons: name = str(exon.id) sequence = str(exon.seq) best_exons.write(f'>{probe}_{name}\n{sequence}\n') NcbimakeblastdbCommandline(dbtype='nucl', input_file=probes, out=probes, parse_seqids=True)() NcbiblastnCommandline( task=blast_task, query=best_separate_exons, db=probes, out=f'{best_separate_exons}_against_{probes}.txt', num_threads=4, outfmt= '6 qaccver saccver pident qcovhsp evalue bitscore sstart send qstart qend' )() with open(f'{best_separate_exons}_against_{probes}.txt' ) as new_blast_results: hits = new_blast_results.readlines() cleaned_hits = [] for hit in hits: if hit.split()[0].split('_')[0] == hit.split()[1]: cleaned_hits.append(hit) cleaned_hits.sort(key=lambda x: float(x.split()[5]), reverse=True) cleaned_hits.sort(key=lambda x: float(x.split()[4])) cleaned_hits.sort(key=lambda x: float(x.split()[2]), reverse=True) cleaned_hits.sort(key=lambda x: float(x.split()[3]), reverse=True) cleaned_hits.sort( key=lambda x: int(x.split()[0].split('-')[3].split('_')[1])) cleaned_hits.sort(key=lambda x: x.split()[0].split('-')[2]) hits_exons = set() cleaned_dedup_hits = [] for cleaned_hit in cleaned_hits: if cleaned_hit.split()[0] not in hits_exons: cleaned_dedup_hits.append(cleaned_hit) hits_exons.add(cleaned_hit.split()[0]) cleaned_dedup_hits.sort( key=lambda x: int(x.split()[0].split('-')[3].split('_')[1])) cleaned_dedup_hits.sort(key=lambda x: x.split()[1].split('-')[1]) with open(f'{best_separate_exons}_against_{probes}.txt', 'w') as new_blast_results: for cleaned_hit in cleaned_dedup_hits: new_blast_results.write(cleaned_hit) with open(probes) as probes_to_parse: probes_as_dict = SeqIO.to_dict( SeqIO.parse(probes_to_parse, 'fasta', generic_dna)) with open(best_separate_exons) as best_exons: best_exons_as_dict = SeqIO.to_dict( SeqIO.parse(best_exons, 'fasta', generic_dna)) with open(result_file, 'w') as resultfile, open(result_file2, 'w') as resultfile2: for cleaned_dedup_hit in cleaned_dedup_hits: name_of_locus = cleaned_dedup_hit.split()[1] name_of_exon = cleaned_dedup_hit.split()[0] num_exon = cleaned_dedup_hit.split()[0].split('-')[3].split('_')[1] if int(cleaned_dedup_hit.split()[6]) > int( cleaned_dedup_hit.split()[7]): start = int(cleaned_dedup_hit.split()[7]) end = int(cleaned_dedup_hit.split()[6]) sequence = str(probes_as_dict[name_of_locus] [start - 1:end].seq.reverse_complement()) else: start = int(cleaned_dedup_hit.split()[6]) end = int(cleaned_dedup_hit.split()[7]) sequence = str(probes_as_dict[name_of_locus][start - 1:end].seq) resultfile.write(f'>{name_of_locus}_exon_{num_exon}\n{sequence}\n') if int(cleaned_dedup_hit.split()[8]) > int( cleaned_dedup_hit.split()[9]): start_opt = int(cleaned_dedup_hit.split()[9]) end_opt = int(cleaned_dedup_hit.split()[8]) sequence_opt = str(best_exons_as_dict[name_of_exon] [start_opt - 1:end_opt].seq.reverse_complement()) else: start_opt = int(cleaned_dedup_hit.split()[8]) end_opt = int(cleaned_dedup_hit.split()[9]) sequence_opt = str( best_exons_as_dict[name_of_exon][start_opt - 1:end_opt].seq) resultfile2.write( f'>{name_of_locus}_exon_{num_exon}\n{sequence_opt}\n') print('Done')
def main(args=[]): usage = ''' usage: %prog [options] arg \nProgram parses blast XML file, translates exons boundaries (annotated on reference sequences) to query sequences (e.g. transcript)" ''' parser = OptionParser(usage, version='%prog version 1.0') parser.add_option("-r", "--reference_fasta", dest="REFERENCE_FASTA", help="reference in fasta format") parser.add_option("-g", "--gff_reference_fasta", dest="GFF_REFERENCE_FASTA", help="annotation for reference in gff3 format") parser.add_option("-q", "--query_fasta", dest="QUERY_FASTA", help="query in fasta format") parser.add_option("-b", "--blast_db_path_and_name", dest="BLAST_DB_PATH_AND_NAME", help="blast+ database" ''', default="blast_out.xml"''') parser.add_option("-v", "--blast_xml_file", dest="BLAST_XML_FILE", help="blast results in xml file format", default="blast_out.xml") parser.add_option("-f", "--output_file", dest="OUTPUT_FILE", help="output file", action="store", type="string", default=str(__name__) + ".txt") parser.add_option("-o", "--output_folder", dest="OUTPUT_FOLDER", help="output folder", default="./") parser.add_option("-e", "--e_value_thresh", dest="E_VALUE_THRESH", help="threshold e-value", default=1e-8) parser.add_option("-a", "--only_best_Alignment", dest="ONLY_BEST_ALIGNMENT", help="take only 1, best q-s pair", default=True) parser.add_option("-w", "--blast_word_size", dest="BLAST_WORD_SIZE", help="blast word_size", default=11) parser.add_option("-n", "--blast_num_threads", dest="BLAST_NUM_THREADS", help="number of threads", default=2) parser.add_option("-m", "--blast_match_score", dest="BLAST_MATCH_SCORE", help="reward for nt match", default=1) parser.add_option("-s", "--blast_mismatch_score", dest="BLAST_MISMATCH_SCORE", help="penalty for nt mismatch", default=-3) parser.add_option("-y", "--blast_gap_open", dest="BLAST_GAP_OPEN", help="cost of opening a gap", default=5) parser.add_option("-x", "--blast_gap_extend", dest="BLAST_GAP_EXTEND", help="cost of gap extension", default=2) (options, arg) = parser.parse_args(args) # --- Entering program t_st = time.time() if not os.path.isdir(options.OUTPUT_FOLDER): sys.stdout.write('\nWrong output directory!') return os.chdir(options.OUTPUT_FOLDER) logging_file = "log_output" if options.OUTPUT_FILE != "": logging_file = options.OUTPUT_FILE log_info_hlr = open(options.OUTPUT_FOLDER + os.sep + logging_file + ".log", "w") log_info = "Entering program: {}\n".format(os.path.basename(__file__)) sys.stdout.write(log_info) log_info_hlr.write(log_info) log_info = "\nUsed options: {}\n".format("\n".join( str(options).split(","))) sys.stdout.write(log_info) log_info_hlr.write(log_info) # --- workspace s = os.path.join(os.path.dirname(__file__), '.') os.chdir(s) print os.getcwd() # --- parsing gff3 file log_info = "Parsing {}\n".format(options.GFF_REFERENCE_FASTA) sys.stdout.write(log_info) log_info_hlr.write(log_info) gff_dic = {} s_prev_gen = "" gff_ref_hlr = open(options.GFF_REFERENCE_FASTA, "r") for line_gff in gff_ref_hlr: line_gff_list = line_gff.split("\t") gene_name = line_gff_list[0] exon_start = int(line_gff_list[3]) exon_end = int(line_gff_list[4]) exon_strand = line_gff_list[6] exon_name = line_gff_list[8].split(";")[0].split("=")[1] last_exon = line_gff_list[8].split(";")[1].strip() info_pack = [exon_name, exon_strand, last_exon] if gene_name not in gff_dic: coord_exons_dic = {} for x_coord in range(exon_start, exon_end + 1): coord_exons_dic[x_coord] = [info_pack] gff_dic[gene_name] = coord_exons_dic else: coord_exons_dic = gff_dic[gene_name] for x_coord in range(exon_start, exon_end + 1): if x_coord not in coord_exons_dic: coord_exons_dic[x_coord] = [] coord_exons_dic[x_coord].append(info_pack) if gene_name != s_prev_gen: s_prev_gen = gene_name log_info = "Parsing gff for gene: {}\n".format(s_prev_gen) sys.stdout.write(log_info) log_info_hlr.write(log_info) # --- blast analysis for each sequence from Bio.Blast.Applications import NcbiblastnCommandline blast_db_source = options.BLAST_DB_PATH_AND_NAME blastx_cline = NcbiblastnCommandline(query=options.QUERY_FASTA, db=blast_db_source, evalue=float(options.E_VALUE_THRESH), outfmt=5, out=options.BLAST_XML_FILE, word_size=options.BLAST_WORD_SIZE, num_threads=options.BLAST_NUM_THREADS, reward=options.BLAST_MATCH_SCORE, penalty=options.BLAST_MISMATCH_SCORE, gapopen=options.BLAST_GAP_OPEN, gapextend=options.BLAST_GAP_EXTEND) stdout, stderr = blastx_cline() # --- analysis of blast alignment out_file_core = "exons_alignment_by_blast_out" out_local_gff3_hlr = open( options.OUTPUT_FOLDER + os.sep + out_file_core + "_local.gff3", "w") out_global_gff3_hlr = open( options.OUTPUT_FOLDER + os.sep + out_file_core + "_global.gff3", "w") out_fasta_hlr = open( options.OUTPUT_FOLDER + os.sep + out_file_core + ".fasta", "w") result_handle = open(options.BLAST_XML_FILE) blast_records = NCBIXML.parse(result_handle) for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.expect < float(options.E_VALUE_THRESH): alignment_geneName = str(alignment.hit_def) print('sequence:', alignment.title) print("len hsp.query", len(hsp.query)) print("len hsp.sbjct", len(hsp.sbjct)) print("len hsp.match", len(hsp.match)) print("hsp.sbjct", str(hsp.sbjct)) print("hsp.match", str(hsp.match)) print("hsp.query", str(hsp.query)) # coordinates: subject print("hsp.sbjct_start", hsp.sbjct_start) print("hsp.sbjct_start", hsp.sbjct_end) # coordinates: query print("hsp.sbjct_start", hsp.query_start) print("hsp.sbjct_start", hsp.query_end) coord_exons_dic = gff_dic[alignment_geneName] # generate alignment objects list query_geneName = str(blast_record.query) query_geneName_local = query_geneName + "__q[" + str( hsp.query_start ) + ":" + str(hsp.query_end) + "]" + "_s[" + str( hsp.sbjct_start) + ":" + str( hsp.sbjct_end) + "]" + "_" + alignment_geneName alignment_object_list = get_hsp_alignment_object_list( hsp, alignment_geneName, query_geneName, query_geneName_local) query_seq = "".join([xx.q for xx in alignment_object_list]) out_fasta_hlr.write(">" + query_geneName_local + "\n") out_fasta_hlr.write(query_seq + "\n") # set exons info into alignment objects for al_obj_in_hsp in alignment_object_list: if al_obj_in_hsp.position_subject in coord_exons_dic: al_obj_in_hsp.set_exons(coord_exons_dic[ al_obj_in_hsp.position_subject]) # global & local gff output extract_and_write_gff(alignment_object_list, out_global_gff3_hlr, out_local_gff3_hlr) if options.ONLY_BEST_ALIGNMENT: break out_local_gff3_hlr.close() out_global_gff3_hlr.close() out_fasta_hlr.close() # --- closing program t_end = time.time() sys.stdout.write("\n\nWork done...") sys.stdout.write("\nProcess time [s]: " + str(t_end - t_st))
db_fna += ">" + gene.name_id + '\n' + str(gene.seq) + '\n' db_trna += ">" + gene.name_id + '\n' + str(gene.seq) + '\n' with open("blast_db/db.fna", 'w') as outfile_fna: outfile_fna.write(db_fna) with open("blast_db/db.faa", 'w') as outfile_faa: outfile_faa.write(db_faa) with open("blast_db/db_tRNA.fna", 'w') as outfile_trna: outfile_trna.write(db_trna) #Call BLAST to make blastDBs subprocess.call("makeblastdb.exe -in blast_db/db.fna -dbtype nucl -out blast_db/nt_db", shell=True) subprocess.call("makeblastdb.exe -in blast_db/db.faa -dbtype prot -out blast_db/aa_db", shell=True) subprocess.call("makeblastdb.exe -in blast_db/db_tRNA.fna -dbtype nucl -out blast_db/trna_db", shell=True) #Names of BLAST dbs #blast_db/nt_db #blast_db/aa_db #blast_db/trna_db ######################################### #BLAST the databases against themselves nt_blast = NcbiblastnCommandline(cmd="blastn.exe", task = 'dc-megablast', out="blast_output/nt_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db.fna", db="blast_db/nt_db") aa_blast = NcbiblastnCommandline(cmd="blastp.exe", out="blast_output/aa_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db.faa", db="blast_db/aa_db") trna_blast = NcbiblastnCommandline(cmd="blastn.exe", task = 'dc-megablast', out="blast_output/trna_blast.txt", outfmt='"6 qseqid sseqid pident qlen length mismatch gapope evalue bitscore qcovhsp"' , query= "blast_db/db_tRNA.fna", db="blast_db/trna_db") nt_blast() aa_blast() trna_blast() ################################################################
def blastSingle(item, query_virus_dir, output_dir, seqid, numThreads): query_name = item.split('.')[0] query_file = os.path.join(query_virus_dir, item) output_file = os.path.join(output_dir, query_name) + '.blast' if seqid is not None: # specify seqidlist blast_call = NcbiblastnCommandline( query=query_file, db=db_host_prefix, out=output_file, outfmt="6 qacc sacc qstart qend qlen", evalue=0.01, gapopen=10, penalty=-1, reward=1, gapextend=2, word_size=11, perc_identity=90, seqidlist=seqid, num_threads=numThreads) else: blast_call = NcbiblastnCommandline( query=query_file, db=db_host_prefix, out=output_file, outfmt="6 qacc sacc qstart qend qlen", evalue=0.01, gapopen=10, penalty=-1, reward=1, gapextend=2, word_size=11, perc_identity=90, num_threads=numThreads) blast_call() ''' Parse blast results for a single file ''' if os.stat(output_file).st_size == 0: ind = False return ind, None else: with open(query_file) as f: query_len = len(f.read()) # bp query_res = pd.read_table(output_file, header=None) # need to make sure a same value for the last column # map headers to genome names #query_res[0] = query_name query_res[1] = [dict_genome[k] for k in list(query_res[1])] df_blast_positions = query_res.groupby([0, 1]).agg({ 2: lambda x: tuple(x - 1), 3: lambda x: tuple(x - 1), 4: min }) df_blast_positions.index = df_blast_positions.index.droplevel() df_blast_perc = df_blast_positions.apply(lambda x: cal_perc(x), axis=1) / query_len sr_blast = df_blast_perc.groupby(level=0, sort=False).apply(sum) ind = True return ind, pd.DataFrame({query_name: sr_blast}).T
def run_BLAST(query, database, args, cons_run): """ Given a mfa of query sequences of interest & a database, search for them. Important to note: * Turns dust filter off, * Only a single target sequence (top hit), * Output in XML format as blast.xml. # TODO: Add evalue filtering ? # TODO: add task='blastn' to use blastn scoring ? .. warning:: default is megablast .. warning:: tblastx funcationality has not been checked :param query: the fullpath to the vf.mfa :param database: the full path of the databse to search for the vf in :param args: the arguments parsed to argparse :param cons_run: part of a mapping consensus run :type query: string :type database: string :type args: argparse args (dictionary) :type cons_run: boolean :returns: the path of the blast.xml file """ tmp1 = os.path.splitext(query.split('/')[-1])[0] tmp2 = os.path.splitext(database.split('/')[-1])[0] if not cons_run: outfile = os.path.join("BLAST_results/", "DB="+tmp1+"ID="+tmp2+"_blast.xml") else: outfile = os.path.join("BLAST_results/", "cons_DB="+tmp1+"ID="+tmp2+"_blast.xml") protein = False # File type not specified, determine using util.is_protein() if args.reftype is None: if SeqFindr.util.is_protein(query) != -1: protein = True sys.stderr.write('%s is protein' % (query)) elif args.reftype == 'prot': protein = True sys.stderr.write('%s is protein\n' % (query)) run_command = '' if protein: sys.stderr.write('Using tblastn\n') run_command = NcbitblastnCommandline(query=query, seg='no', db=database, outfmt=5, num_threads=args.BLAST_THREADS, max_target_seqs=1, evalue=args.evalue, out=outfile) else: if args.tblastx: sys.stderr.write('Using tblastx\n') run_command = NcbitblastxCommandline(query=query, seg='no', db=database, outfmt=5, num_threads=args.BLAST_THREADS, max_target_seqs=1, evalue=args.evalue, out=outfile) else: sys.stderr.write('Using blastn\n') if args.short == False: run_command = NcbiblastnCommandline(query=query, dust='no', db=database, outfmt=5, num_threads=args.BLAST_THREADS, max_target_seqs=1, evalue=args.evalue, out=outfile) else: sys.stderr.write('Optimising for short query sequences\n') run_command = NcbiblastnCommandline(query=query, dust='no', db=database, outfmt=5, word_size=7, num_threads=args.BLAST_THREADS, evalue=1000, max_target_seqs=1, out=outfile) sys.stderr.write(str(run_command)+"\n") run_command() return os.path.join(os.getcwd(), outfile)
sys.exit() mst_type = args.mst_type tmp_path = "tmp" os.mkdir(tmp_path) if "".join(mst_type) == "all": for i in range(1, 31): db_file = "mst" + str(i) + ".fasta" tmp_out = "mst" + str(i) + ".tab" if not os.path.isfile(args.db_directory / db_file): print("Missing database file {}".format(db_file)) sys.exit() else: query = NcbiblastnCommandline(query=args.query_sequence, db=args.db_directory / db_file, evalue=0.001, outfmt=6, out=tmp_path / tmp_out, ungapped=True) else: for i in mst_type: if int(i) not in list(range(1, 31)): print("Invalid argument MST type {}".format(i)) else: db_file = "mst" + str(i) + ".fasta" tmp_out = "mst" + str(i) + ".tab" if not os.path.isfile(args.db_directory / db_file): print("Missing database file {}".format(db_file)) sys.exit() else: query = NcbiblastnCommandline(query=args.query_sequence, db=args.db_directory / db_file,
from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML import pandas as pd import os.path thermal = "16S_thermophile.fasta" meso = "16S_mesophile.fasta" output = "16S_blast.xml" _new = False if _new: # 16S gene blast NcbiblastnCommandline(query=thermal, subject=meso, outfmt=5, out=output)()[0] print("blast finished!") blast_records = NCBIXML.parse(open(output, "r")) f = open("16S_blast_org.csv", 'w') f.write("query_seq,hit_seq,hit_len,identity,score,evalue\n") for blast_record in blast_records: for alignment in blast_record.alignments: for hsp in alignment.hsps: if hsp.gaps != 0 and blast_record.query != alignment.hit_def: f.write( f"{blast_record.query},{alignment.hit_def},{hsp.align_length},{hsp.identities},{hsp.score},{hsp.expect}\n" ) f.close()
#help(NcbiblastnCommandline) import os import sys directory = sys.argv[ -1] #first argument input after 'python BLAST_loop.py' in command line will be stored in variable 'directory' #change this directory as needed. Note that local blast may experience issues if you're not working in the blast directory (here NCBI/blast-2.2.30+) #directory = "C:/Users/madeleine/Documents/NCBI/phageParser/data" for fn in os.listdir("%s/spacers" % directory): query1 = "%s/spacers/%s" % (directory, fn) ext = fn.index('.') outfile1 = fn.replace(fn[ext + 1:], ("txt")) outfile = "%s/phages/%s" % (directory, outfile1) # These parameters are more or less the same as the ones on PhagesDB.org blastn_obj = NcbiblastnCommandline(query=query1, db="phagedb", evalue=10, num_descriptions=100, num_alignments=100, dust="no", task="blastn", reward=1, penalty=-3, out=outfile) stdout, stderr = blastn_obj()
def blast_pacbio(input_data_filename, blast_database_name, output_data_filename, max_hsps, max_target_seqs): out_format = '6 qseqid sseqid pident qcovhsp length mismatch gapopen qstart qend sstart send evalue bitscore staxids' blastn_cline = NcbiblastnCommandline(cmd = 'blastn', query = input_data_filename, db = blast_database_name, max_hsps = max_hsps, max_target_seqs = max_target_seqs, outfmt = '"' + out_format + '"', out = output_data_filename) blastn_cline() return
def blast2circos_file(self, blast, reference, blastn=False, identity_cutoff=80): ''' tblastn vs contigs by default can be switch to blastn :param blast: :param reference: :param blastn: :return: ''' from TPutils import shell_command from TPutils import blast_utils from Bio.Blast.Applications import NcbitblastnCommandline from Bio.Blast.Applications import NcbiblastnCommandline # todo catch IO errors, orther potential errors a, b, c = shell_command.shell_command( 'makeblastdb -in %s -dbtype nucl' % (reference)) # print a # print b print(c) if not blastn: blast_cline = NcbitblastnCommandline( query=blast, db=reference, evalue=0.00000001, # 0.001 outfmt=6, out="blast.tmp", max_target_seqs=1) print(blast_cline) else: blast_cline = NcbiblastnCommandline(query=blast, db=reference, evalue=0.001, outfmt=6, out="blast.tmp") stdout, stderr = blast_cline() # a,b,c = shell_command.shell_command('tblastn -query %s -db %s -evalue 1e-5 -max_target_seqs 1 -outfmt 6 > blast.tmp' % (blast, reference)) # a,b,c = shell_command.shell_command('tblastn -query %s -db %s -evalue 1e-5 -max_target_seqs 1 -outfmt 6' % (blast, reference)) print('############## BLAST ###################') # print a # print b # print c blast2data, queries = blast_utils.remove_blast_redundancy( ["blast.tmp"], check_overlap=False) o = open('circos_blast.txt', "w") l = open('circos_blast_labels.txt', "w") # with open(blast, 'r') as b: ''' for line in a.split('\n'): data = line.rstrip().split('\t') #print data try: if float(data[2])>80: location = sorted([data[8], data[9]]) o.write("%s\t%s\t%s\n" % (data[1], location[0], location[1])) l.write("%s\t%s\t%s\t%s\n" % (data[1], location[0], location[1], data[0])) except IndexError: continue ''' for contig in blast2data: cname = re.sub("\|", "", contig) for gene in blast2data[contig]: if float(blast2data[contig][gene] [0]) >= identity_cutoff: # 80,20 location = sorted(blast2data[contig][gene][1:3]) o.write("%s\t%s\t%s\n" % (contig, location[0] + self.contigs_add[cname][0], location[1] + self.contigs_add[cname][0])) l.write("%s\t%s\t%s\t%s\n" % (contig, location[0] + self.contigs_add[cname][0], location[1] + self.contigs_add[cname][0], gene)) o.close()
else: # Unsuccessful. Stdout will be '1'. Entry was not found or some other error blastn_log.error("There was an error with : %s" % str(Accession[2])) # Log it. break # Create/Open a XML file that stores BLAST data for a particular Organism. # By opening for writing, we can overwrite already existing xml files. save_file = open("%s_%s.xml" % (Accession[1], Org), "w") # Create a copy of the gi list file per taxonomy id to be used in blast os.system("cp " + h + "/data/gi-lists/" + TAX + "gi " + TAX + "gi") # Use Biopython's NCBIBlastnCommandline tool result_handle1 = NcbiblastnCommandline(query="temp.fasta", db="refseq_rna", strand="plus", evalue=0.001, # DONT GO LOWER out="%s_%s.xml" % (Accession[1], Org), outfmt=5, gilist=TAX + "gi", max_target_seqs=10, task="blastn") stdout_str, stderr_str = result_handle1() #blastn_log.info(result_handle1) # log the result handle as a check. # Remove the gi list obinary file from the current directory os.remove(TAX + "gi") blastn_log.info(TAX + "gi file has been deleted." + "\n") # Remove the temp.fasta file in the directory os.remove("temp.fasta") blastn_log.info("The temp.fasta file has been deleted." + "\n") blastn_log.info("%s_%s.xml" % (Accession[1],Org) + " is being parsed." + "\n") # ------------------------------------------------------------------------------
# if gene_name not in genefiles: seq1 = SeqRecord(gene_seq, id=gene_name, name=read.name, description=read.description.translate( string.maketrans( "", "", ), bad_chard)) gene = gene_dir + gene_name + ".fasta" SeqIO.write(seq1, gene, "fasta") # Run BLAST and parse the output as XML output = NcbiblastnCommandline( query=gene, subject="LacO_T0.fasta", dust="no", soft_masking="false", outfmt=5)()[0] # -dust no -soft_masking false -outmft 6 blast_result_record = NCBIXML.read(StringIO(output)) # Print some information on the result if len(blast_result_record.alignments) == 0: # no significant alignments lost_genes.update({gene_name: {'LacO': 'Lost'}}) else: tot_len = 0 for alignment in blast_result_record.alignments: for hsp in alignment.hsps: tot_len += hsp.align_length if hsp.align_length - hsp.identities > nmis: if gene_name not in lost_genes: lost_genes.update({
def map_primers_to_genome(self,blast_db,outfile=None,search_set=None,default_to_PCR=False,temp_dir = None, keep_temp=False, tolerance=1): workingDir = temp_dir if temp_dir is not None else self.tempDirObj.name if outfile == '': outfile = None if search_set == None: search_set = set(self.primers_dict.keys()) temp_infile = os.path.join(workingDir,'tmp_primer.fasta') temp_outfile = os.path.join(workingDir,'tmp_primer_blast.fasta') blast_combined = blankBLASTtable() ql_head = 'query_length' #new column to add fh_head = 'forward hit' export_regions = dict() #name for region, coordinates of innermost nucleotide on outermost primers (draw data from seq_borders dict in the sequencing reaction) for locus in search_set: if locus not in self.primers_dict.keys(): print("Error: {} is not in the set of primer loci".format(locus)) locus_dict = self.primers_dict[locus].copy() #so that I can modify it if default_to_PCR: #Make sure there are primers for sequencing the entire region seq_dict = locus_dict['Seq'] if 'All' not in seq_dict.keys(): seq_dict['All'] = locus_dict['PCR']['All'] export_regions[locus] = dict() ##Evaluate PCR dict first to find general range in which sequencing primers can bind PCR_dict = locus_dict['PCR'] range_list = [] ## Create a master range limit if specified has_range = ('range_contig' in locus_dict.keys() and 'range_from' in locus_dict.keys() and 'range_to' in locus_dict.keys()) if has_range: master_range = region_record(locus_dict['range_contig'],locus_dict['range_from'],locus_dict['range_to']) range_list.append(master_range) ## Place BLAST hits into ranges for (subregion, subregion_dict) in PCR_dict.items(): ##Only one region: "all" for (primer,sequence) in subregion_dict.items(): #Write query file my_seq = SeqRecord(Seq(sequence,IUPAC.ambiguous_dna),id="-".join([locus,'PCR',subregion,primer])) with open(temp_infile,"w") as fout: SeqIO.write(my_seq,fout,'fasta') #Search BLAST blast_cline = NcbiblastnCommandline(query=temp_infile,db=blast_db,outfmt=6,out=temp_outfile,task='blastn-short',evalue=1,reward=1,penalty=-1,gapopen=3,gapextend=2) blast_cline() ##Should only print for errors blast_table = loadBLASTtableToDataFrame(temp_outfile) if keep_temp: named_file = '{}_{}.tab'.format("-".join([locus,'PCR',subregion,primer]),os.path.basename(blast_db)) utilities.safeOverwriteTable(os.path.join(workingDir,named_file), blast_table, 'tab') ##SPlace best hits into ranges if len(blast_table) > 0: ##Add some extra info to table blast_table[ql_head] = len(my_seq) blast_table[fh_head] = blast_table['s. start'] < blast_table['s. end'] ## Limit table to best hits best = blast_table.sort_values(by=['bit score'],ascending=False).iloc[0] best_table = blast_table[blast_table['bit score'] >= tolerance*best['bit score']] #This may be too stringent; may need to revisit ## Add best hits to ranges for _,this_hit in best_table.iterrows(): finished = False #if we found a range for it for this_range in range_list: if not finished: #stop upon success or if range is exclusive finished = this_range.try_add_primer(this_hit['subject id'],this_hit['s. start'],this_hit[fh_head],True) if this_range.exclusive and not finished: finished = True if len(best_table) == 1: print("Warning: an exclusive hit failed to map to the prespecified region. Please report to developer(s)") if not finished: new_range = region_record() new_range.try_add_primer(this_hit['subject id'],this_hit['s. start'],this_hit[fh_head],True) range_list.append(new_range) ## Record best hits for reporting blast_combined = pd.concat([blast_combined,best_table],sort=True)##Note: this is compatible with pandas 0.23 +; older versions will fail. Without sort, it makes FutureWarning and exception. else: print("Warning: zero hits for {}".format(my_seq.id)) ##Merge any ranges that are close/overlapping; test if ranges are valid (primer pairs) i = 0 ValidRanges = set() while i < len(range_list): this_range = range_list[i] j = len(range_list)-1 while j > i: merger = this_range.try_merge_regions(range_list[j]) if merger: print("Warning: this is an exceptional situation and has not been tested, please report to developer(s). Range merger") del(range_list[j]) j-=1 #Test validity of this_range if (len(this_range.For_list) > 0 and len(this_range.Rev_list) > 0): if this_range.get_min() < this_range.get_max(): ValidRanges.add(i) i+=1 #Remove invaled ranges range_list = [range_list[i] for i in ValidRanges] #Report oddities if len(range_list) == 0: print("Warning: Unable to find an amplification region for {}".format(locus)) elif len(range_list) == 2: print("Warning: Detected multiple amplification regions for {}".format(locus)) for this_range in range_list: vprint('\n'+locus + ": Potential amplicon region") vprint(this_range) ## Find the sequencing sites within the defined ranges Seq_dict = locus_dict['Seq'] for (subregion, subregion_dict) in Seq_dict.items(): export_regions[locus][subregion] = dict() seq_borders = dict() ##Use range as key to track where sequencing of subregion starts. Values outside of range indicate no matches seq_primers = dict() ##primer names corresponding to border positions for (primer,sequence) in subregion_dict.items(): my_seq = SeqRecord(Seq(sequence,IUPAC.ambiguous_dna),id="-".join([locus,'Seq',subregion,primer])) with open(temp_infile,"w") as fout: SeqIO.write(my_seq,fout,'fasta') blast_cline = NcbiblastnCommandline(query=temp_infile,db=blast_db,outfmt=6,out=temp_outfile,task='blastn-short',evalue=1,reward=1,penalty=-1,gapopen=3,gapextend=2) blast_cline() ##Should only print for errors blast_table = loadBLASTtableToDataFrame(temp_outfile) if len(blast_table) > 0: ##Add some extra info to table blast_table[ql_head] = len(my_seq) blast_table[fh_head] = blast_table['s. start'] < blast_table['s. end'] for my_range in range_list: ## Limit table to hits in range r_min = my_range.get_min() r_max = my_range.get_max() if my_range not in seq_borders: ##TODO: this should probably be initialized immediately after declaration. Need to check that it doesnt' break the downstream features seq_borders[my_range] = [r_min -1, r_max+1] seq_primers[my_range] = ['None','None'] range_table = blast_table[blast_table['subject id'] == my_range.contig] range_table = range_table[range_table['s. end'] >= r_min] range_table = range_table[range_table['s. end'] <= r_max] if len(range_table) > 0: ## Limit table to best hits best_in_range = range_table.sort_values(by=['bit score'],ascending=False).iloc[0] range_table = range_table[range_table['bit score'] >= best_in_range['bit score']] #This may be too stringent; may need to revisit if len(range_table) > 0: if len(range_table) > 1: export_line = "Warning: sequencing primer maps to multiple locations within PCR primers. Using outermost site: {}".format(my_seq.id) # if __name__ != "__main__": ##Being called from an outside procedure...indent to indicated subsidiary position # export_line = '\t'+export_line print(export_line) for _, hit in range_table.iterrows(): q_end = hit['q. end'] gap = len(my_seq) - q_end s_end = hit['s. end'] is_for = hit[fh_head] if is_for: if seq_borders[my_range][0] < my_range.get_min(): seq_borders[my_range][0] = s_end seq_primers[my_range][0] = primer if gap > 0: vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the low end: {}".format(my_seq.id)) else: if seq_borders[my_range][0] > s_end: seq_borders[my_range][0] = s_end seq_primers[my_range][0] = primer if gap > 0: vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the low end: {}".format(my_seq.id)) vprint("Warning: multiple sequencing primers map in forward direction on template. Using outermost site: {}".format("-".join([locus,'Seq',subregion,seq_primers[my_range][0]]))) else: if seq_borders[my_range][1] > my_range.get_max(): seq_borders[my_range][1] = s_end seq_primers[my_range][1] = primer if gap > 0: vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the high end: {}".format(my_seq.id)) else: if seq_borders[my_range][1] < s_end: seq_borders[my_range][1] = s_end seq_primers[my_range][1] = primer if gap > 0: vprint("Warning: sequencing primer does not match template at 3' end. Sequence probably needs trimming on the high end: {}".format(my_seq.id)) vprint("Warning: multiple sequencing primers map in reverse direction on template. Using outermost site: {}".format("-".join([locus,'Seq',subregion,seq_primers[my_range][1]]))) else: print("Warning: sequencing primer failed to map within PCR primers: {}".format(my_seq.id)) ## Record best hits for reporting best_table = blast_table[blast_table['bit score'] >= best_in_range['bit score']] #This may be too stringent; may need to revisit #~ print("Identified {} hits above threshold used for best in range".format(len(best_table))) blast_combined = pd.concat([blast_combined,best_table],sort=True) ##Note: this is compatible with pandas 0.23 +; older versions will fail. else: print("Warning: sequencing primer does not map to within PCR product. Exporting all matches for {}".format(my_seq.id)) blast_combined = pd.concat([blast_combined,blast_table],sort=True) ##Note: this is compatible with pandas 0.23 +; older versions will fail. ##Export sequencing start sites basename = locus if subregion != 'All': basename += '_' + subregion for my_range in range_list: if my_range in seq_primers: name = basename name += '_{}_{}_{}'.format(seq_primers[my_range][0],seq_primers[my_range][1],os.path.basename(os.path.splitext(blast_db)[0])) ##Convoluted way to get the genome name export_regions[locus][subregion][name] = {'contig':my_range.contig,'start':seq_borders[my_range][0]+1,'stop':seq_borders[my_range][1]-1} else: ##seq_primers never got initialized because there is no match. print("Notice: No sequencing primers for {} mapped with in the defined range for {}.".format(subregion,locus)) #I could add a way to orient the sequences (identify a reference primer) export_regions[locus]['OuterRange'] = range_list os.remove(temp_infile) os.remove(temp_outfile) if outfile != None: blast_combined.to_csv(outfile,index=False) ##columns=blast_default_headers+[ql_head,fh_head] export_line = 'Exported primer locations to '+outfile # if __name__ != "__main__": ##Being called from an outside procedure...indent to indicated subsidiary position # export_line = '\t'+export_line print(export_line) # current_verbose = default_verbose return export_regions
from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SearchIO humdb = "/mithril/Data/Pacbio/Aligned/151019_proc/blast/humiso_blast" blastn_cline = NcbiblastnCommandline(query="temp.fasta", db=humdb, gapopen=1, gapextend=2, word_size=9, reward=1, evalue=10, outfmt=5, out="try.xml") stdout, stderr = blastn_cline() bres = SearchIO.read("try.xml", 'blast-xml') SearchIO.write(bres, 'try.tsv', 'blast-tab') ##ok - this was nice, but can't output because blast is pairwise, and I think we actually want a MAF
def ensure_amplicons_not_in_exclusion(exclusion_blastdb, potential_amplicons, confirmed_amplicons, max_potential_amplicons=200): """ Given a blast database of sequences we do not want amplicons to match to and a fasta file containing our potential amplicons, will blastn potential amplicons to make sure that they don't match too closely to the exclusion blastdb. Criteria for this: Top hit length can't be more than 40 base pairs (anything more than that might start getting amplified if we're really unlucky) and if more than one hit, can't have any two hits within 5000bp of each other, as those could also potentially amplify if we're really unlucky. Amplicons confirmed to meet these criteria will get written to confirmed_amplicons, which will overwrite any file that was already there. :param exclusion_blastdb: Path to exclusion blast database. In this pipeline, should have been created by make_all_exclusion_blast_db :param potential_amplicons: Path to potential amplicon fasta file. In this pipeline, should have been created by split_sequences_into_amplicons :param confirmed_amplicons: Path to your desired output confirmed amplicon file. Overwrites file if something was already there. """ outstr = '' sequence_id = 1 for potential_sequence in SeqIO.parse(potential_amplicons, 'fasta'): blastn = NcbiblastnCommandline(db=exclusion_blastdb, task='blastn', outfmt=5) stdout, stderr = blastn(stdin=str(potential_sequence.seq)) top_hit_length = 999999 # Start this at ridiculouly high value # The hit location dict will store the locations of every blast hit to each contig. # Each contig is an entry into the dict, with each entry being a list of locations. # We'll later try every combination in each list to make sure no two matches are too close together. hit_location_dict = dict() records = NCBIXML.parse(StringIO(stdout)) for record in records: try: top_hit_length = record.alignments[0].hsps[0].align_length except IndexError: # Should happen if we don't have any hits at all. top_hit_length = 0 for alignment in record.alignments: for hsp in alignment.hsps: if alignment.title in hit_location_dict: hit_location_dict[alignment.title].append(hsp.sbjct_start) else: hit_location_dict[alignment.title] = [hsp.sbjct_start] # Set up a flag that we'll turn to true if we find any sets of matches that are too close together. matches_too_close = False for contig in hit_location_dict: for i in range(len(hit_location_dict[contig])): for j in range(len(hit_location_dict[contig])): if i != j: # Make sure no two hits within 5000bp of each other. if abs(hit_location_dict[contig][i] - hit_location_dict[contig][j]) < 5000: matches_too_close = True # Allow writing to outstr if either we have no hits longer than a roughly two pcr primers (so 40ish bp) # Also can't have any two matches to the same contig within 5000bp of each other. if top_hit_length < 40 and matches_too_close is False: outstr += '>sequence' + str(sequence_id) + '\n' outstr += str(potential_sequence.seq) + '\n' sequence_id += 1 if sequence_id > max_potential_amplicons: break with open(confirmed_amplicons, 'w') as f: f.write(outstr)
e_val = 10 name = sys.argv[1] #input_file db_name = sys.argv[2] haveXML = False out_file_name = sys.argv[3] xml_file_name = out_file_name + ".xml" novel_file_name = out_file_name + "_novel" mature_file_name = out_file_name + "_mature" db_fasta = sys.argv[4] #mirBASE (mature.fa) #time_file_name = out_file_name + "_execution_time.txt" #time_handle = open(time_file_name, "w") #start = timeit.default_timer() if not haveXML: blastn_cline = NcbiblastnCommandline(penalty=-5, reward=4, max_target_seqs=100, word_size=11, query=name, db=db_name, evalue=e_val,outfmt=5, out=xml_file_name) stdout, stderr = blastn_cline() try: handle = open (xml_file_name) success = True except IOError: pass if success: novel = [] mature = [] records = NCBIXML.parse(handle) record_index = SeqIO.index(name, "fasta") mirbase_index = SeqIO.index(db_fasta, "fasta") mir_dict = dict() mature_summary = open (mature_file_name+".csv",'w')
def blast (species_path_name,busco_result,query_file,species_out_path_name,pav_excel_name): ''' input:1 contig input 2 busco_result provide species list input 3 blastn query file,pan gene intermediate_out: species_out_path=Path('../Pan_genome_data/c_blast_present_contig/') output 1: pav_excel name ''' global pan_sh species_path=Path(species_path_name) species_95_list=extract_strain_id(busco_result) species_95_list.append("70-15") species_95_list.append("ina168") species_95_list.remove("magnaporthe_oryzae_70-15_8_proteins_T0") species_out_path=Path(species_out_path_name) excel_book=Workbook() pan_sh=excel_book.active species_count=1 for species_id in species_95_list: # print (species_id+"\t") for species_file in species_path.glob(species_id.strip('\n')+".fasta"): # print (str(species_file)+"\n") species_count=species_count+1 species_name=species_file.stem excel_species_name(species_count+1,species_name) species_file_path=str(species_file) species_out=species_out_path/(species_name+'.xml') # species_out_1=os.path.join("../test_why_00005_so_many_1/",species_name+'.txt') # blastdb(species_file_path) blast_cmd=NcbiblastnCommandline( cmd='/mnt/d/zhes_learning_space/software_in_ubuntu/ncbi-blast-2.9.0+/bin/blastn', query=query_file, db=species_file_path, outfmt=5, out=species_out # perc_identity=95 ) if species_out.exists() is False: blast_cmd() if os.path.getsize(str(species_out)) == 0: blast_cmd() with open(species_out) as fl: for record in NCBIXML.parse(fl): gene_name=record.query.split()[0] if record.alignments: max_flag=-1 #out_to_excel(species_count,record.query,1) for alignment in record.alignments: for hsp in alignment.hsps: if max_flag == -1: identity_discriminant_for_length=hsp.align_length/record.query_length identity_discriminant_for_identity_perscent=hsp.identities/hsp.align_length max_flag=max_flag+2 if hsp.align_length < 100 and identity_discriminant_for_length < 0.5: out_to_excel(species_count,gene_name,0) elif identity_discriminant_for_length==1 and identity_discriminant_for_identity_perscent==1: out_to_excel(species_count,gene_name,4) elif identity_discriminant_for_length==1 and hsp.gaps == 0: out_to_excel(species_count,gene_name,3) elif hsp.align_length>record.query_length or record.query_length-hsp.align_length <=50: out_to_excel(species_count,gene_name,2) else: out_to_excel(species_count,gene_name,1) else: out_to_excel(species_count,gene_name,0) excel_book.save(pav_excel_name)
return (min_, max_) def _srange(begin, end): """ Return a set based on range """ return set(range(begin, end)) def _hit_overlap(hsp1, hsp2): """ Determine whether the hits of two hsps overlap """ hit1_begin, hit1_end = _minmax(hsp1.sbjct_start, hsp1.sbjct_end) hit2_begin, hit2_end = _minmax(hsp2.sbjct_start, hsp2.sbjct_end) hit1_range = _srange(hit1_begin, hit1_end) hit2_range = _srange(hit2_begin, hit2_end) return not hit1_range.isdisjoint(hit2_range) if __name__ == '__main__': from Bio.Blast.Applications import NcbiblastnCommandline from Bio import SeqIO cmd = NcbiblastnCommandline(query='test/sul2_1_AF542061.fasta', db='test/102637-001-018_k64-contigs.fa', evalue=0.001) with pyBlastFlat(cmd, rm_tmp=False, min_cov=0.5, verbose=True) as pb: for record in pb: fasta = pyBlastFlat.fasta(record) print(fasta) print(SeqIO.write(fasta, sys.stdout, 'fasta'))
#!/usr/bin/env python3 # author : Andrew Smith # date : 111820 @ 12:57 # file : blast.py # description : provide initial statistics of a query against a blast searchable database # NOTE: this script is specifically configured for one query and one subject. import sys # command line arguments from time import time # execution time from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML if (len(sys.argv) != 3): print("Usage : ./main.py <DBname> <QueryFile>") sys.exit() start_time = time() dbname = sys.argv[1] queryfile = sys.argv[2] blastnCommandLine = NcbiblastnCommandline(query=queryfile, db=dbname, outfmt=5, out="results.xml") stdout, stderr = blastnCommandLine() for query in NCBIXML.parse(open("results.xml")): for alignment in query.alignments: for hsp in alignment.hsps: print(hsp)
def getBlastHits(self): """ Function for blasting the handle sequence against the NCBI nt database to identify homologies """ from Bio.Blast import NCBIWWW import sys import subprocess as sp sys.stdout = Unbuffered(sys.stdout) local=True if local: #localdb='/sw/data/uppnex/blast_databases/nt' localdb='/Users/erikborgstrom/localBioInfo/BLASTnt/nt' from Bio.Blast.Applications import NcbiblastnCommandline from Bio.Blast import NCBIXML from cStringIO import StringIO import time import os #setting up blast database=localdb blastsetting = 'strict' infile = open('tmp.fa','w') infile.write('>tmp\n'+self.sequence+'\n') infile.close() if blastsetting == 'strict': cline = NcbiblastnCommandline(query=infile.name, db=database ,evalue=0.001, outfmt=5)#, out='tmp.blastout') elif blastsetting == 'sloppy':cline = NcbiblastnCommandline(query=infile.name, db=database ,evalue=0.001, outfmt=5, dust='no',perc_identity=80, task='blastn')#,out='tmp.blastout') cline = NcbiblastnCommandline(cmd='blastn', outfmt=5, query=infile.name, db=database, gapopen=5, gapextend=2, culling_limit=2)#,out='tmp.blastout') print str(cline) blast_handle = cline.__call__() #blastn = sp.Popen(cline.__str__().split(), stdout=sp.PIPE, stderr=sp.PIPE) #blastn.wait() #stdout, stderr = blastn.communicate() #print blastn.returncode #print cline.__str__().split() #blast_handle = stdout, stderr #print blast_handle blast_handle = StringIO(blast_handle[0]) blast_handle.seek(0) #os.remove(infile.name) else: sys.stdout.write('getting blast hits for handle#'+str(self.id)+'\n') result_handle = NCBIWWW.qblast("blastn", "nr", '>tmp\n'+self.sequence,format_type='XML') sys.stdout.write('start parsing blast for handle#'+str(self.id)+'\n') from cStringIO import StringIO blast_handle = StringIO(result_handle.read()) blast_handle.seek(0) from Bio.Blast import NCBIXML records = NCBIXML.parse(blast_handle) hits=0 for blast_record in records: for alignment in blast_record.alignments: for hsp in alignment.hsps: perc_identity = float(hsp.identities) / float(hsp.align_length) *100 perc_coverage = float(hsp.align_length) / float(blast_record.query_letters)*100 if perc_identity >= 90 and perc_coverage >= 90: hits +=1 self.blastHits = hits
def search(sequence_filename, output_filename): """ Search for matches to known organisms from included 16S database. Parameters ---------- sequence_filename : str Path to file with 16S rDNA sequences for unique OTUs output_filename: str Path to file where blast output file is saved Returns ------- list of str List of PATRIC genome IDs for known organisms pandas.DataFrame Similarity information with OTU ID, genome ID, and percent similarity of match Raises ------ Bio.Application.ApplicationError When there is an error running the blast command """ # Run blast to search for matches to known organisms. # @todo Should it make me nervous to not use a fully-qualified path here? cmdline = NcbiblastnCommandline( cmd='blastn', query=sequence_filename, db=join(pkg_resources.resource_filename(__name__, 'data/db'), '16Sdb'), out=output_filename, outfmt=6, max_target_seqs=1, num_threads=4) cmdline() # Raises ApplicationError when there is a problem # Parse the blast output file with the results. In output format 6, the first # field is the OTU ID from the query. The second field is the ID of match in # target database. In our case that is the PATRIC genome ID of the organism # with the matching 16S sequence. The third field is the percent similarity. genome_ids = set() query_ids = set() similarity = pd.DataFrame(columns=similarity_columns) with open(output_filename, 'r') as handle: for line in handle: fields = line.split() genome_ids.add(fields[1]) if fields[0] not in query_ids: query_ids.add(fields[0]) similarity = similarity.append(pd.Series( [fields[0], fields[1], float(fields[2])], index=similarity_columns), ignore_index=True) else: current = similarity.loc[similarity['OTU_ID'] == fields[0]] if current.iloc[0]['GENOME_ID'] != fields[1]: warn( 'OTU {0} matches already matched genome {1} and also matches genome {2}' .format(fields[0], current.iloc[0]['GENOME_ID'], fields[1])) return list(genome_ids), similarity
## For each genome in command line for genome in args.genomes: ## 1. Save organism name by reading first line of genome file genome_file=open(genome, 'r') ## open genome file first_line=genome_file.readline() ## read first line first_line=first_line.rstrip('\n') first_line=first_line.split(' ') ## split first line genus=first_line[1] ## save genus species=first_line[2] ## save species organism=("{}_{}").format(genus, species) ## name genome_file.close() print (organism) ## 2. Blast on genome file blastn_cline=NcbiblastnCommandline(query=args.query_sequence, db=genome, outfmt=" '7 std sseq' ", out=organism + "_blast_results.tsv", soft_masking=True) ## '7 std sseq' to obtain a tabular output file with comments line, with standard information and with sequence of aligned part of subject sequence ## max_target_seqs=10 save only first 10 best results ## soft_masking true as suggested by literature os.system(str(blastn_cline)) ## save output ## 3. Compare line to save in FASTA format only the result with highest bit-score or lowest e-value blast_results=open(organism + "_blast_results.tsv", 'r') ## open output file fasta_sequence=open(organism + "_ortholog_sequence.fa", 'w') ## open a new file for line in blast_results: line=line.rstrip('\n') if not line.startswith('#'): ## if not header first_line=line.split('\t') first_e_value=float(first_line[10]) ## save lowest e-value first_bit_score=float(first_line[11]) ## save highest bit-score subject_id=organism + "," + first_line[1] ## save ID
def _blast_primer(primer_fasta_path, db_path, evalue_cutoff=1000, min_total_mismatch_portion=0.2, min_total_mismatch=6, min_prime_3_mismatch=2, prime_3_length=5, alt_pos_cutoff=2000, max_product_size=5000, word_size=7): """ Take a fasta file as input, query genome db and count qualified hits Parameters ---------- primer_fasta_path db_path evalue_cutoff min_total_mismatch_portion min_total_mismatch min_prime_3_mismatch prime_3_length alt_pos_cutoff max_product_size word_size Returns ------- """ # run blastn for all primers temp_dir = pathlib.Path(primer_fasta_path).parent blast_cline = NcbiblastnCommandline(query=str(primer_fasta_path), db=db_path, evalue=evalue_cutoff, outfmt=5, word_size=word_size, out=str(temp_dir / (primer_fasta_path.stem + "_blast_result.xml")), task='blastn') blast_cline() # parse blast result blast_results = SearchIO.parse(temp_dir / (primer_fasta_path.stem + "_blast_result.xml"), "blast-xml") primer_hit_dict = {} for blast_result in blast_results: primer_length = blast_result.seq_len primer_total_mismatch = max(min_total_mismatch, min_total_mismatch_portion * primer_length) alternate_hsps = [] for hit in blast_result: for hsp in hit.hsps: prime_5_unmatch = [' ' for _ in range(hsp.query_range[0])] prime_3_unmatch = [' ' for _ in range(primer_length - hsp.query_range[1])] align_anno = prime_5_unmatch + list(hsp.aln_annotation['similarity']) + prime_3_unmatch align_anno = ''.join(align_anno) total_mismatch = primer_length - align_anno.count('|') if total_mismatch > primer_total_mismatch: continue prime_3_mismatch = prime_3_length - align_anno[-prime_3_length:].count('|') if prime_3_mismatch > min_prime_3_mismatch: continue alternate_hsps.append(hsp) *primer_name, direction = blast_result.id.split('_') primer_name = '_'.join(primer_name) append_pos = 0 if direction == 'l' else 1 if primer_name not in primer_hit_dict: primer_hit_dict[primer_name] = [[], []] primer_hit_dict[primer_name][append_pos] += alternate_hsps primer_hit_records = {} for primer, (left_hits, right_hits) in primer_hit_dict.items(): if (len(left_hits) > alt_pos_cutoff) or (len(right_hits) > alt_pos_cutoff): continue else: valid_product_lengths = [] positive_strand_hit = [hit for hit in left_hits if hit.hit_strand == 1] + \ [hit for hit in right_hits if hit.hit_strand == 1] negative_strand_hit = [hit for hit in left_hits if hit.hit_strand == -1] + \ [hit for hit in right_hits if hit.hit_strand == -1] for positive_hit in positive_strand_hit: for negative_hit in negative_strand_hit: # hit not in same chrom if positive_hit.hit_id != negative_hit.hit_id: continue else: product_size = abs(positive_hit.hit_range[0] - negative_hit.hit_range[1]) # left right too far away if product_size > max_product_size: continue valid_product_lengths.append(str(product_size)) primer_hit_records[primer] = { 'LEFT_GENOME_HITS': len(left_hits), 'RIGHT_GENOME_HITS': len(right_hits), 'POTENTIAL_PRODUCTS': len(valid_product_lengths), 'POTENTIAL_PRODUCT_LENGTHS': '|'.join(valid_product_lengths), } primer_hit_df = pd.DataFrame(primer_hit_records).T return primer_hit_df