def mcl_predict(blast_results_file, min_ident, min_cov, evalue, min_length, tmp_dir): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= 400000] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.reset_index(drop=True) for index, row in blast_df.iterrows(): (seqid, clust_id) = row[1].split('|') blast_df.iloc[index, blast_df.columns.get_loc('sseqid')] = clust_id filtered_blast = os.path.join(tmp_dir, 'filtered_mcl_blast.txt') blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False) mcl_clusters = mcl(filtered_blast, tmp_dir).getclusters() return mcl_clusters
def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging, seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11, seq_id_file=seq_filterfile) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= max_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.reset_index(drop=True) blast_df = fixStart(blast_df) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1, min_covhsp=25, seq_id_file=None): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, seq_id_file=seq_id_file, logging=logging) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def repetitive_blast(input_fasta, ref_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = BlastRunner(input_fasta, tmp_dir) #blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) contig_list = dict() for index, row in blast_df.iterrows(): if not row['qseqid'] in contig_list: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } else: if contig_list[row['qseqid']]['score'] > row['bitscore']: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } return contig_list
def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) size = str(len(blast_df)) prev_size = 0 while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) return blast_df
def mob_blast(input_fasta, ref_db, min_ident, min_cov, evalue, tmp_dir, blast_results_file, overlap=5, num_threads=1): num_threads = 1 blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = 0 size = str(len(blast_df)) while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) #print(blast_df) return blast_df
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = None filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt') blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11) if os.path.getsize(blast_results_file) == 0: fh = open(filtered_blast, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= 400000] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
def run_blast(self, input_fasta, output_path, blast_results_file, logging, min_cov=1, min_ident=1, evalue=1, num_threads=1, min_length=25): blast_runner = BlastRunner(input_fasta, output_path) blast_runner.makeblastdb(input_fasta, 'nucl', logging) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=input_fasta, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11, logging=logging) if os.path.getsize(blast_results_file) == 0: fh = open(blast_results_file, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=False, line_terminator='\n', index=False)