def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging, seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11, seq_id_file=seq_filterfile) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= max_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.reset_index(drop=True) blast_df = fixStart(blast_df) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def repetitive_blast(input_fasta, ref_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = BlastRunner(input_fasta, tmp_dir) #blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) contig_list = dict() for index, row in blast_df.iterrows(): if not row['qseqid'] in contig_list: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } else: if contig_list[row['qseqid']]['score'] > row['bitscore']: contig_list[row['qseqid']] = { 'id': row['sseqid'], 'score': row['bitscore'], 'contig_start': row['sstart'], 'contig_end': row['send'] } return contig_list
def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) size = str(len(blast_df)) prev_size = 0 while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) return blast_df
def mob_blast(input_fasta, ref_db, min_ident, min_cov, evalue, tmp_dir, blast_results_file, overlap=5, num_threads=1): num_threads = 1 blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.makeblastdb(ref_db, 'nucl') blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=ref_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads) if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = 0 size = str(len(blast_df)) while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) #print(blast_df) return blast_df
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file, num_threads=1): blast_runner = None filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt') blast_runner = BlastRunner(input_fasta, tmp_dir) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11) if os.path.getsize(blast_results_file) == 0: fh = open(filtered_blast, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= 400000] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
def mcl_predict(blast_results_file, min_ident, min_cov, evalue, min_length, tmp_dir): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.loc[blast_df['qlen'] <= 400000] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov] blast_df = blast_df.loc[blast_df['qlen'] >= min_length] blast_df = blast_df.reset_index(drop=True) for index, row in blast_df.iterrows(): (seqid, clust_id) = row[1].split('|') blast_df.iloc[index, blast_df.columns.get_loc('sseqid')] = clust_id filtered_blast = os.path.join(tmp_dir, 'filtered_mcl_blast.txt') blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False) mcl_clusters = mcl(filtered_blast, tmp_dir).getclusters() return mcl_clusters
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1, min_covhsp=25, seq_id_file=None): blast_runner = BlastRunner(input_fasta, out_dir) blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb, db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, seq_id_file=seq_id_file, logging=logging) if os.path.getsize(blast_results_file) == 0: os.remove(blast_results_file) return False blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['pident'] >= min_ident] blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs] blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp] blast_df = blast_df.loc[blast_df['evalue'] <= evalue] blast_df = fixStart(blast_df) blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False) return True
def run_blast(self, input_fasta, output_path, blast_results_file, logging, min_cov=1, min_ident=1, evalue=1, num_threads=1, min_length=25): blast_runner = BlastRunner(input_fasta, output_path) blast_runner.makeblastdb(input_fasta, 'nucl', logging) blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=input_fasta, db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue, blast_outfile=blast_results_file, num_threads=num_threads, word_size=11, logging=logging) if os.path.getsize(blast_results_file) == 0: fh = open(blast_results_file, 'w', encoding="utf-8") fh.write('') fh.close() return dict() blast_df = BlastReader(blast_results_file, logging).df blast_df = blast_df.loc[blast_df['length'] >= min_length] blast_df = blast_df.reset_index(drop=True) blast_df.to_csv(blast_results_file, sep='\t', header=False, line_terminator='\n', index=False)
def overhangDetection(self, blast_results_file, min_length=25): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df circular_contigs = {} for index, row in blast_df.iterrows(): contig_id_query = row['qseqid'] contig_id_subject = row['sseqid'] contig_start_subject = row['sstart'] contig_end_subject = row['send'] contig_start_query = row['qstart'] contig_end_query = row['qend'] contig_length = row['qlen'] mid_point = int(contig_length / 2) length = row['length'] if contig_id_query != contig_id_subject: continue if contig_start_query != 1 or length < min_length: continue if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject: continue if (contig_start_query == 1 and contig_end_query == mid_point) or \ (contig_start_query == mid_point+1 and contig_end_query == contig_length): circular_contigs[ contig_id_query] = 'Circular: Complete concatemer' elif contig_start_query == 1 and contig_end_subject == contig_length: circular_contigs[ contig_id_query] = 'Circular: Overlap {} bp'.format(length) return circular_contigs
def overhangDetection(self, blast_results_file, logging, min_length=25): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file, logging).df.sort_values( ['qseqid', 'qstart', 'qend', 'bitscore'], ascending=[True, True, True, False]) circular_contigs = {} for index, row in blast_df.iterrows(): contig_id_query = row['qseqid'] contig_id_subject = row['sseqid'] contig_start_subject = int(row['sstart']) contig_end_subject = int(row['send']) contig_start_query = int(row['qstart']) contig_end_query = int(row['qend']) contig_length = int(row['qlen']) length = int(row['length']) if contig_id_query != contig_id_subject and contig_id_subject != "ref|{}|".format( contig_id_query): continue if contig_start_query != 1 or length < min_length: continue if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject: continue if contig_start_query == 1 and contig_end_subject == contig_length: circular_contigs[ contig_id_query] = 'Circular: Overlap {} bp'.format(length) return circular_contigs
def contig_blast_group(blast_results_file, overlap_threshold): if os.path.getsize(blast_results_file) == 0: return dict() blast_df = BlastReader(blast_results_file).df blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) blast_df = filter_overlaping_records(blast_df, overlap_threshold, 'sseqid', 'sstart', 'send', 'bitscore') size = str(len(blast_df)) prev_size = 0 while size != prev_size: blast_df = filter_overlaping_records(blast_df, overlap_threshold, 'sseqid', 'sstart', 'send', 'bitscore') prev_size = size size = str(len(blast_df)) cluster_scores = dict() groups = dict() hits = dict() contigs = dict() for index, row in blast_df.iterrows(): query = row['qseqid'] pID, clust_id = row['sseqid'].split('|') score = row['bitscore'] pLen = row['slen'] contig_id = row['qseqid'] if not pID in hits: hits[pID] = { 'score': 0, 'length': pLen, 'covered_bases': 0, 'clust_id': clust_id } if not clust_id in cluster_scores: cluster_scores[clust_id] = score elif score > cluster_scores[clust_id]: cluster_scores[clust_id] = score if not clust_id in groups: groups[clust_id] = dict() if not query in groups[clust_id]: groups[clust_id][query] = dict() if not contig_id in contigs: contigs[contig_id] = dict() if not clust_id in contigs[contig_id]: contigs[contig_id][clust_id] = 0 if contigs[contig_id][clust_id] < score: contigs[contig_id][clust_id] = score groups[clust_id][query][contig_id] = score hits[pID]['score'] += score hits[pID]['covered_bases'] += score sorted_d = OrderedDict( sorted(iter(list(cluster_scores.items())), key=lambda x: x[1], reverse=True)) for clust_id in sorted_d: score = sorted_d[clust_id] for contig_id in contigs: if clust_id in contigs[contig_id]: contigs[contig_id] = {clust_id: contigs[contig_id][clust_id]} return contigs
def identify_biomarkers(contig_info, fixed_fasta, tmp_dir, min_length, logging, replicon_ref, min_rep_ident, min_rep_cov, min_rep_evalue, replicon_blast_results, mob_ref, min_mob_ident, min_mob_cov, min_mob_evalue, mob_blast_results, mpf_ref, min_mpf_ident, min_mpf_cov, min_mpf_evalue, mpf_blast_results, repetitive_mask_file, min_rpp_ident, min_rpp_cov, min_rpp_evalue, plasmid_orit, orit_blast_results, repetitive_blast_results, num_threads=1): # blast replicon database logging.info("Blasting replicon sequences {} against {}".format(replicon_ref, fixed_fasta)) blastn(input_fasta=replicon_ref, blastdb=fixed_fasta, min_ident=min_rep_ident, min_cov=min_rep_cov, evalue=min_rep_evalue, min_length=80, out_dir=tmp_dir, blast_results_file=replicon_blast_results, num_threads=num_threads, logging=logging, min_hsp_cov=30) logging.info("Filtering replicon blast results {} ".format(replicon_blast_results)) rep_blast_df = BlastReader(replicon_blast_results, logging=logging).df if len(rep_blast_df) > 0: rep_blast_df = rep_blast_df.drop(0) rep_blast_df = fixStart(rep_blast_df) rep_blast_df = remove_split_hits(rep_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values( ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) rep_blast_df = recursive_filter_overlap_records(rep_blast_df, 5, 'sseqid', 'sstart', 'send', 'bitscore') contig_info = add_biomarker_results(biomarker_df=rep_blast_df, df_column_name_biomarker='qseqid', df_column_name_seqid='sseqid', contig_info=contig_info, contig_info_type_key='rep_type(s)', contig_info_acs_key='rep_type_accession(s)', delimeter='|') del (rep_blast_df) # blast relaxase database logging.info("Blasting relaxase sequences {} against {}".format(mob_ref, fixed_fasta)) tblastn(input_fasta=mob_ref, blastdb=fixed_fasta, min_ident=min_mob_ident, min_covs=min_mob_cov, evalue=min_mob_evalue, out_dir=tmp_dir, logging=logging, blast_results_file=mob_blast_results, num_threads=num_threads) logging.info("Filtering relaxase blast results {} ".format(mob_blast_results)) mob_blast_df = BlastReader(mob_blast_results, logging).df if len(mob_blast_df) > 0: mob_blast_df = fixStart(mob_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])) mob_blast_df = remove_split_hits(mob_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values( ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) mob_blast_df = recursive_filter_overlap_records(mob_blast_df, 5, 'sseqid', 'sstart', 'send', 'bitscore') add_biomarker_results(biomarker_df=mob_blast_df, df_column_name_biomarker='qseqid', df_column_name_seqid='sseqid', contig_info=contig_info, contig_info_type_key='relaxase_type(s)', contig_info_acs_key='relaxase_type_accession(s)', delimeter='|') del (mob_blast_df) # blast mpf database logging.info("Blasting MPF sequences {} against {}".format(mpf_ref, fixed_fasta)) tblastn(input_fasta=mpf_ref, blastdb=fixed_fasta, min_ident=min_mpf_ident, min_covs=min_mpf_cov, evalue=min_mpf_evalue, out_dir=tmp_dir, blast_results_file=mpf_blast_results, num_threads=num_threads, logging=logging) mpf_blast_df = BlastReader(mpf_blast_results, logging).df if len(mpf_blast_df) > 0: mpf_blast_df = fixStart(mpf_blast_df.drop(0)).sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) mpf_blast_df = remove_split_hits(mpf_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values( ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False]) mpf_blast_df = recursive_filter_overlap_records(mpf_blast_df, 5, 'sseqid', 'sstart', 'send', 'bitscore') logging.info("Filtering MPF blast results {} ".format(mpf_blast_results)) add_biomarker_results(biomarker_df=mpf_blast_df, df_column_name_biomarker='qseqid', df_column_name_seqid='sseqid', contig_info=contig_info, contig_info_type_key='mpf_type', contig_info_acs_key='mpf_type_accession(s)', delimeter='|') del (mpf_blast_results) # blast orit database logging.info("Blasting orit sequences {} against {}".format(plasmid_orit, fixed_fasta)) blastn(input_fasta=plasmid_orit, blastdb=fixed_fasta, min_ident=min_rep_ident, min_cov=min_rep_cov, evalue=min_rep_evalue, min_length=80, out_dir=tmp_dir, blast_results_file=orit_blast_results, num_threads=num_threads, logging=logging) logging.info("Filtering orit blast results {} ".format(orit_blast_results)) orit_blast_df = BlastReader(orit_blast_results, logging).df if len(orit_blast_df) > 0: orit_blast_df = recursive_filter_overlap_records(fixStart( orit_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])), 5, 'sseqid', 'sstart', 'send', 'bitscore') add_biomarker_results(biomarker_df=orit_blast_df, df_column_name_biomarker='qseqid', df_column_name_seqid='sseqid', contig_info=contig_info, contig_info_type_key='orit_type(s)', contig_info_acs_key='orit_accession(s)', delimeter='|') del (orit_blast_df) # blast repetitive database if repetitive_mask_file is not None: logging.info("Blasting contigs against repetitive sequences db: {}".format(repetitive_mask_file)) blastn(input_fasta=fixed_fasta, blastdb=repetitive_mask_file, min_ident=min_rpp_ident, min_cov=min_rpp_cov, evalue=min_rpp_evalue, min_length=min_length, out_dir=tmp_dir, blast_results_file=repetitive_blast_results, num_threads=num_threads, logging=logging) logging.info("Filtering repetitive blast results {} ".format(repetitive_blast_results)) repetitive_blast_df = BlastReader(repetitive_blast_results, logging).df if len(repetitive_blast_df) > 0: repetitive_blast_df = recursive_filter_overlap_records(fixStart( repetitive_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])), 5, 'qseqid', 'qstart', 'qend', 'bitscore') repetitive_list = repetitive_blast_df['qseqid'].tolist() # add filtering flag to contigs which are primarially a repetitive element for contig_id in repetitive_list: if contig_id in contig_info: logging.info('Filtering contig: {} due to repetitive sequence'.format(contig_id)) contig_info[contig_id]['filtering_reason'] = 'repetitve element' else: logging.error('Contig: {} not found in contig_df this is likely an error'.format(contig_id)) add_biomarker_results(biomarker_df=repetitive_blast_df, df_column_name_biomarker='sseqid', df_column_name_seqid='qseqid', contig_info=contig_info, contig_info_type_key='repetitive_dna_type', contig_info_acs_key='repetitive_dna_id', delimeter='|', type_col_num=2, type_acs_num=1) del (repetitive_blast_df) return contig_info