Beispiel #1
0
def blastn(input_fasta, blastdb, min_ident, min_cov, evalue, min_length, out_dir, blast_results_file, logging,
           seq_filterfile=None, num_threads=1, max_length=400000, min_hsp_cov=1):
    blast_runner = BlastRunner(input_fasta, out_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, logging=logging, num_threads=num_threads, word_size=11,
                           seq_id_file=seq_filterfile)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= max_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_hsp_cov]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]

    blast_df = blast_df.reset_index(drop=True)
    blast_df = fixStart(blast_df)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
Beispiel #2
0
def repetitive_blast(input_fasta,
                     ref_db,
                     min_ident,
                     min_cov,
                     evalue,
                     min_length,
                     tmp_dir,
                     blast_results_file,
                     num_threads=1):
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    #blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_blast(query_fasta_path=input_fasta,
                           blast_task='megablast',
                           db_path=ref_db,
                           db_type='nucl',
                           min_cov=min_cov,
                           min_ident=min_ident,
                           evalue=evalue,
                           blast_outfile=blast_results_file,
                           num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)

    contig_list = dict()
    for index, row in blast_df.iterrows():
        if not row['qseqid'] in contig_list:
            contig_list[row['qseqid']] = {
                'id': row['sseqid'],
                'score': row['bitscore'],
                'contig_start': row['sstart'],
                'contig_end': row['send']
            }
        else:
            if contig_list[row['qseqid']]['score'] > row['bitscore']:
                contig_list[row['qseqid']] = {
                    'id': row['sseqid'],
                    'score': row['bitscore'],
                    'contig_start': row['sstart'],
                    'contig_end': row['send']
                }

    return contig_list
Beispiel #3
0
def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap):
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    size = str(len(blast_df))
    prev_size = 0
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid',
                                             'sstart', 'send', 'bitscore')
        prev_size = size
        size = str(len(blast_df))

    return blast_df
Beispiel #4
0
def mob_blast(input_fasta,
              ref_db,
              min_ident,
              min_cov,
              evalue,
              tmp_dir,
              blast_results_file,
              overlap=5,
              num_threads=1):
    num_threads = 1
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_tblastn(query_fasta_path=input_fasta,
                             blast_task='megablast',
                             db_path=ref_db,
                             db_type='nucl',
                             min_cov=min_cov,
                             min_ident=min_ident,
                             evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart',
                                         'send', 'bitscore')
    prev_size = 0
    size = str(len(blast_df))
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid',
                                             'sstart', 'send', 'bitscore')
        prev_size = size
        size = str(len(blast_df))
    #print(blast_df)
    return blast_df
Beispiel #5
0
def contig_blast(input_fasta, plasmid_db, min_ident, min_cov, evalue, min_length, tmp_dir, blast_results_file,
                 num_threads=1):
    blast_runner = None
    filtered_blast = os.path.join(tmp_dir, 'filtered_blast.txt')
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.run_blast(query_fasta_path=input_fasta, blast_task='megablast', db_path=plasmid_db,
                           db_type='nucl', min_cov=min_cov, min_ident=min_ident, evalue=evalue,
                           blast_outfile=blast_results_file, num_threads=num_threads, word_size=11)
    if os.path.getsize(blast_results_file) == 0:
        fh = open(filtered_blast, 'w', encoding="utf-8")
        fh.write('')
        fh.close()
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(filtered_blast, sep='\t', header=False, line_terminator='\n', index=False)
Beispiel #6
0
def mcl_predict(blast_results_file, min_ident, min_cov, evalue, min_length,
                tmp_dir):
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['qlen'] <= 400000]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qlen'] >= min_length]
    blast_df = blast_df.reset_index(drop=True)
    for index, row in blast_df.iterrows():
        (seqid, clust_id) = row[1].split('|')
        blast_df.iloc[index, blast_df.columns.get_loc('sseqid')] = clust_id

    filtered_blast = os.path.join(tmp_dir, 'filtered_mcl_blast.txt')
    blast_df.to_csv(filtered_blast,
                    sep='\t',
                    header=False,
                    line_terminator='\n',
                    index=False)
    mcl_clusters = mcl(filtered_blast, tmp_dir).getclusters()

    return mcl_clusters
Beispiel #7
0
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1,
            min_covhsp=25, seq_id_file=None):
    blast_runner = BlastRunner(input_fasta, out_dir)

    blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                             db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads, seq_id_file=seq_id_file, logging=logging)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
Beispiel #8
0
    def run_blast(self,
                  input_fasta,
                  output_path,
                  blast_results_file,
                  logging,
                  min_cov=1,
                  min_ident=1,
                  evalue=1,
                  num_threads=1,
                  min_length=25):
        blast_runner = BlastRunner(input_fasta, output_path)
        blast_runner.makeblastdb(input_fasta, 'nucl', logging)
        blast_runner.run_blast(query_fasta_path=input_fasta,
                               blast_task='megablast',
                               db_path=input_fasta,
                               db_type='nucl',
                               min_cov=min_cov,
                               min_ident=min_ident,
                               evalue=evalue,
                               blast_outfile=blast_results_file,
                               num_threads=num_threads,
                               word_size=11,
                               logging=logging)

        if os.path.getsize(blast_results_file) == 0:
            fh = open(blast_results_file, 'w', encoding="utf-8")
            fh.write('')
            fh.close()
            return dict()

        blast_df = BlastReader(blast_results_file, logging).df
        blast_df = blast_df.loc[blast_df['length'] >= min_length]
        blast_df = blast_df.reset_index(drop=True)
        blast_df.to_csv(blast_results_file,
                        sep='\t',
                        header=False,
                        line_terminator='\n',
                        index=False)
Beispiel #9
0
    def overhangDetection(self, blast_results_file, min_length=25):
        if os.path.getsize(blast_results_file) == 0:
            return dict()

        blast_df = BlastReader(blast_results_file).df

        circular_contigs = {}

        for index, row in blast_df.iterrows():
            contig_id_query = row['qseqid']
            contig_id_subject = row['sseqid']
            contig_start_subject = row['sstart']
            contig_end_subject = row['send']
            contig_start_query = row['qstart']
            contig_end_query = row['qend']
            contig_length = row['qlen']
            mid_point = int(contig_length / 2)
            length = row['length']

            if contig_id_query != contig_id_subject:
                continue

            if contig_start_query != 1 or length < min_length:
                continue

            if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject:
                continue


            if (contig_start_query == 1 and contig_end_query == mid_point) or \
                (contig_start_query == mid_point+1 and contig_end_query == contig_length):
                circular_contigs[
                    contig_id_query] = 'Circular: Complete concatemer'
            elif contig_start_query == 1 and contig_end_subject == contig_length:
                circular_contigs[
                    contig_id_query] = 'Circular: Overlap {} bp'.format(length)

        return circular_contigs
Beispiel #10
0
    def overhangDetection(self, blast_results_file, logging, min_length=25):
        if os.path.getsize(blast_results_file) == 0:
            return dict()

        blast_df = BlastReader(blast_results_file, logging).df.sort_values(
            ['qseqid', 'qstart', 'qend', 'bitscore'],
            ascending=[True, True, True, False])

        circular_contigs = {}

        for index, row in blast_df.iterrows():
            contig_id_query = row['qseqid']
            contig_id_subject = row['sseqid']
            contig_start_subject = int(row['sstart'])
            contig_end_subject = int(row['send'])
            contig_start_query = int(row['qstart'])
            contig_end_query = int(row['qend'])
            contig_length = int(row['qlen'])
            length = int(row['length'])

            if contig_id_query != contig_id_subject and contig_id_subject != "ref|{}|".format(
                    contig_id_query):
                continue

            if contig_start_query != 1 or length < min_length:

                continue

            if contig_start_query == contig_start_subject and contig_end_query == contig_end_subject:

                continue

            if contig_start_query == 1 and contig_end_subject == contig_length:
                circular_contigs[
                    contig_id_query] = 'Circular: Overlap {} bp'.format(length)

        return circular_contigs
Beispiel #11
0
def contig_blast_group(blast_results_file, overlap_threshold):
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])

    blast_df = filter_overlaping_records(blast_df, overlap_threshold, 'sseqid',
                                         'sstart', 'send', 'bitscore')
    size = str(len(blast_df))
    prev_size = 0
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap_threshold,
                                             'sseqid', 'sstart', 'send',
                                             'bitscore')
        prev_size = size
        size = str(len(blast_df))

    cluster_scores = dict()
    groups = dict()
    hits = dict()
    contigs = dict()
    for index, row in blast_df.iterrows():
        query = row['qseqid']
        pID, clust_id = row['sseqid'].split('|')
        score = row['bitscore']
        pLen = row['slen']
        contig_id = row['qseqid']

        if not pID in hits:
            hits[pID] = {
                'score': 0,
                'length': pLen,
                'covered_bases': 0,
                'clust_id': clust_id
            }

        if not clust_id in cluster_scores:
            cluster_scores[clust_id] = score
        elif score > cluster_scores[clust_id]:
            cluster_scores[clust_id] = score

        if not clust_id in groups:
            groups[clust_id] = dict()

        if not query in groups[clust_id]:
            groups[clust_id][query] = dict()

        if not contig_id in contigs:
            contigs[contig_id] = dict()

        if not clust_id in contigs[contig_id]:
            contigs[contig_id][clust_id] = 0

        if contigs[contig_id][clust_id] < score:
            contigs[contig_id][clust_id] = score

        groups[clust_id][query][contig_id] = score

        hits[pID]['score'] += score
        hits[pID]['covered_bases'] += score

    sorted_d = OrderedDict(
        sorted(iter(list(cluster_scores.items())),
               key=lambda x: x[1],
               reverse=True))

    for clust_id in sorted_d:
        score = sorted_d[clust_id]
        for contig_id in contigs:
            if clust_id in contigs[contig_id]:
                contigs[contig_id] = {clust_id: contigs[contig_id][clust_id]}

    return contigs
Beispiel #12
0
def identify_biomarkers(contig_info, fixed_fasta, tmp_dir, min_length, logging,
                        replicon_ref, min_rep_ident, min_rep_cov, min_rep_evalue, replicon_blast_results,
                        mob_ref, min_mob_ident, min_mob_cov, min_mob_evalue, mob_blast_results,
                        mpf_ref, min_mpf_ident, min_mpf_cov, min_mpf_evalue, mpf_blast_results,
                        repetitive_mask_file, min_rpp_ident, min_rpp_cov, min_rpp_evalue,
                        plasmid_orit, orit_blast_results, repetitive_blast_results,
                        num_threads=1):
    # blast replicon database
    logging.info("Blasting replicon sequences {} against {}".format(replicon_ref, fixed_fasta))
    blastn(input_fasta=replicon_ref, blastdb=fixed_fasta, min_ident=min_rep_ident, min_cov=min_rep_cov,
           evalue=min_rep_evalue, min_length=80, out_dir=tmp_dir,
           blast_results_file=replicon_blast_results, num_threads=num_threads, logging=logging, min_hsp_cov=30)

    logging.info("Filtering replicon blast results {} ".format(replicon_blast_results))
    rep_blast_df = BlastReader(replicon_blast_results, logging=logging).df
    if len(rep_blast_df) > 0:
        rep_blast_df = rep_blast_df.drop(0)
        rep_blast_df = fixStart(rep_blast_df)
        rep_blast_df = remove_split_hits(rep_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values(
            ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])

        rep_blast_df = recursive_filter_overlap_records(rep_blast_df, 5, 'sseqid', 'sstart', 'send', 'bitscore')

        contig_info = add_biomarker_results(biomarker_df=rep_blast_df, df_column_name_biomarker='qseqid',
                                            df_column_name_seqid='sseqid', contig_info=contig_info,
                                            contig_info_type_key='rep_type(s)',
                                            contig_info_acs_key='rep_type_accession(s)', delimeter='|')

    del (rep_blast_df)

    # blast relaxase database
    logging.info("Blasting relaxase sequences {} against {}".format(mob_ref, fixed_fasta))
    tblastn(input_fasta=mob_ref, blastdb=fixed_fasta, min_ident=min_mob_ident, min_covs=min_mob_cov,
            evalue=min_mob_evalue, out_dir=tmp_dir, logging=logging,
            blast_results_file=mob_blast_results, num_threads=num_threads)

    logging.info("Filtering relaxase blast results {} ".format(mob_blast_results))

    mob_blast_df = BlastReader(mob_blast_results, logging).df
    if len(mob_blast_df) > 0:
        mob_blast_df = fixStart(mob_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                                                 ascending=[True, True, True, False]))
        mob_blast_df = remove_split_hits(mob_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values(
            ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
        mob_blast_df = recursive_filter_overlap_records(mob_blast_df, 5, 'sseqid', 'sstart', 'send',
                                                        'bitscore')

        add_biomarker_results(biomarker_df=mob_blast_df, df_column_name_biomarker='qseqid',
                              df_column_name_seqid='sseqid', contig_info=contig_info,
                              contig_info_type_key='relaxase_type(s)', contig_info_acs_key='relaxase_type_accession(s)',
                              delimeter='|')

    del (mob_blast_df)

    # blast mpf database
    logging.info("Blasting MPF sequences {} against {}".format(mpf_ref, fixed_fasta))
    tblastn(input_fasta=mpf_ref, blastdb=fixed_fasta, min_ident=min_mpf_ident, min_covs=min_mpf_cov,
            evalue=min_mpf_evalue, out_dir=tmp_dir,
            blast_results_file=mpf_blast_results, num_threads=num_threads, logging=logging)

    mpf_blast_df = BlastReader(mpf_blast_results, logging).df

    if len(mpf_blast_df) > 0:
        mpf_blast_df = fixStart(mpf_blast_df.drop(0)).sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                                                  ascending=[True, True, True, False])
        mpf_blast_df = remove_split_hits(mpf_blast_df, 'qseqid', 'sseqid', 'sstart', 'send', 'bitscore').sort_values(
            ['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
        mpf_blast_df = recursive_filter_overlap_records(mpf_blast_df, 5, 'sseqid', 'sstart', 'send',
                                                        'bitscore')

        logging.info("Filtering MPF blast results {} ".format(mpf_blast_results))

        add_biomarker_results(biomarker_df=mpf_blast_df, df_column_name_biomarker='qseqid',
                              df_column_name_seqid='sseqid', contig_info=contig_info,
                              contig_info_type_key='mpf_type', contig_info_acs_key='mpf_type_accession(s)',
                              delimeter='|')
    del (mpf_blast_results)

    # blast orit database
    logging.info("Blasting orit sequences {} against {}".format(plasmid_orit, fixed_fasta))
    blastn(input_fasta=plasmid_orit, blastdb=fixed_fasta, min_ident=min_rep_ident, min_cov=min_rep_cov,
           evalue=min_rep_evalue, min_length=80, out_dir=tmp_dir,
           blast_results_file=orit_blast_results, num_threads=num_threads, logging=logging)

    logging.info("Filtering orit blast results {} ".format(orit_blast_results))

    orit_blast_df = BlastReader(orit_blast_results, logging).df
    if len(orit_blast_df) > 0:
        orit_blast_df = recursive_filter_overlap_records(fixStart(
            orit_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                              ascending=[True, True, True, False])), 5, 'sseqid', 'sstart', 'send',
                                                         'bitscore')
        add_biomarker_results(biomarker_df=orit_blast_df, df_column_name_biomarker='qseqid',
                              df_column_name_seqid='sseqid', contig_info=contig_info,
                              contig_info_type_key='orit_type(s)', contig_info_acs_key='orit_accession(s)',
                              delimeter='|')

    del (orit_blast_df)

    # blast repetitive database
    if repetitive_mask_file is not None:
        logging.info("Blasting contigs against repetitive sequences db: {}".format(repetitive_mask_file))
        blastn(input_fasta=fixed_fasta, blastdb=repetitive_mask_file, min_ident=min_rpp_ident, min_cov=min_rpp_cov,
               evalue=min_rpp_evalue, min_length=min_length, out_dir=tmp_dir,
               blast_results_file=repetitive_blast_results, num_threads=num_threads, logging=logging)
        logging.info("Filtering repetitive blast results {} ".format(repetitive_blast_results))

        repetitive_blast_df = BlastReader(repetitive_blast_results, logging).df
        if len(repetitive_blast_df) > 0:
            repetitive_blast_df = recursive_filter_overlap_records(fixStart(
                repetitive_blast_df.drop(0).sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                                        ascending=[True, True, True, False])), 5, 'qseqid', 'qstart',
                                                                   'qend',
                                                                   'bitscore')

            repetitive_list = repetitive_blast_df['qseqid'].tolist()

            # add filtering flag to contigs which are primarially a repetitive element
            for contig_id in repetitive_list:
                if contig_id in contig_info:
                    logging.info('Filtering contig: {} due to repetitive sequence'.format(contig_id))
                    contig_info[contig_id]['filtering_reason'] = 'repetitve element'
                else:
                    logging.error('Contig: {} not found in contig_df this is likely an error'.format(contig_id))

            add_biomarker_results(biomarker_df=repetitive_blast_df, df_column_name_biomarker='sseqid',
                                  df_column_name_seqid='qseqid', contig_info=contig_info,
                                  contig_info_type_key='repetitive_dna_type', contig_info_acs_key='repetitive_dna_id',
                                  delimeter='|', type_col_num=2, type_acs_num=1)

        del (repetitive_blast_df)
    return contig_info