Ejemplo n.º 1
0
def tblastn(input_fasta, blastdb, min_ident, min_covs, evalue, out_dir, blast_results_file, logging, num_threads=1,
            min_covhsp=25, seq_id_file=None):
    blast_runner = BlastRunner(input_fasta, out_dir)

    blast_runner.run_tblastn(query_fasta_path=input_fasta, blast_task='megablast', db_path=blastdb,
                             db_type='protein', min_cov=min_covs, min_ident=min_ident, evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads, seq_id_file=seq_id_file, logging=logging)

    if os.path.getsize(blast_results_file) == 0:
        os.remove(blast_results_file)
        return False

    blast_df = BlastReader(blast_results_file, logging).df

    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_covs]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_covhsp]
    blast_df = blast_df.loc[blast_df['evalue'] <= evalue]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'], ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df.to_csv(blast_results_file, sep='\t', header=True, line_terminator='\n', index=False)

    return True
Ejemplo n.º 2
0
def repetitive_blast(input_fasta,
                     ref_db,
                     min_ident,
                     min_cov,
                     evalue,
                     min_length,
                     tmp_dir,
                     blast_results_file,
                     num_threads=1):
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    #blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_blast(query_fasta_path=input_fasta,
                           blast_task='megablast',
                           db_path=ref_db,
                           db_type='nucl',
                           min_cov=min_cov,
                           min_ident=min_ident,
                           evalue=evalue,
                           blast_outfile=blast_results_file,
                           num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()

    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['length'] >= min_length]
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)

    contig_list = dict()
    for index, row in blast_df.iterrows():
        if not row['qseqid'] in contig_list:
            contig_list[row['qseqid']] = {
                'id': row['sseqid'],
                'score': row['bitscore'],
                'contig_start': row['sstart'],
                'contig_end': row['send']
            }
        else:
            if contig_list[row['qseqid']]['score'] > row['bitscore']:
                contig_list[row['qseqid']] = {
                    'id': row['sseqid'],
                    'score': row['bitscore'],
                    'contig_start': row['sstart'],
                    'contig_end': row['send']
                }

    return contig_list
Ejemplo n.º 3
0
def filter_blast(blast_results_file, min_ident, min_cov, evalue, overlap):
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= min_cov]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    size = str(len(blast_df))
    prev_size = 0
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid',
                                             'sstart', 'send', 'bitscore')
        prev_size = size
        size = str(len(blast_df))

    return blast_df
Ejemplo n.º 4
0
def mob_blast(input_fasta,
              ref_db,
              min_ident,
              min_cov,
              evalue,
              tmp_dir,
              blast_results_file,
              overlap=5,
              num_threads=1):
    num_threads = 1
    blast_runner = BlastRunner(input_fasta, tmp_dir)
    blast_runner.makeblastdb(ref_db, 'nucl')
    blast_runner.run_tblastn(query_fasta_path=input_fasta,
                             blast_task='megablast',
                             db_path=ref_db,
                             db_type='nucl',
                             min_cov=min_cov,
                             min_ident=min_ident,
                             evalue=evalue,
                             blast_outfile=blast_results_file,
                             num_threads=num_threads)
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.loc[blast_df['pident'] >= min_ident]
    blast_df = blast_df.loc[blast_df['qcovs'] >= min_cov]
    blast_df = blast_df.loc[blast_df['qcovhsp'] >= 25]
    blast_df = fixStart(blast_df)
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])
    blast_df = blast_df.reset_index(drop=True)
    blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid', 'sstart',
                                         'send', 'bitscore')
    prev_size = 0
    size = str(len(blast_df))
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap, 'sseqid',
                                             'sstart', 'send', 'bitscore')
        prev_size = size
        size = str(len(blast_df))
    #print(blast_df)
    return blast_df
Ejemplo n.º 5
0
def contig_blast_group(blast_results_file, overlap_threshold):
    if os.path.getsize(blast_results_file) == 0:
        return dict()
    blast_df = BlastReader(blast_results_file).df
    blast_df = blast_df.sort_values(['sseqid', 'sstart', 'send', 'bitscore'],
                                    ascending=[True, True, True, False])

    blast_df = filter_overlaping_records(blast_df, overlap_threshold, 'sseqid',
                                         'sstart', 'send', 'bitscore')
    size = str(len(blast_df))
    prev_size = 0
    while size != prev_size:
        blast_df = filter_overlaping_records(blast_df, overlap_threshold,
                                             'sseqid', 'sstart', 'send',
                                             'bitscore')
        prev_size = size
        size = str(len(blast_df))

    cluster_scores = dict()
    groups = dict()
    hits = dict()
    contigs = dict()
    for index, row in blast_df.iterrows():
        query = row['qseqid']
        pID, clust_id = row['sseqid'].split('|')
        score = row['bitscore']
        pLen = row['slen']
        contig_id = row['qseqid']

        if not pID in hits:
            hits[pID] = {
                'score': 0,
                'length': pLen,
                'covered_bases': 0,
                'clust_id': clust_id
            }

        if not clust_id in cluster_scores:
            cluster_scores[clust_id] = score
        elif score > cluster_scores[clust_id]:
            cluster_scores[clust_id] = score

        if not clust_id in groups:
            groups[clust_id] = dict()

        if not query in groups[clust_id]:
            groups[clust_id][query] = dict()

        if not contig_id in contigs:
            contigs[contig_id] = dict()

        if not clust_id in contigs[contig_id]:
            contigs[contig_id][clust_id] = 0

        if contigs[contig_id][clust_id] < score:
            contigs[contig_id][clust_id] = score

        groups[clust_id][query][contig_id] = score

        hits[pID]['score'] += score
        hits[pID]['covered_bases'] += score

    sorted_d = OrderedDict(
        sorted(iter(list(cluster_scores.items())),
               key=lambda x: x[1],
               reverse=True))

    for clust_id in sorted_d:
        score = sorted_d[clust_id]
        for contig_id in contigs:
            if clust_id in contigs[contig_id]:
                contigs[contig_id] = {clust_id: contigs[contig_id][clust_id]}

    return contigs