Beispiel #1
0
def align_rna_metagenomics(
    inBam,
    db,
    taxDb,
    outReport,
    dupeReport=None,
    outBam=None,
    dupeLca=None,
    outLca=None,
    sensitive=None,
    JVMmemory=None,
    numThreads=None,
    picardOptions=None,
    min_score_to_output=None,
):
    '''
        Align to metagenomics bwa index, mark duplicates, and generate LCA report
    '''
    picardOptions = picardOptions if picardOptions else []

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa_opts = ['-a']
    if sensitive:
        bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split()

    map_threshold = min_score_to_output or 30

    aln_bam = util.file.mkstempfname('.bam')
    bwa.mem(inBam, db, aln_bam, options=bwa_opts, min_qual=map_threshold)

    tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)

    if dupeReport:
        aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam')
        samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=numThreads)
        sam_lca_report(tax_db,
                       aln_bam_sorted,
                       outReport=dupeReport,
                       outLca=dupeLca,
                       unique_only=False)
        os.unlink(aln_bam_sorted)

    aln_bam_deduped = outBam if outBam else util.file.mkstempfname(
        '.align_deduped.bam')
    opts = list(picardOptions)
    dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
    pic = tools.picard.MarkDuplicatesTool()
    pic.execute([aln_bam],
                aln_bam_deduped,
                dupe_removal_out_metrics,
                picardOptions=opts,
                JVMmemory=pic.jvmMemDefault)
    os.unlink(aln_bam)

    sam_lca_report(tax_db, aln_bam_deduped, outReport=outReport, outLca=outLca)

    if not outBam:
        os.unlink(aln_bam_deduped)
Beispiel #2
0
def bwamem_idxstats(inBam,
                    refFasta,
                    outBam=None,
                    outStats=None,
                    min_score_to_filter=None,
                    aligner_options=None):
    ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
    '''

    assert outBam or outStats, "Either outBam or outStats must be specified"

    if outBam is None:
        bam_aligned = mkstempfname('.aligned.bam')
    else:
        bam_aligned = outBam

    samtools = tools.samtools.SamtoolsTool()
    bwa = tools.bwa.Bwa()

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(refFasta, ref_indexed)
    bwa.index(ref_indexed)

    bwa_opts = [] if aligner_options is None else aligner_options.split()
    bwa.mem(inBam,
            refFasta,
            bam_aligned,
            options=bwa_opts,
            min_score_to_filter=min_score_to_filter)

    if outStats is not None:
        samtools.idxstats(bam_aligned, outStats)

    if outBam is None:
        os.unlink(bam_aligned)
Beispiel #3
0
def align_rna_metagenomics(
    inBam,
    db,
    taxDb,
    outReport,
    dupeReport=None,
    outBam=None,
    dupeLca=None,
    outLca=None,
    sensitive=None,
    JVMmemory=None,
    numThreads=None,
    picardOptions=None,
    min_score_to_output=None,
):
    """
        Align to metagenomics bwa index, mark duplicates, and generate LCA report
    """
    picardOptions = picardOptions if picardOptions else []

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa_opts = ["-a"]
    if sensitive:
        bwa_opts += "-k 12 -A 1 -B 1 -O 1 -E 1".split()

    map_threshold = min_score_to_output or 30

    aln_bam = util.file.mkstempfname(".bam")
    bwa.mem(inBam, db, aln_bam, options=bwa_opts, min_qual=map_threshold)

    tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)

    if dupeReport:
        aln_bam_sorted = util.file.mkstempfname(".align_namesorted.bam")
        samtools.sort(aln_bam, aln_bam_sorted, args=["-n"], threads=numThreads)
        sam_lca_report(tax_db, aln_bam_sorted, outReport=dupeReport, outLca=dupeLca, unique_only=False)
        os.unlink(aln_bam_sorted)

    aln_bam_deduped = outBam if outBam else util.file.mkstempfname(".align_deduped.bam")
    opts = list(picardOptions)
    dupe_removal_out_metrics = util.file.mkstempfname(".metrics")
    pic = tools.picard.MarkDuplicatesTool()
    pic.execute([aln_bam], aln_bam_deduped, dupe_removal_out_metrics, picardOptions=opts, JVMmemory=pic.jvmMemDefault)
    os.unlink(aln_bam)

    sam_lca_report(tax_db, aln_bam_deduped, outReport=outReport, outLca=outLca)

    if not outBam:
        os.unlink(aln_bam_deduped)
Beispiel #4
0
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None):
    ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
    '''
    if outBam is None:
        bam_aligned = mkstempfname('.aligned.bam')
    else:
        bam_aligned = outBam

    samtools = tools.samtools.SamtoolsTool()
    bwa = tools.bwa.Bwa()

    bwa.mem(inBam, refFasta, bam_aligned)

    if outStats is not None:
        samtools.idxstats(bam_aligned, outStats)

    if outBam is None:
        os.unlink(bam_aligned)
Beispiel #5
0
def bwamem_idxstats(inBam, refFasta, outBam=None, outStats=None):
    ''' Take reads, align to reference with BWA-MEM and perform samtools idxstats.
    '''

    assert outBam or outStats, "Either outBam or outStats must be specified"

    if outBam is None:
        bam_aligned = mkstempfname('.aligned.bam')
    else:
        bam_aligned = outBam

    samtools = tools.samtools.SamtoolsTool()
    bwa = tools.bwa.Bwa()

    bwa.mem(inBam, refFasta, bam_aligned)

    if outStats is not None:
        samtools.idxstats(bam_aligned, outStats)

    if outBam is None:
        os.unlink(bam_aligned)
Beispiel #6
0
def align_rna_metagenomics(
    inBam,
    db,
    taxDb,
    outReport,
    dupeReport=None,
    outBam=None,
    dupeReads=None,
    outReads=None,
    sensitive=None,
    JVMmemory=None,
    numThreads=None,
    picardOptions=None,
):
    '''
        Align to metagenomics bwa index, mark duplicates, and generate LCA report
    '''
    picardOptions = picardOptions if picardOptions else []

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa_opts = ['-a']
    if sensitive:
        bwa_opts += '-k 12 -A 1 -B 1 -O 1 -E 1'.split()

    # TODO: Use bwa.mem's min_score_to_filter argument to decrease false
    # positives in the output. Currently, it works by summing the alignment
    # score across all alignments output by bwa for each query (reads in a
    # pair, supplementary, and secondary alignments). This is not reasonable
    # for reads with secondary alignments because it will be easier for those
    # reads/queries to exceed the threshold given by the value of the argument.
    # In this context, bwa is called using '-a' as an option and its output
    # will likely include many secondary alignments. One option is to add
    # another argument to bwa.mem, similar to min_score_to_filter, that sets a
    # threshold on the alignment score of output alignments but only filters on
    # a per-alignment level (i.e., not by summing alignment scores across all
    # alignments for each query).

    aln_bam = util.file.mkstempfname('.bam')
    bwa.mem(inBam, db, aln_bam, options=bwa_opts)

    tax_db = TaxonomyDb(tax_dir=taxDb, load_names=True, load_nodes=True)

    if dupeReport:
        aln_bam_sorted = util.file.mkstempfname('.align_namesorted.bam')
        samtools.sort(aln_bam, aln_bam_sorted, args=['-n'], threads=numThreads)
        sam_lca_report(tax_db,
                       aln_bam_sorted,
                       outReport=dupeReport,
                       outReads=dupeReads,
                       unique_only=False)
        os.unlink(aln_bam_sorted)

    aln_bam_deduped = outBam if outBam else util.file.mkstempfname(
        '.align_deduped.bam')
    opts = list(picardOptions)
    dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
    pic = tools.picard.MarkDuplicatesTool()
    pic.execute([aln_bam],
                aln_bam_deduped,
                dupe_removal_out_metrics,
                picardOptions=opts,
                JVMmemory=JVMmemory)

    os.unlink(aln_bam)
    aln_bam_dd_sorted = util.file.mkstempfname('.bam')
    samtools.sort(aln_bam_deduped,
                  aln_bam_dd_sorted,
                  args=['-n'],
                  threads=numThreads)
    sam_lca_report(tax_db,
                   aln_bam_dd_sorted,
                   outReport=outReport,
                   outReads=outReads)

    if not outBam:
        os.unlink(aln_bam_deduped)
Beispiel #7
0
def align_and_plot_coverage(
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    out_summary,
    in_bam,
    ref_fasta,
    out_bam=None,
    sensitive=False,
    excludeDuplicates=False,
    JVMmemory=None,
    picardOptions=None,
    min_score_to_output=None
):
    ''' 
        Take reads, align to reference with BWA-MEM, and generate a coverage plot
    '''
    if out_bam is None:
        bam_aligned = util.file.mkstempfname('.aligned.bam')
    else:
        bam_aligned = out_bam

    ref_indexed = util.file.mkstempfname('.reference.fasta')
    shutil.copyfile(ref_fasta, ref_indexed)

    bwa = tools.bwa.Bwa()
    samtools = tools.samtools.SamtoolsTool()

    bwa.index(ref_indexed)

    bwa_opts = []
    if sensitive:
        bwa_opts + "-k 12 -A 1 -B 1 -O 1 -E 1".split()

    map_threshold = min_score_to_output or 30

    bwa_opts + ["-T", str(map_threshold)]

    aln_bam = util.file.mkstempfname('.bam')

    bwa.mem(in_bam, ref_indexed, aln_bam, opts=bwa_opts)

    # @haydenm says:
    # For some reason (particularly when the --sensitive option is on), bwa
    # doesn't listen to its '-T' flag and outputs alignments with score less
    # than the '-T 30' threshold. So filter these:
    aln_bam_filtered = util.file.mkstempfname('.filtered.bam')
    samtools.view(["-b", "-h", "-q", str(map_threshold)], aln_bam, aln_bam_filtered)
    os.unlink(aln_bam)

    aln_bam_dupe_processed = util.file.mkstempfname('.filtered_dupe_processed.bam')
    if excludeDuplicates:
        opts = list(picardOptions)
        dupe_removal_out_metrics = util.file.mkstempfname('.metrics')
        tools.picard.MarkDuplicatesTool().execute(
            [aln_bam_filtered], aln_bam_dupe_processed,
            dupe_removal_out_metrics, picardOptions=opts,
            JVMmemory=JVMmemory
        )
    else:
        aln_bam_dupe_processed = aln_bam_filtered

    samtools.sort(aln_bam_dupe_processed, bam_aligned)
    os.unlink(aln_bam_filtered)
    
    if excludeDuplicates:
        os.unlink(aln_bam_dupe_processed)

    samtools.index(bam_aligned)

    # -- call plot function --
    plot_coverage(
        bam_aligned, out_plot_file, plot_format, plot_data_style, plot_style, plot_width, plot_height, plot_dpi, plot_title,
        base_q_threshold, mapping_q_threshold, max_coverage_depth, read_length_threshold, excludeDuplicates, out_summary
    )

    # remove the output bam, unless it is needed
    if out_bam is None:
        os.unlink(bam_aligned)

    # remove the files created by bwa index. 
    # The empty extension causes the original fasta file to be removed
    for ext in [".amb", ".ann", ".bwt", ".bwa", ".pac", ".sa", ""]:
        file_to_remove = ref_indexed + ext
        if os.path.isfile(file_to_remove):
            os.unlink(file_to_remove)