Example #1
0
    def test_mvicuna_canned_input(self):
        samtools = tools.samtools.SamtoolsTool()

        input_bam = os.path.join(util.file.get_test_input_path(self), 'input.bam')
        expected_bam = os.path.join(util.file.get_test_input_path(self), 'expected.bam')
        output_bam = util.file.mkstempfname("output.bam")
        read_utils.rmdup_mvicuna_bam(
            input_bam,
            output_bam
        )

        self.assertEqual(samtools.count(output_bam), samtools.count(expected_bam))
    def test_mvicuna_canned_input(self):
        samtools = tools.samtools.SamtoolsTool()

        input_bam = os.path.join(util.file.get_test_input_path(self), 'input.bam')
        expected_bam = os.path.join(util.file.get_test_input_path(self), 'expected.bam')
        output_bam = util.file.mkstempfname("output.bam")
        read_utils.rmdup_mvicuna_bam(
            input_bam,
            output_bam
        )

        self.assertEqual(samtools.count(output_bam), samtools.count(expected_bam))
Example #3
0
def align_and_count_hits(inBam, refFasta, outCounts, includeZeros=False,
                  JVMmemory=None, threads=1):
    ''' Take reads, align to reference with Novoalign and return aligned
        read counts for each reference sequence.
    '''

    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(
        inBam,
        refFasta,
        bam_aligned,
        options=['-r', 'Random'],
        JVMmemory=JVMmemory)

    samtools = tools.samtools.SamtoolsTool()
    seqs = list(dict(x.split(':', 1) for x in row[1:])['SN']
        for row in samtools.getHeader(bam_aligned)
        if row[0]=='@SQ')

    with util.file.open_or_gzopen(outCounts, 'w') as outf:
        for seq in seqs:
            n = samtools.count(bam_aligned, regions=[seq])
            if n>0 or includeZeros:
                outf.write("{}\t{}\n".format(seq, n))

    os.unlink(bam_aligned)
Example #4
0
    def downsample_to_approx_count(
        self, inBam, outBam, read_count, picardOptions=None,
        JVMmemory=None
    ):    # pylint: disable=W0221):

        samtools = tools.samtools.SamtoolsTool()
        total_read_count = samtools.count(inBam)

        if total_read_count == 0:
            _log.info("Input BAM has no reads. Copying to output.")
            shutil.copyfile(inBam, outBam)

        probability = Decimal(int(read_count)) / Decimal(total_read_count)
        probability = 1 if probability > 1 else probability

        assert probability >= 0

        if probability < 1:
            # per the Picard docs, HighAccuracy is recommended for read counts <50k
            strategy = "HighAccuracy" if total_read_count < 50000 else "Chained"
            _log.info("Setting downsample accuracy to %s based on read count of %s" % (strategy, total_read_count))
            
            self.execute(inBam, outBam, probability, strategy=strategy, accuracy=0.00001, picardOptions=picardOptions, JVMmemory=JVMmemory)
        else:
            _log.info("Requested downsample count exceeds number of reads. Including all reads in output.")
            shutil.copyfile(inBam, outBam)
Example #5
0
    def downsample_to_approx_count(
        self, inBam, outBam, read_count, picardOptions=None,
        JVMmemory=None
    ):    # pylint: disable=W0221):

        samtools = tools.samtools.SamtoolsTool()
        total_read_count = samtools.count(inBam)

        if total_read_count == 0:
            _log.info("Input BAM has no reads. Copying to output.")
            shutil.copyfile(inBam, outBam)

        probability = Decimal(int(read_count)) / Decimal(total_read_count)
        probability = 1 if probability > 1 else probability

        assert probability >= 0

        if probability < 1:
            # per the Picard docs, HighAccuracy is recommended for read counts <50k
            strategy = "HighAccuracy" if total_read_count < 50000 else "Chained"
            _log.info("Setting downsample accuracy to %s based on read count of %s" % (strategy, total_read_count))
            
            self.execute(inBam, outBam, probability, strategy=strategy, accuracy=0.00001, picardOptions=picardOptions, JVMmemory=JVMmemory)
        else:
            _log.info("Requested downsample count exceeds number of reads. Including all reads in output.")
            shutil.copyfile(inBam, outBam)
Example #6
0
def align_and_count_hits(inBam,
                         refFasta,
                         outCounts,
                         includeZeros=False,
                         JVMmemory=None):
    ''' Take reads, align to reference with Novoalign and return aligned
        read counts for each reference sequence.
    '''

    bam_aligned = mkstempfname('.aligned.bam')
    tools.novoalign.NovoalignTool().execute(inBam,
                                            refFasta,
                                            bam_aligned,
                                            options=['-r', 'Random'],
                                            JVMmemory=JVMmemory)

    samtools = tools.samtools.SamtoolsTool()
    seqs = list(
        dict(x.split(':', 1) for x in row[1:])['SN']
        for row in samtools.getHeader(bam_aligned) if row[0] == '@SQ')

    with util.file.open_or_gzopen(outCounts, 'w') as outf:
        for seq in seqs:
            n = samtools.count(bam_aligned, regions=[seq])
            if n > 0 or includeZeros:
                outf.write("{}\t{}\n".format(seq, n))

    os.unlink(bam_aligned)
Example #7
0
 def test_revert_bam_empty_input(self):
     empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
     out_bam = util.file.mkstempfname()
     tools.picard.RevertSamTool().execute(
         empty_bam,
         out_bam,
         picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true'])
     samtools = tools.samtools.SamtoolsTool()
     assert samtools.count(out_bam) == 0
 def test_cdhit_empty_input(self):
     samtools = tools.samtools.SamtoolsTool()
     empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
     output_bam = util.file.mkstempfname("output.bam")
     read_utils.rmdup_cdhit_bam(
         empty_bam,
         output_bam
     )
     self.assertEqual(samtools.count(output_bam), 0)
Example #9
0
 def test_cdhit_empty_input(self):
     samtools = tools.samtools.SamtoolsTool()
     empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
     output_bam = util.file.mkstempfname("output.bam")
     read_utils.rmdup_cdhit_bam(
         empty_bam,
         output_bam
     )
     self.assertEqual(samtools.count(output_bam), 0)
 def test_revert_bam_empty_input(self):
     empty_bam = os.path.join(util.file.get_test_input_path(), 'empty.bam')
     out_bam = util.file.mkstempfname()
     tools.picard.RevertSamTool().execute(
         empty_bam,
         out_bam,
         picardOptions=['SORT_ORDER=queryname', 'SANITIZE=true']
     )
     samtools = tools.samtools.SamtoolsTool()
     assert samtools.count(out_bam) == 0
Example #11
0
    def downsample_to_approx_count(
        self, inBam, outBam, read_count, picardOptions=None,
        JVMmemory=None
    ):    # pylint: disable=W0221):

        samtools = tools.samtools.SamtoolsTool()
        total_read_count = samtools.count(inBam)

        probability = Decimal(int(read_count)) / Decimal(total_read_count)

        self.execute(inBam, outBam, probability, accuracy=0.00001, picardOptions=picardOptions, JVMmemory=JVMmemory)
Example #12
0
    def test_sam_downsample(self):
        desired_count = 100
        tolerance = 0.1

        in_sam = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')
        out_bam = util.file.mkstempfname('.bam')

        samtools = tools.samtools.SamtoolsTool()

        samtools.downsample_to_approx_count(in_sam, out_bam, desired_count)

        assert samtools.count(out_bam) in range(
            int(desired_count - (desired_count * tolerance)), int(desired_count + (desired_count * tolerance))+1
        ), "Downsampled bam file does not contain the expected number of reads within tolerance: %s" % tolerance
    def test_sam_downsample(self):
        desired_count = 100
        tolerance = 0.1

        in_sam = os.path.join(util.file.get_test_input_path(), 'G5012.3.subset.bam')
        out_bam = util.file.mkstempfname('.bam')

        samtools = tools.samtools.SamtoolsTool()

        samtools.downsample_to_approx_count(in_sam, out_bam, desired_count)

        assert samtools.count(out_bam) in range(
            int(desired_count - (desired_count * tolerance)), int(desired_count + (desired_count * tolerance))+1
        ), "Downsampled bam file does not contain the expected number of reads within tolerance: %s" % tolerance
Example #14
0
    def test_filterByCigarString(self):
        # The test input contains three reads to remove; one each: 
        #   leading indel, trailing indel, both leading and trailing
        # It also has a cigar string with an indel between alignment matches
        in_sam = os.path.join(util.file.get_test_input_path(self), 'indel_cigar.sam')
        out_bam = util.file.mkstempfname('.bam')

        samtools = tools.samtools.SamtoolsTool()

        # We'll use the default regex, which matches leading or trailing indels.
        # It is reproduced here in case the default changes:
        # '^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$'
        samtools.filterByCigarString(in_sam, out_bam)

        assert samtools.count(out_bam)==39, "Output read count does not match the expected count."
Example #15
0
    def test_filterByCigarString(self):
        # The test input contains three reads to remove; one each: 
        #   leading indel, trailing indel, both leading and trailing
        # It also has a cigar string with an indel between alignment matches
        in_sam = os.path.join(util.file.get_test_input_path(self), 'indel_cigar.sam')
        out_bam = util.file.mkstempfname('.bam')

        samtools = tools.samtools.SamtoolsTool()

        # We'll use the default regex, which matches leading or trailing indels.
        # It is reproduced here in case the default changes:
        # '^((?:[0-9]+[ID]){1}(?:[0-9]+[MNIDSHPX=])+)|((?:[0-9]+[MNIDSHPX=])+(?:[0-9]+[ID]){1})$'
        samtools.filterByCigarString(in_sam, out_bam)

        assert samtools.count(out_bam)==39, "Output read count does not match the expected count."
Example #16
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each", totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute(
                "SamFormatConverter", [
                    'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam, 'VERBOSITY=WARNING'
                ],
                JVMmemory='512m'
            )
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
Example #17
0
def split_bam(inBam, outBams):
    '''Split BAM file equally into several output BAM files. '''
    samtools = tools.samtools.SamtoolsTool()
    picard = tools.picard.PicardTools()

    # get totalReadCount and maxReads
    # maxReads = totalReadCount / num files, but round up to the nearest
    # even number in order to keep read pairs together (assuming the input
    # is sorted in query order and has no unmated reads, which can be
    # accomplished by Picard RevertSam with SANITIZE=true)
    totalReadCount = samtools.count(inBam)
    maxReads = int(math.ceil(float(totalReadCount) / len(outBams) / 2) * 2)
    log.info("splitting %d reads into %d files of %d reads each",
             totalReadCount, len(outBams), maxReads)

    # load BAM header into memory
    header = samtools.getHeader(inBam)
    if 'SO:queryname' not in header[0]:
        raise Exception('Input BAM file must be sorted in queryame order')

    # dump to bigsam
    bigsam = mkstempfname('.sam')
    samtools.view([], inBam, bigsam)

    # split bigsam into little ones
    with util.file.open_or_gzopen(bigsam, 'rt') as inf:
        for outBam in outBams:
            log.info("preparing file " + outBam)
            tmp_sam_reads = mkstempfname('.sam')
            with open(tmp_sam_reads, 'wt') as outf:
                for row in header:
                    outf.write('\t'.join(row) + '\n')
                for _ in range(maxReads):
                    line = inf.readline()
                    if not line:
                        break
                    outf.write(line)
                if outBam == outBams[-1]:
                    for line in inf:
                        outf.write(line)
            picard.execute("SamFormatConverter", [
                'INPUT=' + tmp_sam_reads, 'OUTPUT=' + outBam,
                'VERBOSITY=WARNING'
            ],
                           JVMmemory='512m')
            os.unlink(tmp_sam_reads)
    os.unlink(bigsam)
Example #18
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  plot_x_limits,
                  plot_y_limits,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  bin_large_plots=False,
                  binning_summary_statistic="max",
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''
    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if the plot input bam file contains reads and you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024", '-@', '3'], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # only sort if not sorted
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    should_remove_sorted = True
    if not util.file.bam_is_sorted(bam_dupe_processed):
        samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])
        if plot_only_non_duplicates:
            os.unlink(bam_dupe_processed)
    else:
        bam_sorted = bam_dupe_processed
        if not plot_only_non_duplicates:
            # in this case we are passing through the original in_bam directly
            should_remove_sorted = False

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)

    # only remove the sorted bam if it is not the original input bam
    # which we use directly in some casess
    if should_remove_sorted:
        os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(float(row[2]))
            domain_max += 1

    with matplotlib.pyplot.style.context(plot_style):
        fig = matplotlib.pyplot.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = matplotlib.pyplot.subplot(
        )  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        # Binning
        bin_size = 1
        if bin_large_plots:
            # Bin locations and take summary value (maximum or minimum) in each bin
            binning_fn = {
                "min": min,
                "max": max,
                "mean": mean,
                "median": median
            }
            binning_action = binning_fn.get(binning_summary_statistic, "max")

            inner_plot_width_inches = ax.get_window_extent().transformed(
                fig.dpi_scale_trans.inverted()).width
            inner_plot_width_px = inner_plot_width_inches * fig.dpi  # width of actual plot (sans whitespace and y axis text)
            bins_per_pixel = 1  # increase to make smaller (but less visible) bins
            bin_size = 1 + int(domain_max /
                               (inner_plot_width_px * bins_per_pixel))

            binned_segment_depths = OrderedDict()
            for segment_num, (segment_name, position_depths) in enumerate(
                    segment_depths.items()):
                summary_depths_in_bins = [
                    binning_action(position_depths[i:i + bin_size])
                    for i in range(0, len(position_depths), bin_size)
                ]
                binned_segment_depths[segment_name] = summary_depths_in_bins
            segment_depths = binned_segment_depths

        # Plotting
        domain_max = 0
        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(
                matplotlib.pyplot.rcParams['axes.prop_cycle'].by_key()
                ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            x_values = range(prior_domain_max, domain_max)
            x_values = [x * bin_size for x in x_values]

            if plot_data_style == "filled":
                matplotlib.pyplot.fill_between(x_values,
                                               position_depths,
                                               [0] * len(position_depths),
                                               linewidth=0,
                                               antialiased=True,
                                               color=segment_color)
            elif plot_data_style == "line":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       antialiased=True,
                                       color=segment_color)
            elif plot_data_style == "dots":
                matplotlib.pyplot.plot(x_values,
                                       position_depths,
                                       'ro',
                                       antialiased=True,
                                       color=segment_color)

        matplotlib.pyplot.title(plot_title, fontsize=font_size * 1.2)
        matplotlib.pyplot.xlabel("bp", fontsize=font_size * 1.1)

        ylabel = "read depth"
        if (bin_size > 1):
            ylabel = "read depth ({summary} in {size}-bp bin)".format(
                size=bin_size, summary=binning_summary_statistic)
        matplotlib.pyplot.ylabel(ylabel, fontsize=font_size * 1.1)

        if plot_x_limits is not None:
            x_min, x_max = plot_x_limits
            matplotlib.pyplot.xlim(x_min, x_max)
        if plot_y_limits is not None:
            y_min, y_max = plot_y_limits
            matplotlib.pyplot.ylim(y_min, y_max)

        # to squash a backend renderer error on OSX related to tight layout
        if matplotlib.pyplot.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        matplotlib.pyplot.savefig(out_plot_file, format=plot_format,
                                  dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Example #19
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly',
                       assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self',
                       reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = [
        'sample',
        'reads_raw',
        'reads_cleaned',
        'reads_taxfilt',
        'assembled_trinity',
        'trinity_in_reads',
        'n_contigs',
        'contig_len',
        'unambig_bases',
        'pct_unambig',
        'aln2self_reads_tot',
        'aln2self_reads_aln',
        'aln2self_reads_rmdup',
        'aln2self_pct_nondup',
        'aln2self_cov_median',
        'aln2self_cov_mean',
        'aln2self_cov_mean_non0',
    ] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    if os.path.isdir(raw_reads_dir):
        out['reads_raw'] = sum(
            samtools.count(bam)
            # correct issue where sample names containing other sample names as substrings leads
            # to extra files being included in the count
            #
            # add a dot before the wildcard, and assume the sample name is found before the dot.
            # this works for now since dots are the filename field separators
            # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
            # TODO: replace this with better filtering?
            for bam in glob.glob(os.path.join(raw_reads_dir, sample +
                                              ".*.bam")))
        sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam")
        if os.path.isfile(sample_raw_fname):
            # if "00_raw/sample.bam" exists, these were not demuxed by snakemake
            if out['reads_raw']:
                # if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem!
                out['reads_raw'] = 'ambiguous filenames in raw reads directory!'
            else:
                # just count the sample.bam reads
                out['reads_raw'] = samtools.count(sample_raw_fname)

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(
        os.path.join(assembly_tmp,
                     sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp,
                                      sample + '.assembly2-scaffolded.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
    if os.path.isfile(assembly_fname):
        with open(assembly_fname, 'rt') as inf:
            counts = [(len(s), util.misc.unambig_count(s.seq))
                      for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
        out['n_contigs'] = len(counts)
        out['contig_len'] = ','.join(str(x) for x, y in counts)
        out['unambig_bases'] = ','.join(str(y) for x, y in counts)
        out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if os.path.isfile(bam_fname):
        out['aln2self_reads_tot'] = samtools.count(bam_fname)
        out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
        out['aln2self_reads_rmdup'] = samtools.count(bam_fname,
                                                     opts=['-F', '1028'])
        if out['aln2self_reads_aln']:
            out['aln2self_pct_nondup'] = float(
                out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    if os.path.isfile(bam_fname):
        with pysam.AlignmentFile(bam_fname, 'rb') as bam:
            coverages = list([pcol.nsegments for pcol in bam.pileup()])
        if coverages:
            out['aln2self_cov_median'] = median(coverages)
            out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
            out['aln2self_cov_mean_non0'] = "%0.3f" % mean(
                [n for n in coverages if n > 0])
            for thresh in cov_thresholds:
                out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages
                                                       if n >= thresh)

    return (header, out)
Example #20
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None, min_qual=30, threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                return
            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            removeInput = True
            
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0
        
        aln_bam_prefilter = util.file.mkstempfname('.prefiltered.bam')
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, aln_bam_prefilter, options=options+['-R', readgroup_line.rstrip("\n").rstrip("\r")], min_qual=min_qual, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)

        # @haydenm says: 
        # For some reason (particularly when the --sensitive option is on), bwa
        # doesn't listen to its '-T' flag and outputs alignments with score less
        # than the '-T 30' threshold. So filter these:
        if min_qual > 0:
            tmp_bam_aligned = util.file.mkstempfname('.aligned.bam')
            tools.samtools.SamtoolsTool().view(["-b", "-h", "-q", str(min_qual)], aln_bam_prefilter, tmp_bam_aligned)
            os.unlink(aln_bam_prefilter)
        else:
            shutil.move(aln_bam_prefilter, tmp_bam_aligned)

        # if the aligned bam file contains no reads after filtering
        # just create an empty file
        if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0:
            util.file.touch(outBam)
        else:
            # samtools reheader seems to segfault on some alignments created by bwa
            # so rather than reheader, BWA will write out the RG given to it via '-R'
            # reheadered_bam = util.file.mkstempfname('.reheadered.bam')
            # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam)
            # os.unlink(tmp_bam_aligned)
            # os.unlink(headerFile)
            # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam))

            # sort
            sorter = tools.picard.SortSamTool()
            sorter.execute(
                tmp_bam_aligned,
                outBam,
                sort_order='coordinate',
                picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
                JVMmemory=JVMmemory
            )
Example #21
0
    def align_one_rg_bam(self, inBam, refFasta, outBam, rgid=None, rgs=None, options=None, min_qual=0, JVMmemory=None):
        ''' Execute Novoalign on BAM inputs and outputs.
            Requires that only one RG exists (will error otherwise).
            Use Picard to sort and index the output BAM.
            If min_qual>0, use Samtools to filter on mapping quality.
        '''
        options = options or ["-r", "Random"]

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = rgs if rgs is not None else samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))
        #rg = rgs[rgid]

        # Strip inBam to just one RG (if necessary)
        if len(rgs) == 1:
            one_rg_inBam = inBam
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                return

            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)
            os.unlink(headerFile)

        # Novoalign
        tmp_sam = util.file.mkstempfname('.novoalign.sam')
        tmp_sam_err = util.file.mkstempfname('.novoalign.sam.err')
        cmd = [self.install_and_get_path(), '-f', one_rg_inBam] + list(map(str, options))
        cmd = cmd + ['-F', 'BAM', '-d', self._fasta_to_idx_name(refFasta), '-o', 'SAM']
        _log.debug(' '.join(cmd))
        with open(tmp_sam, 'wt') as outf:
            util.misc.run_and_save(cmd, outf=outf)

        # Samtools filter (optional)
        if min_qual:
            tmp_bam2 = util.file.mkstempfname('.filtered.bam')
            cmd = [samtools.install_and_get_path(), 'view', '-b', '-S', '-1', '-q', str(min_qual), tmp_sam]
            _log.debug('%s > %s', ' '.join(cmd), tmp_bam2)
            with open(tmp_bam2, 'wb') as outf:
                util.misc.run_and_save(cmd, outf=outf)
            os.unlink(tmp_sam)
            tmp_sam = tmp_bam2

        # Picard SortSam
        sorter = tools.picard.SortSamTool()
        sorter.execute(
            tmp_sam,
            outBam,
            sort_order='coordinate',
            picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
            JVMmemory=JVMmemory
        )
Example #22
0
    def align_mem_one_rg(self,
                         inBam,
                         refDb,
                         outBam,
                         rgid=None,
                         options=None,
                         min_score_to_filter=None,
                         threads=None,
                         JVMmemory=None,
                         invert_filter=False,
                         should_index=True):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError(
                "{} has {} read groups, but we require exactly one".format(
                    inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError(
                "{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.count(tmp_bam) == 0:
                    log.warning("No reads present for RG %s in file: %s", rgid,
                                inBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname(
                    '.{}.in.bam'.format(rgid))
                removeInput = True

                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row
                                            if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned:
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(
            one_rg_inBam,
            refDb,
            outBam,
            options=options +
            ['-R', readgroup_line.rstrip("\r\n").replace('\t', '\\t')],
            min_score_to_filter=min_score_to_filter,
            threads=threads,
            invert_filter=invert_filter,
            should_index=should_index)

        return (rgid, outBam)
        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)
Example #23
0
def plot_coverage(in_bam,
                  out_plot_file,
                  plot_format,
                  plot_data_style,
                  plot_style,
                  plot_width,
                  plot_height,
                  plot_dpi,
                  plot_title,
                  base_q_threshold,
                  mapping_q_threshold,
                  max_coverage_depth,
                  read_length_threshold,
                  plot_only_non_duplicates=False,
                  out_summary=None):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam)

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # TODO: this is probably not necessary since "samtools depth" does not count marked duplicates
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']  # report coverate at "absolutely all" positions
    if base_q_threshold:
        if not plot_only_non_duplicates:
            # Note: "bedtools genomecov" will count depth including duplicates, but does
            # not expose options for filtering by quality. When duplicates
            # are excluded, "samtools depth" is used which does support quality filtering
            # We use either samtools or bedtools, because the former ignores marked duplicates
            # from its depth count while bedtools includes them.
            log.warning("'-q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-Q' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        if not plot_only_non_duplicates:
            log.warning("'-m' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        if not plot_only_non_duplicates:
            log.warning("'-l' ignored since --plotOnlyNonDuplicates is absent")
        opts += ["-l", str(read_length_threshold)]

    # add option here for bedtools to report coverage w/ duplicates
    # (and then samtools for no-dups)
    #
    # Ex.
    #   samtools depth -aa mapped-to-ref.with-dups.tmp.bam
    #   bedtools genomecov -ibam mapped-to-ref.with-dups.tmp.bam -d
    if not plot_only_non_duplicates:
        bt = BedTool(bam_sorted)
        # "d=True" is the equivalent of passing "-d" to the bedtools CLI
        bt.genome_coverage(d=True).saveas(coverage_tsv_file)
    else:
        samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(
            float(plot_width) / float(DPI),
            float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()  # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(
                segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()
                          ['color'])  # get the colors for this style
            segment_color = colors[
                segment_num %
                len(colors)]  # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(range(prior_domain_max, domain_max),
                                 position_depths, [0] * len(position_depths),
                                 linewidth=0,
                                 antialiased=True,
                                 color=segment_color)
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         antialiased=True,
                         color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(range(prior_domain_max, domain_max),
                         position_depths,
                         'ro',
                         antialiased=True,
                         color=segment_color)

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format,
                    dpi=DPI)  #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Example #24
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None,
                         min_score_to_filter=None, threads=None, JVMmemory=None, invert_filter=False, should_index=True):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            with util.file.tempfname('.onebam.bam') as tmp_bam:
                samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
                # special exit if this file is empty
                if samtools.count(tmp_bam) == 0:
                    log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                    return
                # simplify BAM header otherwise Novoalign gets confused
                one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
                removeInput = True
                
                with open(headerFile, 'wt') as outf:
                    for row in samtools.getHeader(inBam):
                        if len(row) > 0 and row[0] == '@RG':
                            if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                                # skip all read groups that are not rgid
                                continue
                        outf.write('\t'.join(row) + '\n')
                samtools.reheader(tmp_bam, headerFile, one_rg_inBam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        #with util.file.tempfname('.aligned.bam') as tmp_bam_aligned:
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, outBam, options=options+['-R',
                 readgroup_line.rstrip("\r\n").replace('\t','\\t')],
                 min_score_to_filter=min_score_to_filter, threads=threads, invert_filter=invert_filter, should_index=should_index)

        return (rgid, outBam)
        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)
Example #25
0
def get_assembly_stats(sample,
        cov_thresholds=(1,5,20,100),
        assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly',
        align_dir='data/02_align_to_self', reads_dir='data/01_per_sample',
        raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample':sample}
    samtools = tools.samtools.SamtoolsTool()
    header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt',
        'assembled_trinity', 'trinity_in_reads',
        'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig',
        'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup',
        'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0',
        ] + ['aln2self_cov_%dX'%t for t in cov_thresholds]
    
    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_'+adj] = samtools.count(reads_bam)
    out['reads_raw'] = sum(samtools.count(bam)
        for bam in glob.glob(os.path.join(raw_reads_dir, sample+"*.bam")))
    
    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp,
        sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)    
    
    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
            return (header, out)
    with open(assembly_fname, 'rt') as inf:
        counts = [(len(s), assembly.unambig_count(s.seq))
            for s in Bio.SeqIO.parse(inf, 'fasta')
            if len(s)>0]
    out['n_contigs'] = len(counts)
    out['contig_len'] = ','.join(str(x) for x,y in counts)
    out['unambig_bases'] = ','.join(str(y) for x,y in counts)
    out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts)
    
    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if not os.path.isfile(bam_fname):
        return (header, out)
    out['aln2self_reads_tot'] = samtools.count(bam_fname)
    out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
    out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028'])
    if out['aln2self_reads_aln']:
        out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']
    
    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    with pysam.AlignmentFile(bam_fname, 'rb') as bam:
        coverages = list([pcol.nsegments for pcol in bam.pileup()])
    out['aln2self_cov_median'] = median(coverages)
    out['aln2self_cov_mean'] = "%0.3f"%mean(coverages)
    out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0])
    for thresh in cov_thresholds:
        out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh)
    
    return (header, out)
Example #26
0
    def align_mem_one_rg(self, inBam, refDb, outBam, rgid=None, options=None,
                         min_score_to_filter=None, threads=None, JVMmemory=None):
        """
            Performs an alignment of one read group in a bam file to a reference fasta file

            TODO: With the addition of a third aligner to viral-ngs, the functionality
            common to this method and to the comparable method in the Novoalign wrapper should
            be broken out as an "aligner" superclass, capable of aligning bam or fastq files with an arbitrary
            aligner, while preserving read groups. 
        """
        options = options or []

        samtools = tools.samtools.SamtoolsTool()

        # Require exactly one RG
        rgs = samtools.getReadGroups(inBam)
        if len(rgs) == 0:
            raise InvalidBamHeaderError("{} lacks read groups".format(inBam))
        elif len(rgs) == 1:
            if not rgid:
                rgid = list(rgs.keys())[0]
        elif not rgid:
            raise InvalidBamHeaderError("{} has {} read groups, but we require exactly one".format(inBam, len(rgs)))
        if rgid not in rgs:
            raise InvalidBamHeaderError("{} has read groups, but not {}".format(inBam, rgid))

        headerFile = util.file.mkstempfname('.{}.header.txt'.format(rgid))
        # Strip inBam to just one RG (if necessary)
        removeInput = False
        if len(rgs) == 1:
            one_rg_inBam = inBam
            tools.samtools.SamtoolsTool().dumpHeader(one_rg_inBam, headerFile)
        else:
            # strip inBam to one read group
            tmp_bam = util.file.mkstempfname('.onebam.bam')
            samtools.view(['-b', '-r', rgid], inBam, tmp_bam)
            # special exit if this file is empty
            if samtools.count(tmp_bam) == 0:
                log.warning("No reads present for RG %s in file: %s", rgid, inBam)
                return
            # simplify BAM header otherwise Novoalign gets confused
            one_rg_inBam = util.file.mkstempfname('.{}.in.bam'.format(rgid))
            removeInput = True
            
            with open(headerFile, 'wt') as outf:
                for row in samtools.getHeader(inBam):
                    if len(row) > 0 and row[0] == '@RG':
                        if rgid != list(x[3:] for x in row if x.startswith('ID:'))[0]:
                            # skip all read groups that are not rgid
                            continue
                    outf.write('\t'.join(row) + '\n')
            samtools.reheader(tmp_bam, headerFile, one_rg_inBam)
            os.unlink(tmp_bam)

        # perform actual alignment

        # get the read group line to give to BWA
        readgroup_line = ""
        with open(headerFile) as inf:
            for line in inf:
                if line.startswith("@RG"):
                    readgroup_line = line

        assert len(readgroup_line) > 0

        tmp_bam_aligned = util.file.mkstempfname('.aligned.bam')
        # rather than reheader the alignment bam file later so it has the readgroup information
        # from the original bam file, we'll pass the RG line to bwa to write out
        self.mem(one_rg_inBam, refDb, tmp_bam_aligned, options=options+['-R',
                 readgroup_line.rstrip("\r\n")],
                 min_score_to_filter=min_score_to_filter, threads=threads)

        # if there was more than one RG in the input, we had to create a temporary file with the one RG specified
        # and we can safely delete it this file
        # if there was only one RG in the input, we used it directly and should not delete it
        if removeInput:
            os.unlink(one_rg_inBam)

        # if the aligned bam file contains no reads after filtering
        # just create an empty file
        if tools.samtools.SamtoolsTool().count(tmp_bam_aligned) == 0:
            util.file.touch(outBam)
        else:
            # samtools reheader seems to segfault on some alignments created by bwa
            # so rather than reheader, BWA will write out the RG given to it via '-R'
            # reheadered_bam = util.file.mkstempfname('.reheadered.bam')
            # tools.samtools.SamtoolsTool().reheader(tmp_bam_aligned, headerFile, reheadered_bam)
            # os.unlink(tmp_bam_aligned)
            # os.unlink(headerFile)
            # os.system("samtools view -h {} > /Users/tomkinsc/Desktop/test_reheader.bam".format(reheadered_bam))

            # sort
            sorter = tools.picard.SortSamTool()
            sorter.execute(
                tmp_bam_aligned,
                outBam,
                sort_order='coordinate',
                picardOptions=['CREATE_INDEX=true', 'VALIDATION_STRINGENCY=SILENT'],
                JVMmemory=JVMmemory
            )
Example #27
0
def plot_coverage(
    in_bam,
    out_plot_file,
    plot_format,
    plot_data_style,
    plot_style,
    plot_width,
    plot_height,
    plot_dpi,
    plot_title,
    base_q_threshold,
    mapping_q_threshold,
    max_coverage_depth,
    read_length_threshold,
    plot_only_non_duplicates=False,
    out_summary=None
):
    ''' 
        Generate a coverage plot from an aligned bam file
    '''

    # TODO: remove this:
    #coverage_tsv_file = "/Users/tomkinsc/Downloads/plottest/test_multisegment.tsv"

    samtools = tools.samtools.SamtoolsTool()

    # check if in_bam is aligned, if not raise an error
    num_mapped_reads = samtools.count(in_bam, opts=["-F", "4"])
    if num_mapped_reads == 0:
        raise Exception(
            """The bam file specified appears to have zero mapped reads. 'plot_coverage' requires an aligned bam file. You can try 'align_and_plot_coverage' if you don't mind a simple bwa alignment. \n File: %s"""
            % in_bam
        )

    if out_summary is None:
        coverage_tsv_file = util.file.mkstempfname('.summary.tsv')
    else:
        coverage_tsv_file = out_summary

    bam_dupe_processed = util.file.mkstempfname('.dupe_processed.bam')
    if plot_only_non_duplicates:
        # write a new bam file; exclude reads with the 1024 flag set (PCR or optical duplicates)
        samtools.view(["-F", "1024"], in_bam, bam_dupe_processed)
    else:
        bam_dupe_processed = in_bam

    # call samtools sort
    bam_sorted = util.file.mkstempfname('.sorted.bam')
    samtools.sort(bam_dupe_processed, bam_sorted, args=["-O", "bam"])

    if plot_only_non_duplicates:
        os.unlink(bam_dupe_processed)

    # call samtools index
    samtools.index(bam_sorted)

    # call samtools depth
    opts = []
    opts += ['-aa']    # report coverate at "absolutely all" positions
    if base_q_threshold:
        opts += ["-q", str(base_q_threshold)]
    if mapping_q_threshold:
        opts += ["-Q", str(mapping_q_threshold)]
    if max_coverage_depth:
        opts += ["-m", str(max_coverage_depth)]
    if read_length_threshold:
        opts += ["-l", str(read_length_threshold)]

    samtools.depth(bam_sorted, coverage_tsv_file, opts)
    os.unlink(bam_sorted)

    # ---- create plot based on coverage_tsv_file ----

    segment_depths = OrderedDict()
    domain_max = 0
    with open(coverage_tsv_file, "r") as tabfile:
        for row in csv.reader(tabfile, delimiter='\t'):
            segment_depths.setdefault(row[0], []).append(int(row[2]))
            domain_max += 1

    domain_max = 0
    with plt.style.context(plot_style):
        fig = plt.gcf()
        DPI = plot_dpi or fig.get_dpi()
        fig.set_size_inches(float(plot_width) / float(DPI), float(plot_height) / float(DPI))

        font_size = (2.5 * plot_height) / float(DPI)

        ax = plt.subplot()    # Defines ax variable by creating an empty plot

        # Set the tick labels font
        for label in (ax.get_xticklabels() + ax.get_yticklabels()):
            label.set_fontsize(font_size)

        for segment_num, (segment_name, position_depths) in enumerate(segment_depths.items()):
            prior_domain_max = domain_max
            domain_max += len(position_depths)

            colors = list(plt.rcParams['axes.prop_cycle'].by_key()['color'])    # get the colors for this style
            segment_color = colors[segment_num % len(colors)]    # pick a color, offset by the segment index

            if plot_data_style == "filled":
                plt.fill_between(
                    range(prior_domain_max, domain_max),
                    position_depths, [0] * len(position_depths),
                    linewidth=0,
                    antialiased=True,
                    color=segment_color
                )
            elif plot_data_style == "line":
                plt.plot(range(prior_domain_max, domain_max), position_depths, antialiased=True, color=segment_color)
            elif plot_data_style == "dots":
                plt.plot(
                    range(prior_domain_max, domain_max),
                    position_depths,
                    'ro',
                    antialiased=True,
                    color=segment_color
                )

        plt.title(plot_title, fontsize=font_size * 1.2)
        plt.xlabel("bp", fontsize=font_size * 1.1)
        plt.ylabel("read depth", fontsize=font_size * 1.1)

        # to squash a backend renderer error on OSX related to tight layout
        if plt.get_backend().lower() in ['agg', 'macosx']:
            fig.set_tight_layout(True)
        else:
            fig.tight_layout()

        plt.savefig(out_plot_file, format=plot_format, dpi=DPI)    #, bbox_inches='tight')
        log.info("Coverage plot saved to: " + out_plot_file)

    if not out_summary:
        os.unlink(coverage_tsv_file)
Example #28
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self', reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = ['sample',
              'reads_raw',
              'reads_cleaned',
              'reads_taxfilt',
              'assembled_trinity',
              'trinity_in_reads',
              'n_contigs',
              'contig_len',
              'unambig_bases',
              'pct_unambig',
              'aln2self_reads_tot',
              'aln2self_reads_aln',
              'aln2self_reads_rmdup',
              'aln2self_pct_nondup',
              'aln2self_cov_median',
              'aln2self_cov_mean',
              'aln2self_cov_mean_non0',] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    if os.path.isdir(raw_reads_dir):
        out['reads_raw'] = sum(samtools.count(bam)
            # correct issue where sample names containing other sample names as substrings leads
            # to extra files being included in the count
            #
            # add a dot before the wildcard, and assume the sample name is found before the dot.
            # this works for now since dots are the filename field separators
            # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
            # TODO: replace this with better filtering?
            for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam")))

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample +
                                                           '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
    if os.path.isfile(assembly_fname):
        with open(assembly_fname, 'rt') as inf:
            counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
        out['n_contigs'] = len(counts)
        out['contig_len'] = ','.join(str(x) for x, y in counts)
        out['unambig_bases'] = ','.join(str(y) for x, y in counts)
        out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if os.path.isfile(bam_fname):
        out['aln2self_reads_tot'] = samtools.count(bam_fname)
        out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
        out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028'])
        if out['aln2self_reads_aln']:
            out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    if os.path.isfile(bam_fname):
        with pysam.AlignmentFile(bam_fname, 'rb') as bam:
            coverages = list([pcol.nsegments for pcol in bam.pileup()])
        out['aln2self_cov_median'] = median(coverages)
        out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
        out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0])
        for thresh in cov_thresholds:
            out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh)

    return (header, out)
Example #29
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly',
                       assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self',
                       reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = [
        'sample',
        'reads_raw',
        'reads_cleaned',
        'reads_taxfilt',
        'assembled_trinity',
        'trinity_in_reads',
        'n_contigs',
        'contig_len',
        'unambig_bases',
        'pct_unambig',
        'aln2self_reads_tot',
        'aln2self_reads_aln',
        'aln2self_reads_rmdup',
        'aln2self_pct_nondup',
        'aln2self_cov_median',
        'aln2self_cov_mean',
        'aln2self_cov_mean_non0',
    ] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    out['reads_raw'] = sum(
        samtools.count(bam)
        for bam in glob.glob(os.path.join(raw_reads_dir, sample + "*.bam")))

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(
        os.path.join(assembly_tmp,
                     sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp,
                                      sample + '.assembly2-vfat.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
            return (header, out)
    with open(assembly_fname, 'rt') as inf:
        counts = [(len(s), assembly.unambig_count(s.seq))
                  for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
    out['n_contigs'] = len(counts)
    out['contig_len'] = ','.join(str(x) for x, y in counts)
    out['unambig_bases'] = ','.join(str(y) for x, y in counts)
    out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if not os.path.isfile(bam_fname):
        return (header, out)
    out['aln2self_reads_tot'] = samtools.count(bam_fname)
    out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
    out['aln2self_reads_rmdup'] = samtools.count(bam_fname,
                                                 opts=['-F', '1028'])
    if out['aln2self_reads_aln']:
        out['aln2self_pct_nondup'] = float(
            out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    with pysam.AlignmentFile(bam_fname, 'rb') as bam:
        coverages = list([pcol.nsegments for pcol in bam.pileup()])
    out['aln2self_cov_median'] = median(coverages)
    out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
    out['aln2self_cov_mean_non0'] = "%0.3f" % mean(
        [n for n in coverages if n > 0])
    for thresh in cov_thresholds:
        out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages
                                               if n >= thresh)

    return (header, out)