Example #1
0
    def test_ref_assisted_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta')
        refGenome = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, refGenome)
        novoalign.index_fasta(refGenome)
        inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')
        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502"]
        args = assembly.parser_refine_assembly().parse_args(args)
        args.func_main(args)
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) > 1000)

        # check assembly quality
        with open(outFasta, 'rt') as inf:
            seq = Bio.SeqIO.read(inf, 'fasta')
            self.assertGreater(len(seq), 17000)
            self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
Example #2
0
    def test_ref_assisted_assembly(self):
        novoalign = tools.novoalign.NovoalignTool()
        novoalign.install()

        # prep inputs
        orig_ref = os.path.join(util.file.get_test_input_path(), 'ebov-makona.fasta')
        refGenome = util.file.mkstempfname('.ref.fasta')
        shutil.copyfile(orig_ref, refGenome)
        novoalign.index_fasta(refGenome)
        inBam = os.path.join(util.file.get_test_input_path(), 'G5012.3.testreads.bam')
        outFasta = util.file.mkstempfname('.refined.fasta')

        # run refine_assembly
        args = [refGenome, inBam, outFasta, "--chr_names", 'G5012.3', "--min_coverage", '3', "--novo_params",
                "-r Random -l 30 -g 40 -x 20 -t 502"]
        args = assembly.parser_refine_assembly().parse_args(args)
        args.func_main(args)
        self.assertTrue(os.path.isfile(outFasta))
        self.assertTrue(os.path.getsize(outFasta) > 1000)

        # check assembly quality
        with open(outFasta, 'rt') as inf:
            seq = Bio.SeqIO.read(inf, 'fasta')
            self.assertGreater(len(seq), 17000)
            self.assertGreater(assembly.unambig_count(seq.seq), len(seq) * 0.95)
Example #3
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self', reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = ['sample',
              'reads_raw',
              'reads_cleaned',
              'reads_taxfilt',
              'assembled_trinity',
              'trinity_in_reads',
              'n_contigs',
              'contig_len',
              'unambig_bases',
              'pct_unambig',
              'aln2self_reads_tot',
              'aln2self_reads_aln',
              'aln2self_reads_rmdup',
              'aln2self_pct_nondup',
              'aln2self_cov_median',
              'aln2self_cov_mean',
              'aln2self_cov_mean_non0',] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    if os.path.isdir(raw_reads_dir):
        out['reads_raw'] = sum(samtools.count(bam)
            # correct issue where sample names containing other sample names as substrings leads
            # to extra files being included in the count
            #
            # add a dot before the wildcard, and assume the sample name is found before the dot.
            # this works for now since dots are the filename field separators
            # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
            # TODO: replace this with better filtering?
            for bam in glob.glob(os.path.join(raw_reads_dir, sample + ".*.bam")))

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp, sample +
                                                           '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-scaffolded.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
    if os.path.isfile(assembly_fname):
        with open(assembly_fname, 'rt') as inf:
            counts = [(len(s), assembly.unambig_count(s.seq)) for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
        out['n_contigs'] = len(counts)
        out['contig_len'] = ','.join(str(x) for x, y in counts)
        out['unambig_bases'] = ','.join(str(y) for x, y in counts)
        out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if os.path.isfile(bam_fname):
        out['aln2self_reads_tot'] = samtools.count(bam_fname)
        out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
        out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028'])
        if out['aln2self_reads_aln']:
            out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    if os.path.isfile(bam_fname):
        with pysam.AlignmentFile(bam_fname, 'rb') as bam:
            coverages = list([pcol.nsegments for pcol in bam.pileup()])
        out['aln2self_cov_median'] = median(coverages)
        out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
        out['aln2self_cov_mean_non0'] = "%0.3f" % mean([n for n in coverages if n > 0])
        for thresh in cov_thresholds:
            out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages if n >= thresh)

    return (header, out)
Example #4
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly',
                       assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self',
                       reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = [
        'sample',
        'reads_raw',
        'reads_cleaned',
        'reads_taxfilt',
        'assembled_trinity',
        'trinity_in_reads',
        'n_contigs',
        'contig_len',
        'unambig_bases',
        'pct_unambig',
        'aln2self_reads_tot',
        'aln2self_reads_aln',
        'aln2self_reads_rmdup',
        'aln2self_pct_nondup',
        'aln2self_cov_median',
        'aln2self_cov_mean',
        'aln2self_cov_mean_non0',
    ] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    if os.path.isdir(raw_reads_dir):
        out['reads_raw'] = sum(
            samtools.count(bam)
            # correct issue where sample names containing other sample names as substrings leads
            # to extra files being included in the count
            #
            # add a dot before the wildcard, and assume the sample name is found before the dot.
            # this works for now since dots are the filename field separators
            # and leading/trailing dots are stripped from sample names in util.file.string_to_file_name()
            # TODO: replace this with better filtering?
            for bam in glob.glob(os.path.join(raw_reads_dir, sample +
                                              ".*.bam")))
        sample_raw_fname = os.path.join(raw_reads_dir, sample + ".bam")
        if os.path.isfile(sample_raw_fname):
            # if "00_raw/sample.bam" exists, these were not demuxed by snakemake
            if out['reads_raw']:
                # if sample.bam AND sample.library.flowcell.lane.bam exist, we have a problem!
                out['reads_raw'] = 'ambiguous filenames in raw reads directory!'
            else:
                # just count the sample.bam reads
                out['reads_raw'] = samtools.count(sample_raw_fname)

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(
        os.path.join(assembly_tmp,
                     sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp,
                                      sample + '.assembly2-scaffolded.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
    if os.path.isfile(assembly_fname):
        with open(assembly_fname, 'rt') as inf:
            counts = [(len(s), assembly.unambig_count(s.seq))
                      for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
        out['n_contigs'] = len(counts)
        out['contig_len'] = ','.join(str(x) for x, y in counts)
        out['unambig_bases'] = ','.join(str(y) for x, y in counts)
        out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if os.path.isfile(bam_fname):
        out['aln2self_reads_tot'] = samtools.count(bam_fname)
        out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
        out['aln2self_reads_rmdup'] = samtools.count(bam_fname,
                                                     opts=['-F', '1028'])
        if out['aln2self_reads_aln']:
            out['aln2self_pct_nondup'] = float(
                out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    if os.path.isfile(bam_fname):
        with pysam.AlignmentFile(bam_fname, 'rb') as bam:
            coverages = list([pcol.nsegments for pcol in bam.pileup()])
        if coverages:
            out['aln2self_cov_median'] = median(coverages)
            out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
            out['aln2self_cov_mean_non0'] = "%0.3f" % mean(
                [n for n in coverages if n > 0])
            for thresh in cov_thresholds:
                out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages
                                                       if n >= thresh)

    return (header, out)
Example #5
0
def get_assembly_stats(sample,
        cov_thresholds=(1,5,20,100),
        assembly_dir='data/02_assembly', assembly_tmp='tmp/02_assembly',
        align_dir='data/02_align_to_self', reads_dir='data/01_per_sample',
        raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample':sample}
    samtools = tools.samtools.SamtoolsTool()
    header = ['sample', 'reads_raw', 'reads_cleaned', 'reads_taxfilt',
        'assembled_trinity', 'trinity_in_reads',
        'n_contigs', 'contig_len', 'unambig_bases', 'pct_unambig',
        'aln2self_reads_tot', 'aln2self_reads_aln', 'aln2self_reads_rmdup', 'aln2self_pct_nondup',
        'aln2self_cov_median', 'aln2self_cov_mean', 'aln2self_cov_mean_non0',
        ] + ['aln2self_cov_%dX'%t for t in cov_thresholds]
    
    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_'+adj] = samtools.count(reads_bam)
    out['reads_raw'] = sum(samtools.count(bam)
        for bam in glob.glob(os.path.join(raw_reads_dir, sample+"*.bam")))
    
    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(os.path.join(assembly_tmp,
        sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)    
    
    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp, sample + '.assembly2-vfat.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
            return (header, out)
    with open(assembly_fname, 'rt') as inf:
        counts = [(len(s), assembly.unambig_count(s.seq))
            for s in Bio.SeqIO.parse(inf, 'fasta')
            if len(s)>0]
    out['n_contigs'] = len(counts)
    out['contig_len'] = ','.join(str(x) for x,y in counts)
    out['unambig_bases'] = ','.join(str(y) for x,y in counts)
    out['pct_unambig'] = ','.join(str(float(y)/x) for x,y in counts)
    
    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if not os.path.isfile(bam_fname):
        return (header, out)
    out['aln2self_reads_tot'] = samtools.count(bam_fname)
    out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
    out['aln2self_reads_rmdup'] = samtools.count(bam_fname, opts=['-F', '1028'])
    if out['aln2self_reads_aln']:
        out['aln2self_pct_nondup'] = float(out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']
    
    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    with pysam.AlignmentFile(bam_fname, 'rb') as bam:
        coverages = list([pcol.nsegments for pcol in bam.pileup()])
    out['aln2self_cov_median'] = median(coverages)
    out['aln2self_cov_mean'] = "%0.3f"%mean(coverages)
    out['aln2self_cov_mean_non0'] = "%0.3f"%mean([n for n in coverages if n>0])
    for thresh in cov_thresholds:
        out['aln2self_cov_%dX'%thresh] = sum(1 for n in coverages if n>=thresh)
    
    return (header, out)
Example #6
0
def get_assembly_stats(sample,
                       cov_thresholds=(1, 5, 20, 100),
                       assembly_dir='data/02_assembly',
                       assembly_tmp='tmp/02_assembly',
                       align_dir='data/02_align_to_self',
                       reads_dir='data/01_per_sample',
                       raw_reads_dir='data/00_raw'):
    ''' Fetch assembly-level statistics for a given sample '''
    out = {'sample': sample}
    samtools = tools.samtools.SamtoolsTool()
    header = [
        'sample',
        'reads_raw',
        'reads_cleaned',
        'reads_taxfilt',
        'assembled_trinity',
        'trinity_in_reads',
        'n_contigs',
        'contig_len',
        'unambig_bases',
        'pct_unambig',
        'aln2self_reads_tot',
        'aln2self_reads_aln',
        'aln2self_reads_rmdup',
        'aln2self_pct_nondup',
        'aln2self_cov_median',
        'aln2self_cov_mean',
        'aln2self_cov_mean_non0',
    ] + ['aln2self_cov_%dX' % t for t in cov_thresholds]

    # per-sample unaligned read stats
    for adj in ('cleaned', 'taxfilt'):
        reads_bam = os.path.join(reads_dir, '.'.join((sample, adj, 'bam')))
        if os.path.isfile(reads_bam):
            out['reads_' + adj] = samtools.count(reads_bam)
    out['reads_raw'] = sum(
        samtools.count(bam)
        for bam in glob.glob(os.path.join(raw_reads_dir, sample + "*.bam")))

    # pre-assembly stats
    out['assembled_trinity'] = os.path.isfile(
        os.path.join(assembly_tmp,
                     sample + '.assembly1-trinity.fasta')) and 1 or 0
    sub_bam = os.path.join(assembly_tmp, sample + '.subsamp.bam')
    if os.path.isfile(sub_bam):
        out['trinity_in_reads'] = samtools.count(sub_bam)

    # assembly stats
    assembly_fname = os.path.join(assembly_dir, sample + '.fasta')
    if not os.path.isfile(assembly_fname):
        assembly_fname = os.path.join(assembly_tmp,
                                      sample + '.assembly2-vfat.fasta')
        if not os.path.isfile(assembly_fname):
            out['n_contigs'] = 0
            return (header, out)
    with open(assembly_fname, 'rt') as inf:
        counts = [(len(s), assembly.unambig_count(s.seq))
                  for s in Bio.SeqIO.parse(inf, 'fasta') if len(s) > 0]
    out['n_contigs'] = len(counts)
    out['contig_len'] = ','.join(str(x) for x, y in counts)
    out['unambig_bases'] = ','.join(str(y) for x, y in counts)
    out['pct_unambig'] = ','.join(str(float(y) / x) for x, y in counts)

    # read counts from align-to-self
    bam_fname = os.path.join(align_dir, sample + '.bam')
    if not os.path.isfile(bam_fname):
        return (header, out)
    out['aln2self_reads_tot'] = samtools.count(bam_fname)
    out['aln2self_reads_aln'] = samtools.count(bam_fname, opts=['-F', '4'])
    out['aln2self_reads_rmdup'] = samtools.count(bam_fname,
                                                 opts=['-F', '1028'])
    if out['aln2self_reads_aln']:
        out['aln2self_pct_nondup'] = float(
            out['aln2self_reads_rmdup']) / out['aln2self_reads_aln']

    # genome coverage stats
    bam_fname = os.path.join(align_dir, sample + '.mapped.bam')
    with pysam.AlignmentFile(bam_fname, 'rb') as bam:
        coverages = list([pcol.nsegments for pcol in bam.pileup()])
    out['aln2self_cov_median'] = median(coverages)
    out['aln2self_cov_mean'] = "%0.3f" % mean(coverages)
    out['aln2self_cov_mean_non0'] = "%0.3f" % mean(
        [n for n in coverages if n > 0])
    for thresh in cov_thresholds:
        out['aln2self_cov_%dX' % thresh] = sum(1 for n in coverages
                                               if n >= thresh)

    return (header, out)