Beispiel #1
0
    def _subcontigs_from_strand_bias(self, bam, ctg_name):
        ctg_length = len(self.contigs[ctg_name])
        fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length)
        rev_cov = mapping.get_bam_region_coverage(bam,
                                                  ctg_name,
                                                  ctg_length,
                                                  rev=True)
        good_intervals = self._good_intervals_from_strand_coverage(
            fwd_cov, rev_cov)
        new_contigs = []

        if len(good_intervals) == 1:
            self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[
                good_intervals[0][0]:good_intervals[0][1] + 1]
        elif len(good_intervals) > 1:
            for i in range(len(good_intervals)):
                start = good_intervals[i][0]
                end = good_intervals[i][1]
                if end - start + 1 >= 100:
                    new_contigs.append(
                        pyfastaq.sequences.Fasta(
                            ctg_name + '.' + str(i + 1),
                            self.contigs[ctg_name].fa[start:end + 1]))

        return new_contigs
Beispiel #2
0
    def _trim_contig_for_strand_bias(self, bam, ctg_name):
        assert os.path.exists(bam)
        if ctg_name in self.contigs_trimmed_for_strand_bias:
            return
        ctg_length = len(self.contigs[ctg_name])
        fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length)
        rev_cov = mapping.get_bam_region_coverage(bam,
                                                  ctg_name,
                                                  ctg_length,
                                                  rev=True)
        first_good_base = 0
        while first_good_base < ctg_length:
            total_cov = fwd_cov[first_good_base] + rev_cov[first_good_base]
            if total_cov >= self.ext_min_cov and min(
                    fwd_cov[first_good_base],
                    rev_cov[first_good_base]) / total_cov >= self.strand_bias:
                break
            first_good_base += 1

        last_good_base = ctg_length - 1
        while last_good_base > first_good_base:
            total_cov = fwd_cov[last_good_base] + rev_cov[last_good_base]
            if total_cov >= self.ext_min_cov and min(
                    fwd_cov[last_good_base],
                    rev_cov[last_good_base]) / total_cov >= self.strand_bias:
                break
            last_good_base -= 1

        if self.verbose >= 2:
            print('Trimming strand biased ends of contig', ctg_name,
                  '- good base range is', first_good_base + 1, 'to',
                  last_good_base + 1, 'from', ctg_length, 'bases')
        self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[
            first_good_base:last_good_base + 1]
Beispiel #3
0
 def test_get_bam_region_coverage_fwd_And_rev(self):
     '''Test get_bam_region_coverage both strands'''
     bam = os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam')
     cov = mapping.get_bam_region_coverage(bam, 'ref', 190, verbose=3, both_strands=True)
     f = open(os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam.fwd_and_rev.cov'), 'rb')
     expected = pickle.load(f)
     f.close()
     self.assertListEqual(cov, expected)
Beispiel #4
0
    def _subcontigs_from_strand_bias(self, bam, ctg_name):
        ctg_length = len(self.contigs[ctg_name])
        fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length)
        rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True)
        good_intervals = self._good_intervals_from_strand_coverage(fwd_cov, rev_cov)
        new_contigs = []

        if len(good_intervals) == 1:
            self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[good_intervals[0][0]:good_intervals[0][1]+1]
        elif len(good_intervals) > 1:
            for i in range(len(good_intervals)):
                start = good_intervals[i][0]
                end = good_intervals[i][1]
                if end - start + 1 >= 100:
                    new_contigs.append(pyfastaq.sequences.Fasta(ctg_name + '.' + str(i+1), self.contigs[ctg_name].fa[start:end+1]))

        return new_contigs
Beispiel #5
0
    def _trim_contig_for_strand_bias(self, bam, ctg_name):
        assert os.path.exists(bam)
        if ctg_name in self.contigs_trimmed_for_strand_bias:
            return
        ctg_length = len(self.contigs[ctg_name])
        fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length)
        rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True)
        first_good_base = 0
        while first_good_base < ctg_length:
            total_cov = fwd_cov[first_good_base] + rev_cov[first_good_base]
            if total_cov >= self.ext_min_cov and min(fwd_cov[first_good_base], rev_cov[first_good_base]) / total_cov >= self.strand_bias:
                break
            first_good_base += 1

        last_good_base = ctg_length - 1
        while last_good_base > first_good_base:
            total_cov = fwd_cov[last_good_base] + rev_cov[last_good_base]
            if total_cov >= self.ext_min_cov and min(fwd_cov[last_good_base], rev_cov[last_good_base]) / total_cov >= self.strand_bias:
                break
            last_good_base -= 1

        if self.verbose >= 2:
            print('Trimming strand biased ends of contig', ctg_name, '- good base range is', first_good_base + 1, 'to', last_good_base + 1, 'from', ctg_length, 'bases')
        self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[first_good_base:last_good_base+1]
Beispiel #6
0
def _trim_ends(fasta_in,
               fasta_out,
               to_trim,
               min_length=100,
               min_dist_to_end=25,
               window_length=10,
               min_pc=90):
    '''Trim sequences off contig ends.'''
    tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd())
    tmp_prefix = os.path.join(tmpdir, 'out')
    sorted_bam = tmp_prefix + '.bam'
    mapping.map_reads(to_trim,
                      None,
                      fasta_in,
                      tmp_prefix,
                      index_k=9,
                      index_s=1,
                      threads=1,
                      minid=0.75,
                      sort=True,
                      extra_smalt_map_ops='-d -1 -m 10')

    f_out = pyfastaq.utils.open_file_write(fasta_out)
    seq_reader = pyfastaq.sequences.file_reader(fasta_in)
    for seq in seq_reader:
        coverage = mapping.get_bam_region_coverage(sorted_bam,
                                                   seq.id,
                                                   len(seq),
                                                   both_strands=True)
        good_coords = _coverage_to_trimmed_coords(
            coverage,
            min_dist_to_end=min_dist_to_end,
            window_length=window_length,
            min_pc=min_pc)
        if good_coords is None:
            continue

        seq.seq = seq.seq[good_coords[0]:good_coords[1] + 1]
        if len(seq) >= min_length:
            print(seq, file=f_out)

    pyfastaq.utils.close(f_out)
    shutil.rmtree(tmpdir)
Beispiel #7
0
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90):
    '''Trim sequences off contig ends.'''
    tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd())
    tmp_prefix = os.path.join(tmpdir, 'out')
    sorted_bam = tmp_prefix + '.bam'
    mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10')

    f_out = pyfastaq.utils.open_file_write(fasta_out)
    seq_reader = pyfastaq.sequences.file_reader(fasta_in)
    for seq in seq_reader:
        coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True)
        good_coords = _coverage_to_trimmed_coords(coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc)
        if good_coords is None:
            continue
 
        seq.seq = seq.seq[good_coords[0]:good_coords[1]+1]
        if len(seq) >= min_length:
            print(seq, file=f_out)

    pyfastaq.utils.close(f_out)
    shutil.rmtree(tmpdir)