def _subcontigs_from_strand_bias(self, bam, ctg_name): ctg_length = len(self.contigs[ctg_name]) fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length) rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True) good_intervals = self._good_intervals_from_strand_coverage( fwd_cov, rev_cov) new_contigs = [] if len(good_intervals) == 1: self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[ good_intervals[0][0]:good_intervals[0][1] + 1] elif len(good_intervals) > 1: for i in range(len(good_intervals)): start = good_intervals[i][0] end = good_intervals[i][1] if end - start + 1 >= 100: new_contigs.append( pyfastaq.sequences.Fasta( ctg_name + '.' + str(i + 1), self.contigs[ctg_name].fa[start:end + 1])) return new_contigs
def _trim_contig_for_strand_bias(self, bam, ctg_name): assert os.path.exists(bam) if ctg_name in self.contigs_trimmed_for_strand_bias: return ctg_length = len(self.contigs[ctg_name]) fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length) rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True) first_good_base = 0 while first_good_base < ctg_length: total_cov = fwd_cov[first_good_base] + rev_cov[first_good_base] if total_cov >= self.ext_min_cov and min( fwd_cov[first_good_base], rev_cov[first_good_base]) / total_cov >= self.strand_bias: break first_good_base += 1 last_good_base = ctg_length - 1 while last_good_base > first_good_base: total_cov = fwd_cov[last_good_base] + rev_cov[last_good_base] if total_cov >= self.ext_min_cov and min( fwd_cov[last_good_base], rev_cov[last_good_base]) / total_cov >= self.strand_bias: break last_good_base -= 1 if self.verbose >= 2: print('Trimming strand biased ends of contig', ctg_name, '- good base range is', first_good_base + 1, 'to', last_good_base + 1, 'from', ctg_length, 'bases') self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[ first_good_base:last_good_base + 1]
def test_get_bam_region_coverage_fwd_And_rev(self): '''Test get_bam_region_coverage both strands''' bam = os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam') cov = mapping.get_bam_region_coverage(bam, 'ref', 190, verbose=3, both_strands=True) f = open(os.path.join(data_dir, 'mapping_test.smalt.out.sorted.bam.fwd_and_rev.cov'), 'rb') expected = pickle.load(f) f.close() self.assertListEqual(cov, expected)
def _subcontigs_from_strand_bias(self, bam, ctg_name): ctg_length = len(self.contigs[ctg_name]) fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length) rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True) good_intervals = self._good_intervals_from_strand_coverage(fwd_cov, rev_cov) new_contigs = [] if len(good_intervals) == 1: self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[good_intervals[0][0]:good_intervals[0][1]+1] elif len(good_intervals) > 1: for i in range(len(good_intervals)): start = good_intervals[i][0] end = good_intervals[i][1] if end - start + 1 >= 100: new_contigs.append(pyfastaq.sequences.Fasta(ctg_name + '.' + str(i+1), self.contigs[ctg_name].fa[start:end+1])) return new_contigs
def _trim_contig_for_strand_bias(self, bam, ctg_name): assert os.path.exists(bam) if ctg_name in self.contigs_trimmed_for_strand_bias: return ctg_length = len(self.contigs[ctg_name]) fwd_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length) rev_cov = mapping.get_bam_region_coverage(bam, ctg_name, ctg_length, rev=True) first_good_base = 0 while first_good_base < ctg_length: total_cov = fwd_cov[first_good_base] + rev_cov[first_good_base] if total_cov >= self.ext_min_cov and min(fwd_cov[first_good_base], rev_cov[first_good_base]) / total_cov >= self.strand_bias: break first_good_base += 1 last_good_base = ctg_length - 1 while last_good_base > first_good_base: total_cov = fwd_cov[last_good_base] + rev_cov[last_good_base] if total_cov >= self.ext_min_cov and min(fwd_cov[last_good_base], rev_cov[last_good_base]) / total_cov >= self.strand_bias: break last_good_base -= 1 if self.verbose >= 2: print('Trimming strand biased ends of contig', ctg_name, '- good base range is', first_good_base + 1, 'to', last_good_base + 1, 'from', ctg_length, 'bases') self.contigs[ctg_name].fa.seq = self.contigs[ctg_name].fa.seq[first_good_base:last_good_base+1]
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90): '''Trim sequences off contig ends.''' tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10') f_out = pyfastaq.utils.open_file_write(fasta_out) seq_reader = pyfastaq.sequences.file_reader(fasta_in) for seq in seq_reader: coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True) good_coords = _coverage_to_trimmed_coords( coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc) if good_coords is None: continue seq.seq = seq.seq[good_coords[0]:good_coords[1] + 1] if len(seq) >= min_length: print(seq, file=f_out) pyfastaq.utils.close(f_out) shutil.rmtree(tmpdir)
def _trim_ends(fasta_in, fasta_out, to_trim, min_length=100, min_dist_to_end=25, window_length=10, min_pc=90): '''Trim sequences off contig ends.''' tmpdir = tempfile.mkdtemp(prefix='tmp.adapter_trim.', dir=os.getcwd()) tmp_prefix = os.path.join(tmpdir, 'out') sorted_bam = tmp_prefix + '.bam' mapping.map_reads(to_trim, None, fasta_in, tmp_prefix, index_k=9, index_s=1, threads=1, minid=0.75, sort=True, extra_smalt_map_ops='-d -1 -m 10') f_out = pyfastaq.utils.open_file_write(fasta_out) seq_reader = pyfastaq.sequences.file_reader(fasta_in) for seq in seq_reader: coverage = mapping.get_bam_region_coverage(sorted_bam, seq.id, len(seq), both_strands=True) good_coords = _coverage_to_trimmed_coords(coverage, min_dist_to_end=min_dist_to_end, window_length=window_length, min_pc=min_pc) if good_coords is None: continue seq.seq = seq.seq[good_coords[0]:good_coords[1]+1] if len(seq) >= min_length: print(seq, file=f_out) pyfastaq.utils.close(f_out) shutil.rmtree(tmpdir)