def _break_reads(self, contig, position, fout, min_read_length=250): '''Get all reads from contig, but breaks them all at given position (0-based) in the reference. Writes to fout. Currently pproximate where it breaks (ignores indels in the alignment)''' sam_reader = pysam.Samfile(self.bam, "rb") for read in sam_reader.fetch(contig): seqs = [] if read.pos < position < read.reference_end - 1: split_point = position - read.pos if split_point - 1 >= min_read_length: sequence = mapping.aligned_read_to_read( read, revcomp=False, ignore_quality=not self.fastq_out).subseq( 0, split_point) sequence.id += '.left' seqs.append(sequence) if read.query_length - split_point >= min_read_length: sequence = mapping.aligned_read_to_read( read, revcomp=False, ignore_quality=not self.fastq_out).subseq( split_point, read.query_length) sequence.id += '.right' seqs.append(sequence) else: seqs.append( mapping.aligned_read_to_read( read, revcomp=False, ignore_quality=not self.fastq_out)) for seq in seqs: if read.is_reverse: seq.revcomp() print(seq, file=fout)
def test_aligned_read_to_read(self): '''test aligned_read_to_read''' infile = os.path.join(data_dir, 'mapping_test_aligned_read_to_read.bam') sam_reader = pysam.Samfile(infile, "rb") aln1, aln2 = [x for x in sam_reader.fetch()] read1_fq = pyfastaq.sequences.Fastq( 'read1', 'TGTGTAACACTCCACCTCTGGTTCCCAGAGTTCGGTATCCGGCCGATACTTGAGGATAGC', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGHFEDCBA') read1_fa = pyfastaq.sequences.Fasta( 'read1', 'TGTGTAACACTCCACCTCTGGTTCCCAGAGTTCGGTATCCGGCCGATACTTGAGGATAGC') self.assertEqual(read1_fq, mapping.aligned_read_to_read(aln1)) self.assertEqual(read1_fq, mapping.aligned_read_to_read(aln1, revcomp=False)) self.assertEqual( read1_fa, mapping.aligned_read_to_read(aln1, ignore_quality=True)) read2 = pyfastaq.sequences.Fastq( 'read2', 'GATCGTCACGAAAGAACCAAGCCGGATCGTGGGAGGGGTACAACTCAGGTGAATTAACGT', 'HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGFEDC') read2_rev = copy.copy(read2) read2_rev.revcomp() self.assertEqual(read2, mapping.aligned_read_to_read(aln2)) self.assertEqual(read2_rev, mapping.aligned_read_to_read(aln2, revcomp=False))
def _all_reads_from_contig(self, contig, fout): '''Gets all reads from contig called "contig" and writes to fout''' sam_reader = pysam.Samfile(self.bam, "rb") for read in sam_reader.fetch(contig): print(mapping.aligned_read_to_read( read, ignore_quality=not self.fastq_out), file=fout)
def _get_region(self, contig, start, end, fout, min_length=250): '''Writes reads mapping to given region of contig, trimming part of read not in the region''' sam_reader = pysam.Samfile(self.bam, "rb") trimming_end = (start == 0) for read in sam_reader.fetch(contig, start, end): read_interval = pyfastaq.intervals.Interval( read.pos, read.reference_end - 1) seq = mapping.aligned_read_to_read( read, ignore_quality=not self.fastq_out, revcomp=False) if trimming_end: bases_off_start = 0 bases_off_end = max(0, read.reference_end - 1 - end) #seq.seq = seq.seq[:read.query_alignment_end - bases_off_end] seq = seq.subseq(0, read.query_alignment_end - bases_off_end) else: bases_off_start = max(0, start - read.pos + 1) #seq.seq = seq.seq[bases_off_start + read.query_alignment_start:] seq = seq.subseq(bases_off_start + read.query_alignment_start, len(seq)) if read.is_reverse: seq.revcomp() if len(seq) >= min_length: print(seq, file=fout)
def _get_all_unmapped_reads(self, fout): '''Writes all unmapped reads to fout''' sam_reader = pysam.Samfile(self.bam, "rb") for read in sam_reader.fetch(until_eof=True): if read.is_unmapped: print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
def test_aligned_read_to_read(self): '''test aligned_read_to_read''' infile = os.path.join(data_dir, 'mapping_test_aligned_read_to_read.bam') sam_reader = pysam.Samfile(infile, "rb") aln1, aln2 = [x for x in sam_reader.fetch()] read1_fq = pyfastaq.sequences.Fastq('read1', 'TGTGTAACACTCCACCTCTGGTTCCCAGAGTTCGGTATCCGGCCGATACTTGAGGATAGC', 'IIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIGHFEDCBA') read1_fa = pyfastaq.sequences.Fasta('read1', 'TGTGTAACACTCCACCTCTGGTTCCCAGAGTTCGGTATCCGGCCGATACTTGAGGATAGC') self.assertEqual(read1_fq, mapping.aligned_read_to_read(aln1)) self.assertEqual(read1_fq, mapping.aligned_read_to_read(aln1, revcomp=False)) self.assertEqual(read1_fa, mapping.aligned_read_to_read(aln1, ignore_quality=True)) read2 = pyfastaq.sequences.Fastq('read2', 'GATCGTCACGAAAGAACCAAGCCGGATCGTGGGAGGGGTACAACTCAGGTGAATTAACGT', 'HHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHGFEDC') read2_rev = copy.copy(read2) read2_rev.revcomp() self.assertEqual(read2, mapping.aligned_read_to_read(aln2)) self.assertEqual(read2_rev, mapping.aligned_read_to_read(aln2, revcomp=False))
def _exclude_region(self, contig, start, end, fout): '''Writes reads not mapping to the given region of contig, start and end as per python convention''' sam_reader = pysam.Samfile(self.bam, "rb") exclude_interval = pyfastaq.intervals.Interval(start, end - 1) for read in sam_reader.fetch(contig): read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1) if not read_interval.intersects(exclude_interval): print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
def _exclude_region(self, contig, start, end, fout): '''Writes reads not mapping to the given region of contig, start and end as per python convention''' sam_reader = pysam.Samfile(self.bam, "rb") exclude_interval = pyfastaq.intervals.Interval(start, end - 1) for read in sam_reader.fetch(contig): read_interval = pyfastaq.intervals.Interval( read.pos, read.reference_end - 1) if not read_interval.intersects(exclude_interval): print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)
def _break_reads(self, contig, position, fout, min_read_length=250): '''Get all reads from contig, but breaks them all at given position (0-based) in the reference. Writes to fout. Currently pproximate where it breaks (ignores indels in the alignment)''' sam_reader = pysam.Samfile(self.bam, "rb") for read in sam_reader.fetch(contig): seqs = [] if read.pos < position < read.reference_end - 1: split_point = position - read.pos if split_point - 1 >= min_read_length: sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(0, split_point) sequence.id += '.left' seqs.append(sequence) if read.query_length - split_point >= min_read_length: sequence = mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True).subseq(split_point, read.query_length) sequence.id += '.right' seqs.append(sequence) else: seqs.append(mapping.aligned_read_to_read(read, revcomp=False, ignore_quality=True)) for seq in seqs: if read.is_reverse: seq.revcomp() print(seq, file=fout)
def _get_region(self, contig, start, end, fout, min_length=250): '''Writes reads mapping to given region of contig, trimming part of read not in the region''' sam_reader = pysam.Samfile(self.bam, "rb") trimming_end = (start == 0) for read in sam_reader.fetch(contig, start, end): read_interval = pyfastaq.intervals.Interval(read.pos, read.reference_end - 1) seq = mapping.aligned_read_to_read(read, ignore_quality=True, revcomp=False) if trimming_end: bases_off_start = 0 bases_off_end = max(0, read.reference_end - 1 - end) seq.seq = seq.seq[:read.query_alignment_end - bases_off_end] else: bases_off_start = max(0, start - read.pos + 1) seq.seq = seq.seq[bases_off_start + read.query_alignment_start:] if read.is_reverse: seq.revcomp() if len(seq) >= min_length: print(seq, file=fout)
def _all_reads_from_contig(self, contig, fout): '''Gets all reads from contig called "contig" and writes to fout''' sam_reader = pysam.Samfile(self.bam, "rb") for read in sam_reader.fetch(contig): print(mapping.aligned_read_to_read(read, ignore_quality=True), file=fout)