def __init__(self, poly_a_clipping=False, min_read_length=20, paired_end=False, min_phred_score=None, adapter=None, reverse_complement=False): self._poly_a_clipping = poly_a_clipping self._min_read_length = min_read_length self._paired_end = paired_end self._min_phred_score = min_phred_score self._adapter = adapter self._poly_a_clipper = PolyAClipper() self._reverse_complement = reverse_complement
def __init__(self, poly_a_clipping=False, min_read_length=12, paired_end=False, fastq=False, min_phred_score=None, adapter=None, reverse_complement=False): self._poly_a_clipping = poly_a_clipping self._min_read_length = min_read_length self._paired_end = paired_end self._fastq = fastq self._min_phred_score = min_phred_score self._adapter = adapter self._poly_a_clipper = PolyAClipper() self._reverse_complement = reverse_complement
def test_poly_a_clipper(): # Define Object poly_a_clipper = PolyAClipper() # Test: If there is no poly a stretch the sequence is not changed seq_no_stretch = result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAC" "AATGGTTAGGTACAGATAG" assert poly_a_clipper.clip_poly_a_stretch(seq_no_stretch) == result_seq # Test: If the sequence is empty the sequence is not changed empty_seq = result_seq = "" assert poly_a_clipper.clip_poly_a_stretch(empty_seq) == result_seq # Test: Clip a terminal 10 fold A stretch test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG" result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC" assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq # Test: If there is less than a 10 fold terminal stretch don't clip test_seq = result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAA" "TTTAGACGACG" assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq # Test: All A sequence test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" result_seq = "" assert poly_a_clipper.clip_poly_a_stretch(test_seq) == result_seq # Test: Sequence with an starting 'AAAA' substring test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC" assert list(poly_a_clipper._aaaa_starting_substrings( test_seq, 11)) == [['AAAATTTTTTT', 3], ['AAAACCCCCCC', 15]] # Test: If there is no terminal A stretch, there is no clipping test_seq = result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq # Test: Removal of one terminal A test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq # Test: Removal of multiple terminal As test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" assert poly_a_clipper.remove_3_prime_a(test_seq) == result_seq
def setUp(self): self.poly_a_clipper = PolyAClipper()
class TestPolyAClipper(unittest.TestCase): def setUp(self): self.poly_a_clipper = PolyAClipper() def test_clip_poly_a_strech_no_change(self): """If there is no poly a strech the sequence is not changed.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACACAATGGTTAGGTACAGATAG" result_seq = test_seq self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_empt(self): """If there the sequence is empty the sequence is not changed.""" test_seq = "" result_seq = test_seq self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_terminal_10_a(self): """Clipp terminal A strech if it is 10 A.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAAA" result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC" self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_internal_10_a(self): """Clip before a 10 A long internal strech.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG" result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC" self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_terminal_09_a(self): """If there less than 10 A don't clip.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAA" result_seq = test_seq self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strecht_internal_(self): """If there less than 10 A don't clip.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAATTTAGACGACG" result_seq = test_seq self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strecht_internal_09_a(self): """Test all A string""" test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" result_seq = "" self.assertEqual(self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_aaaa_starting_substrings(self): test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC" self.assertEqual( list(self.poly_a_clipper._aaaa_starting_substrings(test_seq, 11)), [["AAAATTTTTTT", 3], ["AAAACCCCCCC", 15]], ) def test_remove_3_prime_a_no_change(self): """If there are no terminal As, there is no clipping.""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTT" result_seq = test_seq self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq) def test_remove_3_prime_a_one_a(self): """Remove terminal A""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq) def test_remove_3_prime_a_multiple_as(self): """Remove terminal stretch of multiple As""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" self.assertEqual(self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq)
class ReadProcessor(object): def __init__(self, poly_a_clipping=False, min_read_length=12, paired_end=False, fastq=False, min_phred_score=None, adapter=None, reverse_complement=False): self._poly_a_clipping = poly_a_clipping self._min_read_length = min_read_length self._paired_end = paired_end self._fastq = fastq self._min_phred_score = min_phred_score self._adapter = adapter self._poly_a_clipper = PolyAClipper() self._reverse_complement = reverse_complement def process_single_end(self, input_path, output_path): self._init_stat_dict() with gzip.open(output_path, "wb") as output_fh: input_fh = self._input_fh(input_path) self._process_single_end(input_fh, output_fh) return self._stats def process_paired_end(self, input_path_pair, output_path_pair): self._init_stat_dict() with gzip.open(output_path_pair[0], "wb") as output_p1_fh, \ gzip.open(output_path_pair[1], "wb") as output_p2_fh: input_p1_fh = self._input_fh(input_path_pair[0]) input_p2_fh = self._input_fh(input_path_pair[1]) self._process_paired_end( input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh) return self._stats def _init_stat_dict(self): self._stats = defaultdict(int) self._stats["total_no_of_reads"] self._stats["polya_removed"] self._stats["single_a_removed"] self._stats["unmodified"] self._stats["too_short"] self._stats["long_enough"] self._stats[ "read_length_before_processing_and_freq"] = defaultdict(int) self._stats["read_length_after_processing_and_freq"] = defaultdict(int) def _input_fh(self, input_path): """Return a file hande Can deal with plain fasta files, gzipped fasta or bzipped2 fasta. """ if input_path.endswith(".gz"): return gzip.open(input_path, "rt") elif input_path.endswith(".bz2"): return bz2.open(input_path, "rt") return open(input_path) def _trim_by_quality(self, seq, qualities): good_nucl = [] for nucl, qual in zip(seq, qualities): if qual < self._min_phred_score: break good_nucl.append(nucl) return "".join(good_nucl) def _clip_adapter(self, seq): adapter_start_pos = seq.find(self._adapter) if adapter_start_pos == -1: return seq else: return seq[:adapter_start_pos] def _process_single_end(self, input_fh, output_fh): for header, seq, qualities in self._parse_sequences(input_fh): raw_seq_len = len(seq) self._stats["total_no_of_reads"] += 1 if self._fastq and not self._min_phred_score is None: seq = self._trim_by_quality(seq, qualities) if self._reverse_complement: seq = Seq(seq) seq = str(seq.reverse_complement()) if not self._adapter is None: seq = self._clip_adapter(seq) if self._poly_a_clipping: seq = self._poly_a_clipper.clip_poly_a_strech(seq) seq = self._poly_a_clipper.remove_3_prime_a(seq) clipped_seq_len = len(seq) if clipped_seq_len == raw_seq_len - 1: self._stats["single_a_removed"] += 1 elif clipped_seq_len < raw_seq_len - 1: self._stats["polya_removed"] += 1 else: self._stats["unmodified"] += 1 if clipped_seq_len < self._min_read_length: self._stats["too_short"] += 1 continue self._stats["long_enough"] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_len] += 1 self._stats["read_length_after_processing_and_freq"][ clipped_seq_len] += 1 # Encoding to bytes is necessary due to saving via gzip output_fh.write(str.encode(">%s\n%s\n" % (header, seq))) def _parse_sequences(self, input_fh): if self._fastq: for seq_record in SeqIO.parse(input_fh, "fastq"): yield(seq_record.description, str(seq_record.seq), seq_record.letter_annotations["phred_quality"]) else: for seq_record in SeqIO.parse(input_fh, "fasta"): yield(seq_record.description, str(seq_record.seq), None) def _process_paired_end( self, input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh): for fasta_entry_p1, fasta_entry_p2 in zip( self._parse_sequences(input_p1_fh), self._parse_sequences(input_p2_fh,)): header_p1 = fasta_entry_p1[0] header_p2 = fasta_entry_p2[0] seq_p1 = fasta_entry_p1[1] seq_p2 = fasta_entry_p2[1] qualities_p1 = fasta_entry_p1[2] qualities_p2 = fasta_entry_p1[2] raw_seq_p1_len = len(seq_p1) raw_seq_p2_len = len(seq_p2) self._stats["total_no_of_reads"] += 1 self._stats["unmodified"] += 1 if self._fastq and self._min_phred_score is not None: seq_p1 = self._trim_by_quality(seq_p1, qualities_p1) seq_p2 = self._trim_by_quality(seq_p2, qualities_p2) if self._reverse_complement: seq_p1 = Seq(seq_p1) seq_p1 = str(seq_p1.reverse_complement()) seq_p2 = Seq(seq_p2) seq_p2 = str(seq_p2.reverse_complement()) if self._adapter is not None: seq_p1 = self._clip_adapter(seq_p1) seq_p2 = self._clip_adapter(seq_p2) if (raw_seq_p1_len < self._min_read_length or raw_seq_p2_len < self._min_read_length): self._stats["too_short"] += 1 continue self._stats["long_enough"] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_p1_len] += 1 self._stats["read_length_after_processing_and_freq"][ raw_seq_p1_len] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_p2_len] += 1 self._stats["read_length_after_processing_and_freq"][ raw_seq_p2_len] += 1 # Encoding to bytes is necessary due to saving via gzip output_p1_fh.write(str.encode(">%s\n%s\n" % (header_p1, seq_p1))) output_p2_fh.write(str.encode(">%s\n%s\n" % (header_p2, seq_p2)))
class ReadProcessor(object): def __init__(self, poly_a_clipping=False, min_read_length=20, paired_end=False, min_phred_score=None, adapter=None, reverse_complement=False): self._poly_a_clipping = poly_a_clipping self._min_read_length = min_read_length self._paired_end = paired_end self._min_phred_score = min_phred_score self._adapter = adapter self._poly_a_clipper = PolyAClipper() self._reverse_complement = reverse_complement def process_single_end(self, input_path, output_path): self._init_stat_dict() with gzip.open(output_path, "wb") as output_fh: input_fh = self._input_fh(input_path) self._process_single_end(input_fh, output_fh) return self._stats def process_paired_end(self, input_path_pair, output_path_pair): self._init_stat_dict() with gzip.open(output_path_pair[0], "wb") as output_p1_fh, \ gzip.open(output_path_pair[1], "wb") as output_p2_fh: input_p1_fh = self._input_fh(input_path_pair[0]) input_p2_fh = self._input_fh(input_path_pair[1]) self._process_paired_end( input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh) return self._stats def _init_stat_dict(self): self._stats = defaultdict(int) self._stats["total_no_of_reads"] self._stats["polya_removed"] self._stats["single_a_removed"] self._stats["unmodified"] self._stats["too_short"] self._stats["long_enough"] self._stats[ "read_length_before_processing_and_freq"] = defaultdict(int) self._stats["read_length_after_processing_and_freq"] = defaultdict(int) def _input_fh(self, input_path): """Return a file hande Can deal with plain fasta files, gzipped bzipped2 fasta. """ if input_path.endswith(".gz"): return gzip.open(input_path, "rt") elif input_path.endswith(".bz2"): return bz2.open(input_path, "rt") elif input_path.endswith(".xz"): return lzma.open(input_path, "rt") with open(input_path, "r") as check_file: check_file.seek(0) first_line = check_file.readline() if first_line[0] == '@': self._fastq = True else: self._fastq = False return open(input_path) def _trim_by_quality(self, seq, qualities): good_nucl = [] for nucl, qual in zip(seq, qualities): if qual < self._min_phred_score: break good_nucl.append(nucl) return "".join(good_nucl) def _clip_adapter(self, seq): adapter_start_pos = seq.find(self._adapter) if adapter_start_pos == -1: return seq else: return seq[:adapter_start_pos] def _process_single_end(self, input_fh, output_fh): for header, seq, qualities in self._parse_sequences(input_fh): raw_seq_len = len(seq) self._stats["total_no_of_reads"] += 1 if self._fastq and self._min_phred_score is not None: seq = self._trim_by_quality(seq, qualities) if self._reverse_complement: seq = Seq(seq) seq = str(seq.reverse_complement()) if self._adapter is not None: seq = self._clip_adapter(seq) if self._poly_a_clipping: seq = self._poly_a_clipper.clip_poly_a_stretch(seq) seq = self._poly_a_clipper.remove_3_prime_a(seq) clipped_seq_len = len(seq) if clipped_seq_len == raw_seq_len - 1: self._stats["single_a_removed"] += 1 elif clipped_seq_len < raw_seq_len - 1: self._stats["polya_removed"] += 1 else: self._stats["unmodified"] += 1 if clipped_seq_len < self._min_read_length: self._stats["too_short"] += 1 continue self._stats["long_enough"] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_len] += 1 self._stats["read_length_after_processing_and_freq"][ clipped_seq_len] += 1 # Encoding to bytes is necessary due to saving via gzip output_fh.write(str.encode(">%s\n%s\n" % (header, seq))) def _parse_sequences(self, input_fh): if self._fastq: for seq_record in SeqIO.parse(input_fh, "fastq"): yield(seq_record.description, str(seq_record.seq), seq_record.letter_annotations["phred_quality"]) else: for seq_record in SeqIO.parse(input_fh, "fasta"): yield(seq_record.description, str(seq_record.seq), None) def _process_paired_end( self, input_p1_fh, input_p2_fh, output_p1_fh, output_p2_fh): for fasta_entry_p1, fasta_entry_p2 in zip( self._parse_sequences(input_p1_fh), self._parse_sequences(input_p2_fh,)): header_p1 = fasta_entry_p1[0] header_p2 = fasta_entry_p2[0] seq_p1 = fasta_entry_p1[1] seq_p2 = fasta_entry_p2[1] qualities_p1 = fasta_entry_p1[2] qualities_p2 = fasta_entry_p1[2] raw_seq_p1_len = len(seq_p1) raw_seq_p2_len = len(seq_p2) self._stats["total_no_of_reads"] += 1 self._stats["unmodified"] += 1 if self._fastq and self._min_phred_score is not None: seq_p1 = self._trim_by_quality(seq_p1, qualities_p1) seq_p2 = self._trim_by_quality(seq_p2, qualities_p2) if self._reverse_complement: seq_p1 = Seq(seq_p1) seq_p1 = str(seq_p1.reverse_complement()) seq_p2 = Seq(seq_p2) seq_p2 = str(seq_p2.reverse_complement()) if self._adapter is not None: seq_p1 = self._clip_adapter(seq_p1) seq_p2 = self._clip_adapter(seq_p2) if (raw_seq_p1_len < self._min_read_length or raw_seq_p2_len < self._min_read_length): self._stats["too_short"] += 1 continue self._stats["long_enough"] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_p1_len] += 1 self._stats["read_length_after_processing_and_freq"][ raw_seq_p1_len] += 1 self._stats["read_length_before_processing_and_freq"][ raw_seq_p2_len] += 1 self._stats["read_length_after_processing_and_freq"][ raw_seq_p2_len] += 1 # Encoding to bytes is necessary due to saving via gzip output_p1_fh.write(str.encode(">%s\n%s\n" % (header_p1, seq_p1))) output_p2_fh.write(str.encode(">%s\n%s\n" % (header_p2, seq_p2)))
class TestPolyAClipper(unittest.TestCase): def setUp(self): self.poly_a_clipper = PolyAClipper() def test_clip_poly_a_strech_no_change(self): """If there is no poly a strech the sequence is not changed.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACACAATGGTTAGGTACAGATAG" result_seq = test_seq self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_empt(self): """If there the sequence is empty the sequence is not changed.""" test_seq = "" result_seq = test_seq self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_terminal_10_a(self): """Clipp terminal A strech if it is 10 A.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAAA" result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC" self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_internal_10_a(self): """Clip before a 10 A long internal strech.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAATTTAGACGACG" result_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACAC" self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strech_terminal_09_a(self): """If there less than 10 A don't clip.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAAA" result_seq = test_seq self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strecht_internal_(self): """If there less than 10 A don't clip.""" test_seq = "ATAGTAGGAGATTTAGACCAGATGACGATGACACAAAAAAAAATTTAGACGACG" result_seq = test_seq self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_clip_poly_a_strecht_internal_09_a(self): """Test all A string""" test_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" result_seq = "" self.assertEqual( self.poly_a_clipper.clip_poly_a_strech(test_seq), result_seq) def test_aaaa_starting_substrings(self): test_seq = "TTTAAAATTTTTTTTAAAACCCCCCCCCCAAAAC" self.assertEqual( list(self.poly_a_clipper._aaaa_starting_substrings(test_seq, 11)), [['AAAATTTTTTT', 3], ['AAAACCCCCCC', 15]]) def test_remove_3_prime_a_no_change(self): """If there are no terminal As, there is no clipping.""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTT" result_seq = test_seq self.assertEqual( self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq) def test_remove_3_prime_a_one_a(self): """Remove terminal A""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTTA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" self.assertEqual( self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq) def test_remove_3_prime_a_multiple_as(self): """Remove terminal stretch of multiple As""" test_seq = "AAAAATTTTCCGCCCGGGAAATTTTAAAAAA" result_seq = "AAAAATTTTCCGCCCGGGAAATTTT" self.assertEqual( self.poly_a_clipper.remove_3_prime_a(test_seq), result_seq)