def group_pairs_by_name(seqs, all_pairs_same_n_seqs=False): paired_seqs = [] prev_name = None n_seqs_per_pair = None for seq in iter(seqs): try: name = _parse_pair_direction_and_name(seq)[0] except PairDirectionError: name = None if name is None or (paired_seqs and name != prev_name): if all_pairs_same_n_seqs: if n_seqs_per_pair is None: n_seqs_per_pair = len(paired_seqs) elif n_seqs_per_pair != len(paired_seqs): msg = 'Pair had different number of reads: ' msg += prev_name raise InterleaveError(msg) if paired_seqs: yield paired_seqs paired_seqs = [] paired_seqs.append(seq) prev_name = name if paired_seqs: if all_pairs_same_n_seqs and n_seqs_per_pair is not None: if n_seqs_per_pair != len(paired_seqs): msg = 'Pair had different number of reads: ' msg += prev_name raise InterleaveError(msg) yield paired_seqs
def _check_name_and_direction_match(seq1, seq2): 'It fails if the names do not match or if the direction are equal' name1, direction1 = _parse_pair_direction_and_name(seq1) name2, direction2 = _parse_pair_direction_and_name(seq2) if name1 != name2: msg = 'The reads from the two files do not match: {}, {}' msg = msg.format(name1, name2) raise InterleaveError(msg) if direction1 == direction2: msg = 'Two paired reads have the same direction: {}, {}' msg = msg.format(name1 + ' ' + direction1, name2 + ' ' + direction2) raise InterleaveError(msg)
def group_pairs(seqs, n_seqs_in_pair=None, check_all_same_n_seqs=True, check_name_matches=True): seqs = iter(seqs) if n_seqs_in_pair is None: first_pair, next_read = _get_first_pair_by_name(seqs) if first_pair is None: n_seqs_in_pair = None else: yield first_pair n_seqs_in_pair = len(first_pair) seqs = chain([next_read], seqs) if n_seqs_in_pair == 1: # No need to check anything, a pair cannot have less than one read # or more than one name check_all_same_n_seqs = False check_name_matches = False if n_seqs_in_pair: pairs = group_in_packets_fill_last(seqs, packet_size=n_seqs_in_pair) for pair in pairs: pair = filter(lambda seq: seq is not None, pair) if check_all_same_n_seqs and n_seqs_in_pair != len(pair): msg = 'The last pair has fewer reads' raise InterleaveError(msg) if check_name_matches: _check_name_and_direction_match(*pair) yield pair
def _check_name_and_direction_match(*seqs): 'It fails if the names do not match or if the directions are equal' n_seqs = len(seqs) names = set() directions = set() for seq in seqs: name, direction = _parse_pair_direction_and_name(seq) names.add(name) directions.add(direction) if len(names) > 1: msg = 'The read names from a pair do not match: %s' msg %= ','.join(names) raise InterleaveError(msg) if len(directions) != n_seqs: msg = 'A pair has repeated directions: ' + first(names) raise InterleaveError(msg)
def interleave_pairs(seqs1, seqs2, skip_checks=False): '''A generator that interleaves the paired reads found in two iterators. It will fail if forward and reverse reads do not match in both sequence iterators. ''' for seq1, seq2 in izip_longest(seqs1, seqs2, fillvalue=None): if not skip_checks: if seq1 is None or seq2 is None: msg = 'The files had a different number of sequences' raise InterleaveError(msg) _check_name_and_direction_match(seq1, seq2) if seq1 is not None: yield seq1 if seq2 is not None: yield seq2
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' while True: try: seq1 = seqs.next() except StopIteration: seq1 = None try: seq2 = seqs.next() except StopIteration: seq2 = None if seq1 is None: break # we have consumed the input iterator completely if seq2 is None: msg = 'The file had an odd number of sequences' raise InterleaveError(msg) _check_name_and_direction_match(seq1, seq2) write_seqs([seq1], out_fhand1, out_format) write_seqs([seq2], out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()