def match_pairs( reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None, ): """It matches the seq pairs in an iterator and splits the orphan seqs.""" counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = "Reads are not ordered by pairs.Use unordered option" raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = "Reads are not ordered by pairs.Use unordered option" raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None): '''It matches the seq pairs in an iterator and splits the orphan seqs.''' counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def filter_duplicates(in_fhands, out_fhand, paired_reads, n_seqs_packet=None, tempdir=None): if not in_fhands: raise ValueError('At least one input fhand is required') pairs = _read_pairs(in_fhands, paired_reads) sorted_pairs = sorted_items(pairs, key=_get_pair_key, tempdir=tempdir, max_items_in_memory=n_seqs_packet) for pair in unique(sorted_pairs, key=_get_pair_key): write_seqs(pair, out_fhand)
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): """It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. """ for pair in group_pairs(seqs, n_seqs_in_pair=2): write_seqs((pair[0],), out_fhand1, out_format) write_seqs((pair[1],), out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' for pair in group_pairs(seqs, n_seqs_in_pair=2): write_seqs((pair[0], ), out_fhand1, out_format) write_seqs((pair[1], ), out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_fwd, buf_rev # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue if direction == FWD: buf1 = buf_rev buf2 = buf_fwd else: buf1 = buf_fwd buf2 = buf_rev try: matching_seq_index = buf1['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf2['items'].append(seq) buf2['index'][seq_name] = len(buf2['items']) - 1 # check mem limit sum_items = len(buf1['items'] + buf2['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf1['items'][:matching_seq_index] matching_seq = buf1['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffers 1 buf1['items'] = buf1['items'][matching_seq_index + 1:] buf1['index'] = {s: i for i, s in enumerate(buf1['items'])} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf2['items'], orphan_out_fhand, out_format) buf2['items'] = [] buf2['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)
def match_pairs_unordered(seq_fpath, out_fhand, orphan_out_fhand, out_format): 'It matches the seq pairs in an iterator and splits the orphan seqs' index_ = _index_seq_file(seq_fpath) paired, orphans = _get_paired_and_orphan(index_) # write paired write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in paired), out_fhand, out_format) # orphans write_seqs((SeqWrapper(SEQRECORD, index_[title], None) for title in orphans), orphan_out_fhand, out_format)
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] reads_fhand = NamedTemporaryFile(dir=self._tempdir, suffix='.trimming') write_seqs(seqs, reads_fhand) reads_fhand.flush() bwa = map_with_bwamem(self._index_fpath, interleave_fpath=reads_fhand.name) bam_fhand = NamedTemporaryFile(dir=self._tempdir) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=self._tempdir) self._bam_fhand = bam_fhand reads_fhand.close()
def filter_duplicates(in_fhands, out_fhand, paired_reads, use_length=None, n_seqs_packet=None, tempdir=None): if not in_fhands: raise ValueError('At least one input fhand is required') pairs = _read_pairs(in_fhands, paired_reads) get_pair_key = _PairKeyGetter(use_length=use_length) if n_seqs_packet is None: unique_pairs = unique_unordered(pairs, key=get_pair_key) else: sorted_pairs = sorted_items(pairs, key=get_pair_key, tempdir=tempdir, max_items_in_memory=n_seqs_packet) unique_pairs = unique(sorted_pairs, key=get_pair_key) for pair in unique_pairs: write_seqs(pair, out_fhand)
def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs
def test_all_orphan(): "All reads end up in orphan" seqs = [SeqRecord(Seq("ACT"), id="seq1"), SeqRecord(Seq("ACT"), id="seq2")] seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None)) out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs(seqs, out_fhand, orphan_out_fhand, out_format="fasta") assert orphan_out_fhand.getvalue() == ">seq1\nACT\n>seq2\nACT\n" seq_fhand = NamedTemporaryFile(suffix=".fasta") write_seqs(seqs, seq_fhand, file_format="fasta") seq_fhand.flush() out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs_unordered(seq_fhand.name, out_fhand, orphan_out_fhand, out_format="fasta") assert ">seq1\nACT\n" in orphan_out_fhand.getvalue() assert ">seq2\nACT\n" in orphan_out_fhand.getvalue()
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None, params=None, remote=False): '''It returns an alignment result with the blast. It is an alternative interface to the one based on fpaths. db_fpath should be a plain sequence file. queries should be a SeqRecord list. If an alternative blast output format is given it should be tabular, so blast_format is a list of fields. ''' query_fhand = write_seqs(queries, file_format='fasta') query_fhand.flush() if remote: blastdb = db_fpath fmt = 'XML' if blast_format is None else blast_format.upper() else: blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype) if blast_format is None: blast_format = [ 'query', 'subject', 'query_length', 'subject_length', 'query_start', 'query_end', 'subject_start', 'subject_end', 'expect', 'identity', ] fmt = generate_tabblast_format(blast_format) if params is None: params = {} params['outfmt'] = fmt blast_fhand = tempfile.NamedTemporaryFile(suffix='.blast') do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params, remote=remote) if remote: blasts = BlastParser(blast_fhand) else: blasts = TabularBlastParser(blast_fhand, blast_format) return blasts, blast_fhand
def test_all_orphan(): 'All reads end up in orphan' seqs = [SeqRecord(Seq('ACT'), id='seq1'), SeqRecord(Seq('ACT'), id='seq2')] seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None)) out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta') assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n' seq_fhand = NamedTemporaryFile(suffix='.fasta') write_seqs(seqs, seq_fhand, file_format='fasta') seq_fhand.flush() out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs_unordered(seq_fhand.name, out_fhand, orphan_out_fhand, out_format='fasta') assert '>seq1\nACT\n' in orphan_out_fhand.getvalue() assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
def _run_estscan(seqs, pep_out_fpath, dna_out_fpath, matrix_fpath): 'It runs estscan in the input seqs' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() binary = get_binary_path('estscan') cmd = [binary, '-t', pep_out_fpath, '-o', dna_out_fpath, '-M', matrix_fpath, seq_fhand.name] process = popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE) check_process_finishes(process, binary=cmd[0]) seq_fhand.close()
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED] for s in seqs] db_fhand = write_seqs(seqs, file_format="fasta") db_fhand.flush() params = {"task": "blastn-short", "expect": "0.0001"} filters = [ {"kind": "score_threshold", "score_key": "identity", "min_score": 87}, {"kind": "min_length", "min_num_residues": 13, "length_in_query": False}, ] self._matcher = BlasterForFewSubjects( db_fhand.name, self.oligos, program="blastn", filters=filters, params=params, elongate_for_global=True )
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list(read_seqs([fhand], out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath get_or_create_bowtie2_index(index_fpath) seqs = [s for seqs in filterpacket[SEQS_PASSED] for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_with_bowtie2(index_fpath, bam_fhand.name, unpaired_fpaths=[reads_fhand.name], extra_params=extra_params) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def classify_chimeras(in_fhand, index_fpath, mate_distance, out_fhand, chimeras_fhand=None, unknown_fhand=None, tempdir=None, threads=None, settings=get_setting('CHIMERAS_SETTINGS')): '''It maps sequences from input files, sorts them and writes to output files according to its classification''' bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname', tempdir=tempdir) for pair, kind in classify_mapped_reads(bam_fhand, settings=settings, mate_distance=mate_distance): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand)
def _setup_checks(self, filterpacket): index_fpath = self._index_fpath get_or_create_bowtie2_index(index_fpath) seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs] seq_class = seqs[0].kind extra_params = [] # Which format do we need for the bowtie2 input read file fasta or # fastq? if seq_class == SEQRECORD: if 'phred_quality' in seqs[0].object.letter_annotations.viewkeys(): file_format = 'fastq' else: extra_params.append('-f') file_format = 'fasta' elif seq_class == SEQITEM: file_format = get_file_format(seqs[0]) if 'illumina' in file_format: extra_params.append('--phred64') elif 'fasta' in file_format: extra_params.append('-f') elif 'fastq' in file_format: pass else: msg = 'For FilterBowtie2Match and SeqItems fastq or fasta ' msg += 'files are required' raise RuntimeError(msg) else: raise NotImplementedError() reads_fhand = NamedTemporaryFile(suffix=file_format) write_seqs(seqs, reads_fhand, file_format=file_format) reads_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') map_process = map_with_bowtie2(index_fpath, unpaired_fpaths=[reads_fhand.name], extra_params=extra_params) map_process_to_bam(map_process, bam_fhand.name) self.mapped_reads = _get_mapped_reads(bam_fhand.name, self.min_mapq)
def _pre_trim(self, trim_packet): seqs = [s for seqs in trim_packet[SEQS_PASSED]for s in seqs] db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True)
def test_all_orphan(): 'All reads end up in orphan' seqs = [ SeqRecord(Seq('ACT'), id='seq1'), SeqRecord(Seq('ACT'), id='seq2') ] seqs = list(assing_kind_to_seqs(SEQRECORD, seqs, None)) out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs(seqs, out_fhand, orphan_out_fhand, out_format='fasta') assert orphan_out_fhand.getvalue() == '>seq1\nACT\n>seq2\nACT\n' seq_fhand = NamedTemporaryFile(suffix='.fasta') write_seqs(seqs, seq_fhand, file_format='fasta') seq_fhand.flush() out_fhand = StringIO() orphan_out_fhand = StringIO() match_pairs_unordered(seq_fhand.name, out_fhand, orphan_out_fhand, out_format='fasta') assert '>seq1\nACT\n' in orphan_out_fhand.getvalue() assert '>seq2\nACT\n' in orphan_out_fhand.getvalue()
def deinterleave_pairs(seqs, out_fhand1, out_fhand2, out_format): '''It splits a sequence iterator with alternating paired reads in two. It will fail if forward and reverse reads are not alternating. ''' while True: try: seq1 = seqs.next() except StopIteration: seq1 = None try: seq2 = seqs.next() except StopIteration: seq2 = None if seq1 is None: break # we have consumed the input iterator completely if seq2 is None: msg = 'The file had an odd number of sequences' raise InterleaveError(msg) _check_name_and_direction_match(seq1, seq2) write_seqs([seq1], out_fhand1, out_format) write_seqs([seq2], out_fhand2, out_format) out_fhand1.flush() out_fhand2.flush()
def _setup_checks(self, filterpacket): seqs = [s for seqs in filterpacket[SEQS_PASSED]for s in seqs] # we create a blastdb for these reads and then we use the oligos # as the blast query db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 87}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] self._matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=False)
def _do_blast_2(db_fpath, queries, program, dbtype=None, blast_format=None, params=None, remote=False): """It returns an alignment result with the blast. It is an alternative interface to the one based on fpaths. db_fpath should be a plain sequence file. queries should be a SeqRecord list. If an alternative blast output format is given it should be tabular, so blast_format is a list of fields. """ query_fhand = write_seqs(queries, file_format="fasta") query_fhand.flush() if remote: blastdb = db_fpath fmt = "XML" if blast_format is None else blast_format.upper() else: blastdb = get_or_create_blastdb(db_fpath, dbtype=dbtype) if blast_format is None: blast_format = [ "query", "subject", "query_length", "subject_length", "query_start", "query_end", "subject_start", "subject_end", "expect", "identity", ] fmt = generate_tabblast_format(blast_format) if params is None: params = {} params["outfmt"] = fmt blast_fhand = tempfile.NamedTemporaryFile(suffix=".blast") do_blast(query_fhand.name, blastdb, program, blast_fhand.name, params, remote=remote) if remote: blasts = BlastParser(blast_fhand) else: blasts = TabularBlastParser(blast_fhand, blast_format) return blasts, blast_fhand
def __call__(self, seqs): 'It trims the masked segments of the SeqWrappers.' db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 89}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True) for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: _add_trim_segments(segments[0], seq, kind=VECTOR) return seqs
def test_seqitems_io(self): 'It checks the different seq class streams IO' fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list(read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQITEM])) assert seqs[0].kind == SEQITEM fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' assert seqs[0].object.name == 's1' # SeqRecord fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list( read_seqs([fhand], 'fasta', prefered_seq_classes=[SEQRECORD])) assert seqs[0].kind == SEQRECORD fhand = StringIO() write_seqs(seqs, fhand, 'fasta') assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n' # seqitem not possible with different input and output formats fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') try: seqs = list( read_seqs([fhand], 'fasta', out_format='fastq', prefered_seq_classes=[SEQITEM])) self.fail('ValueError expected') except ValueError: pass fhand = StringIO('>s1\nACTG\n>s2 desc\nACTG\n') seqs = list( read_seqs([fhand], 'fasta', out_format='fasta', prefered_seq_classes=[SEQITEM])) fhand = StringIO() write_seqs(seqs, fhand) assert fhand.getvalue() == '>s1\nACTG\n>s2 desc\nACTG\n'
def filter_chimeras(ref_fpath, out_fhand, chimeras_fhand, in_fhands, unknown_fhand, unpaired=False, paired_result=True, settings=get_setting('CHIMERAS_SETTINGS'), min_seed_len=None, directory=None): file_format = get_format(in_fhands[0]) if unpaired: unpaired_fpaths = [fhand.name for fhand in in_fhands] paired_fpaths = None else: f_fhand = NamedTemporaryFile() r_fhand = NamedTemporaryFile() seqs = read_seqs(in_fhands) deinterleave_pairs(seqs, f_fhand, r_fhand, file_format) paired_fpaths = [f_fhand.name, r_fhand.name] unpaired_fpaths = None bamfile = _sorted_mapped_reads(ref_fpath, paired_fpaths, unpaired_fpaths, directory, file_format, min_seed_len) total = 0 chimeric = 0 unknown = 0 for pair, kind in classify_mapped_reads(bamfile, settings=settings, paired_result=paired_result, file_format=file_format): if kind is NON_CHIMERIC: write_seqs(pair, out_fhand) elif kind is CHIMERA and chimeras_fhand is not None: write_seqs(pair, chimeras_fhand) chimeric += 1 elif kind is UNKNOWN and unknown_fhand is not None: write_seqs(pair, unknown_fhand) unknown += 1 total += 1 mapped = total - chimeric - unknown print 'Total pairs analyzed: ', total print 'Chimeric pairs filtered: ', chimeric, '\t', chimeric / float(total) print 'Unknown pairs found: ', unknown, '\t', unknown / float(total) print 'Non-chimeric pairs: ', mapped, '\t', mapped / float(total)
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_rev, buf_fwd # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue # buf1 -> buffer for the reads with the same orientation as the # current one # buf2 -> buffer for the reads with the reverse orientation as the # current one if direction == FWD: buf1 = buf_fwd buf2 = buf_rev else: buf1 = buf_rev buf2 = buf_fwd try: matching_seq_index = buf2['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf1['items'].append(seq) buf1['index'][seq_name] = len(buf1['items']) - 1 # check mem limit sum_items = len(buf2['items'] + buf1['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit.' error_msg += 'Are you sure that the reads are sorted and ' error_msg += 'interleaved?. You could try with the unordered' error_msg += ' algorith' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf2['items'][:matching_seq_index] matching_seq = buf2['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffer 1 if matching_seq_index != len(buf2['items']) - 1: msg = 'The given files are not sorted (ordered) and ' msg = 'interleaved. You could try with the unordered algorithm' raise MalformedFile(msg) buf2 = {'index': {}, 'items': []} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf1['items'], orphan_out_fhand, out_format) buf1['items'] = [] buf1['index'] = {} else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)
def match_pairs(seqs, out_fhand, orphan_out_fhand, out_format, memory_limit=get_setting('DEFAULT_SEQS_IN_MEM_LIMIT')): 'It matches the seq pairs in an iterator and splits the orphan seqs' buf_fwd = {'index': {}, 'items': []} buf_rev = {'index': {}, 'items': []} buf1, buf2 = buf_rev, buf_fwd # for the all orphan case for seq in seqs: try: seq_name, direction = _parse_pair_direction_and_name(seq) except PairDirectionError: write_seqs([seq], orphan_out_fhand, out_format) continue # buf1 -> buffer for the reads with the same orientation as the # current one # buf2 -> buffer for the reads with the reverse orientation as the # current one if direction == FWD: buf1 = buf_fwd buf2 = buf_rev else: buf1 = buf_rev buf2 = buf_fwd try: matching_seq_index = buf2['index'][seq_name] except KeyError: matching_seq_index = None if matching_seq_index is None: # add to buff buf1['items'].append(seq) buf1['index'][seq_name] = len(buf1['items']) - 1 # check mem limit sum_items = len(buf2['items'] + buf1['items']) if memory_limit is not None and sum_items >= memory_limit: error_msg = 'There are too many consecutive non matching seqs' error_msg += ' in your input. We have reached the memory limit.' error_msg += 'Are you sure that the reads are sorted and ' error_msg += 'interleaved?. You could try with the unordered' error_msg += ' algorith' raise MaxNumReadsInMem(error_msg) else: # write seqs from buffer1 orphan_seqs = buf2['items'][:matching_seq_index] matching_seq = buf2['items'][matching_seq_index] write_seqs(orphan_seqs, orphan_out_fhand, out_format) write_seqs([matching_seq, seq], out_fhand, out_format) # fix buffer 1 if matching_seq_index != len(buf2['items']) - 1: msg = 'The given files are not sorted (ordered) and ' msg = 'interleaved. You could try with the unordered algorithm' raise MalformedFile(msg) buf2 = {'index': {}, 'items': []} # writes seqs from buffer 2 and fix buffer2 write_seqs(buf1['items'], orphan_out_fhand, out_format) buf1 = {'index': {}, 'items': []} if direction == FWD: buf_fwd = buf1 buf_rev = buf2 else: buf_rev = buf1 buf_fwd = buf2 else: orphan_seqs = buf1['items'] + buf2['items'] write_seqs(orphan_seqs, orphan_out_fhand, out_format) orphan_out_fhand.flush() flush_fhand(out_fhand)