def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' stats = self._stats stats[PROCESSED_PACKETS] += 1 seq_fhand = write_seqrecords(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 17 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlastMatcher(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True) new_seqs = [] for seqrec in seqs: stats[PROCESSED_SEQS] += 1 segments = matcher.get_matched_segments_for_read(seqrec.id) if segments is not None: split_seqs = self._split_by_mate_linker(seqrec, segments) else: split_seqs = [seqrec] for seq in split_seqs: new_seqs.append(seq) stats[YIELDED_SEQS] += 1 return new_seqs
def test_matching_segments(self): 'It tests the detection of oligos in sequence files' seq_5 = 'CTAGTCTAGTCGTAGTCATGGCTGTAGTCTAGTCTACGATTCGTATCAGTTGTGTGAC' mate_fhand = create_a_matepair_file() expected_region = (len(seq_5), len(seq_5 + TITANIUM_LINKER) - 1) matcher = BlastMatcher(mate_fhand.name, LINKERS, program='blastn', elongate_for_global=True) linker_region = matcher.get_matched_segments_for_read('seq1')[0] assert [expected_region] == linker_region
def __call__(self, seqrecords): 'It trims the masked segments of the seqrecords.' stats = self.stats db_fhand = write_seqrecords(seqrecords, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 89}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] matcher = BlastMatcher(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True) for seqrec in seqrecords: stats[PROCESSED_SEQS] += 1 segments = matcher.get_matched_segments_for_read(seqrec.id) if segments is not None: _add_trim_segments(segments[0], seqrec, kind=VECTOR) stats[YIELDED_SEQS] += 1 return seqrecords