def test_classify_paired_reads(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') #Non chimeric query1 = '>seq1 1:N:0:GATCAG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 2:N:0:GATCAG\nAGGAGGGATCGGGCACCCACGGCGCGGTAGACTGAGGCCTTCTCGAACT\n' #Chimeric query3 = '>seq2 1:N:0:GATCAG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC\n' query4 = '>seq2 2:N:0:GATCAG\nACGTGGATGCGGCGACGGCCCTACGGCACATACTGTTATTAGGGTCACT\n' #unknown query5 = '>seq3 1:N:0:GATCAG\nAGTGACCCTAATAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGT\n' query6 = '>seq3 2:N:0:GATCAG\nGTCGTGCGCAGCCATTGAGACCTTCCTAGGGTTTTCCCCATGGAATCGG\n' query = query1 + query2 + query5 + query6 + query3 + query4 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() bam_fhand = NamedTemporaryFile(suffix='.bam') extra_params = ['-a', '-M'] bwa = map_with_bwamem(index_fpath, interleave_fpath=in_fhand.name, extra_params=extra_params) map_process_to_sortedbam(bwa, bam_fhand.name, key='queryname') result = classify_mapped_reads(bam_fhand, mate_distance=2000) for pair, kind in result: if kind == NON_CHIMERIC: assert 'seq1' in get_name(pair[0]) elif kind == UNKNOWN: assert 'seq3' in get_name(pair[0]) elif kind == CHIMERA: assert 'seq2' in get_name(pair[0]) else: self.fail()
def test_giuseppe_reads(): 'It splits some real reads' seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01BJHT8\\1' in seq_names assert 'G109AZL01BJHT8\\2' in seq_names assert len(new_seqs) == 20 # reads 2 seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) new_seqs = [] for packet in read_seq_packets([open(seq_fpath)], 2): new_seqs.extend(splitter(packet)) seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20 # test with process_seq_packet seq_fpath = os.path.join(TEST_DATA_DIR, '454_reads2.fastq') linker_fpath = os.path.join(TEST_DATA_DIR, 'linkers.fasta') linkers = list(read_seqs([open(linker_fpath)])) splitter = MatePairSplitter(linkers=linkers) seq_packets = read_seq_packets([open(seq_fpath)], 2) seq_packets = process_seq_packets(seq_packets, [splitter])[0] new_seqs = [seq for l in list(seq_packets) for seq in l] seq_names = [get_name(seq) for seq in new_seqs] assert 'G109AZL01D8U3X\\1' in seq_names assert 'G109AZL01D8U3X\\2' in seq_names assert len(new_seqs) == 20
def test_sort_by_position_in_ref(self): index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') #with fasta format query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n' query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n' query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n' query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n' query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n' query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #it fails because bwa somehow gives a position to an unmapped seq #with fastq format query1 += '+\n??????????????????????????????????????????????????\n' query2 += '+\n??????????????????????????????????????????????????\n' query3 += '+\n??????????????????????????????????????????????????\n' query4 += '+\n??????????????????????????????????????????????????\n' query5 += '+\n??????????????????????????????????????????????????\n' query6 += '+\n??????????????????????????????????????????????????\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files(in_fhand, 'coordinate', index_fpath): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #sort by sequence sorted_names = [] for seq in sort_fastx_files([in_fhand], key='seq', directory=None, max_items_in_memory=None, tempdir=None): sorted_names.append(get_name(seq)) expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4'] assert sorted_names == expected_names
def test_sort_by_position_in_ref(self): reference = GENOME ref_fhand = NamedTemporaryFile() ref_fhand.write(reference) ref_fhand.flush() #with fasta format query1 = '>seq1\nGAGAATTAAGCCTATCTGGAGAGCGGTACCAACAGGGAAACACCGACTCA\n' query2 = '>seq2\nTAACAGTATGTGCCGTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGC\n' query3 = '>seq3\nTACGGCCGTCCCCCTGCTGCTTATCATCAGGCGACGATAGTCAGCTCCGC\n' query4 = '>seq4\nTGCAGAGACCGACATGCGAAAGGAGTGACTATCACCGTCAATGGCGTGCC\n' query5 = '>seq5\nAATAAATAATCTGGGTATGTACTCGGAGTCTACGTAAGCGCGCTTAAATT\n' query6 = '>seq6\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #with fastq format query1 += '+\n??????????????????????????????????????????????????\n' query2 += '+\n??????????????????????????????????????????????????\n' query3 += '+\n??????????????????????????????????????????????????\n' query4 += '+\n??????????????????????????????????????????????????\n' query5 += '+\n??????????????????????????????????????????????????\n' query6 += '+\n??????????????????????????????????????????????????\n' query = query6 + query1 + query2 + query3 + query4 + query5 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() sorted_names = [] for seq in sort_fastx_files([in_fhand], 'coordinate', ref_fhand.name): sorted_names.append(get_name(seq)) expected_names = ['seq2', 'seq3', 'seq1', 'seq5', 'seq4', 'seq6'] assert sorted_names == expected_names #sort by sequence sorted_names = [] for seq in sort_fastx_files([in_fhand], key='seq', directory=None, max_items_in_memory=None, tempdir=None): sorted_names.append(get_name(seq)) expected_names = ['seq6', 'seq5', 'seq1', 'seq2', 'seq3', 'seq4'] assert sorted_names == expected_names
def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs
def test_many_reads(self): 'It splits lots of reads to check that blast finds everything' linker = TITANIUM_LINKER def create_seq(index): 'It creates a random seq with a linker' seq1 = ''.join(choice('ACTG') for i in range(100)) seq2 = ''.join(choice('ACTG') for i in range(100)) seq = seq1 + linker + seq2 seq = SeqRecord(id='seq_' + str(index), seq=Seq(seq)) seq = SeqWrapper(SEQRECORD, seq, None) return seq # We want to test that blast reports all reads packet_size = get_setting('PACKET_SIZE') default_blast_max_target_size = 500 assert packet_size > default_blast_max_target_size seqs = [create_seq(i) for i in range(1000)] splitter = MatePairSplitter() for index, seq in enumerate(splitter(seqs)): seq_index = index // 2 pair_index = (index % 2) + 1 expected_id = 'seq_' + str(seq_index) + '\\' + str(pair_index) assert get_name(seq) == expected_id
def __call__(self, seqs): 'It runs the actual annotations' if not seqs: return seqs pep_fhand = NamedTemporaryFile() dna_fhand = NamedTemporaryFile() _run_estscan(seqs, pep_fhand.name, dna_fhand.name, self._usage_matrix) # now we read the result files estscan_result = _read_estcan_results(open(pep_fhand.name), open(dna_fhand.name)) for seq in seqs: seq_name = get_name(seq) orfs = estscan_result.get(seq_name, {}) feats = [] for (start, end, strand), str_seqs in orfs.viewitems(): start -= 1 # end is fine -- end[ feat = SeqFeature(location=FeatureLocation(start, end, strand), type='ORF', qualifiers=str_seqs) feats.append(feat) if feats: seq.object.features.extend(feats) dna_fhand.close() pep_fhand.close() return seqs
def match_pairs( reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None, ): """It matches the seq pairs in an iterator and splits the orphan seqs.""" counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = "Reads are not ordered by pairs.Use unordered option" raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = "Reads are not ordered by pairs.Use unordered option" raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def append_to_description(seqrecord, text): 'it appends the text to the seqrecord description' desc = get_description(seqrecord) if desc in (None, get_name(seqrecord), '<unknown description>'): desc = '' desc += text seqrecord.object.description = desc
def match_pairs(reads, out_fhand, orphan_out_fhand, out_format, ordered=True, check_order_buffer_size=0, max_reads_memory=None, temp_dir=None): '''It matches the seq pairs in an iterator and splits the orphan seqs.''' counts = 0 check_order_buffer = KeyedSet() for pair in _get_paired_and_orphan(reads, ordered, max_reads_memory, temp_dir): if len(pair) == 1: write_seqs(pair, orphan_out_fhand, out_format) try: name = _parse_pair_direction_and_name(pair[0])[0] except PairDirectionError: name = get_name(pair[0]) if ordered and counts < check_order_buffer_size: counts += 1 if not check_order_buffer.check_add(name): msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif ordered and counts >= check_order_buffer_size: if name in check_order_buffer: msg = 'Reads are not ordered by pairs.Use unordered option' raise ItemsNotSortedError(msg) elif len(pair) == 2: write_seqs(pair, out_fhand, out_format) flush_fhand(orphan_out_fhand) flush_fhand(out_fhand)
def test_filter_chimeras(self): reference_seq = GENOME #Typic non chimeric query1 = '>seq1 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTA' query1 += 'CATTGAACTT\n' query2 = '>seq1 2:Y:18:ATCACG\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG' query2 += 'GGTTGTAACG\n' #typic chimeric query3 = '>seq2 1:Y:18:ATCACG\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGT' query3 += 'CTGCGATCCCTG' query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n' query4 = '>seq2 2:Y:18:ATCACG\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATT' query4 += 'GATGCTGAACTT' query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n' #Unknown, 3' end does not map, impossible to know if it is chimeric query13 = '>seq7 1:Y:18:ATCACG\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCT' query13 += 'ACATTGAACTT' query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query14 = '>seq7 2:Y:18:ATCACG\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAG' query14 += 'GGTTGTAACGCC' query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n' query = query1 + query2 + query3 + query4 + query13 + query14 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() out_fhand = NamedTemporaryFile() chimeras_fhand = NamedTemporaryFile() unknown_fhand = NamedTemporaryFile() filter_chimeras(ref_fhand.name, out_fhand, chimeras_fhand, [in_fhand], unknown_fhand) result = read_seqs([out_fhand]) chimeric = read_seqs([chimeras_fhand]) unknown = read_seqs([unknown_fhand]) for seq in result: assert get_name(seq) in ['seq1.f', 'seq1.r'] for seq in chimeric: assert get_name(seq) in ['seq2.f', 'seq2.r'] for seq in unknown: assert get_name(seq) in ['seq7.f', 'seq7.r']
def _do_check(self, seq): count = self._read_counts[get_name(seq)] kb_len = count['length'] / 1000 rpk = count['mapped_reads'] / kb_len # rpks rpkm = rpk / self._million_reads # rpkms return True if rpkm >= self._min_rpkm else False
def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC'] # with pairs seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n']) seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n']) seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n']) seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n']) seqs = [] seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'), SeqWrapper(SEQITEM, seq1, 'fasta')]) seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'), SeqWrapper(SEQITEM, seq3, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['CTTTC'] assert ['CTTTC', 'CTTTC'] == res # no drag trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['s1.r'] assert ['CTTTC', 'CTTTC'] == res
def test_split_mate(self): 'It tests the function that splits seqs using segments' # pylint: disable=W0212 seq = 'aaatttccctt' seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None) # fake class to test splitter = MatePairSplitter([seq]) # segment beginning seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False)) assert get_str_seq(seqs[0]) == 'ttccctt' assert get_name(seqs[0]) == 'seq' # segment at end seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False)) assert get_str_seq(seqs[0]) == 'aaatttc' assert get_name(seqs[0]) == 'seq' # segmnent in the middle seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'ctt' assert get_name(seqs[0]) == 'seq_pl.part1' assert get_name(seqs[1]) == 'seq_pl.part2' seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False)) assert get_name(seqs[0]) == r'seq\1' assert get_name(seqs[1]) == r'seq\2' seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'c' assert get_str_seq(seqs[2]) == 't' assert get_name(seqs[0]) == 'seq_mlc.part1' # all sequence is linker seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False)) assert not get_str_seq(seqs[0]) # there's no segments seqs = splitter._split_by_mate_linker(seq, ([], False)) assert get_name(seq) == get_name(seqs[0]) assert get_str_seq(seq) == get_str_seq(seqs[0])
def __call__(self, seqrecords): 'It does the work' if not seqrecords: return seqrecords matcher = Blaster(seqrecords, self.blastdb, self._program, self._dbtype, filters=self._filters, params=self._params, remote=self._remote) blasts = matcher.blasts blastdb = os.path.basename(self.blastdb) for seqrecord in seqrecords: align_result = blasts.get(get_name(seqrecord), None) if not align_result: continue match_counter = 0 for match in align_result['matches']: subject = match['subject']['name'] match_counter += 1 for match_part in match['match_parts']: if match_part['subject_end'] < match_part['subject_start']: strand = -1 subject_start = match_part['subject_end'] subject_end = match_part['subject_start'] else: strand = 1 subject_start = match_part['subject_start'] subject_end = match_part['subject_end'] query_start = match_part['query_start'] query_end = match_part['query_end'] qualifiers = {} qualifiers['Target'] = { 'start': subject_start, 'end': subject_end, 'name': subject } qualifiers['score'] = match_part['scores']['expect'] qualifiers['identity'] = match_part['scores']['identity'] qualifiers['blastdb'] = blastdb location = FeatureLocation(query_start, query_end, strand) feature = SeqFeature( location=location, type='match_part', qualifiers=qualifiers, id='match{0:03d}'.format(match_counter)) seqrecord.object.features.append(feature) return seqrecords
def test_blastmatch_filter(): 'it test filter by blast' seq = 'CCAAAGTACGGTCTCCCAAGCGGTCTCTTACCGGACACCGTCACCGATTTCACCCTCT' oligo = 'GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT' seq_oligo = seq + oligo oligo = SeqRecord(Seq(oligo)) oligo = SeqWrapper(SEQRECORD, oligo, None) seq = SeqRecord(Seq(seq), id='seq') seq = SeqWrapper(object=seq, kind=SEQRECORD, file_format=None) seq_oligo = SeqRecord(Seq(seq_oligo), id='seq_oligo') seq_oligo = SeqWrapper(object=seq_oligo, kind=SEQRECORD, file_format=None) seqs = {SEQS_PASSED: [[seq], [seq_oligo]], SEQS_FILTERED_OUT: []} filter_ = FilterBlastShort([oligo]) filt_packet = filter_(seqs) passed = [get_name(pair[0]) for pair in filt_packet[SEQS_PASSED]] fail = [get_name(pair[0]) for pair in filt_packet[SEQS_FILTERED_OUT]] assert passed == ['seq'] assert fail == ['seq_oligo']
def _do_check(self, seq): seq_object = seq.object try: quals = seq_object.letter_annotations['phred_quality'] except KeyError: msg = 'Some of the input sequences do not have qualities: {}' msg = msg.format(get_name(seq)) raise WrongFormatError(msg) if self.ignore_masked: str_seq = str(seq_object.seq) seg_quals = [quals[segment[0]: segment[1] + 1] for segment in get_uppercase_segments(str_seq)] qual = sum(sum(q) * len(q) for q in seg_quals) / len(quals) else: qual = sum(quals) / len(quals) return True if qual >= self.threshold else False
def _do_trim(self, seq): "It trims the masked segments of the seqrecords." window = self.window threshold = self.threshold trim_left = self.trim_left trim_right = self.trim_right try: quals = list(get_int_qualities(seq)) except KeyError: msg = "Some of the input sequences do not have qualities: {}" msg = msg.format(get_name(seq)) segments = _get_bad_quality_segments(quals, window, threshold, trim_left, trim_right) if segments is not None: _add_trim_segments(segments, seq, kind=QUALITY) return seq
def _do_trim(self, seq): 'It trims the masked segments of the seqrecords.' window = self.window threshold = self.threshold trim_left = self.trim_left trim_right = self.trim_right try: quals = list(get_int_qualities(seq)) except KeyError: msg = 'Some of the input sequences do not have qualities: {}' msg = msg.format(get_name(seq)) segments = _get_bad_quality_segments(quals, window, threshold, trim_left, trim_right) if segments is not None: _add_trim_segments(segments, seq, kind=QUALITY) return seq
def __call__(self, seqs): 'It trims the masked segments of the SeqWrappers.' db_fhand = write_seqs(seqs, file_format='fasta') db_fhand.flush() params = {'task': 'blastn-short', 'expect': '0.0001'} filters = [{'kind': 'score_threshold', 'score_key': 'identity', 'min_score': 89}, {'kind': 'min_length', 'min_num_residues': 13, 'length_in_query': False}] matcher = BlasterForFewSubjects(db_fhand.name, self.oligos, program='blastn', filters=filters, params=params, elongate_for_global=True) for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: _add_trim_segments(segments[0], seq, kind=VECTOR) return seqs
def __call__(self, seqs): 'It trims the masked segments of the seqrecords.' window = self.window threshold = self.threshold trim_left = self.trim_left trim_right = self.trim_right trimmed_seqs = [] for seq in seqs: try: quals = list(get_qualities(seq)) except KeyError: msg = 'Some of the input sequences do not have qualities: {}' msg = msg.format(get_name(seq)) segments = _get_bad_quality_segments(quals, window, threshold, trim_left, trim_right) if segments is not None: _add_trim_segments(segments, seq, kind=QUALITY) trimmed_seqs.append(seq) return trimmed_seqs
def __call__(self, seqrecords): 'It does the work' if not seqrecords: return seqrecords matcher = Blaster(seqrecords, self.blastdb, self._program, self._dbtype, filters=self._filters, params=self._params, remote=self._remote) blasts = matcher.blasts blastdb = os.path.basename(self.blastdb) for seqrecord in seqrecords: align_result = blasts.get(get_name(seqrecord), None) if not align_result: continue match_counter = 0 for match in align_result['matches']: subject = match['subject']['name'] match_counter += 1 for match_part in match['match_parts']: if match_part['subject_end'] < match_part['subject_start']: strand = -1 subject_start = match_part['subject_end'] subject_end = match_part['subject_start'] else: strand = 1 subject_start = match_part['subject_start'] subject_end = match_part['subject_end'] query_start = match_part['query_start'] query_end = match_part['query_end'] qualifiers = {} qualifiers['Target'] = {'start': subject_start, 'end': subject_end, 'name': subject} qualifiers['score'] = match_part['scores']['expect'] qualifiers['identity'] = match_part['scores']['identity'] qualifiers['blastdb'] = blastdb location = FeatureLocation(query_start, query_end, strand) feature = SeqFeature(location=location, type='match_part', qualifiers=qualifiers, id='match{0:03d}'.format(match_counter)) seqrecord.object.features.append(feature) return seqrecords
def _read_estcan_result(fhand, result, file_type): 'It reads a dna or pep ESTscan result file' for seq in read_seqs([fhand], file_format='fasta'): items = [i.strip() for i in get_description(seq).split(';')] strand = -1 if 'minus strand' in items else 1 start, end = items[0].split(' ', 3)[1:3] # estscan changes the name, we have to fix it seqid = get_name(seq).strip(';') try: seq_orfs = result[seqid] except KeyError: seq_orfs = {} result[seqid] = seq_orfs orf_key = (int(start), int(end), strand) if orf_key in seq_orfs: orf = seq_orfs[orf_key] else: orf = {} seq_orfs[orf_key] = orf orf[file_type] = get_str_seq(seq)
def _get_chrom_lengths(self): chrom_lens = OrderedDict() if self._ref_fhand is None: vcf_fhand = gzip.open(self._reader.fhand.name) for line in vcf_fhand: line = line.strip() if line.startswith('#'): continue items = line.split() chrom = items[0] loc = int(items[1]) if chrom not in chrom_lens: chrom_lens[chrom] = loc else: if loc > chrom_lens[chrom]: chrom_lens[chrom] = loc else: for read in read_seqs([self._ref_fhand]): chrom_lens[get_name(read)] = get_length(read) return chrom_lens
def _do_trim(self, seq): 'It trims the masked segments of the SeqWrappers.' segments = self._matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: _add_trim_segments(segments[0], seq, kind=VECTOR) return seq
def test_classify_paired_reads(self): reference_seq = GENOME #Typic non chimeric query1 = '>seq1 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT\n' query2 = '>seq1 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' #typic chimeric query3 = '>seq2 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG' query3 += 'GGTAGACTGAGGCCTTCTCGAACTACAAATCATCACCAGACCATGTCCGA\n' query4 = '>seq2 r\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATTGATGCTGAACTT' query4 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n' #PE-like chimera. 5' end does not map query5 = '>seq3 f\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' query5 += 'AAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTG\n' query6 = '>seq3 r\nTTAAGGCACGTACGGTACCTAAATCGGCCTGATGGTATTGATGCTGAACTT' query6 += 'ATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAACGTTAT\n' #Non chimeric read fragmented into two different sequences query7 = '>seq4 f\nCAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC' #first part of f sequence not detected -> unknown instead of mapped query7 += 'TCTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGCCTGAGCGCG\n' query8 = '>seq4 r\nTGTTCTGCAATCGATACAACGATCGAATTTAATCTGAGTAACTGCCAATTC' query8 += 'TGAGTAATATTATAGAAAGT\n' #Chimeric reads mapping different reference sequence query9 = '>seq5 f\nTTTATCCGGCCCTTGCTCAAGGGTATGTTAAAACGGCAAGAGCTGC' query9 += 'CTGAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCTGTGG\n' query10 = '>seq5 r\nACTTATTGCGGCTCACACACCCCTACGTTACACGCAAATGCTGCCCGAAA' query10 += 'CGTTATCTGCGGTGAAATGATGTTCGCGGAGCTGACTATCGTCGCCTGATGATAAG\n' query11 = '>seq6 f\nACGCACTGATTGTGCTAGGGCCACAGTAGCGGAGATGATTAAGCAGCGAC' query11 += 'AACTACAAATCATCACCAGACCATGTCCGATCCCGGGAGTCTTTTCCAAGGTGTGC\n' query12 = '>seq6 r\nAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA' query12 += 'TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT\n' #Unknown, 3' end does not map, impossible to know if it is chimeric query13 = '>seq7 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query13 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query14 = '>seq7 r\nATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACGCC' query14 += 'CCTCGAAGGTACCTTTGCCAGACTGGGCTACAGGACACCCAGTCTCCCGGGAGTCT\n' #chimeric sequences with wrong direction query15 = '>seq8 f\nTAGGGCCGTCGCCGCATCCACGTTATCGGAAGGGCAACTTCGTCTCTCCA' query15 += 'ATCAGCTACCGAATTGGGACCTCTACGGGAGTATGGAACGATTGA\n' query16 = '>seq8 r\nAGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query16 += 'GATGATTTGTAGTTCGAGAAGGCCTCAGTCTACCGCGCCGTGGGTGCCCGATCCCT\n' #chimeric sequences with wrong direction query18 = '>seq9 r\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCCT' query18 += 'GTGGACTTTCTATAATATTACTCAGAATTGGCAGTTACTCAGATTAAATTCG\n' query17 = '>seq9 f\nGCACACCTTGGAAAAGACTCCCGGGATCGGACATGGTCTGGTGATGATTT' query17 += 'GTAGTTCGAGAAGGCCTCAGTCTACCGCGCCGTGGGTGCCCGATCCCTCCTCTAGC\n' #Unknown, wrong relative positions <== = => query19 = '>seq10 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query19 += '\n' query20 = '>seq10 r\nATGTAATACGGGCTAGCCGGGGATGCCGACGATTAAACACGCTGTCATA' query20 += 'GTAGCGTCTTCCTAGGGTTTTCCCCATGGAATCGGTTATCGTGATACGTTAAATTT\n' #Unknown, wrong relative positions ==> <= = query21 = '>seq11 f\nAAGTTCAATGTAGCAAGGGTACATGCTGACGAGATGGGTCTGCGATCCC' query21 += '\n' query22 = '>seq11 r\nAAATTTAACGTATCACGATAACCGATTCCATGGGGAAAACCCTAGGAAG' query22 += 'ACGCTACTATGACAGCGTGTTTAATCGTCGGCATCCCCGGCTAGCCCGTATTACAT\n' forward = query1 + query3 + query5 + query7 + query9 + query11 forward += query13 + query15 + query17 + query19 + query21 reverse = query2 + query4 + query6 + query8 + query10 + query12 reverse += query14 + query16 + query18 + query20 + query22 f_fhand = NamedTemporaryFile() f_fhand.write(forward) f_fhand.flush() r_fhand = NamedTemporaryFile() r_fhand.write(reverse) r_fhand.flush() paired_fpaths = [f_fhand.name, r_fhand.name] ref_fhand = NamedTemporaryFile() ref_fhand.write(reference_seq) ref_fhand.flush() #Kind is given per pair of mates bamfile = _sorted_mapped_reads(ref_fhand.name, paired_fpaths=paired_fpaths) result = classify_mapped_reads(bamfile, file_format='fasta') mapped = [['seq1.f', 'seq1.r'], ['seq4.f', 'seq4.r']] non_contiguous = [['seq2.f', 'seq2.r'], ['seq3.f', 'seq3.r'], ['seq5.f', 'seq5.r'], ['seq6.f', 'seq6.r'], ['seq10.f', 'seq10.r'], ['seq11.f', 'seq11.r'], ['seq8.f', 'seq8.r']] unknown = [['seq7.f', 'seq7.r'], ['seq9.f', 'seq9.r'], ['seq4.f', 'seq4.r']] expected = {'non_chimeric': mapped, 'chimera': non_contiguous, 'unknown': unknown} for pair in result: try: names = [get_name(read) for read in pair[0]] assert names in expected[pair[1]] except AssertionError: str_names = ' '.join(names) msg = str_names + ' not expected to be ' msg += pair[1] raise AssertionError(msg)
def _do_check(self, seq): return True if get_name(seq) in self.seq_ids else False
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality, FilterBlastMatch, FilterDustComplexity, seq_to_filterpackets, FilterByRpkm, FilterByBam, FilterBowtie2Match, FilterByFeatureTypes) from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM, SEQRECORD) from crumbs.utils.file_utils import TemporaryDir from crumbs.seq import get_name, get_str_seq, SeqWrapper from crumbs.mapping import get_or_create_bowtie2_index from crumbs.seqio import read_seq_packets _seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair] _seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai] class PacketConversionTest(unittest.TestCase): 'It tests the seqs and filter packet conversion' def test_seqs_to_filter_packets(self): 'It converts seq packets into filter packets' seqpackets = [['ACT'], ['CTG', 'TTT']] filter_packets = list(seq_to_filterpackets(iter(seqpackets))) expected = [[['ACT']], [['CTG'], ['TTT']]] assert [p[SEQS_PASSED] for p in filter_packets] == expected assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []] def _create_seqrecord(string):
def _do_check(self, seq): return False if get_name(seq) in self.mapped_reads else True
def _do_check(self, seq): segments = self._matcher.get_matched_segments(get_name(seq)) return True if segments is None else False
from Bio.SeqFeature import SeqFeature, FeatureLocation from crumbs.filters import (FilterByLength, FilterById, FilterByQuality, FilterBlastMatch, FilterBlastShort, FilterDustComplexity, seq_to_filterpackets, FilterByRpkm, FilterByBam, FilterAllNs, FilterBowtie2Match, FilterByFeatureTypes) from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM, SEQRECORD) from crumbs.seq import get_name, get_str_seq, SeqWrapper from crumbs.seqio import read_seq_packets _seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair] _seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai] class PacketConversionTest(unittest.TestCase): 'It tests the seqs and filter packet conversion' def test_seqs_to_filter_packets(self): 'It converts seq packets into filter packets' seqpackets = [['ACT'], ['CTG', 'TTT']] filter_packets = list(seq_to_filterpackets(iter(seqpackets))) expected = [[('ACT',)], [('CTG',), ('TTT',)]] assert [p[SEQS_PASSED] for p in filter_packets] == expected assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []] def _create_seqrecord(string):
def _get_seq_lengths(fhand): return {get_name(seq): get_length(seq) for seq in read_seqs([fhand])}
class MatePairSplitter(object): 'It splits the input sequences with the provided linkers.' def __init__(self, linkers=None): 'The initiator' if linkers is None: linkers = get_setting('LINKERS') linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)] linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta') self.linkers = list(linkers) def __call__(self, seqs): 'It splits a list of sequences with the provided linkers' seq_fhand = write_seqs(seqs, file_format='fasta') seq_fhand.flush() min_identity = 87.0 min_len = 13 filters = [{'kind': 'min_length', 'min_num_residues': min_len, 'length_in_query': False, 'filter_match_parts': True}, {'kind': 'score_threshold', 'score_key': 'identity', 'min_score': min_identity}] matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers, program='blastn', filters=filters, params={'task': 'blastn-short'}, elongate_for_global=True, seqs_type=NUCL) new_seqs = [] for seq in seqs: segments = matcher.get_matched_segments_for_read(get_name(seq)) if segments is not None: split_seqs = self._split_by_mate_linker(seq, segments) else: split_seqs = [seq] for seq in split_seqs: new_seqs.append(seq) return new_seqs def _split_by_mate_linker(self, seq, (segments, is_partial)): 'It splits the seqs using segments' if not segments: return [copy_seq(seq)] elongated_match = is_partial if len(segments) == 1: segment_start = segments[0][0] segment_end = segments[0][1] seq_end = get_length(seq) - 1 if segment_start == 0: new_seq = slice_seq(seq, segment_end + 1, None) return [new_seq] elif segment_end == seq_end: new_seq = slice_seq(seq, None, segment_start) return [new_seq] elif segment_end > seq_end: msg = 'The segment ends after the sequence has ended' raise RuntimeError(msg) else: new_seq1 = slice_seq(seq, None, segment_start) new_seq2 = slice_seq(seq, segment_end + 1, None) if elongated_match: name = get_name(seq) + '_pl' else: name = get_name(seq) new_seq1 = copy_seq(new_seq1, name=name + r'\1') new_seq2 = copy_seq(new_seq2, name=name + r'\2') return [new_seq1, new_seq2] else: seqs = [] counter = 1 seq_start = 0 for segment_start, segment_end in segments: if segment_start == 0: continue new_seq = slice_seq(seq, seq_start, segment_start) seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=seq_name) seqs.append(new_seq) counter += 1 seq_start = segment_end + 1 else: if segment_end != get_length(seq) + 1: new_seq = slice_seq(seq, segment_end + 1, None) name = get_name(seq) + '_mlc.part{0:d}'.format(counter) new_seq = copy_seq(new_seq, name=name) seqs.append(new_seq) return seqs