def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq_ = slice_seq(seq, 1, 5) assert list(get_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1: 5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'CTCA' trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'GGTCTCA' trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert not seqs2 trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec seqs2 = seq_trimmer([seq]) assert get_str_seq(seqs2[0]) == 'GGTCTCA' assert TRIMMING_RECOMMENDATIONS not in get_annotations(seqs2[0])
def test_slice(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n']) expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta') assert slice_seq(seq, 1, 5) == expected_seq # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') seq = slice_seq(seq, 1, 3) assert list(get_qualities(seq)) == [30, 0] assert get_str_seq(seq) == 'at' assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n'] # with multiline fastq seq = SeqItem( name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline') seq_ = slice_seq(seq, 1, 5) assert list(get_qualities(seq_)) == [1, 1, 1, 2] assert get_str_seq(seq_) == get_str_seq(seq)[1:5] # It tests the stop is None seq = SeqItem('seq', ['>seq\n', 'aCTG']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:] assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
def _seqitem_pairs_equal(pair1, pair2): if len(pair1) != len(pair2): return False else: for read1, read2 in zip(pair1, pair2): if not get_str_seq(read1) == get_str_seq(read2): return False return True
def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)]) seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTTC'] # with pairs seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n']) seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n']) seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n']) seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n']) seqs = [] seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'), SeqWrapper(SEQITEM, seq1, 'fasta')]) seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'), SeqWrapper(SEQITEM, seq3, 'fasta')]) trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['CTTTC'] assert ['CTTTC', 'CTTTC'] == res # no drag trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 trim_packet = trim(trim_lowercased_seqs(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]] assert orphan_res == ['s1.r'] assert ['CTTTC', 'CTTTC'] == res
def test_str_seq(self): # with fasta seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n']) seq = SeqWrapper(SEQITEM, seq, 'fasta') assert get_str_seq(seq) == 'ACTGGTAC' # with fastq seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n']) seq = SeqWrapper(SEQITEM, seq, 'fastq') assert get_str_seq(seq) == 'aaaa'
def test_pair_grouper(): seq1 = SeqWrapper(SEQITEM, SeqItem("s1", [">s1.f\n", "A\n"]), "fasta") seq2 = SeqWrapper(SEQITEM, SeqItem("s1", [">s1.r\n", "C\n"]), "fasta") seq3 = SeqWrapper(SEQITEM, SeqItem("s2", [">s2.f\n", "T\n"]), "fasta") seq4 = SeqWrapper(SEQITEM, SeqItem("s2", [">s2.r\n", "G\n"]), "fasta") seqs = seq1, seq2, seq3, seq4 paired_seqs = list(group_seqs_in_pairs(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ["A", "C"] assert [get_str_seq(s) for s in paired_seqs[1]] == ["T", "G"] assert len(paired_seqs) == 2
def test_pair_grouper(): seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta') seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta') seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta') seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta') seqs = seq1, seq2, seq3, seq4 paired_seqs = list(group_seqs_in_pairs(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2
def test_pair_grouper(self): seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2 seqs = _build_some_paired_seqs() paired_seqs = list( group_pairs(seqs, n_seqs_in_pair=1, check_name_matches=True)) assert [get_str_seq(s) for pair in paired_seqs for s in pair] == ['A', 'C', 'T', 'G']
def test_pair_grouper(self): seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2 seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=1, check_name_matches=True)) assert [get_str_seq(s) for pair in paired_seqs for s in pair] == ['A', 'C', 'T', 'G']
def test_n_seqs_check(self): seqs = _build_some_paired_seqs() seqs = seqs[:-1] try: list(group_pairs(seqs, n_seqs_in_pair=2)) self.fail('InterleaveError expected') except InterleaveError: pass paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=2, check_all_same_n_seqs=False)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
def test_n_seqs_check(self): seqs = _build_some_paired_seqs() seqs = seqs[:-1] try: list(group_pairs(seqs, n_seqs_in_pair=2)) self.fail('InterleaveError expected') except InterleaveError: pass paired_seqs = list( group_pairs(seqs, n_seqs_in_pair=2, check_all_same_n_seqs=False)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
def test_pair_grouper(self): seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2 seqs = seqs[0], seqs[2], seqs[1], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['C'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['G'] assert len(paired_seqs) == 4 seqs = _build_some_paired_seqs() seqs = seqs[:-1] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] seqs = _build_some_paired_seqs() seqs = seqs[:-1] try: paired_seqs = list(group_pairs_by_name(seqs, all_pairs_same_n_seqs=True)) self.fail('InterleaveError expected') except InterleaveError: pass
def test_pair_grouper(self): seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G'] assert len(paired_seqs) == 2 seqs = seqs[0], seqs[2], seqs[1], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['C'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['G'] assert len(paired_seqs) == 4 seqs = _build_some_paired_seqs() seqs = seqs[:-1] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] seqs = _build_some_paired_seqs() seqs = seqs[:-1] try: paired_seqs = list( group_pairs_by_name(seqs, all_pairs_same_n_seqs=True)) self.fail('InterleaveError expected') except InterleaveError: pass
def guess_seq_type(fhand): '''It guesses if the file is nucleotide or protein''' rna = set(ambiguous_rna_letters) dna = set(ambiguous_dna_letters) rna_dna = rna.union(dna) protein = set(extended_protein_letters) only_prot = list(protein.difference(rna_dna)) chunk_size = 1024 chunk = peek_chunk_from_file(fhand, chunk_size) if not chunk: raise UnknownFormatError('The file is empty') fhand_ = cStringIO.StringIO(chunk) total_letters = 0 nucleotides = 0 for seq in read_seqs([fhand_]): for letter in get_str_seq(seq): total_letters += 1 if letter in ('gcatnuGCATNU'): nucleotides += 1 if letter in only_prot: return 'prot' nucl_freq = nucleotides / total_letters if nucl_freq > 0.8: return 'nucl' raise RuntimeError('unable to guess the seq type')
def _annotate_polya(seq, min_len, max_cont_mismatches): 'It annotates the polyA with the EMBOSS trimest method' str_seq = get_str_seq(seq) polya = _detect_polya_tail(str_seq, THREE_PRIME, min_len, max_cont_mismatches) polyt = _detect_polya_tail(str_seq, FIVE_PRIME, min_len, max_cont_mismatches) a_len = polya[1] - polya[0] if polya else 0 t_len = polyt[1] - polyt[0] if polyt else 0 chosen_tail = None if a_len > t_len: chosen_tail = 'A' elif t_len > a_len: chosen_tail = 'T' elif a_len and a_len == t_len: if randint(0, 1): chosen_tail = 'A' else: chosen_tail = 'T' if chosen_tail: strand = 1 if chosen_tail == 'A' else -1 start, end = polya if chosen_tail == 'A' else polyt feat = SeqFeature(location=FeatureLocation(start, end, strand), type='polyA_sequence') # We're assuming that the seq has a SeqRecord in it seq.object.features.append(feat)
def guess_seq_type(fhand): '''It guesses the file's seq type''' rna = set(ambiguous_rna_letters) dna = set(ambiguous_dna_letters) rna_dna = rna.union(dna) protein = set(extended_protein_letters) only_prot = list(protein.difference(rna_dna)) chunk_size = 1024 chunk = peek_chunk_from_file(fhand, chunk_size) if not chunk: raise UnknownFormatError('The file is empty') fhand_ = cStringIO.StringIO(chunk) total_letters = 0 nucleotides = 0 for seq in read_seqs([fhand_]): for letter in get_str_seq(seq): total_letters += 1 if letter in ('gcatnuGCATNU'): nucleotides += 1 if letter in only_prot: return 'prot' nucl_freq = nucleotides / total_letters if nucl_freq > 0.8: return 'nucl' raise RuntimeError('unable to guess the seq type')
def __call__(self, pair): key = [] for read in pair: seq = get_str_seq(read) if self._use_length is not None: seq = seq[:self._use_length] key.append(seq) return tuple(key)
def test_case_change(self): 'It changes the case of the sequences' seqs = [SeqRecord(Seq('aCCg'), letter_annotations={'dummy': 'dddd'})] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=UPPERCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['ACCG'] seqs = [SeqRecord(Seq('aCCg'))] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=LOWERCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['accg'] seqs = [SeqRecord(Seq('aCCg'))] seqs = assing_kind_to_seqs(SEQRECORD, seqs, None) change_case = ChangeCase(action=SWAPCASE) strs = [get_str_seq(s) for s in change_case(seqs)] assert strs == ['AccG']
def _do_check(self, seq): seq = get_str_seq(seq) if not seq: return True chars = set(seq) good_chars = chars.difference(set(('N', 'n', '-', '*'))) if good_chars: return True else: return False
def test_trim_chimeras_bin(self): trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras') assert 'usage' in check_output([trim_chimeras_bin, '-h']) index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() out_fhand = NamedTemporaryFile() expected_seqs = [ 'GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT', 'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG' ] cmd = [ trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name ] #raw_input(" ".join(cmd)) check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0 #With several threads cmd = [ trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name, '-p', '2' ] check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0
def test_split_mate(self): 'It tests the function that splits seqs using segments' # pylint: disable=W0212 seq = 'aaatttccctt' seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None) # fake class to test splitter = MatePairSplitter([seq]) # segment beginning seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False)) assert get_str_seq(seqs[0]) == 'ttccctt' assert get_name(seqs[0]) == 'seq' # segment at end seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False)) assert get_str_seq(seqs[0]) == 'aaatttc' assert get_name(seqs[0]) == 'seq' # segmnent in the middle seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'ctt' assert get_name(seqs[0]) == 'seq_pl.part1' assert get_name(seqs[1]) == 'seq_pl.part2' seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False)) assert get_name(seqs[0]) == r'seq\1' assert get_name(seqs[1]) == r'seq\2' seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False)) assert get_str_seq(seqs[0]) == 'aaat' assert get_str_seq(seqs[1]) == 'c' assert get_str_seq(seqs[2]) == 't' assert get_name(seqs[0]) == 'seq_mlc.part1' # all sequence is linker seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False)) assert not get_str_seq(seqs[0]) # there's no segments seqs = splitter._split_by_mate_linker(seq, ([], False)) assert get_name(seq) == get_name(seqs[0]) assert get_str_seq(seq) == get_str_seq(seqs[0])
def _do_check(self, seq): min_ = self.min max_ = self.max length = uppercase_length(get_str_seq(seq)) if self.ignore_masked else get_length(seq) passed = True if min_ is not None and length < min_: passed = False if max_ is not None and length > max_: passed = False return passed
def test_name_check(self): seqs = _build_some_paired_seqs() try: list(group_pairs(seqs, n_seqs_in_pair=4)) self.fail('InterleaveError expected') except InterleaveError: pass seqs = _build_some_paired_seqs() paired_seqs = list( group_pairs(seqs, n_seqs_in_pair=4, check_name_matches=False)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C', 'T', 'G']
def test_name_check(self): seqs = _build_some_paired_seqs() try: list(group_pairs(seqs, n_seqs_in_pair=4)) self.fail('InterleaveError expected') except InterleaveError: pass seqs = _build_some_paired_seqs() paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=4, check_name_matches=False)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C', 'T', 'G']
def test_trim_seqs(): 'It tests the trim seq function' seqs = [] seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)) seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)) seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)) seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)) seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)) trim_lowercased_seqs = TrimLowercasedLetters() trim = TrimOrMask() # pylint: disable=W0141 res = [get_str_seq(s) for s in trim(trim_lowercased_seqs(seqs))] assert res == ['CTTTC', 'CTTC', 'CTC', 'AC'] seqs = [] seq = SeqItem('s', ['>s\n', 'aaCTTTC\n']) seqs.append(SeqWrapper(SEQITEM, seq, 'fasta')) res = [get_str_seq(s) for s in trim(trim_lowercased_seqs(seqs))] assert res == ['CTTTC']
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) seqs = [seq] trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []} trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['CTCA'] trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) assert not trim_packet2[SEQS_PASSED] trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_packet2[SEQS_PASSED][0][0] assert TRIMMING_RECOMMENDATIONS not in get_annotations( trim_packet2[SEQS_PASSED][0][0])
def test_trimming(self): 'The sequences are trimmed according to the recommendations.' seq1 = 'gggtctcatcatcaggg'.upper() seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}}) seq = SeqWrapper(SEQRECORD, seq, None) seqs = [seq] trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []} trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS] seq_trimmer = TrimOrMask() trim_rec['vector'] = [(0, 3), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['CTCA'] trim_rec['vector'] = [(0, 0), (8, 13)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_rec['vector'] = [(0, 1), (8, 12)] trim_rec['quality'] = [(1, 8), (13, 17)] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) assert not trim_packet2[SEQS_PASSED] trim_rec['vector'] = [(0, 0), (8, 13)] trim_rec['quality'] = [] get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec trim_packet2 = seq_trimmer(trim_packet) res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l] assert res == ['GGTCTCA'] trim_packet2[SEQS_PASSED][0][0] assert TRIMMING_RECOMMENDATIONS not in get_annotations(trim_packet2[SEQS_PASSED][0][0])
def test_trim_chimeras_bin(self): trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras') assert 'usage' in check_output([trim_chimeras_bin, '-h']) index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta') query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT' query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n' query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$' query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n' query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n' query = query1 + query2 in_fhand = NamedTemporaryFile() in_fhand.write(query) in_fhand.flush() out_fhand = NamedTemporaryFile() expected_seqs = ['GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT', 'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG'] cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name] #raw_input(" ".join(cmd)) check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0 #With several threads cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o', out_fhand.name, '-p', '2'] check_output(cmd, stdin=in_fhand) counts = 0 for seq in read_seqs([open(out_fhand.name)]): assert get_str_seq(seq) in expected_seqs counts += 1 assert counts != 0
def __call__(self, filter_packet): seqs_passed = [] filtered_out = filter_packet[SEQS_FILTERED_OUT][:] for pair in filter_packet[SEQS_PASSED]: str_pair = tuple(get_str_seq(seq) for seq in pair) duplicated = True if str_pair in self._prev_pairs else False self._prev_pairs.add(str_pair) filter_pass = duplicated if self.reverse else not(duplicated) if filter_pass: seqs_passed.append(pair) else: filtered_out.append(pair) return {SEQS_PASSED: seqs_passed, SEQS_FILTERED_OUT: filtered_out}
def __call__(self, filter_packet): seqs_passed = [] filtered_out = filter_packet[SEQS_FILTERED_OUT][:] for pair in filter_packet[SEQS_PASSED]: str_pair = tuple(get_str_seq(seq) for seq in pair) duplicated = True if str_pair in self._prev_pairs else False self._prev_pairs.add(str_pair) filter_pass = duplicated if self.reverse else not (duplicated) if filter_pass: seqs_passed.append(pair) else: filtered_out.append(pair) return {SEQS_PASSED: seqs_passed, SEQS_FILTERED_OUT: filtered_out}
def __call__(self, seqs): 'It changes the case of the seqrecords.' action = self.action processed_seqs = [] for seq in seqs: str_seq = get_str_seq(seq) if action == UPPERCASE: str_seq = str_seq.upper() elif action == LOWERCASE: str_seq = str_seq.lower() elif action == SWAPCASE: str_seq = str_seq.swapcase() else: raise NotImplementedError() seq = copy_seq(seq, seq=str_seq) processed_seqs.append(seq) return processed_seqs
def _do_trim(self, seq): str_seq = get_str_seq(seq) unmasked_segments = get_uppercase_segments(str_seq) segment = get_longest_segment(unmasked_segments) if segment is not None: segments = [] if segment[0] != 0: segments.append((0, segment[0] - 1)) len_seq = len(str_seq) if segment[1] != len_seq - 1: segments.append((segment[1] + 1, len_seq - 1)) _add_trim_segments(segments, seq, kind=OTHER) else: segments = [(0, len(seq))] _add_trim_segments(segments, seq, kind=OTHER) return seq
def __call__(self, seqs): 'It trims the masked segments of the seqrecords.' trimmed_seqs = [] for seq in seqs: str_seq = get_str_seq(seq) unmasked_segments = get_uppercase_segments(str_seq) segment = get_longest_segment(unmasked_segments) if segment is not None: segments = [] if segment[0] != 0: segments.append((0, segment[0] - 1)) len_seq = len(str_seq) if segment[1] != len_seq - 1: segments.append((segment[1] + 1, len_seq - 1)) _add_trim_segments(segments, seq, kind=OTHER) trimmed_seqs.append(seq) return trimmed_seqs
def _mask_sequence(seq, segments): 'It masks the given segments of the sequence' if not segments: return seq segments = merge_overlaping_segments(segments) segments = get_all_segments(segments, get_length(seq)) str_seq = get_str_seq(seq) new_seq = '' for segment in segments: start = segment[0][0] end = segment[0][1] + 1 str_seq_ = str_seq[start:end] if segment[1]: str_seq_ = str_seq_.lower() new_seq += str_seq_ if seq.kind == SEQRECORD: new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet) return copy_seq(seq, seq=new_seq)
def _read_estcan_result(fhand, result, file_type): 'It reads a dna or pep ESTscan result file' for seq in read_seqs([fhand], file_format='fasta'): items = [i.strip() for i in get_description(seq).split(';')] strand = -1 if 'minus strand' in items else 1 start, end = items[0].split(' ', 3)[1:3] # estscan changes the name, we have to fix it seqid = get_name(seq).strip(';') try: seq_orfs = result[seqid] except KeyError: seq_orfs = {} result[seqid] = seq_orfs orf_key = (int(start), int(end), strand) if orf_key in seq_orfs: orf = seq_orfs[orf_key] else: orf = {} seq_orfs[orf_key] = orf orf[file_type] = get_str_seq(seq)
def calculate_dust_score(seq): '''It returns the dust score. From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA Sequences" doi:10.1089/cmb.2006.13.1028 and re-implemented from PRINSEQ ''' seq = get_str_seq(seq) length = len(seq) if length == 3: return 0 if length <= 5: return None windowsize = get_setting('DUST_WINDOWSIZE') windowstep = get_setting('DUST_WINDOWSTEP') dustscores = [] if length > windowsize: windows = 0 for seq_in_win in rolling_window(seq, windowsize, windowstep): score = _calculate_rawscore(seq_in_win) dustscores.append(score / (windowsize - 2)) windows += 1 remaining_seq = seq[windows * windowstep:] else: remaining_seq = seq if remaining_seq > 5: length = len(remaining_seq) score = _calculate_rawscore(remaining_seq) dustscore = score / (length - 3) * (windowsize - 2) / (length - 2) dustscores.append(dustscore) # max score should be 100 not 31 dustscore = sum(dustscores) / len(dustscores) * 100 / 31 return dustscore
def test_edge_trimming(self): 'It trims the edges' trim = TrimOrMask() trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCG', 'AACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACC', 'AAACCCGG'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CC', 'AACCCGG'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCCG'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CCC'] trim = TrimOrMask(mask=True) trim_edges = TrimEdges(left=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCG', 'aAACCCGGG'] trim_edges = TrimEdges(right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['ACCg', 'AAACCCGGg'] trim_edges = TrimEdges(left=1, right=1) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCCg', 'aAACCCGGg'] trim_edges = TrimEdges(left=2, right=2) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaACCCGgg'] trim_edges = TrimEdges(left=3, right=3) trim_packet = trim(trim_edges(self._some_seqs())) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaaCCCggg'] # test overlapping mask trim1 = TrimEdges(left=3, right=3) trim2 = TrimEdges(left=4, right=4) trim_packet = trim(trim2(trim1(self._some_seqs()))) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['accg', 'aaacCcggg'] # With a SeqItem trim = TrimOrMask(mask=False) trim_edges = TrimEdges(left=1, right=1) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['CTTT'] trim = TrimOrMask(mask=True) seq = SeqItem('s', ['>s\n', 'ACTTTC\n']) seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]] trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []} trim_packet = trim(trim_edges(trim_packet)) res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l] assert res == ['aCTTTc']
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality, FilterBlastMatch, FilterDustComplexity, seq_to_filterpackets, FilterByRpkm, FilterByBam, FilterBowtie2Match, FilterByFeatureTypes) from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM, SEQRECORD) from crumbs.utils.file_utils import TemporaryDir from crumbs.seq import get_name, get_str_seq, SeqWrapper from crumbs.mapping import get_or_create_bowtie2_index from crumbs.seqio import read_seq_packets _seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair] _seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai] class PacketConversionTest(unittest.TestCase): 'It tests the seqs and filter packet conversion' def test_seqs_to_filter_packets(self): 'It converts seq packets into filter packets' seqpackets = [['ACT'], ['CTG', 'TTT']] filter_packets = list(seq_to_filterpackets(iter(seqpackets))) expected = [[['ACT']], [['CTG'], ['TTT']]] assert [p[SEQS_PASSED] for p in filter_packets] == expected assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []] def _create_seqrecord(string): 'Given an string it returns a SeqRecord'
def test_no_name(self): seqs = _build_some_paired_seqs() seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta') seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['T'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['G'] seqs = _build_some_paired_seqs() seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['A'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['C'] assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G'] seqs = _build_some_paired_seqs() seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3] paired_seqs = list(group_pairs_by_name(seqs)) assert [get_str_seq(s) for s in paired_seqs[0]] == ['N'] assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C'] assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
def test_bin_transcrip_orientator(self): 'it tests the transcript orientator binary' orientate_bin = os.path.join(BIN_DIR, 'orientate_transcripts') assert 'usage' in check_output([orientate_bin, '-h']) in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta') estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes') blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza') out_fhand = NamedTemporaryFile() cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', '-v', '0.0001', in_fpath, '-o', out_fhand.name, '--polya_min_len', '4'] check_output(cmd) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0]) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert 'polyA' in out_seqs[1].object.description assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert 'estscan_orf' in out_seqs[4].object.description assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) out_seq6 = str(out_seqs[6].object.seq.reverse_complement()) assert str(init_seqs[6].object.seq) == out_seq6 assert 'blast arabidopsis_genes' in out_seqs[6].object.description cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', in_fpath] stderr = NamedTemporaryFile() try: check_output(cmd, stderr=stderr) self.fail() except CalledProcessError: stde = open(stderr.name).read() assert 'Blast parameters are not well defined' in stde # witouth parameters out_fhand = NamedTemporaryFile() check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '--polya_min_len', '4']) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq) assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq) # only with orf annotator check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u', estscan_matrix, '--polya_min_len', '4']) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq) # multiprocessor out_fhand = NamedTemporaryFile() cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d', blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001', '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2', '--polya_min_len', '4'] check_output(cmd) out_seqs = list(read_seqs([open(out_fhand.name)], prefered_seq_classes=[SEQRECORD])) init_seqs = list(read_seqs([open(in_fpath)], prefered_seq_classes=[SEQRECORD])) assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq) out_seq1 = str(out_seqs[1].object.seq.reverse_complement()) assert str(init_seqs[1].object.seq) == out_seq1 assert 'polyA' in out_seqs[1].object.description assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq) out_seq4 = str(out_seqs[4].object.seq.reverse_complement()) assert str(init_seqs[4].object.seq) == out_seq4 assert 'estscan_orf' in out_seqs[4].object.description assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq) out_seq6 = str(out_seqs[6].object.seq.reverse_complement()) assert str(init_seqs[6].object.seq) == out_seq6 assert 'blast arabidopsis_genes' in out_seqs[6].object.description
def test_transcriptome_orientator(self): '''tests the orientator class''' estscan_matrix = os.path.join(TEST_DATA_DIR, 'Arabidopsis_thaliana.smat') seq1 = SeqRecord(seq=Seq('atccgtcagcatcCAATAAAAA'), id='seq1_polia+') seq2 = SeqRecord(seq=Seq('TTTTcTTcatccgtcag'), id='seq2_polia-') seq3 = SeqRecord(seq=Seq('cTTcatccgtcag'), id='seq3') seq1 = _wrap_seq(seq1) seq2 = _wrap_seq(seq2) seq3 = _wrap_seq(seq3) seq_forward = 'CATAGGGTCACCAATGGCTTCTTCTTTGCTTGCACTCTTCTCCTGTCTCTTCCTC' seq_forward += 'TCTCTCTTATCTCTCTCCTCCTCCCTAAATCTCCGCCGTCCGATCTTCTCTCAA' seq_forward += 'TCCAACGACCTCGATCTCTTCTCTTCTCTAAATCTCGACCGTCCATCTCTCGCC' seq_forward += 'GCCGATGACATCCACGATCTTCTCCCACGCTACGGATTCCCGAAAGGTCTTCTT' seq_forward += 'CCCAACAACGTCAAATCGTACACTATCTCCGACGACGGCGATTTCACCGTTGAC' seq_forward += 'CTGATTTCCAGTTGCTACGTCAAGTTCTCCGATCAACTCGTTTTCTACGGCAAG' seq_forward += 'AATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCAA' seq_forward += 'GCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGC' seq_forward += 'TCTGCCACGGTTGTGTTCTCCGTCGGATTTGTGTCCAAGACTTTACCTGCTTCC' seq_forward += 'ATGTTCGAAAATGTTCCTTCTTGCTCAAGAAACCTAAATCTTCAAGACTCTTGA' seq_forward += 'ATCCACCTGAAACGATCTCAAGATTCAACATTCCCTCCACCCTTTATAGTTTTG' seq_forward += 'TATTTCAGAAGTATTTTGCTTGGTTTCGTAGATATAGGTTCGAATTGGAAAAGA' seq_forward += 'TACTATCTTAATTATTCGAATCAGATTATGTTATACTGCCCAAA' seq_reverse = 'TTTGGGCAGTATAACATAATCTGATTCGAATAATTAAGATAGTATCTTTTCCAAT' seq_reverse += 'TCGAACCTATATCTACGAAACCAAGCAAAATACTTCTGAAATACAAAACTATAA' seq_reverse += 'AGGGTGGAGGGAATGTTGAATCTTGAGATCGTTTCAGGTGGATTCAAGAGTCTT' seq_reverse += 'GAAGATTTAGGTTTCTTGAGCAAGAAGGAACATTTTCGAACATGGAAGCAGGTA' seq_reverse += 'AAGTCTTGGACACAAATCCGACGGAGAACACAACCGTGGCAGAGCTTGGATCCG' seq_reverse += 'ATTCCATGGCGGTGATTGGTAGCCAAAGGAAAGCTTCTTTAGCTTGGATTCCAC' seq_reverse += 'GGACGTCTTTAACAGATCCGTAACTGAGTTTTCCGGCGATATTCTTGCCGTAGA' seq_reverse += 'AAACGAGTTGATCGGAGAACTTGACGTAGCAACTGGAAATCAGGTCAACGGTGA' seq_reverse += 'AATCGCCGTCGTCGGAGATAGTGTACGATTTGACGTTGTTGGGAAGAAGACCTT' seq_reverse += 'TCGGGAATCCGTAGCGTGGGAGAAGATCGTGGATGTCATCGGCGGCGAGAGATG' seq_reverse += 'GACGGTCGAGATTTAGAGAAGAGAAGAGATCGAGGTCGTTGGATTGAGAGAAGA' seq_reverse += 'TCGGACGGCGGAGATTTAGGGAGGAGGAGAGAGATAAGAGAGAGAGGAAGAGAC' seq_reverse += 'AGGAGAAGAGTGCAAGCAAAGAAGAAGCCATTGGTGACCCTATG' seq4 = SeqRecord(seq=Seq(seq_forward), id='seq_orf_forward') seq5 = SeqRecord(seq=Seq(seq_reverse), id='seq_orf_reverse') seq4 = _wrap_seq(seq4) seq5 = _wrap_seq(seq5) seq_forward = 'CTAAATCTCCGCCGTCCGATCTTCTCTCAATCCAACGACCTCGATCTCTTCTCTT' seq_forward += 'TCTCCGATCAACTCGTTTTCTACGGCAAGAATATCGCCGGAAAACTCAGTTACG' seq_reverse = 'TTTAACAGATCCGTAACTGAGTTTTCCGGCGATATTCTTGCCGTAGAAAACGAGT' seq_reverse += 'CGGAGATTTAG' seq6 = SeqRecord(seq=Seq(seq_forward), id='seq_blast_forward') seq7 = SeqRecord(seq=Seq(seq_reverse), id='seq_blast_reverse') seq6 = _wrap_seq(seq6) seq7 = _wrap_seq(seq7) seq_forward = 'GTTCGTTTCTCTTCTGAATTTCTGTAATCTGTAACGATGTCTCAGACTACTG' seq_forward += 'TCCTCAAGGTTGCTATGTCATGTCAG' seq_reverse = 'AGGCAGTCTTCTTCCCAGTTTTCGAAACGGTTTGGAAAACTACATCGC' seq8 = SeqRecord(seq=Seq(seq_forward), id='seq_blast2_forward') seq9 = SeqRecord(seq=Seq(seq_reverse), id='seq_blast2_reverse') seq8 = _wrap_seq(seq8) seq9 = _wrap_seq(seq9) seqrecords = [seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, seq9] estscan_params = {'usage_matrix': estscan_matrix} polya_params = {'min_len': 4, 'max_cont_mismatches': POLYA_ANNOTATOR_MISMATCHES} ara_blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes') cala_blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza') filters = [{'kind': 'score_threshold', 'score_key': 'expect', 'max_score': 1e-10}] blast_params = [{'blastdb': ara_blastdb, 'program': 'blastn', 'filters':filters}, {'blastdb': cala_blastdb, 'program': 'blastn'}] orientator = TranscriptOrientator(polya_params, estscan_params, blast_params) seqs = orientator(seqrecords) assert get_str_seq(seq1) == get_str_seq(seqs[0]) rev_str_seq1 = str(seqs[1].object.seq.reverse_complement()) assert get_str_seq(seq2) == rev_str_seq1 assert get_str_seq(seq4) == get_str_seq(seqs[3]) rev_str_seq4 = str(seqs[4].object.seq.reverse_complement()) assert get_str_seq(seq5) == rev_str_seq4 assert get_str_seq(seq6) == get_str_seq(seqs[5]) rev_str_seq6 = str(seqs[6].object.seq.reverse_complement()) assert get_str_seq(seq7) == rev_str_seq6
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality, FilterBlastMatch, FilterBlastShort, FilterDustComplexity, seq_to_filterpackets, FilterByRpkm, FilterByBam, FilterAllNs, FilterBowtie2Match, FilterByFeatureTypes) from crumbs.utils.bin_utils import BIN_DIR from crumbs.utils.test_utils import TEST_DATA_DIR from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM, SEQRECORD) from crumbs.seq import get_name, get_str_seq, SeqWrapper from crumbs.seqio import read_seq_packets _seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair] _seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai] class PacketConversionTest(unittest.TestCase): 'It tests the seqs and filter packet conversion' def test_seqs_to_filter_packets(self): 'It converts seq packets into filter packets' seqpackets = [['ACT'], ['CTG', 'TTT']] filter_packets = list(seq_to_filterpackets(iter(seqpackets))) expected = [[('ACT',)], [('CTG',), ('TTT',)]] assert [p[SEQS_PASSED] for p in filter_packets] == expected assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []] def _create_seqrecord(string): 'Given an string it returns a SeqRecord'
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False, nxs=None): 'It calculates some stats for the given seqs.' # get data lengths = IntCounter() quals_per_pos = IntBoxplot() nucl_freq = NuclFreqsPlot() kmer_counter = KmerCounter(kmer_size) if kmer_size else None dustscores = IntCounter() for seq in seqs: lengths[get_length(seq)] += 1 try: quals = get_qualities(seq) except AttributeError: quals = [] for index, qual in enumerate(quals): quals_per_pos.append(index + 1, qual) str_seq = get_str_seq(seq) for index, nucl in enumerate(str_seq): nucl_freq.append(index, nucl) if kmer_counter is not None: kmer_counter.count_seq(str_seq) if do_dust_stats: dustscore = calculate_dust_score(seq) if dustscore is not None: dustscores[int(dustscore)] += 1 lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'}) # length distribution lengths_srt = 'Length stats and distribution.\n' lengths_srt += '------------------------------\n' nxs = sorted(nxs) if nxs else [] for nx in sorted(nxs): lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx)) lengths_srt += str(lengths) lengths_srt += '\n' # agregate quals if quals_per_pos: quals = quals_per_pos.aggregated_array quals.update_labels({'sum': None, 'items': 'tot. base pairs'}) q30 = quals.count_relative_to_value(30, operator.ge) / quals.count q30 *= 100 q20 = quals.count_relative_to_value(20, operator.ge) / quals.count q20 *= 100 # qual distribution qual_str = 'Quality stats and distribution.\n' qual_str += '-------------------------------\n' qual_str += 'Q20: {:.2f}\n'.format(q20) qual_str += 'Q30: {:.2f}\n'.format(q30) qual_str += str(quals) qual_str += '\n' # qual per position boxplot qual_boxplot = 'Boxplot for quality per position.\n' qual_boxplot += '---------------------------------\n' qual_boxplot += quals_per_pos.ascii_plot qual_boxplot += '\n' else: qual_str = '' qual_boxplot = '' # nucl freqs freq_str = 'Nucleotide frequency per position.\n' freq_str += '----------------------------------\n' freq_str += nucl_freq.ascii_plot freq_str += '\n' # kmer_distriubution kmer_str = '' if kmer_counter is not None: kmers = IntCounter(kmer_counter.values) if kmers: kmers.update_labels({'sum': None, 'items': 'num. kmers'}) kmer_str = 'Kmer distribution\n' kmer_str += '-----------------\n' kmer_str += str(kmers) kmer_str += '\n' kmer_str += 'Most common kmers:\n' for kmer, number in kmer_counter.most_common(20): kmer_str += '\t{}: {}\n'.format(kmer, number) dust_str = '' if dustscores: dustscores.update_labels({'sum': None, 'items': 'num. seqs.'}) dust_str = 'Dustscores stats and distribution.\n' dust_str += '----------------------------------\n' dust7 = (dustscores.count_relative_to_value(7, operator.gt) / dustscores.count) dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7) dust_str += str(dustscores) dust_str += '\n' return {'length': lengths_srt, 'quality': qual_str, 'nucl_freq': freq_str, 'qual_boxplot': qual_boxplot, 'kmer': kmer_str, 'dustscore': dust_str}