Beispiel #1
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n',
                                         '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1: 5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Beispiel #2
0
    def test_trimming(self):
        'The sequences are trimmed according to the recommendations.'
        seq1 = 'gggtctcatcatcaggg'.upper()
        seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}})
        seq = SeqWrapper(SEQRECORD, seq, None)

        trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS]
        seq_trimmer = TrimOrMask()

        trim_rec['vector'] = [(0, 3), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert get_str_seq(seqs2[0]) == 'CTCA'

        trim_rec['vector'] = [(0, 0), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert get_str_seq(seqs2[0]) == 'GGTCTCA'

        trim_rec['vector'] = [(0, 1), (8, 12)]
        trim_rec['quality'] = [(1, 8), (13, 17)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert not seqs2

        trim_rec['vector'] = [(0, 0), (8, 13)]
        trim_rec['quality'] = []
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        seqs2 = seq_trimmer([seq])
        assert get_str_seq(seqs2[0]) == 'GGTCTCA'
        assert TRIMMING_RECOMMENDATIONS not in get_annotations(seqs2[0])
Beispiel #3
0
    def test_slice(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        expected_seq = SeqItem(name='s1', lines=['>s1\n', 'CTGG\n'])
        expected_seq = SeqWrapper(SEQITEM, expected_seq, 'fasta')
        assert slice_seq(seq, 1, 5) == expected_seq

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aata\n', '+\n', '!?!?\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = slice_seq(seq, 1, 3)
        assert list(get_qualities(seq)) == [30, 0]
        assert get_str_seq(seq) == 'at'
        assert seq.object.lines == ['@seq\n', 'at\n', '+\n', '?!\n']

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaat\n', 'caaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq_ = slice_seq(seq, 1, 5)
        assert list(get_qualities(seq_)) == [1, 1, 1, 2]
        assert get_str_seq(seq_) == get_str_seq(seq)[1:5]

        # It tests the stop is None
        seq = SeqItem('seq', ['>seq\n', 'aCTG'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(slice_seq(seq, 1, None)) == 'aCTG'[1:]

        assert get_str_seq(slice_seq(seq, None, 1)) == 'aCTG'[:1]
Beispiel #4
0
def _seqitem_pairs_equal(pair1, pair2):
    if len(pair1) != len(pair2):
        return False
    else:
        for read1, read2 in zip(pair1, pair2):
            if not get_str_seq(read1) == get_str_seq(read2):
                return False
        return True
Beispiel #5
0
def _seqitem_pairs_equal(pair1, pair2):
    if len(pair1) != len(pair2):
        return False
    else:
        for read1, read2 in zip(pair1, pair2):
            if not get_str_seq(read1) == get_str_seq(read2):
                return False
        return True
Beispiel #6
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
Beispiel #7
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None)])
        seqs.append([SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None)])

        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]

        assert res == ['CTTTC']

        # with pairs
        seq = SeqItem('s.f', ['>s.f\n', 'aaCTTTC\n'])
        seq1 = SeqItem('s.r', ['>s.r\n', 'aaCTTTC\n'])
        seq2 = SeqItem('s1.f', ['>s1.f\n', 'aa\n'])
        seq3 = SeqItem('s1.r', ['>s1.r\n', 'aaCTTTC\n'])
        seqs = []
        seqs.append([SeqWrapper(SEQITEM, seq, 'fasta'),
                     SeqWrapper(SEQITEM, seq1, 'fasta')])
        seqs.append([SeqWrapper(SEQITEM, seq2, 'fasta'),
                     SeqWrapper(SEQITEM, seq3, 'fasta')])
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_str_seq(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['CTTTC']
        assert ['CTTTC', 'CTTTC'] == res

        # no drag
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141
        trim_packet = trim(trim_lowercased_seqs(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        orphan_res = [get_name(s) for s in trim_packet[ORPHAN_SEQS]]
        assert orphan_res == ['s1.r']
        assert ['CTTTC', 'CTTTC'] == res
Beispiel #8
0
    def test_str_seq(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(seq) == 'ACTGGTAC'

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_seq(seq) == 'aaaa'
 def test_pair_grouper():
     seq1 = SeqWrapper(SEQITEM, SeqItem("s1", [">s1.f\n", "A\n"]), "fasta")
     seq2 = SeqWrapper(SEQITEM, SeqItem("s1", [">s1.r\n", "C\n"]), "fasta")
     seq3 = SeqWrapper(SEQITEM, SeqItem("s2", [">s2.f\n", "T\n"]), "fasta")
     seq4 = SeqWrapper(SEQITEM, SeqItem("s2", [">s2.r\n", "G\n"]), "fasta")
     seqs = seq1, seq2, seq3, seq4
     paired_seqs = list(group_seqs_in_pairs(seqs))
     assert [get_str_seq(s) for s in paired_seqs[0]] == ["A", "C"]
     assert [get_str_seq(s) for s in paired_seqs[1]] == ["T", "G"]
     assert len(paired_seqs) == 2
Beispiel #10
0
 def test_pair_grouper():
     seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
     seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
     seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
     seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
     seqs = seq1, seq2, seq3, seq4
     paired_seqs = list(group_seqs_in_pairs(seqs))
     assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
     assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
     assert len(paired_seqs) == 2
 def test_pair_grouper():
     seq1 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.f\n', 'A\n']), 'fasta')
     seq2 = SeqWrapper(SEQITEM, SeqItem('s1', ['>s1.r\n', 'C\n']), 'fasta')
     seq3 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.f\n', 'T\n']), 'fasta')
     seq4 = SeqWrapper(SEQITEM, SeqItem('s2', ['>s2.r\n', 'G\n']), 'fasta')
     seqs = seq1, seq2, seq3, seq4
     paired_seqs = list(group_seqs_in_pairs(seqs))
     assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
     assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
     assert len(paired_seqs) == 2
Beispiel #12
0
    def test_str_seq(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTGGTAC\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        assert get_str_seq(seq) == 'ACTGGTAC'

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+\n', '????\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        assert get_str_seq(seq) == 'aaaa'
Beispiel #13
0
    def test_pair_grouper(self):
        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs(seqs))

        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
        assert len(paired_seqs) == 2

        seqs = _build_some_paired_seqs()
        paired_seqs = list(
            group_pairs(seqs, n_seqs_in_pair=1, check_name_matches=True))
        assert [get_str_seq(s) for pair in paired_seqs
                for s in pair] == ['A', 'C', 'T', 'G']
Beispiel #14
0
    def test_pair_grouper(self):
        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs(seqs))

        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
        assert len(paired_seqs) == 2

        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=1,
                           check_name_matches=True))
        assert [get_str_seq(s) for pair in paired_seqs for s in pair] == ['A',
                                                                 'C', 'T', 'G']
Beispiel #15
0
    def test_n_seqs_check(self):
        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        try:
            list(group_pairs(seqs, n_seqs_in_pair=2))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=2,
                           check_all_same_n_seqs=False))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
Beispiel #16
0
    def test_n_seqs_check(self):
        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        try:
            list(group_pairs(seqs, n_seqs_in_pair=2))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        paired_seqs = list(
            group_pairs(seqs, n_seqs_in_pair=2, check_all_same_n_seqs=False))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
Beispiel #17
0
    def test_pair_grouper(self):
        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
        assert len(paired_seqs) == 2

        seqs = seqs[0], seqs[2], seqs[1], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']
        assert len(paired_seqs) == 4

        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']

        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        try:
            paired_seqs = list(group_pairs_by_name(seqs,
                                                   all_pairs_same_n_seqs=True))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass
Beispiel #18
0
    def test_pair_grouper(self):
        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T', 'G']
        assert len(paired_seqs) == 2

        seqs = seqs[0], seqs[2], seqs[1], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']
        assert len(paired_seqs) == 4

        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']

        seqs = _build_some_paired_seqs()
        seqs = seqs[:-1]
        try:
            paired_seqs = list(
                group_pairs_by_name(seqs, all_pairs_same_n_seqs=True))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass
Beispiel #19
0
def guess_seq_type(fhand):
    '''It guesses if the file is nucleotide or protein'''
    rna = set(ambiguous_rna_letters)
    dna = set(ambiguous_dna_letters)
    rna_dna = rna.union(dna)

    protein = set(extended_protein_letters)
    only_prot = list(protein.difference(rna_dna))

    chunk_size = 1024
    chunk = peek_chunk_from_file(fhand, chunk_size)
    if not chunk:
        raise UnknownFormatError('The file is empty')
    fhand_ = cStringIO.StringIO(chunk)
    total_letters = 0
    nucleotides = 0
    for seq in read_seqs([fhand_]):
        for letter in get_str_seq(seq):
            total_letters += 1
            if letter in ('gcatnuGCATNU'):
                nucleotides += 1
            if letter in only_prot:
                return 'prot'
    nucl_freq = nucleotides / total_letters
    if nucl_freq > 0.8:
        return 'nucl'

    raise RuntimeError('unable to guess the seq type')
Beispiel #20
0
def _annotate_polya(seq, min_len, max_cont_mismatches):
    'It annotates the polyA with the EMBOSS trimest method'
    str_seq = get_str_seq(seq)
    polya = _detect_polya_tail(str_seq, THREE_PRIME, min_len,
                               max_cont_mismatches)
    polyt = _detect_polya_tail(str_seq, FIVE_PRIME, min_len,
                               max_cont_mismatches)
    a_len = polya[1] - polya[0] if polya else 0
    t_len = polyt[1] - polyt[0] if polyt else 0
    chosen_tail = None
    if a_len > t_len:
        chosen_tail = 'A'
    elif t_len > a_len:
        chosen_tail = 'T'
    elif a_len and a_len == t_len:
        if randint(0, 1):
            chosen_tail = 'A'
        else:
            chosen_tail = 'T'
    if chosen_tail:
        strand = 1 if chosen_tail == 'A' else -1
        start, end = polya if chosen_tail == 'A' else polyt
        feat = SeqFeature(location=FeatureLocation(start, end, strand),
                          type='polyA_sequence')
        # We're assuming that the seq has a SeqRecord in it
        seq.object.features.append(feat)
Beispiel #21
0
def _annotate_polya(seq, min_len, max_cont_mismatches):
    'It annotates the polyA with the EMBOSS trimest method'
    str_seq = get_str_seq(seq)
    polya = _detect_polya_tail(str_seq, THREE_PRIME, min_len,
                               max_cont_mismatches)
    polyt = _detect_polya_tail(str_seq, FIVE_PRIME, min_len,
                               max_cont_mismatches)
    a_len = polya[1] - polya[0] if polya else 0
    t_len = polyt[1] - polyt[0] if polyt else 0
    chosen_tail = None
    if a_len > t_len:
        chosen_tail = 'A'
    elif t_len > a_len:
        chosen_tail = 'T'
    elif a_len and a_len == t_len:
        if randint(0, 1):
            chosen_tail = 'A'
        else:
            chosen_tail = 'T'
    if chosen_tail:
        strand = 1 if chosen_tail == 'A' else -1
        start, end = polya if chosen_tail == 'A' else polyt
        feat = SeqFeature(location=FeatureLocation(start, end, strand),
                          type='polyA_sequence')
        # We're assuming that the seq has a SeqRecord in it
        seq.object.features.append(feat)
Beispiel #22
0
def guess_seq_type(fhand):
    '''It guesses the file's seq type'''

    rna = set(ambiguous_rna_letters)
    dna = set(ambiguous_dna_letters)
    rna_dna = rna.union(dna)

    protein = set(extended_protein_letters)
    only_prot = list(protein.difference(rna_dna))

    chunk_size = 1024
    chunk = peek_chunk_from_file(fhand, chunk_size)
    if not chunk:
        raise UnknownFormatError('The file is empty')
    fhand_ = cStringIO.StringIO(chunk)
    total_letters = 0
    nucleotides = 0
    for seq in read_seqs([fhand_]):
        for letter in get_str_seq(seq):
            total_letters += 1
            if letter in ('gcatnuGCATNU'):
                nucleotides += 1
            if letter in only_prot:
                return 'prot'
    nucl_freq = nucleotides / total_letters
    if nucl_freq > 0.8:
        return 'nucl'

    raise RuntimeError('unable to guess the seq type')
Beispiel #23
0
 def __call__(self, pair):
     key = []
     for read in pair:
         seq = get_str_seq(read)
         if self._use_length is not None:
             seq = seq[:self._use_length]
         key.append(seq)
     return tuple(key)
Beispiel #24
0
 def __call__(self, pair):
     key = []
     for read in pair:
         seq = get_str_seq(read)
         if self._use_length is not None:
             seq = seq[:self._use_length]
         key.append(seq)
     return tuple(key)
    def test_case_change(self):
        'It changes the case of the sequences'
        seqs = [SeqRecord(Seq('aCCg'), letter_annotations={'dummy': 'dddd'})]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=UPPERCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['ACCG']

        seqs = [SeqRecord(Seq('aCCg'))]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=LOWERCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['accg']

        seqs = [SeqRecord(Seq('aCCg'))]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=SWAPCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['AccG']
Beispiel #26
0
 def _do_check(self, seq):
     seq = get_str_seq(seq)
     if not seq:
         return True
     chars = set(seq)
     good_chars = chars.difference(set(('N', 'n', '-', '*')))
     if good_chars:
         return True
     else:
         return False
Beispiel #27
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = [
            'GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
            'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG'
        ]
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name
        ]
        #raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        #With several threads
        cmd = [
            trim_chimeras_bin, in_fhand.name, '-r', index_fpath, '-o',
            out_fhand.name, '-p', '2'
        ]
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
    def test_split_mate(self):
        'It tests the function that splits seqs using segments'
        # pylint: disable=W0212
        seq = 'aaatttccctt'
        seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
        # fake class to test
        splitter = MatePairSplitter([seq])
        # segment beginning
        seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False))
        assert get_str_seq(seqs[0]) == 'ttccctt'
        assert get_name(seqs[0]) == 'seq'

        # segment at end
        seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False))
        assert  get_str_seq(seqs[0]) == 'aaatttc'
        assert get_name(seqs[0]) == 'seq'

        # segmnent in the middle
        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'ctt'
        assert get_name(seqs[0]) == 'seq_pl.part1'
        assert get_name(seqs[1]) == 'seq_pl.part2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False))
        assert get_name(seqs[0]) == r'seq\1'
        assert get_name(seqs[1]) == r'seq\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)],
                                                          False))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'c'
        assert get_str_seq(seqs[2]) == 't'
        assert get_name(seqs[0]) == 'seq_mlc.part1'

        # all sequence is linker
        seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False))
        assert not get_str_seq(seqs[0])

        # there's no segments
        seqs = splitter._split_by_mate_linker(seq, ([], False))
        assert get_name(seq) == get_name(seqs[0])
        assert get_str_seq(seq) == get_str_seq(seqs[0])
    def test_case_change(self):
        'It changes the case of the sequences'
        seqs = [SeqRecord(Seq('aCCg'), letter_annotations={'dummy': 'dddd'})]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=UPPERCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['ACCG']

        seqs = [SeqRecord(Seq('aCCg'))]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=LOWERCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['accg']

        seqs = [SeqRecord(Seq('aCCg'))]
        seqs = assing_kind_to_seqs(SEQRECORD, seqs, None)
        change_case = ChangeCase(action=SWAPCASE)
        strs = [get_str_seq(s) for s in change_case(seqs)]
        assert strs == ['AccG']
    def test_split_mate(self):
        'It tests the function that splits seqs using segments'
        # pylint: disable=W0212
        seq = 'aaatttccctt'
        seq = SeqWrapper(SEQRECORD, SeqRecord(Seq(seq), id='seq'), None)
        # fake class to test
        splitter = MatePairSplitter([seq])
        # segment beginning
        seqs = splitter._split_by_mate_linker(seq, ([(0, 3)], False))
        assert get_str_seq(seqs[0]) == 'ttccctt'
        assert get_name(seqs[0]) == 'seq'

        # segment at end
        seqs = splitter._split_by_mate_linker(seq, ([(7, 11)], False))
        assert get_str_seq(seqs[0]) == 'aaatttc'
        assert get_name(seqs[0]) == 'seq'

        # segmnent in the middle
        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], True))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'ctt'
        assert get_name(seqs[0]) == 'seq_pl.part1'
        assert get_name(seqs[1]) == 'seq_pl.part2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 7)], False))
        assert get_name(seqs[0]) == r'seq\1'
        assert get_name(seqs[1]) == r'seq\2'

        seqs = splitter._split_by_mate_linker(seq, ([(4, 6), (8, 9)], False))
        assert get_str_seq(seqs[0]) == 'aaat'
        assert get_str_seq(seqs[1]) == 'c'
        assert get_str_seq(seqs[2]) == 't'
        assert get_name(seqs[0]) == 'seq_mlc.part1'

        # all sequence is linker
        seqs = splitter._split_by_mate_linker(seq, ([(0, 10)], False))
        assert not get_str_seq(seqs[0])

        # there's no segments
        seqs = splitter._split_by_mate_linker(seq, ([], False))
        assert get_name(seq) == get_name(seqs[0])
        assert get_str_seq(seq) == get_str_seq(seqs[0])
Beispiel #31
0
    def _do_check(self, seq):
        min_ = self.min
        max_ = self.max
        length = uppercase_length(get_str_seq(seq)) if self.ignore_masked else get_length(seq)

        passed = True
        if min_ is not None and length < min_:
            passed = False
        if max_ is not None and length > max_:
            passed = False
        return passed
Beispiel #32
0
    def _do_check(self, seq):
        min_ = self.min
        max_ = self.max
        length = uppercase_length(get_str_seq(seq)) if self.ignore_masked else get_length(seq)

        passed = True
        if min_ is not None and length < min_:
            passed = False
        if max_ is not None and length > max_:
            passed = False
        return passed
Beispiel #33
0
    def test_name_check(self):
        seqs = _build_some_paired_seqs()
        try:
            list(group_pairs(seqs, n_seqs_in_pair=4))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        seqs = _build_some_paired_seqs()
        paired_seqs = list(
            group_pairs(seqs, n_seqs_in_pair=4, check_name_matches=False))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C', 'T', 'G']
Beispiel #34
0
    def test_name_check(self):
        seqs = _build_some_paired_seqs()
        try:
            list(group_pairs(seqs, n_seqs_in_pair=4))
            self.fail('InterleaveError expected')
        except InterleaveError:
            pass

        seqs = _build_some_paired_seqs()
        paired_seqs = list(group_pairs(seqs, n_seqs_in_pair=4,
                           check_name_matches=False))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C', 'T', 'G']
Beispiel #35
0
    def test_trim_seqs():
        'It tests the trim seq function'
        seqs = []
        seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTTTC')), None))
        seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('CTTCaa')), None))
        seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('aaCTCaa')), None))
        seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('actg')), None))
        seqs.append(SeqWrapper(SEQRECORD, SeqRecord(Seq('AC')), None))

        trim_lowercased_seqs = TrimLowercasedLetters()
        trim = TrimOrMask()
        # pylint: disable=W0141

        res = [get_str_seq(s) for s in trim(trim_lowercased_seqs(seqs))]
        assert res == ['CTTTC', 'CTTC', 'CTC', 'AC']

        seqs = []
        seq = SeqItem('s', ['>s\n', 'aaCTTTC\n'])
        seqs.append(SeqWrapper(SEQITEM, seq, 'fasta'))
        res = [get_str_seq(s) for s in trim(trim_lowercased_seqs(seqs))]
        assert res == ['CTTTC']
Beispiel #36
0
    def test_trimming(self):
        'The sequences are trimmed according to the recommendations.'
        seq1 = 'gggtctcatcatcaggg'.upper()
        seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}})
        seq = SeqWrapper(SEQRECORD, seq, None)
        seqs = [seq]
        trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []}

        trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS]
        seq_trimmer = TrimOrMask()

        trim_rec['vector'] = [(0, 3), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['CTCA']

        trim_rec['vector'] = [(0, 0), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']

        trim_rec['vector'] = [(0, 1), (8, 12)]
        trim_rec['quality'] = [(1, 8), (13, 17)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        assert not trim_packet2[SEQS_PASSED]

        trim_rec['vector'] = [(0, 0), (8, 13)]
        trim_rec['quality'] = []
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']
        trim_packet2[SEQS_PASSED][0][0]
        assert TRIMMING_RECOMMENDATIONS not in get_annotations(
            trim_packet2[SEQS_PASSED][0][0])
Beispiel #37
0
    def test_trimming(self):
        'The sequences are trimmed according to the recommendations.'
        seq1 = 'gggtctcatcatcaggg'.upper()
        seq = SeqRecord(Seq(seq1), annotations={TRIMMING_RECOMMENDATIONS: {}})
        seq = SeqWrapper(SEQRECORD, seq, None)
        seqs = [seq]
        trim_packet = {SEQS_PASSED: [seqs], ORPHAN_SEQS: []}

        trim_rec = get_annotations(seq)[TRIMMING_RECOMMENDATIONS]
        seq_trimmer = TrimOrMask()

        trim_rec['vector'] = [(0, 3), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['CTCA']

        trim_rec['vector'] = [(0, 0), (8, 13)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']

        trim_rec['vector'] = [(0, 1), (8, 12)]
        trim_rec['quality'] = [(1, 8), (13, 17)]
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        assert not trim_packet2[SEQS_PASSED]

        trim_rec['vector'] = [(0, 0), (8, 13)]
        trim_rec['quality'] = []
        get_annotations(seq)[TRIMMING_RECOMMENDATIONS] = trim_rec
        trim_packet2 = seq_trimmer(trim_packet)
        res = [get_str_seq(s) for l in trim_packet2[SEQS_PASSED] for s in l]
        assert res == ['GGTCTCA']
        trim_packet2[SEQS_PASSED][0][0]
        assert TRIMMING_RECOMMENDATIONS not in get_annotations(trim_packet2[SEQS_PASSED][0][0])
Beispiel #38
0
    def test_trim_chimeras_bin(self):
        trim_chimeras_bin = os.path.join(BIN_DIR, 'trim_mp_chimeras')
        assert 'usage' in check_output([trim_chimeras_bin, '-h'])
        index_fpath = os.path.join(TEST_DATA_DIR, 'ref_example.fasta')
        query1 = '@seq2 f\nGGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT'
        query1 += 'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n'
        query1 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$'
        query1 += '$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query2 = '@seq2 r\nCATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG\n'
        query2 += '+\n$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$\n'
        query = query1 + query2
        in_fhand = NamedTemporaryFile()
        in_fhand.write(query)
        in_fhand.flush()

        out_fhand = NamedTemporaryFile()
        expected_seqs = ['GGGATCGCAGACCCATCTCGTCAGCATGTACCCTTGCTACATTGAACTT',
                         'CATCATTGCATAAGTAACACTCAACCAACAGTGCTACAGGGTTGTAACG']
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name]
        #raw_input(" ".join(cmd))
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0

        #With several threads
        cmd = [trim_chimeras_bin, in_fhand.name, '-r', index_fpath,
               '-o', out_fhand.name, '-p', '2']
        check_output(cmd, stdin=in_fhand)
        counts = 0
        for seq in read_seqs([open(out_fhand.name)]):
            assert get_str_seq(seq) in expected_seqs
            counts += 1
        assert counts != 0
Beispiel #39
0
    def __call__(self, filter_packet):
        seqs_passed = []
        filtered_out = filter_packet[SEQS_FILTERED_OUT][:]
        for pair in filter_packet[SEQS_PASSED]:
            str_pair = tuple(get_str_seq(seq) for seq in pair)
            duplicated = True if str_pair in self._prev_pairs else False
            self._prev_pairs.add(str_pair)

            filter_pass = duplicated if self.reverse else not(duplicated)

            if filter_pass:
                seqs_passed.append(pair)
            else:
                filtered_out.append(pair)

        return {SEQS_PASSED: seqs_passed, SEQS_FILTERED_OUT: filtered_out}
Beispiel #40
0
    def __call__(self, filter_packet):
        seqs_passed = []
        filtered_out = filter_packet[SEQS_FILTERED_OUT][:]
        for pair in filter_packet[SEQS_PASSED]:
            str_pair = tuple(get_str_seq(seq) for seq in pair)
            duplicated = True if str_pair in self._prev_pairs else False
            self._prev_pairs.add(str_pair)

            filter_pass = duplicated if self.reverse else not (duplicated)

            if filter_pass:
                seqs_passed.append(pair)
            else:
                filtered_out.append(pair)

        return {SEQS_PASSED: seqs_passed, SEQS_FILTERED_OUT: filtered_out}
Beispiel #41
0
 def __call__(self, seqs):
     'It changes the case of the seqrecords.'
     action = self.action
     processed_seqs = []
     for seq in seqs:
         str_seq = get_str_seq(seq)
         if action == UPPERCASE:
             str_seq = str_seq.upper()
         elif action == LOWERCASE:
             str_seq = str_seq.lower()
         elif action == SWAPCASE:
             str_seq = str_seq.swapcase()
         else:
             raise NotImplementedError()
         seq = copy_seq(seq, seq=str_seq)
         processed_seqs.append(seq)
     return processed_seqs
Beispiel #42
0
 def __call__(self, seqs):
     'It changes the case of the seqrecords.'
     action = self.action
     processed_seqs = []
     for seq in seqs:
         str_seq = get_str_seq(seq)
         if action == UPPERCASE:
             str_seq = str_seq.upper()
         elif action == LOWERCASE:
             str_seq = str_seq.lower()
         elif action == SWAPCASE:
             str_seq = str_seq.swapcase()
         else:
             raise NotImplementedError()
         seq = copy_seq(seq, seq=str_seq)
         processed_seqs.append(seq)
     return processed_seqs
Beispiel #43
0
    def _do_trim(self, seq):
        str_seq = get_str_seq(seq)
        unmasked_segments = get_uppercase_segments(str_seq)
        segment = get_longest_segment(unmasked_segments)
        if segment is not None:
            segments = []
            if segment[0] != 0:
                segments.append((0, segment[0] - 1))
            len_seq = len(str_seq)
            if segment[1] != len_seq - 1:
                segments.append((segment[1] + 1, len_seq - 1))

            _add_trim_segments(segments, seq, kind=OTHER)

        else:
            segments = [(0, len(seq))]
            _add_trim_segments(segments, seq, kind=OTHER)
        return seq
Beispiel #44
0
    def __call__(self, seqs):
        'It trims the masked segments of the seqrecords.'
        trimmed_seqs = []
        for seq in seqs:
            str_seq = get_str_seq(seq)
            unmasked_segments = get_uppercase_segments(str_seq)
            segment = get_longest_segment(unmasked_segments)
            if segment is not None:
                segments = []
                if segment[0] != 0:
                    segments.append((0, segment[0] - 1))
                len_seq = len(str_seq)
                if segment[1] != len_seq - 1:
                    segments.append((segment[1] + 1, len_seq - 1))

                _add_trim_segments(segments, seq, kind=OTHER)
                trimmed_seqs.append(seq)
        return trimmed_seqs
Beispiel #45
0
    def _do_trim(self, seq):
        str_seq = get_str_seq(seq)
        unmasked_segments = get_uppercase_segments(str_seq)
        segment = get_longest_segment(unmasked_segments)
        if segment is not None:
            segments = []
            if segment[0] != 0:
                segments.append((0, segment[0] - 1))
            len_seq = len(str_seq)
            if segment[1] != len_seq - 1:
                segments.append((segment[1] + 1, len_seq - 1))

            _add_trim_segments(segments, seq, kind=OTHER)

        else:
            segments = [(0, len(seq))]
            _add_trim_segments(segments, seq, kind=OTHER)
        return seq
Beispiel #46
0
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
Beispiel #47
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Beispiel #48
0
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
Beispiel #49
0
def _read_estcan_result(fhand, result, file_type):
    'It reads a dna or pep ESTscan result file'
    for seq in read_seqs([fhand], file_format='fasta'):
        items = [i.strip() for i in get_description(seq).split(';')]
        strand = -1 if 'minus strand' in items else 1
        start, end = items[0].split(' ', 3)[1:3]
        # estscan changes the name, we have to fix it
        seqid = get_name(seq).strip(';')
        try:
            seq_orfs = result[seqid]
        except KeyError:
            seq_orfs = {}
            result[seqid] = seq_orfs
        orf_key = (int(start), int(end), strand)
        if orf_key in seq_orfs:
            orf = seq_orfs[orf_key]
        else:
            orf = {}
            seq_orfs[orf_key] = orf
        orf[file_type] = get_str_seq(seq)
Beispiel #50
0
def calculate_dust_score(seq):
    '''It returns the dust score.

    From: "A Fast and Symmetric DUST Implementation to Mask Low-Complexity DNA
    Sequences"
    doi:10.1089/cmb.2006.13.1028

    and re-implemented from PRINSEQ
    '''
    seq = get_str_seq(seq)
    length = len(seq)
    if length == 3:
        return 0
    if length <= 5:
        return None

    windowsize = get_setting('DUST_WINDOWSIZE')
    windowstep = get_setting('DUST_WINDOWSTEP')

    dustscores = []
    if length > windowsize:
        windows = 0
        for seq_in_win in rolling_window(seq, windowsize, windowstep):
            score = _calculate_rawscore(seq_in_win)
            dustscores.append(score / (windowsize - 2))
            windows += 1
        remaining_seq = seq[windows * windowstep:]
    else:
        remaining_seq = seq

    if remaining_seq > 5:
        length = len(remaining_seq)
        score = _calculate_rawscore(remaining_seq)
        dustscore = score / (length - 3) * (windowsize - 2) / (length - 2)
        dustscores.append(dustscore)

    # max score should be 100 not 31
    dustscore = sum(dustscores) / len(dustscores) * 100 / 31
    return dustscore
Beispiel #51
0
    def test_edge_trimming(self):
        'It trims the edges'
        trim = TrimOrMask()
        trim_edges = TrimEdges(left=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CCG', 'AACCCGGG']

        trim_edges = TrimEdges(right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACC', 'AAACCCGG']

        trim_edges = TrimEdges(left=1, right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CC', 'AACCCGG']

        trim_edges = TrimEdges(left=2, right=2)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACCCG']

        trim_edges = TrimEdges(left=3, right=3)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CCC']

        trim = TrimOrMask(mask=True)
        trim_edges = TrimEdges(left=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCCG', 'aAACCCGGG']

        trim_edges = TrimEdges(right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['ACCg', 'AAACCCGGg']

        trim_edges = TrimEdges(left=1, right=1)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCCg', 'aAACCCGGg']

        trim_edges = TrimEdges(left=2, right=2)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaACCCGgg']

        trim_edges = TrimEdges(left=3, right=3)
        trim_packet = trim(trim_edges(self._some_seqs()))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaaCCCggg']

        # test overlapping mask
        trim1 = TrimEdges(left=3, right=3)
        trim2 = TrimEdges(left=4, right=4)
        trim_packet = trim(trim2(trim1(self._some_seqs())))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['accg', 'aaacCcggg']

        # With a SeqItem
        trim = TrimOrMask(mask=False)
        trim_edges = TrimEdges(left=1, right=1)

        seq = SeqItem('s', ['>s\n', 'ACTTTC\n'])
        seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]]
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}

        trim_packet = trim(trim_edges(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['CTTT']

        trim = TrimOrMask(mask=True)
        seq = SeqItem('s', ['>s\n', 'ACTTTC\n'])
        seqs = [[SeqWrapper(SEQITEM, seq, 'fasta')]]
        trim_packet = {SEQS_PASSED: seqs, ORPHAN_SEQS: []}
        trim_packet = trim(trim_edges(trim_packet))
        res = [get_str_seq(s) for l in trim_packet[SEQS_PASSED] for s in l]
        assert res == ['aCTTTc']
Beispiel #52
0
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality,
                            FilterBlastMatch, FilterDustComplexity,
                            seq_to_filterpackets, FilterByRpkm, FilterByBam,
                            FilterBowtie2Match, FilterByFeatureTypes)
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM,
                               SEQRECORD)
from crumbs.utils.file_utils import TemporaryDir
from crumbs.seq import get_name, get_str_seq, SeqWrapper
from crumbs.mapping import get_or_create_bowtie2_index
from crumbs.seqio import read_seq_packets


_seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair]
_seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai]


class PacketConversionTest(unittest.TestCase):
    'It tests the seqs and filter packet conversion'
    def test_seqs_to_filter_packets(self):
        'It converts seq packets into filter packets'
        seqpackets = [['ACT'], ['CTG', 'TTT']]
        filter_packets = list(seq_to_filterpackets(iter(seqpackets)))
        expected = [[['ACT']], [['CTG'], ['TTT']]]
        assert [p[SEQS_PASSED] for p in filter_packets] == expected
        assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []]


def _create_seqrecord(string):
    'Given an string it returns a SeqRecord'
Beispiel #53
0
    def test_no_name(self):
        seqs = _build_some_paired_seqs()
        seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta')

        seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']

        seqs = _build_some_paired_seqs()
        seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G']

        seqs = _build_some_paired_seqs()
        seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
Beispiel #54
0
    def test_no_name(self):
        seqs = _build_some_paired_seqs()
        seq = SeqWrapper(SEQITEM, SeqItem('s', ['>s\n', 'N\n']), 'fasta')

        seqs = seqs[0], seqs[1], seqs[2], seq, seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['T']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['G']

        seqs = _build_some_paired_seqs()
        seqs = seqs[0], seq, seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['A']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['C']
        assert [get_str_seq(s) for s in paired_seqs[3]] == ['T', 'G']

        seqs = _build_some_paired_seqs()
        seqs = seq, seqs[0], seqs[1], seqs[2], seqs[3]
        paired_seqs = list(group_pairs_by_name(seqs))
        assert [get_str_seq(s) for s in paired_seqs[0]] == ['N']
        assert [get_str_seq(s) for s in paired_seqs[1]] == ['A', 'C']
        assert [get_str_seq(s) for s in paired_seqs[2]] == ['T', 'G']
    def test_bin_transcrip_orientator(self):
        'it tests the transcript orientator binary'
        orientate_bin = os.path.join(BIN_DIR, 'orientate_transcripts')
        assert 'usage' in check_output([orientate_bin, '-h'])

        in_fpath = os.path.join(TEST_DATA_DIR, 'seqs_to_orientate.fasta')
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
        blastdb1 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'arabidopsis_genes')
        blastdb2 = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza')

        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name,
               '--polya_min_len', '4']
        check_output(cmd)

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert get_str_seq(init_seqs[0]) == get_str_seq(out_seqs[0])
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               in_fpath]
        stderr = NamedTemporaryFile()
        try:
            check_output(cmd, stderr=stderr)
            self.fail()
        except CalledProcessError:
            stde = open(stderr.name).read()
            assert 'Blast parameters are not well defined' in stde

        # witouth parameters
        out_fhand = NamedTemporaryFile()
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name,
                      '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        assert str(init_seqs[4].object.seq) == str(out_seqs[4].object.seq)
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # only with orf annotator
        check_output([orientate_bin, in_fpath, '-o', out_fhand.name, '-u',
                      estscan_matrix, '--polya_min_len', '4'])

        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        assert str(init_seqs[6].object.seq) == str(out_seqs[6].object.seq)

        # multiprocessor
        out_fhand = NamedTemporaryFile()
        cmd = [orientate_bin, '-u', estscan_matrix, '-d', blastdb1, '-d',
               blastdb2, '-g', 'blastn', '-g', 'blastn', '-v', '0.0001',
               '-v', '0.0001', in_fpath, '-o', out_fhand.name, '-p', '2',
               '--polya_min_len', '4']
        check_output(cmd)
        out_seqs = list(read_seqs([open(out_fhand.name)],
                                  prefered_seq_classes=[SEQRECORD]))
        init_seqs = list(read_seqs([open(in_fpath)],
                                   prefered_seq_classes=[SEQRECORD]))

        assert str(init_seqs[0].object.seq) == str(out_seqs[0].object.seq)
        out_seq1 = str(out_seqs[1].object.seq.reverse_complement())
        assert str(init_seqs[1].object.seq) == out_seq1
        assert 'polyA' in  out_seqs[1].object.description
        assert str(init_seqs[3].object.seq) == str(out_seqs[3].object.seq)
        out_seq4 = str(out_seqs[4].object.seq.reverse_complement())
        assert str(init_seqs[4].object.seq) == out_seq4
        assert 'estscan_orf' in  out_seqs[4].object.description
        assert str(init_seqs[5].object.seq) == str(out_seqs[5].object.seq)
        out_seq6 = str(out_seqs[6].object.seq.reverse_complement())
        assert str(init_seqs[6].object.seq) == out_seq6
        assert 'blast arabidopsis_genes' in  out_seqs[6].object.description
    def test_transcriptome_orientator(self):
        '''tests the orientator class'''
        estscan_matrix = os.path.join(TEST_DATA_DIR,
                                      'Arabidopsis_thaliana.smat')
        seq1 = SeqRecord(seq=Seq('atccgtcagcatcCAATAAAAA'), id='seq1_polia+')
        seq2 = SeqRecord(seq=Seq('TTTTcTTcatccgtcag'), id='seq2_polia-')
        seq3 = SeqRecord(seq=Seq('cTTcatccgtcag'), id='seq3')
        seq1 = _wrap_seq(seq1)
        seq2 = _wrap_seq(seq2)
        seq3 = _wrap_seq(seq3)
        seq_forward = 'CATAGGGTCACCAATGGCTTCTTCTTTGCTTGCACTCTTCTCCTGTCTCTTCCTC'
        seq_forward += 'TCTCTCTTATCTCTCTCCTCCTCCCTAAATCTCCGCCGTCCGATCTTCTCTCAA'
        seq_forward += 'TCCAACGACCTCGATCTCTTCTCTTCTCTAAATCTCGACCGTCCATCTCTCGCC'
        seq_forward += 'GCCGATGACATCCACGATCTTCTCCCACGCTACGGATTCCCGAAAGGTCTTCTT'
        seq_forward += 'CCCAACAACGTCAAATCGTACACTATCTCCGACGACGGCGATTTCACCGTTGAC'
        seq_forward += 'CTGATTTCCAGTTGCTACGTCAAGTTCTCCGATCAACTCGTTTTCTACGGCAAG'
        seq_forward += 'AATATCGCCGGAAAACTCAGTTACGGATCTGTTAAAGACGTCCGTGGAATCCAA'
        seq_forward += 'GCTAAAGAAGCTTTCCTTTGGCTACCAATCACCGCCATGGAATCGGATCCAAGC'
        seq_forward += 'TCTGCCACGGTTGTGTTCTCCGTCGGATTTGTGTCCAAGACTTTACCTGCTTCC'
        seq_forward += 'ATGTTCGAAAATGTTCCTTCTTGCTCAAGAAACCTAAATCTTCAAGACTCTTGA'
        seq_forward += 'ATCCACCTGAAACGATCTCAAGATTCAACATTCCCTCCACCCTTTATAGTTTTG'
        seq_forward += 'TATTTCAGAAGTATTTTGCTTGGTTTCGTAGATATAGGTTCGAATTGGAAAAGA'
        seq_forward += 'TACTATCTTAATTATTCGAATCAGATTATGTTATACTGCCCAAA'

        seq_reverse = 'TTTGGGCAGTATAACATAATCTGATTCGAATAATTAAGATAGTATCTTTTCCAAT'
        seq_reverse += 'TCGAACCTATATCTACGAAACCAAGCAAAATACTTCTGAAATACAAAACTATAA'
        seq_reverse += 'AGGGTGGAGGGAATGTTGAATCTTGAGATCGTTTCAGGTGGATTCAAGAGTCTT'
        seq_reverse += 'GAAGATTTAGGTTTCTTGAGCAAGAAGGAACATTTTCGAACATGGAAGCAGGTA'
        seq_reverse += 'AAGTCTTGGACACAAATCCGACGGAGAACACAACCGTGGCAGAGCTTGGATCCG'
        seq_reverse += 'ATTCCATGGCGGTGATTGGTAGCCAAAGGAAAGCTTCTTTAGCTTGGATTCCAC'
        seq_reverse += 'GGACGTCTTTAACAGATCCGTAACTGAGTTTTCCGGCGATATTCTTGCCGTAGA'
        seq_reverse += 'AAACGAGTTGATCGGAGAACTTGACGTAGCAACTGGAAATCAGGTCAACGGTGA'
        seq_reverse += 'AATCGCCGTCGTCGGAGATAGTGTACGATTTGACGTTGTTGGGAAGAAGACCTT'
        seq_reverse += 'TCGGGAATCCGTAGCGTGGGAGAAGATCGTGGATGTCATCGGCGGCGAGAGATG'
        seq_reverse += 'GACGGTCGAGATTTAGAGAAGAGAAGAGATCGAGGTCGTTGGATTGAGAGAAGA'
        seq_reverse += 'TCGGACGGCGGAGATTTAGGGAGGAGGAGAGAGATAAGAGAGAGAGGAAGAGAC'
        seq_reverse += 'AGGAGAAGAGTGCAAGCAAAGAAGAAGCCATTGGTGACCCTATG'

        seq4 = SeqRecord(seq=Seq(seq_forward), id='seq_orf_forward')
        seq5 = SeqRecord(seq=Seq(seq_reverse), id='seq_orf_reverse')
        seq4 = _wrap_seq(seq4)
        seq5 = _wrap_seq(seq5)

        seq_forward = 'CTAAATCTCCGCCGTCCGATCTTCTCTCAATCCAACGACCTCGATCTCTTCTCTT'
        seq_forward += 'TCTCCGATCAACTCGTTTTCTACGGCAAGAATATCGCCGGAAAACTCAGTTACG'

        seq_reverse = 'TTTAACAGATCCGTAACTGAGTTTTCCGGCGATATTCTTGCCGTAGAAAACGAGT'
        seq_reverse += 'CGGAGATTTAG'

        seq6 = SeqRecord(seq=Seq(seq_forward), id='seq_blast_forward')
        seq7 = SeqRecord(seq=Seq(seq_reverse), id='seq_blast_reverse')
        seq6 = _wrap_seq(seq6)
        seq7 = _wrap_seq(seq7)

        seq_forward = 'GTTCGTTTCTCTTCTGAATTTCTGTAATCTGTAACGATGTCTCAGACTACTG'
        seq_forward += 'TCCTCAAGGTTGCTATGTCATGTCAG'

        seq_reverse = 'AGGCAGTCTTCTTCCCAGTTTTCGAAACGGTTTGGAAAACTACATCGC'

        seq8 = SeqRecord(seq=Seq(seq_forward), id='seq_blast2_forward')
        seq9 = SeqRecord(seq=Seq(seq_reverse), id='seq_blast2_reverse')
        seq8 = _wrap_seq(seq8)
        seq9 = _wrap_seq(seq9)

        seqrecords = [seq1, seq2, seq3, seq4, seq5, seq6, seq7, seq8, seq9]
        estscan_params = {'usage_matrix': estscan_matrix}
        polya_params = {'min_len': 4,
                        'max_cont_mismatches': POLYA_ANNOTATOR_MISMATCHES}
        ara_blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs',
                                   'arabidopsis_genes')
        cala_blastdb = os.path.join(TEST_DATA_DIR, 'blastdbs', 'calabaza')
        filters = [{'kind': 'score_threshold', 'score_key': 'expect',
                    'max_score': 1e-10}]
        blast_params = [{'blastdb': ara_blastdb, 'program': 'blastn',
                         'filters':filters},
                        {'blastdb': cala_blastdb, 'program': 'blastn'}]

        orientator = TranscriptOrientator(polya_params, estscan_params,
                                          blast_params)
        seqs = orientator(seqrecords)

        assert get_str_seq(seq1) == get_str_seq(seqs[0])
        rev_str_seq1 = str(seqs[1].object.seq.reverse_complement())
        assert get_str_seq(seq2) == rev_str_seq1 
        assert get_str_seq(seq4) == get_str_seq(seqs[3])
        rev_str_seq4 = str(seqs[4].object.seq.reverse_complement())
        assert get_str_seq(seq5) == rev_str_seq4
        assert get_str_seq(seq6) == get_str_seq(seqs[5])
        rev_str_seq6 = str(seqs[6].object.seq.reverse_complement())
        assert get_str_seq(seq7) == rev_str_seq6
Beispiel #57
0
from crumbs.filters import (FilterByLength, FilterById, FilterByQuality,
                            FilterBlastMatch, FilterBlastShort,
                            FilterDustComplexity, seq_to_filterpackets,
                            FilterByRpkm, FilterByBam, FilterAllNs,
                            FilterBowtie2Match, FilterByFeatureTypes)
from crumbs.utils.bin_utils import BIN_DIR
from crumbs.utils.test_utils import TEST_DATA_DIR
from crumbs.utils.tags import (NUCL, SEQS_FILTERED_OUT, SEQS_PASSED, SEQITEM,
                               SEQRECORD)
from crumbs.seq import get_name, get_str_seq, SeqWrapper
from crumbs.seqio import read_seq_packets


_seqs_to_names = lambda seqs: [get_name(s) for pair in seqs for s in pair]
_seqs_to_str_seqs = lambda seqs: [get_str_seq(s) for pai in seqs for s in pai]


class PacketConversionTest(unittest.TestCase):
    'It tests the seqs and filter packet conversion'
    def test_seqs_to_filter_packets(self):
        'It converts seq packets into filter packets'
        seqpackets = [['ACT'], ['CTG', 'TTT']]
        filter_packets = list(seq_to_filterpackets(iter(seqpackets)))
        expected = [[('ACT',)], [('CTG',), ('TTT',)]]
        assert [p[SEQS_PASSED] for p in filter_packets] == expected
        assert [p[SEQS_FILTERED_OUT] for p in filter_packets] == [[], []]


def _create_seqrecord(string):
    'Given an string it returns a SeqRecord'
Beispiel #58
0
def calculate_sequence_stats(seqs, kmer_size=None, do_dust_stats=False,
                             nxs=None):
    'It calculates some stats for the given seqs.'
    # get data
    lengths = IntCounter()
    quals_per_pos = IntBoxplot()
    nucl_freq = NuclFreqsPlot()
    kmer_counter = KmerCounter(kmer_size) if kmer_size else None
    dustscores = IntCounter()
    for seq in seqs:
        lengths[get_length(seq)] += 1
        try:
            quals = get_qualities(seq)
        except AttributeError:
            quals = []
        for index, qual in enumerate(quals):
            quals_per_pos.append(index + 1, qual)
        str_seq = get_str_seq(seq)
        for index, nucl in enumerate(str_seq):
            nucl_freq.append(index, nucl)
        if kmer_counter is not None:
            kmer_counter.count_seq(str_seq)
        if do_dust_stats:
            dustscore = calculate_dust_score(seq)
            if dustscore is not None:
                dustscores[int(dustscore)] += 1

    lengths.update_labels({'sum': 'tot. residues', 'items': 'num. seqs.'})

    # length distribution
    lengths_srt = 'Length stats and distribution.\n'
    lengths_srt += '------------------------------\n'
    nxs = sorted(nxs) if nxs else []
    for nx in sorted(nxs):
        lengths_srt += 'N{:d}: {:d}\n'.format(nx, calculate_nx(lengths, nx))
    lengths_srt += str(lengths)
    lengths_srt += '\n'

    # agregate quals
    if quals_per_pos:
        quals = quals_per_pos.aggregated_array
        quals.update_labels({'sum': None, 'items': 'tot. base pairs'})

        q30 = quals.count_relative_to_value(30, operator.ge) / quals.count
        q30 *= 100

        q20 = quals.count_relative_to_value(20, operator.ge) / quals.count
        q20 *= 100

        # qual distribution
        qual_str = 'Quality stats and distribution.\n'
        qual_str += '-------------------------------\n'
        qual_str += 'Q20: {:.2f}\n'.format(q20)
        qual_str += 'Q30: {:.2f}\n'.format(q30)
        qual_str += str(quals)
        qual_str += '\n'

        # qual per position boxplot
        qual_boxplot = 'Boxplot for quality per position.\n'
        qual_boxplot += '---------------------------------\n'
        qual_boxplot += quals_per_pos.ascii_plot
        qual_boxplot += '\n'
    else:
        qual_str = ''
        qual_boxplot = ''

    # nucl freqs
    freq_str = 'Nucleotide frequency per position.\n'
    freq_str += '----------------------------------\n'
    freq_str += nucl_freq.ascii_plot
    freq_str += '\n'

    # kmer_distriubution
    kmer_str = ''
    if kmer_counter is not None:
        kmers = IntCounter(kmer_counter.values)
        if kmers:
            kmers.update_labels({'sum': None, 'items': 'num. kmers'})
            kmer_str = 'Kmer distribution\n'
            kmer_str += '-----------------\n'
            kmer_str += str(kmers)
            kmer_str += '\n'
            kmer_str += 'Most common kmers:\n'
            for kmer, number in kmer_counter.most_common(20):
                kmer_str += '\t{}: {}\n'.format(kmer, number)

    dust_str = ''
    if dustscores:
        dustscores.update_labels({'sum': None, 'items': 'num. seqs.'})
        dust_str = 'Dustscores stats and distribution.\n'
        dust_str += '----------------------------------\n'
        dust7 = (dustscores.count_relative_to_value(7, operator.gt) /
                 dustscores.count)
        dust_str += '% above 7 (low complexity): {:.2f}\n'.format(dust7)
        dust_str += str(dustscores)
        dust_str += '\n'

    return {'length': lengths_srt,
            'quality': qual_str,
            'nucl_freq': freq_str,
            'qual_boxplot': qual_boxplot,
            'kmer': kmer_str,
            'dustscore': dust_str}