Beispiel #1
0
    def test_copy(self):
        # with fasta
        seq = SeqItem(name='s1',
                      lines=['>s1\n', 'ACTG\n', 'GTAC\n'],
                      annotations={'a': 'b'})
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='s1',
                                      lines=['>s1\n', 'ACTG\n'],
                                      annotations={'a': 'b'})
        assert seq.object is not seq2.object
        assert seq.object.lines is not seq2.object.lines

        # with fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTG\n', '+\n', '!???\n'])

        # with multiline fastq
        seq = SeqItem(
            name='seq',
            lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n', '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq2 = copy_seq(seq, seq='ACTGactg')
        assert seq2.object == SeqItem(
            name='seq', lines=['@seq\n', 'ACTGactg\n', '+\n', '@AAABBBB\n'])
Beispiel #2
0
    def test_copy(self):
        # with fasta
        seq = SeqItem(name='s1', lines=['>s1\n', 'ACTG\n', 'GTAC\n'],
                      annotations={'a': 'b'})
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='s1', lines=['>s1\n', 'ACTG\n'],
                                      annotations={'a': 'b'})
        assert seq.object is not seq2.object
        assert seq.object.lines is not seq2.object.lines

        # with fastq
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq2 = copy_seq(seq, seq='ACTG')
        assert seq2.object == SeqItem(name='seq',
                               lines=['@seq\n', 'ACTG\n', '+\n', '!???\n'])

        # with multiline fastq
        seq = SeqItem(name='seq', lines=['@seq\n', 'aaaa\n', 'aaaa\n', '+\n',
                                         '@AAA\n', 'BBBB\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq-illumina-multiline')
        seq2 = copy_seq(seq, seq='ACTGactg')
        assert seq2.object == SeqItem(name='seq',
                                      lines=['@seq\n', 'ACTGactg\n', '+\n',
                                             '@AAABBBB\n'])
Beispiel #3
0
    def test_change_name(self):
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n',
                                       '!???\n'], {})

        seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'], {})
Beispiel #4
0
    def test_change_name(self):
        seq = SeqItem(name='seq',
                      lines=['@seq\n', 'aaaa\n', '+seq\n', '!???\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fastq')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['@seq2\n', 'aaaa\n', '+\n', '!???\n'],
                              {})

        seq = SeqItem(name='seq', lines=['>seq\n', 'aaaa\n'])
        seq = SeqWrapper(SEQITEM, seq, 'fasta')
        seq = copy_seq(seq, name='seq2')
        assert seq.object == ('seq2', ['>seq2\n', 'aaaa\n'],
                              {})
Beispiel #5
0
 def __call__(self, seqs):
     'It changes the case of the seqrecords.'
     action = self.action
     processed_seqs = []
     for seq in seqs:
         str_seq = get_str_seq(seq)
         if action == UPPERCASE:
             str_seq = str_seq.upper()
         elif action == LOWERCASE:
             str_seq = str_seq.lower()
         elif action == SWAPCASE:
             str_seq = str_seq.swapcase()
         else:
             raise NotImplementedError()
         seq = copy_seq(seq, seq=str_seq)
         processed_seqs.append(seq)
     return processed_seqs
Beispiel #6
0
 def __call__(self, seqs):
     'It changes the case of the seqrecords.'
     action = self.action
     processed_seqs = []
     for seq in seqs:
         str_seq = get_str_seq(seq)
         if action == UPPERCASE:
             str_seq = str_seq.upper()
         elif action == LOWERCASE:
             str_seq = str_seq.lower()
         elif action == SWAPCASE:
             str_seq = str_seq.swapcase()
         else:
             raise NotImplementedError()
         seq = copy_seq(seq, seq=str_seq)
         processed_seqs.append(seq)
     return processed_seqs
Beispiel #7
0
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
Beispiel #8
0
def _mask_sequence(seq, segments):
    'It masks the given segments of the sequence'

    if not segments:
        return seq
    segments = merge_overlaping_segments(segments)
    segments = get_all_segments(segments, get_length(seq))
    str_seq = get_str_seq(seq)
    new_seq = ''
    for segment in segments:
        start = segment[0][0]
        end = segment[0][1] + 1
        str_seq_ = str_seq[start:end]

        if segment[1]:
            str_seq_ = str_seq_.lower()
        new_seq += str_seq_
    if seq.kind == SEQRECORD:
        new_seq = Seq(new_seq, alphabet=seq.object.seq.alphabet)
    return copy_seq(seq, seq=new_seq)
Beispiel #9
0
    def __call__(self, seqs):
        'It trims the edges of the given seqs.'
        mask = self.mask
        processed_seqs = []
        for seq in seqs:
            annots = get_annotations(seq)
            if not TRIMMING_RECOMMENDATIONS in annots:
                processed_seqs.append(copy_seq(seq))
                continue

            trim_rec = annots[TRIMMING_RECOMMENDATIONS]
            # fixing the trimming recommendations
            if TRIMMING_RECOMMENDATIONS in annots:
                del annots[TRIMMING_RECOMMENDATIONS]

            trim_segments = []
            for trim_kind in TRIMMING_KINDS:
                trim_segments.extend(trim_rec.get(trim_kind, []))

            # masking
            if mask:
                seq = _mask_sequence(seq, trim_segments)
            else:
                # trimming
                if trim_segments:
                    trim_limits = get_longest_complementary_segment(
                                                trim_segments, get_length(seq))
                    if trim_limits is None:
                        # there's no sequence left
                        continue
                else:
                    trim_limits = []

                if trim_limits:
                    seq = slice_seq(seq, trim_limits[0], trim_limits[1] + 1)

            processed_seqs.append(seq)

        return processed_seqs
Beispiel #10
0
class MatePairSplitter(object):
    'It splits the input sequences with the provided linkers.'

    def __init__(self, linkers=None):
        'The initiator'
        if linkers is None:
            linkers = get_setting('LINKERS')
            linkers = [SeqItem(str(i), '>%d\n%s\n' % (i, l)) for i, l in enumerate(linkers)]
            linkers = assing_kind_to_seqs(SEQITEM, linkers, 'fasta')
        self.linkers = list(linkers)

    def __call__(self, seqs):
        'It splits a list of sequences with the provided linkers'
        seq_fhand = write_seqs(seqs, file_format='fasta')
        seq_fhand.flush()

        min_identity = 87.0
        min_len = 13
        filters = [{'kind': 'min_length', 'min_num_residues': min_len,
                    'length_in_query': False, 'filter_match_parts': True},
                   {'kind': 'score_threshold', 'score_key': 'identity',
                   'min_score': min_identity}]

        matcher = BlasterForFewSubjects(seq_fhand.name, self.linkers,
                                        program='blastn', filters=filters,
                                        params={'task': 'blastn-short'},
                                        elongate_for_global=True,
                                        seqs_type=NUCL)
        new_seqs = []
        for seq in seqs:
            segments = matcher.get_matched_segments_for_read(get_name(seq))
            if segments is not None:
                split_seqs = self._split_by_mate_linker(seq, segments)
            else:
                split_seqs = [seq]
            for seq in split_seqs:
                new_seqs.append(seq)
        return new_seqs

    def _split_by_mate_linker(self, seq, (segments, is_partial)):
        'It splits the seqs using segments'

        if not segments:
            return [copy_seq(seq)]

        elongated_match = is_partial
        if len(segments) == 1:
            segment_start = segments[0][0]
            segment_end = segments[0][1]
            seq_end = get_length(seq) - 1
            if segment_start == 0:
                new_seq = slice_seq(seq, segment_end + 1, None)
                return [new_seq]
            elif segment_end == seq_end:
                new_seq = slice_seq(seq, None, segment_start)
                return [new_seq]
            elif segment_end > seq_end:
                msg = 'The segment ends after the sequence has ended'
                raise RuntimeError(msg)
            else:
                new_seq1 = slice_seq(seq, None, segment_start)
                new_seq2 = slice_seq(seq, segment_end + 1, None)
                if elongated_match:
                    name = get_name(seq) + '_pl'
                else:
                    name = get_name(seq)
                new_seq1 = copy_seq(new_seq1, name=name + r'\1')
                new_seq2 = copy_seq(new_seq2, name=name + r'\2')
                return [new_seq1, new_seq2]
        else:
            seqs = []
            counter = 1
            seq_start = 0
            for segment_start, segment_end in segments:
                if segment_start == 0:
                    continue
                new_seq = slice_seq(seq, seq_start, segment_start)
                seq_name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                new_seq = copy_seq(new_seq, name=seq_name)
                seqs.append(new_seq)
                counter += 1
                seq_start = segment_end + 1
            else:
                if segment_end != get_length(seq) + 1:
                    new_seq = slice_seq(seq, segment_end + 1, None)
                    name = get_name(seq) + '_mlc.part{0:d}'.format(counter)
                    new_seq = copy_seq(new_seq, name=name)
                    seqs.append(new_seq)
            return seqs