Esempio n. 1
0
def map_sequence(short_seq, long_seq, max_mismatch=0.9):
    """ map the short_seq to long_seq,
        return (start_pos, end_pos, inserts, deletions) """

    min_score = len(short_seq) * max_mismatch * 0.75
    (sseq, lseq, score) = funcs.align([short_seq, long_seq])

    gap = ord('-')

    start_pos = -1
    for i in range(len(sseq)):
        if sseq[i] != gap:
            start_pos = i
            break

    end_pos = -1
    for i in range(len(sseq) - 1, 0, -1):
        if sseq[i] != gap:
            end_pos = i
            break

    if start_pos == end_pos:
        raise RuntimeError()

    inserts = lseq[start_pos:end_pos].count(gap)
    dels = sseq[start_pos:end_pos].count(gap)
    mismatch = 0
    for i in range(start_pos, end_pos):
        if lseq[i] != sseq[i]:
            mismatch += 1

    if mismatch / (end_pos - start_pos) > max_mismatch:
        return (-1, -1, mismatch, -1, -1)

    return (start_pos, end_pos, mismatch, inserts, dels)
Esempio n. 2
0
    def align(self, indexes, method=None, matrix=None):
        if matrix is None:
            if self.type() in [DNA, RNA]:
                matrix = 'DNA'
            else:
                matrix = 'BLOSUM62'
        src = [self[idx] for idx in indexes]

        from seqpy.core import funcs
        results = funcs.align(src, method, matrix)
        for idx, r in zip(indexes, results):
            self[idx].set_sequence(r)
Esempio n. 3
0
def align_ref(ref, query, max_mismatch=0.5):
    """ return:
            ref_start -> position where the reference start aligned
            ref_end -> position where the reference stop aligned

    """

    (rseq, qseq, score) = funcs.align([ref, query])

    ref_start, ref_end = start_end_pos(rseq)
    query_start, query_end = start_end_pos(qseq)

    return ref_start, ref_end, query_start, query_end, rseq, qseq, score / min(
        ref_end - query_start, query_end - ref_start)
Esempio n. 4
0
def recircularize_sequence(seq, ref, max_mismatch):

    revcomp = funcs.reverse_complemented(seq)
    ref_start, ref_end, query_start, query_end, arseq, aqseq, score = align_ref(
        ref, seq, max_mismatch)
    #print(arseq)
    #print(aqseq)
    ref_start2, ref_end2, query_start2, query_end2, arseq2, aqseq2, score2 = align_ref(
        ref, revcomp, max_mismatch)
    #print(arseq2)
    #print(aqseq2)
    if score2 > score:
        cerr('-> use reverse complement seq')
        ref_start, ref_end = ref_start2, ref_end2
        query_start, query_end = query_start2, query_end2
        seq = revcomp
        arseq = arseq2
        aqseq = aqseq2

    # starting from here, seq = original query seq, qseq, aqseq = aligned query seq

    circularized_seq = aqseq[ref_start:ref_end + 1]

    upstream = downstream = ''

    print(arseq)
    print(aqseq)

    if ref_start > query_start:
        # sequence has headings
        upstream = aqseq[0:ref_start]

    if ref_end < query_end:
        # sequence has downstream / tail
        downstream = aqseq[ref_end + 1:]
        #print('boundary ->', aqseq[ref_end-5:ref_end+5])

    print('ustream ->', upstream)
    print('dstream ->', downstream)

    arseq = arseq[ref_start:ref_end + 1]

    if len(upstream) > 15:
        merged_1, merged_2, _ = funcs.align([arseq, upstream], degap=False)
        print('merged_1 >< merged_2 >< circularized_seq')
        print(merged_1)
        print(merged_2)
        print(circularized_seq)
        circularized_seq = funcs.merged([circularized_seq, merged_2])

    if len(downstream) > 15:
        #print('ref >< circularized_seq')
        #print(arseq)
        #print(circularized_seq)
        merged_3, merged_4, _ = funcs.align(
            [funcs.degapped(arseq), downstream], degap=False)
        print('merged_3 >< merged_4')
        print(merged_3)
        print(merged_4)
        merged_5, merged_6, _ = funcs.align(
            [merged_3, funcs.degapped(circularized_seq)], degap=False)
        print('merged_5 >< merged_6')
        print(merged_5)
        print(merged_6)
        print('circ >< merged_4 >< merged_6')
        print(circularized_seq)
        print(merged_4)
        print(merged_6)
        circularized_seq = funcs.merged([merged_4, merged_6])
        #print('ref >< circularized_seq')
        #print(ref)
        #print(circularized_seq)

    return circularized_seq
Esempio n. 5
0
def recircularize_sequence(seq, ref, match_len=30, max_mismatch=0.9):
    """ recircularize a circular DNA, based on ref
        return the recircularized sequence
    """

    # first, map ref to seq to find head position(s)

    head_start, _, _, _, _ = map_sequence(ref[:match_len], seq, max_mismatch)
    if head_start < 0:

        seq = funcs.reverse_complemented(seq)
        head_start, _, _, _, _ = map_sequence(ref[:match_len], seq,
                                              max_mismatch)
        if head_start < 0:
            cerr('>>> head not found!')
            return seq

    head_start2 = -1
    if len(seq) - head_start >= len(ref):
        # remaining seq is longer than ref, possibly 2nd head pos exists
        # just pass half of ref
        offset = head_start + match_len
        head_start2, _, _, _, _ = map_sequence(ref[:match_len], seq[offset:],
                                               max_mismatch)
        if head_start2 >= 0:
            head_start2 += offset

    #print(head_start, head_start2)

    if head_start2 > head_start:
        # this part deals in case where the full sequence appear in the seq

        circularized_seq = seq[head_start:head_start2]
        # create merged with seq[:head_pos] and seq[head_pos2:]

        upstream_part = seq[head_start2:]
        downstream_part = seq[:head_start]
        if len(upstream_part) > 0:
            merged_1, merged_2, _ = funcs.align(
                [circularized_seq, upstream_part])
            circularized_seq = funcs.merged([merged_1, merged_2])
        if len(downstream_part) > 0:
            merged_3, merged_4, _ = funcs.align(
                [circularized_seq, downstream_part])
            circularized_seq = funcs.merged([merged_3, merged_4])

    else:

        upstream_part = seq[head_start:]
        downstream_part = seq[:head_start]
        overlap_len = len(seq) - len(ref) + 15

        overlap_len = min(overlap_len, len(upstream_part),
                          len(downstream_part))

        if overlap_len == 0:
            return upstream_part + downstream_part

        # map the end of upstream_part to ref
        #print( upstream_part[-overlap_len:] )
        #print( downstream_part[:overlap_len] )
        overlap_start1, overlap_end1, mismatch1, ins1, dels1 = map_sequence(
            upstream_part[-overlap_len:], ref)
        overlap_start2, overlap_end2, mismatch2, ins2, dels2 = map_sequence(
            downstream_part[:overlap_len], ref)
        #print(overlap_start1, overlap_end1)
        #print(overlap_start2, overlap_end2)
        if overlap_start2 >= overlap_end1:
            cerr('>> algorithm problem for this sample!')
            return seq
        upstream_tobe_merged, downstream_tobe_merged, score = funcs.align([
            upstream_part[overlap_start2:overlap_end1],
            downstream_part[:overlap_end1 - overlap_start2]
        ])
        merged_segment = funcs.merged(
            [upstream_tobe_merged, downstream_tobe_merged])
        circularized_seq = upstream_part[:
                                         overlap_start2] + merged_segment + downstream_part[
                                             overlap_end1 - overlap_start2:]

    return circularized_seq