Example #1
0
def get_aln_data(t_seq, q_seq):
    aln_data = []
    K = 8
    seq0 = t_seq
    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
    sa_ptr = kup.allocate_seq( len(seq0) )
    sda_ptr = kup.allocate_seq_addr( len(seq0) )
    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
    q_id = "dummy"
    
    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr)
    kmer_match = kmer_match_ptr[0]
    aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12)
    aln_range = aln_range_ptr[0]
    x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] )
    kup.free_kmer_match(kmer_match_ptr)
    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
    
    if e1 - s1 > 100:

        alignment = DWA.align(q_seq[s1:e1], e1-s1,
                              seq0[s2:e2], e2-s2,
                              1500,1)

        if alignment[0].aln_str_size > 100:
            aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) )
            aln_str1 = alignment[0].q_aln_str
            aln_str0 = alignment[0].t_aln_str

        DWA.free_alignment(alignment)

    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data, x, y
Example #2
0
def get_aln_data(t_seq, q_seq):
    aln_data = []
    #x = []
    #y = []
    K = 8
    seq0 = t_seq
    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
    sa_ptr = kup.allocate_seq(len(seq0))
    sda_ptr = kup.allocate_seq_addr(len(seq0))
    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
    q_id = "dummy"

    kmer_match_ptr = kup.find_kmer_pos_for_seq(
        q_seq, len(q_seq), K, sda_ptr, lk_ptr)
    kmer_match = kmer_match_ptr[0]

    if kmer_match.count != 0:
        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
        aln_range = aln_range_ptr[0]
        #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
        #              for i in range(kmer_match.count)]))

        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2

        log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))

        max_len = 250000 # to keep allocations < 16GB, given band_tol=1500
        if (e1 - s1) >= max_len or (e2 - s2) >= max_len:
            # DW.align() would crash, so raise here.
            # (500000 is the approx. upper bound for int overflow,
            #  but some users run out of memory anyway.)
            raise TooLongError('q_len={} or t_len={} are too big, over 500k'.format(
                (e1-s1), (e2-s2)))
        if e1 - s1 > 100:
            log('Calling DW_banded.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format(
                s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq)))
            alignment = DWA.align(q_seq[s1:e1], e1 - s1,
                                  seq0[s2:e2], e2 - s2,
                                  1500, 1)

            if alignment[0].aln_str_size > 100:
                aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(
                    seq0), alignment[0].aln_str_size, alignment[0].dist))
                aln_str1 = alignment[0].q_aln_str
                aln_str0 = alignment[0].t_aln_str

            DWA.free_alignment(alignment)

        kup.free_aln_range(aln_range_ptr)

    kup.free_kmer_match(kmer_match_ptr)
    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data #, x, y
Example #3
0
def get_aln_data(t_seq, q_seq):
    aln_data = []
    #x = []
    #y = []
    K = 8
    seq0 = t_seq
    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
    sa_ptr = kup.allocate_seq(len(seq0))
    sda_ptr = kup.allocate_seq_addr(len(seq0))
    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
    q_id = "dummy"

    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr,
                                               lk_ptr)
    kmer_match = kmer_match_ptr[0]

    if kmer_match.count != 0:
        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
        aln_range = aln_range_ptr[0]
        #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i])
        #              for i in range(kmer_match.count)]))

        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2

        max_len = 250000  # to keep allocations < 16GB, given band_tol=1500
        if (e1 - s1) >= max_len or (e2 - s2) >= max_len:
            # DW.align() would crash, so raise here.
            # (500000 is the approx. upper bound for int overflow,
            #  but some users run out of memory anyway.)
            raise TooLongError(
                'q_len={} or t_len={} are too big, over 500k'.format(
                    (e1 - s1), (e2 - s2)))
        if e1 - s1 > 100:
            log('Calling DW_banded.align(q, {}, t, {}, 1500, 1)'.format(
                e1 - s1, e2 - s2))
            alignment = DWA.align(q_seq[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2,
                                  1500, 1)

            if alignment[0].aln_str_size > 100:
                aln_data.append(
                    (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
                     alignment[0].aln_str_size, alignment[0].dist))
                aln_str1 = alignment[0].q_aln_str
                aln_str0 = alignment[0].t_aln_str

            DWA.free_alignment(alignment)

        kup.free_aln_range(aln_range_ptr)

    kup.free_kmer_match(kmer_match_ptr)
    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data  #, x, y
Example #4
0
def get_alignment(seq1, seq0):

    K = 8
    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
    sa_ptr = kup.allocate_seq(len(seq0))
    sda_ptr = kup.allocate_seq_addr(len(seq0))
    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)

    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr,
                                               lk_ptr)
    kmer_match = kmer_match_ptr[0]
    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 50)
    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
    kup.free_kmer_match(kmer_match_ptr)
    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
    if e1 - s1 > 500:
        #s1 = 0 if s1 < 14 else s1 - 14
        #s2 = 0 if s2 < 14 else s2 - 14
        e1 = len(seq1) if e1 >= len(seq1) - 2 * K else e1 + K * 2
        e2 = len(seq0) if e2 >= len(seq0) - 2 * K else e2 + K * 2

        alignment = DWA.align(seq1[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2, 100,
                              0)
        #print seq1[s1:e1]
        #print seq0[s2:e2]
        #if alignment[0].aln_str_size > 500:

        #aln_str1 = alignment[0].q_aln_str
        #aln_str0 = alignment[0].t_aln_str
        aln_size = alignment[0].aln_str_size
        aln_dist = alignment[0].dist
        aln_q_s = alignment[0].aln_q_s
        aln_q_e = alignment[0].aln_q_e
        aln_t_s = alignment[0].aln_t_s
        aln_t_e = alignment[0].aln_t_e
        assert aln_q_e - aln_q_s <= alignment[
            0].aln_str_size or aln_t_e - aln_t_s <= alignment[0].aln_str_size
        #print aln_str1
        #print aln_str0

        DWA.free_alignment(alignment)

    kup.free_seq_addr_array(sda_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_kmer_lookup(lk_ptr)
    if e1 - s1 > 500 and aln_size > 500:
        return s1, s1 + aln_q_e - aln_q_s, s2, s2 + aln_t_e - aln_t_s, aln_size, aln_dist
    else:
        return None
Example #5
0
def get_alignment(seq1, seq0):

    K = 8
    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
    sa_ptr = kup.allocate_seq( len(seq0) )
    sda_ptr = kup.allocate_seq_addr( len(seq0) )
    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)

    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
    kmer_match = kmer_match_ptr[0]
    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
    kup.free_kmer_match(kmer_match_ptr)
    s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2
    if e1 - s1 > 500:
        #s1 = 0 if s1 < 14 else s1 - 14
        #s2 = 0 if s2 < 14 else s2 - 14
        e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2
        e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2
        
        alignment = DWA.align(seq1[s1:e1], e1-s1,
                              seq0[s2:e2], e2-s2,
                              100, 0)
        #print seq1[s1:e1]
        #print seq0[s2:e2]
        #if alignment[0].aln_str_size > 500:

        #aln_str1 = alignment[0].q_aln_str
        #aln_str0 = alignment[0].t_aln_str
        aln_size = alignment[0].aln_str_size
        aln_dist = alignment[0].dist
        aln_q_s = alignment[0].aln_q_s
        aln_q_e = alignment[0].aln_q_e
        aln_t_s = alignment[0].aln_t_s
        aln_t_e = alignment[0].aln_t_e
        assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
        #print aln_str1
        #print aln_str0
    
        DWA.free_alignment(alignment)

    kup.free_seq_addr_array(sda_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_kmer_lookup(lk_ptr)
    if e1 - s1 > 500 and aln_size > 500:
        return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist
    else:
        return None
Example #6
0
def get_aln_data(t_seq, q_seq):
    aln_data = []
    x = []
    y = []
    K = 8
    seq0 = t_seq
    lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2))
    sa_ptr = kup.allocate_seq(len(seq0))
    sda_ptr = kup.allocate_seq_addr(len(seq0))
    kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)
    q_id = "dummy"

    kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr,
                                               lk_ptr)
    kmer_match = kmer_match_ptr[0]

    if kmer_match.count != 0:
        aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12)
        aln_range = aln_range_ptr[0]
        x, y = zip(*[(kmer_match.query_pos[i], kmer_match.target_pos[i])
                     for i in range(kmer_match.count)])

        s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2

        if e1 - s1 > 100:

            alignment = DWA.align(q_seq[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2,
                                  1500, 1)

            if alignment[0].aln_str_size > 100:
                aln_data.append(
                    (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0),
                     alignment[0].aln_str_size, alignment[0].dist))
                aln_str1 = alignment[0].q_aln_str
                aln_str0 = alignment[0].t_aln_str

            DWA.free_alignment(alignment)

        kup.free_aln_range(aln_range_ptr)

    kup.free_kmer_match(kmer_match_ptr)
    kup.free_kmer_lookup(lk_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_seq_addr_array(sda_ptr)
    return aln_data, x, y
    monomer_db[rid].append((rng, r.sequence, orientation))

print len(seq_db), " sequences read.",\
    "Reads with monomers:", len(monomer_db.keys())

# RUN OVER ALL READS THAT CONTAIN MONOMERS #
for rid, monomers in monomer_db.items():
    aln_data = []
    range_list = []
    total_monomer_len = 0
    # Align all monomers on the read to each other.
    for i in range(len(monomers)):
        for j in range(i + 1, len(monomers)):
            t_seq = monomers[i][1]
            q_seq = monomers[j][1]
            alignment = DWA.align(q_seq, len(q_seq), t_seq, len(t_seq), 50, 1)
            aln_str1 = alignment[0].q_aln_str
            aln_str0 = alignment[0].t_aln_str
            aln_size = alignment[0].aln_str_size
            aln_dist = alignment[0].dist
            aln_q_s = alignment[0].aln_q_s
            aln_q_e = alignment[0].aln_q_e
            aln_t_s = alignment[0].aln_t_s
            aln_t_e = alignment[0].aln_t_e

            # Skip if no alignment
            if aln_size != 0:
                # Store the alignment score for the monomer pair: (i, j, score)
                aln_data.append((i, j, 1 - (1.0*aln_dist/aln_size)))
            DWA.free_alignment(alignment)
Example #8
0
def get_ovelap_alignment(seq1, seq0):

    K = 8
    lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) )
    sa_ptr = kup.allocate_seq( len(seq0) )
    sda_ptr = kup.allocate_seq_addr( len(seq0) )
    kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr)

    kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr)
    kmer_match = kmer_match_ptr[0]
    aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50)
    #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] )
    kup.free_kmer_match(kmer_match_ptr)
    s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2  
    len_1 = len(seq1)
    len_0 = len(seq0)
    do_aln = False
    contain_status = "none" 
    if e1 - s1 > 500:
        if s1 < 100 and len_1 - e1 < 100:
            do_aln = False
            contain_status = "contains"
        elif s0 < 100 and len_0 - e0 < 100:
            do_aln = False
            contain_status = "contained"
        else:
            do_aln = True
            if s0 < s1:
                s1 -= s0 #assert s1 > 0
                s0 = 0
                e1 = len_1
                e0 = len_1 - s1 if len_1 - s1 < len_0 else len_0
                if e0 == len_0:
                    do_aln = False
                    contain_status = "contained"
                
            if s1 <= s0:
                s0 -= s1 #assert s1 > 0
                s1 = 0
                e0 = len_0
                e1 = len_0 - s0 if len_0 - s0 < len_1 else len_1
                if e1 == len_1:
                    do_aln = False
                    contain_status = "contains"


        if do_aln:
            alignment = DWA.align(seq1[s1:e1], e1-s1,
                                  seq0[s0:e0], e0-s0,
                                  500, 0)
            #print seq1[s1:e1]
            #print seq0[s2:e2]
            #if alignment[0].aln_str_size > 500:
    
            #aln_str1 = alignment[0].q_aln_str
            #aln_str0 = alignment[0].t_aln_str
            aln_size = alignment[0].aln_str_size
            aln_dist = alignment[0].dist
            aln_q_s = alignment[0].aln_q_s
            aln_q_e = alignment[0].aln_q_e
            aln_t_s = alignment[0].aln_t_s
            aln_t_e = alignment[0].aln_t_e
            assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size
            #print aln_str1
            #print aln_str0
            if aln_size > 500: 
                contain_status = "overlap"            
            DWA.free_alignment(alignment)
        
    kup.free_seq_addr_array(sda_ptr)
    kup.free_seq_array(sa_ptr)
    kup.free_kmer_lookup(lk_ptr)

    if e1 - s1 > 500 and do_aln and aln_size > 500:
        #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y
        return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status
    else:
        return 0, 0, 0, 0, 0, 0, contain_status 
    monomer_db[rid].append((rng, r.sequence, orientation))

print len(seq_db), " sequences read.",\
    "Reads with monomers:", len(monomer_db.keys())

# RUN OVER ALL READS THAT CONTAIN MONOMERS #
for rid, monomers in monomer_db.items():
    aln_data = []
    range_list = []
    total_monomer_len = 0
    # Align all monomers on the read to each other.
    for i in range(len(monomers)):
        for j in range(i + 1, len(monomers)):
            t_seq = monomers[i][1]
            q_seq = monomers[j][1]
            alignment = DWA.align(q_seq, len(q_seq), t_seq, len(t_seq), 50, 1)
            aln_str1 = alignment[0].q_aln_str
            aln_str0 = alignment[0].t_aln_str
            aln_size = alignment[0].aln_str_size
            aln_dist = alignment[0].dist
            aln_q_s = alignment[0].aln_q_s
            aln_q_e = alignment[0].aln_q_e
            aln_t_s = alignment[0].aln_t_s
            aln_t_e = alignment[0].aln_t_e

            # Skip if no alignment
            if aln_size != 0:
                # Store the alignment score for the monomer pair: (i, j, score)
                aln_data.append((i, j, 1 - (1.0 * aln_dist / aln_size)))
            DWA.free_alignment(alignment)