def get_aln_data(t_seq, q_seq): aln_data = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) ) sa_ptr = kup.allocate_seq( len(seq0) ) sda_ptr = kup.allocate_seq_addr( len(seq0) ) kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 12) aln_range = aln_range_ptr[0] x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)] ) kup.free_kmer_match(kmer_match_ptr) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 if e1 - s1 > 100: alignment = DWA.align(q_seq[s1:e1], e1-s1, seq0[s2:e2], e2-s2, 1500,1) if alignment[0].aln_str_size > 100: aln_data.append( ( q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist ) ) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y
def get_aln_data(t_seq, q_seq): aln_data = [] #x = [] #y = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq( q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i]) # for i in range(kmer_match.count)])) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) max_len = 250000 # to keep allocations < 16GB, given band_tol=1500 if (e1 - s1) >= max_len or (e2 - s2) >= max_len: # DW.align() would crash, so raise here. # (500000 is the approx. upper bound for int overflow, # but some users run out of memory anyway.) raise TooLongError('q_len={} or t_len={} are too big, over 500k'.format( (e1-s1), (e2-s2))) if e1 - s1 > 100: log('Calling DW_banded.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) alignment = DWA.align(q_seq[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2, 1500, 1) if alignment[0].aln_str_size > 100: aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len( seq0), alignment[0].aln_str_size, alignment[0].dist)) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data #, x, y
def get_aln_data(t_seq, q_seq): aln_data = [] #x = [] #y = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] #x, y = list(zip(* [(kmer_match.query_pos[i], kmer_match.target_pos[i]) # for i in range(kmer_match.count)])) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 max_len = 250000 # to keep allocations < 16GB, given band_tol=1500 if (e1 - s1) >= max_len or (e2 - s2) >= max_len: # DW.align() would crash, so raise here. # (500000 is the approx. upper bound for int overflow, # but some users run out of memory anyway.) raise TooLongError( 'q_len={} or t_len={} are too big, over 500k'.format( (e1 - s1), (e2 - s2))) if e1 - s1 > 100: log('Calling DW_banded.align(q, {}, t, {}, 1500, 1)'.format( e1 - s1, e2 - s2)) alignment = DWA.align(q_seq[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2, 1500, 1) if alignment[0].aln_str_size > 100: aln_data.append( (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist)) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data #, x, y
def get_aln_data(t_seq, q_seq): """ Inputs in bytes. """ aln_data = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})' .format(s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) if e1 - s1 > 100: log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})' .format(s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) # Align using Edlib instead of DWA. edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW") delta_l = len(q_seq) - len(t_seq) cov = float(e1 - s1) / float(len(q_seq)) idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1) aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), delta_l, idt, cov)) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data #, x, y
def get_alignment(seq1, seq0): K = 8 lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 50) #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] ) kup.free_kmer_match(kmer_match_ptr) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 if e1 - s1 > 500: #s1 = 0 if s1 < 14 else s1 - 14 #s2 = 0 if s2 < 14 else s2 - 14 e1 = len(seq1) if e1 >= len(seq1) - 2 * K else e1 + K * 2 e2 = len(seq0) if e2 >= len(seq0) - 2 * K else e2 + K * 2 alignment = DWA.align(seq1[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2, 100, 0) #print seq1[s1:e1] #print seq0[s2:e2] #if alignment[0].aln_str_size > 500: #aln_str1 = alignment[0].q_aln_str #aln_str0 = alignment[0].t_aln_str aln_size = alignment[0].aln_str_size aln_dist = alignment[0].dist aln_q_s = alignment[0].aln_q_s aln_q_e = alignment[0].aln_q_e aln_t_s = alignment[0].aln_t_s aln_t_e = alignment[0].aln_t_e assert aln_q_e - aln_q_s <= alignment[ 0].aln_str_size or aln_t_e - aln_t_s <= alignment[0].aln_str_size #print aln_str1 #print aln_str0 DWA.free_alignment(alignment) kup.free_seq_addr_array(sda_ptr) kup.free_seq_array(sa_ptr) kup.free_kmer_lookup(lk_ptr) if e1 - s1 > 500 and aln_size > 500: return s1, s1 + aln_q_e - aln_q_s, s2, s2 + aln_t_e - aln_t_s, aln_size, aln_dist else: return None
def get_alignment(seq1, seq0): K = 8 lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) ) sa_ptr = kup.allocate_seq( len(seq0) ) sda_ptr = kup.allocate_seq_addr( len(seq0) ) kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50) #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] ) kup.free_kmer_match(kmer_match_ptr) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 if e1 - s1 > 500: #s1 = 0 if s1 < 14 else s1 - 14 #s2 = 0 if s2 < 14 else s2 - 14 e1 = len(seq1) if e1 >= len(seq1)-2*K else e1 + K*2 e2 = len(seq0) if e2 >= len(seq0)-2*K else e2 + K*2 alignment = DWA.align(seq1[s1:e1], e1-s1, seq0[s2:e2], e2-s2, 100, 0) #print seq1[s1:e1] #print seq0[s2:e2] #if alignment[0].aln_str_size > 500: #aln_str1 = alignment[0].q_aln_str #aln_str0 = alignment[0].t_aln_str aln_size = alignment[0].aln_str_size aln_dist = alignment[0].dist aln_q_s = alignment[0].aln_q_s aln_q_e = alignment[0].aln_q_e aln_t_s = alignment[0].aln_t_s aln_t_e = alignment[0].aln_t_e assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size #print aln_str1 #print aln_str0 DWA.free_alignment(alignment) kup.free_seq_addr_array(sda_ptr) kup.free_seq_array(sa_ptr) kup.free_kmer_lookup(lk_ptr) if e1 - s1 > 500 and aln_size > 500: return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist else: return None
def get_aln_data(t_seq, q_seq): aln_data = [] x = [] y = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq(q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] x, y = zip(*[(kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count)]) s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 if e1 - s1 > 100: alignment = DWA.align(q_seq[s1:e1], e1 - s1, seq0[s2:e2], e2 - s2, 1500, 1) if alignment[0].aln_str_size > 100: aln_data.append( (q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), alignment[0].aln_str_size, alignment[0].dist)) aln_str1 = alignment[0].q_aln_str aln_str0 = alignment[0].t_aln_str DWA.free_alignment(alignment) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data, x, y
def get_aln_data(t_seq, q_seq): aln_data = [] K = 8 seq0 = t_seq lk_ptr = kup.allocate_kmer_lookup(1 << (K * 2)) sa_ptr = kup.allocate_seq(len(seq0)) sda_ptr = kup.allocate_seq_addr(len(seq0)) kup.add_sequence(0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) q_id = "dummy" kmer_match_ptr = kup.find_kmer_pos_for_seq( q_seq, len(q_seq), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] if kmer_match.count != 0: aln_range_ptr = kup.find_best_aln_range(kmer_match_ptr, K, K * 5, 12) aln_range = aln_range_ptr[0] s1, e1, s2, e2 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 log('Mapped (q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) if e1 - s1 > 100: log('Calling edlib.align(q, s1 = {}, e1 = {}, len1 = {}, (e1 - s1) = {}, t, s2 = {}, e2 = {}, (e2 - s2) = {}, len2 = {})'.format( s1, e1, e1 - s1, len(q_seq), s2, e2, e2 - s2, len(t_seq))) # Align using Edlib instead of DWA. edlib_result = edlib.align(q_seq[s1:e1], seq0[s2:e2], mode="NW") delta_l = len(q_seq) - len(t_seq) cov = float(e1 - s1) / float(len(q_seq)) idt = float(e1 - s1 - edlib_result['editDistance']) / float(e1 - s1) aln_data.append((q_id, 0, s1, e1, len(q_seq), s2, e2, len(seq0), delta_l, idt, cov)) kup.free_aln_range(aln_range_ptr) kup.free_kmer_match(kmer_match_ptr) kup.free_kmer_lookup(lk_ptr) kup.free_seq_array(sa_ptr) kup.free_seq_addr_array(sda_ptr) return aln_data #, x, y
def get_ovelap_alignment(seq1, seq0): K = 8 lk_ptr = kup.allocate_kmer_lookup( 1 << (K * 2) ) sa_ptr = kup.allocate_seq( len(seq0) ) sda_ptr = kup.allocate_seq_addr( len(seq0) ) kup.add_sequence( 0, K, seq0, len(seq0), sda_ptr, sa_ptr, lk_ptr) kmer_match_ptr = kup.find_kmer_pos_for_seq(seq1, len(seq1), K, sda_ptr, lk_ptr) kmer_match = kmer_match_ptr[0] aln_range = kup.find_best_aln_range(kmer_match_ptr, K, K*5, 50) #x,y = zip( * [ (kmer_match.query_pos[i], kmer_match.target_pos[i]) for i in range(kmer_match.count )] ) kup.free_kmer_match(kmer_match_ptr) s1, e1, s0, e0 = aln_range.s1, aln_range.e1, aln_range.s2, aln_range.e2 len_1 = len(seq1) len_0 = len(seq0) do_aln = False contain_status = "none" if e1 - s1 > 500: if s1 < 100 and len_1 - e1 < 100: do_aln = False contain_status = "contains" elif s0 < 100 and len_0 - e0 < 100: do_aln = False contain_status = "contained" else: do_aln = True if s0 < s1: s1 -= s0 #assert s1 > 0 s0 = 0 e1 = len_1 e0 = len_1 - s1 if len_1 - s1 < len_0 else len_0 if e0 == len_0: do_aln = False contain_status = "contained" if s1 <= s0: s0 -= s1 #assert s1 > 0 s1 = 0 e0 = len_0 e1 = len_0 - s0 if len_0 - s0 < len_1 else len_1 if e1 == len_1: do_aln = False contain_status = "contains" if do_aln: alignment = DWA.align(seq1[s1:e1], e1-s1, seq0[s0:e0], e0-s0, 500, 0) #print seq1[s1:e1] #print seq0[s2:e2] #if alignment[0].aln_str_size > 500: #aln_str1 = alignment[0].q_aln_str #aln_str0 = alignment[0].t_aln_str aln_size = alignment[0].aln_str_size aln_dist = alignment[0].dist aln_q_s = alignment[0].aln_q_s aln_q_e = alignment[0].aln_q_e aln_t_s = alignment[0].aln_t_s aln_t_e = alignment[0].aln_t_e assert aln_q_e- aln_q_s <= alignment[0].aln_str_size or aln_t_e- aln_t_s <= alignment[0].aln_str_size #print aln_str1 #print aln_str0 if aln_size > 500: contain_status = "overlap" DWA.free_alignment(alignment) kup.free_seq_addr_array(sda_ptr) kup.free_seq_array(sa_ptr) kup.free_kmer_lookup(lk_ptr) if e1 - s1 > 500 and do_aln and aln_size > 500: #return s1, s1+aln_q_e-aln_q_s, s2, s2+aln_t_e-aln_t_s, aln_size, aln_dist, x, y return s1, s1+aln_q_e-aln_q_s, s0, s0+aln_t_e-aln_t_s, aln_size, aln_dist, contain_status else: return 0, 0, 0, 0, 0, 0, contain_status