def map_pair(seq1, seq1_len, seq2, seq2_len, k): d1 = 1 d2 = 1 p1 = -1 p2 = -1 c1 = -1 c2 = -1 found1 = 0 for i in range(seq1_len - k + 1): kit1 = seq1[i:i + k] kit1_rep = kmer_rep(kit1) if kit1_rep in kmer_str_dict: found1 = 1 kit1_info = kmer_str_dict[kit1_rep] c1 = kit1_info.contig_id if (kit1_rep == kit1) == kit1_info.sense_in_contig: p1 = kit1_info.pos_in_contig - i d1 = 1 else: p1 = kit1_info.pos_in_contig + k + i d1 = 0 # print(seq1) # print(kit1_info.pos_in_contig) break else: pass if not found1: return -1 found2 = 0 for i in range(seq2_len - k + 1): kit2 = seq2[i:i + k] kit2_rep = kmer_rep(kit2) if kit2_rep in kmer_str_dict: found2 = 1 kit2_info = kmer_str_dict[kit2_rep] c2 = kit2_info.contig_id if (kit2_rep == kit2) == kit2_info.sense_in_contig: p2 = kit2_info.pos_in_contig - i d2 = 1 else: p2 = kit2_info.pos_in_contig + k + i d2 = 0 # print(kit2_info.pos_in_contig) break else: pass if not found2: return -1 if c1 != c2: return -1 if d1 and d2: return -1 if not d1 and not d2: return -1 if p1 > p2: return p1 - p2 else: return p2 - p1
def fix_contigs(k): perfect_contig_num = 0 for contig_id in range(len(contig_find_trans_list)): all_ok = 1 # 如前文所言,没有找错的 n_of_kmer_in_curr_contig = contig_list[contig_id].n_of_kmer for each_trans_info in contig_find_trans_list[contig_id]: if each_trans_info.start_in_tran != 0 or each_trans_info.stop_in_tran != n_of_kmer_in_curr_contig: all_ok = 0 # 说明这里有找错的 if all_ok: perfect_contig_num = perfect_contig_num + 1 else: tmp_list = [] for each_trans_info in contig_find_trans_list[contig_id]: tmp_list.append(each_trans_info.start_in_tran) tmp_list.append(each_trans_info.stop_in_tran) break_points = sorted(list(set(tmp_list))) old_contig_seq = contig_list[contig_id].seq old_trans_info = contig_find_trans_list[contig_id] new_contig = Contig() for j in range(len(break_points) - 1): new_contig.seq = old_contig_seq[ break_points[j]:break_points[j + 1] - break_points[j] + k - 1] new_contig.n_of_kmer = break_points[j + 1] - break_points[j] if j == 0: new_contig.id = contig_id contig_list[contig_id] = new_contig else: new_contig.id = len(contig_list) contig_list.append(new_contig) ecs_list.append(-1) for m in range(len(new_contig.seq) - k + 1): curr_kmer_in_contig = new_contig.seq[m:m + k] curr_rep = kmer_rep(curr_kmer_in_contig) if curr_rep in kmer_str_dict: kmer_str_dict[curr_rep].contig_id = new_contig.id kmer_str_dict[curr_rep].pos_in_contig = m kmer_str_dict[ curr_rep].n_of_kmer_in_contig = new_contig.n_of_kmer else: program_stop("ecs.py-4") new_trans_info_list = [] for each_trans_info in old_trans_info: if not (each_trans_info.start_in_tran >= break_points[j + 1] or each_trans_info.stop_in_tran <= break_points[j]): new_trans_info = ContigFindTrans() new_trans_info.sense_in_tran = each_trans_info.sense_in_tran new_trans_info.tran_id = each_trans_info.tran_id new_trans_info.start_in_tran = 0 new_trans_info.stop_in_tran = new_contig.n_of_kmer new_trans_info_list.append(new_trans_info) if j == 0: contig_find_trans_list[contig_id] = new_trans_info_list else: contig_find_trans_list.append(new_trans_info_list)
def fwstep(kmer): global fw_step_kmer fw_count = 0 for fw_kmer in contig_fw(kmer): curr_rep = kmer_rep(fw_kmer) if curr_rep in kmer_str_dict: fw_count = fw_count + 1 curr_fw = fw_kmer if fw_count > 1: return 0 if fw_count != 1: return 0 bw_count = sum( kmer_rep(tmp) in kmer_str_dict for tmp in contig_bw(curr_fw)) if bw_count > 1: return 0 if bw_count != 1: return 0 else: if curr_fw != kmer: fw_step_kmer = curr_fw return 1 else: return 0
def build_ecs(seqs, k): contig_find_trans_dict = {} for i in range(len(contig_list)): contig_find_trans_dict[i] = [] # 遍历基因序列 for i in range(len(seqs)): each_ec = [i] ec_map.append(each_ec) ec_inv_dict[i] = each_ec seq = seqs[i] n_of_kmers_of_seq = len(seq) - k + 1 j = 0 while j < n_of_kmers_of_seq: curr_kmer = seq[j:j + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) curr_twin = kmer_twin(curr_kmer) if curr_rep in kmer_str_dict: curr_contig_id = kmer_str_dict[curr_rep].contig_id curr_contig = contig_list[curr_contig_id] kmers_in_contig = [] # 遍历curr_contig里面的每一个kmer for m in range(len(curr_contig.seq) - k + 1): curr_kmer_in_contig = curr_contig.seq[m:m + k] kmers_in_contig.append(curr_kmer_in_contig) kmers_in_contig_len = len(kmers_in_contig) contig_find_trans = ContigFindTrans() contig_find_trans.tran_id = i if (curr_kmer == curr_rep ) == kmer_str_dict[curr_rep].sense_in_contig: contig_find_trans.sense_in_tran = 1 contig_find_trans.start_in_tran = kmers_in_contig.index( curr_kmer) if kmers_in_contig_len - contig_find_trans.start_in_tran > n_of_kmers_of_seq - j: # 说明这个contig不来自这个seq contig_find_trans.stop_in_tran = contig_find_trans.start_in_tran + n_of_kmers_of_seq - j # [start,stop)这里这个长度肯定是错误的,正好后面要用到 j = n_of_kmers_of_seq # 到末尾,也就是结束 else: contig_find_trans.stop_in_tran = kmers_in_contig_len # contig属于这个seq,而且长度就是contig的长度 j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran # 下一个从紧接着当前contig的下一个contig开始 else: contig_find_trans.sense_in_tran = 0 contig_find_trans.stop_in_tran = kmers_in_contig.index( curr_twin) + 1 # 因为[start,stop) stop代表的是长度,取不到stop,实际取的是stop-1 if contig_find_trans.stop_in_tran > n_of_kmers_of_seq - j: contig_find_trans.start_in_tran = contig_find_trans.stop_in_tran - n_of_kmers_of_seq + j j = n_of_kmers_of_seq else: contig_find_trans.start_in_tran = 0 j = j + contig_find_trans.stop_in_tran - contig_find_trans.start_in_tran contig_find_trans_dict[curr_contig_id].append( contig_find_trans) else: print(curr_rep) print(j) program_stop("ecs.py-2") for i in range(len(contig_list)): contig_find_trans_list.append(contig_find_trans_dict[i]) fix_contigs(k) for curr_contig_id in range(len(contig_find_trans_list)): curr_contig_list = [] for curr_trans_info in contig_find_trans_list[curr_contig_id]: curr_contig_list.append(curr_trans_info.tran_id) curr_contig_list = sorted(list(set(curr_contig_list))) ec = -1 if curr_contig_list in ec_inv_dict.values(): for (key, value) in ec_inv_dict.items(): if value == curr_contig_list: ec = key else: ec = len(ec_inv_dict) ec_inv_dict[ec] = curr_contig_list ec_map.append(curr_contig_list) ecs_list[curr_contig_id] = ec contig_list[curr_contig_id].ecs_id = ec for i in range(len(seqs)): seq = seqs[i] n_of_kmers_of_seq = len(seq) - k + 1 tmp_str = "" j = 0 while j < n_of_kmers_of_seq: curr_kmer = seq[j:j + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) if curr_rep in kmer_str_dict: curr_kmer_info = kmer_str_dict[curr_rep] contig_include_trans = ContigIncludeTrans() contig_include_trans.tran_id = i contig_include_trans.pos_in_tran = j contig_include_trans.sense_in_tran = int((( curr_kmer == curr_rep) == curr_kmer_info.sense_in_contig)) contig_list[curr_kmer_info.contig_id].include_trans.append( contig_include_trans) j = j + contig_list[curr_kmer_info.contig_id].n_of_kmer # if contig_include_trans.sense_in_tran: # if contig_include_trans.pos_in_tran == 0: # tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq # else: # tmp_str = tmp_str + contig_list[curr_kmer_info.contig_id].seq[k - 1] # else: # new_tm_str = str_pair(contig_list[curr_kmer_info.contig_id].seq) # if contig_include_trans.pos_in_tran == 0: # tmp_str = tmp_str + new_tm_str # else: # tmp_str = tmp_str + new_tm_str[k - 1] else: program_stop("ecs.py-3")
def match(curr_read, curr_read_len, k): # global num_read # num_read = num_read + 1 # if num_read %2 == 0: # print(num_read) v = [] i = 0 again = 0 while i < curr_read_len - k + 1: kit = curr_read[i:i + k] kit_rep = kmer_rep(kit) if kit_rep in kmer_str_dict: curr_kit_info = kmer_str_dict[kit_rep] pos = i v.append((curr_kit_info, pos)) dist = get_dist(int((kit == kit_rep)), curr_kit_info) if dist > 2: pos2 = pos + dist if pos2 + k >= curr_read_len: # 超界,检查该read的最后一个kmer pos2 = curr_read_len - k kit2 = curr_read[pos2:pos2 + k] kit2_rep = kmer_rep(kit2) if kit2_rep in kmer_str_dict: found2 = 0 found2pos = pos + dist kit2_info = kmer_str_dict[kit2_rep] if curr_kit_info.contig_id == kit2_info.contig_id: found2 = 1 found2pos = pos + dist else: found2 = 1 found2pos = pos if found2: if found2pos >= curr_read_len - k: v.append((curr_kit_info, curr_read_len - k)) return v else: v.append((curr_kit_info, found2pos)) else: found_middle = 0 if dist > 4: middle_pos = int((pos + pos2) / 2) found3pos = pos + dist kit3 = curr_read[middle_pos:middle_pos + k] kit3_rep = kmer_rep(kit3) if kit3_rep in kmer_str_dict: kit3_info = kmer_str_dict[kit3_rep] middle_contig_id = kit3_info.contig_id if middle_contig_id == curr_kit_info.contig_id: found_middle = 1 found3pos = middle_pos elif middle_contig_id == kit2_info.contig_id: found_middle = 1 found3pos = pos + dist else: pass if found_middle: v.append((kit3_info, found3pos)) if pos2 >= curr_read_len - k: return v else: i = pos2 else: pass else: pass else: pass if not found_middle: i = i + 1 again = 1 if again: j = 0 m = 0 while j < curr_read_len and m < curr_read_len: if j == skip: j = 0 if j == 0: again_kit = curr_read[m:m + k] again_kit_rep = kmer_rep(again_kit) if again_kit_rep in kmer_str_dict: again_kit_info = kmer_str_dict[ again_kit_rep] v.append((again_kit_info, m)) if m == pos2: again = 0 return v m = m + 1 j = j + 1 else: pass else: pass i = i + 1 return v
def build_dbg(seqs, k): global fw_step_kmer tmp_kmer_map = set() # 把序列打断,得到kmer的集合 for seq in seqs: for i in range(len(seq) - k + 1): curr_kmer = seq[i:i + k] curr_kmer = str(curr_kmer) curr_rep = kmer_rep(curr_kmer) tmp_kmer_map.add(curr_rep) # 给每一个kmer带上一个可以存储它信息的类 for each_kmer in tmp_kmer_map: kmer_info = KmerInfo() kmer_str_dict[each_kmer] = kmer_info # 把kmer连起来,找contig for kmer_key in kmer_str_dict: curr_kmer = kmer_key curr_kmer_info = kmer_str_dict[kmer_key] if curr_kmer_info.contig_id == -1: self_loop = 0 curr_twin = kmer_twin(curr_kmer) fw_list = [curr_kmer] last_kmer = curr_kmer fw_step_kmer = curr_kmer while fwstep(fw_step_kmer): if fw_step_kmer == curr_kmer: # 该if判断是否成环 self_loop = 1 break # pass # example(3,"ACTGAC") (5,"ACCAACCA") (5,"TCTGTCTG") (5,"AACAAACA") (5,"CACACA") (5,"ACACAC") # print("begin:",begin,"fw:", fw, "t_kmer", self.tmp_kmer, "seqs:", self.seqs) elif fw_step_kmer == curr_twin: self_loop = (len(fw_list) > 1) break elif fw_step_kmer == kmer_twin(last_kmer): break fw_list.append(fw_step_kmer) last_kmer = fw_step_kmer fw_step_kmer = curr_twin bw_list = [] first_kmer = curr_twin if not self_loop: while fwstep(fw_step_kmer): if fw_step_kmer == curr_twin: break elif fw_step_kmer == curr_kmer: break elif fw_step_kmer == kmer_twin(first_kmer): break bw_list.append(fw_step_kmer) first_kmer = fw_step_kmer curr_kmer_list = [] for bw in reversed(bw_list): curr_kmer_list.append(kmer_twin(bw)) for fw in fw_list: curr_kmer_list.append(fw) contig_list_len = len(curr_kmer_list) curr_contig_id = len(contig_list) contig = Contig() contig.id = curr_contig_id contig.n_of_kmer = contig_list_len # 存储kmer的信息 for j in range(contig_list_len): each_kmer = curr_kmer_list[j] tmp_rep = kmer_rep(each_kmer) if tmp_rep in kmer_str_dict: kmer_str_dict[tmp_rep].contig_id = curr_contig_id kmer_str_dict[tmp_rep].pos_in_contig = j kmer_str_dict[ tmp_rep].n_of_kmer_in_contig = contig_list_len if tmp_rep == each_kmer: kmer_str_dict[tmp_rep].sense_in_contig = 1 else: kmer_str_dict[tmp_rep].sense_in_contig = 0 else: program_stop("contig.py") if j == 0: contig.seq = each_kmer elif j > 0: contig.seq = contig.seq + each_kmer[-1] contig_list.append(contig) ecs_list.append(-1)