def update_clouds(self, left_kmer, kmers): # kmers: key is kmer, value is clouds(reads) right_kmer = tools.bool_reverse(left_kmer) for key in kmers: if tools.hamming_distance(key, left_kmer) < tools.hamming_distance( key, right_kmer): self.left_clouds.update(kmers[key]) elif tools.hamming_distance( key, left_kmer) > tools.hamming_distance(key, right_kmer): self.right_clouds.update(kmers[key])
def find_indel_pair_kmer(mapK, uniqK_1mers, k): mid = int(k/2) print ("before filter mapK") uniqMapK = {} for key in mapK: for (kmer, cov) in mapK[key]: leftHalf = kmer[:mid] rightHalf = kmer[mid+1:] if (tools.hamming_distance(rightHalf, kmer[mid : -1 ] ) <= 1 or # mutation => delete tools.hamming_distance(leftHalf, kmer[1:mid+1]) <= 1 ): mapK[key].remove((kmer, cov)) #if len(mapK[key]) == 1: if len(mapK[key]) >= 1: uniqMapK[key] = mapK[key] print ("after filter mapK") mapK_1 = {} print ( "uniq K-1 mer size", len(uniqK_1mers) ) for (key, cov) in uniqK_1mers: #assert key not in uniqK_1mers mapK_1[key] = cov indelPair = [] print ( "map K-1 size", len(mapK_1) ) for key in mapK_1: if key in uniqMapK: #assert len( mapK[key] ) == 1 #kmer, c = mapK[key][0] ''' # covered by left half or right half shift one if kmer[:-1] == key or kmer[1:] == key: # remove head or tail, they are same print (kmer, "remove head or tail same with remove at middle") continue ''' temp = copy.deepcopy(uniqMapK[key]) temp.append( (key, mapK_1[key]) ) #indelPair.append( (kmer, key, c, mapK_1[key]) ) indelPair.append( temp ) #else: # print ("more than one kmer corr. k-1mer", key, mapK_1[key], mapK[key]) ''' fout = open("indel_pair", "w") sortedKmers = sorted(indelKmers) for (k1, k2, c1, c2) in sortedKmers: fout.write(">kmer_indel%s_1_cov_%s\n" % (ID, c1)) fout.write("%s\n" % ( k1 ) ) fout.write(">kmer_indel%s_2_cov_%s\n" % (ID, c2)) fout.write("%s\n" % ( k2 ) ) ID += 1 fout.close() ''' return indelPair
def calc_MEC(self): ans = 0 for c in self.left_clouds.union(self.unsure_clouds): s1 = c.seq s2 = self[c.start:c.end + 1] ans += tools.hamming_distance(s1, s2) #print "left MEC", ans for c in self.right_clouds: s1 = tools.list_reverse(c.seq) s2 = self[c.start:c.end + 1] ans += tools.hamming_distance(s1, s2) #print "right MEC", ans self.MEC = ans return ans
def deal_unsure_clouds(self): print "deal unsure clouds" sure = list() for c in self.unsure_clouds: s0 = c.seq s1 = tools.list_reverse(s0) s2 = self[c.start:c.end + 1] # haplotype seq print s0, s1, s2 if tools.hamming_distance(s0, s2) < tools.hamming_distance(s1, s2): self.left_clouds.add(c) sure.append(c) elif tools.hamming_distance(s0, s2) > tools.hamming_distance( s1, s2): self.right_clouds.add(c) sure.append(c) for c in sure: self.unsure_clouds.remove(c)
def assign_clouds_part_region(self, clouds_at_index, start, end): self.clouds = self.get_clouds_part_region(clouds_at_index, start, end) self.left_clouds = set() self.right_clouds = set() for c in self.clouds: s0 = c.seq s1 = tools.list_reverse(s0) s2 = self[c.start:c.end + 1] # haplotype seq #print c.name d02 = tools.hamming_distance(s0, s2) d12 = tools.hamming_distance(s1, s2) if d02 < d12: self.left_clouds.add(c) elif d02 > d12: self.right_clouds.add(c) else: self.unsure_clouds.add(c) return self.calc_MEC()
continue elif state == 1: kmer1 = line.strip() state = 2 line = f.readline() line = f.readline() continue elif state == 2: kmer2 = line.strip() if mutation == "SNP": snpNumber += 1 #print (ID, kmer1, kmer2) assert len(kmer1) == len(kmer2) #kmer1 = kmer1[15:-15] # if we want middle 31mer #kmer2 = kmer2[15:-15] # this two line if tools.hamming_distance(kmer1, kmer2) != 1: #print ("not isolated with 31mer") #print (ID, kmer1, kmer2) state = 0 line = f.readline() continue smallerKmer1, smallerKmer2 = tools.get_smaller_pair_kmer( kmer1, kmer2) snpPairKmer.append((smallerKmer1, smallerKmer2)) #print (ID, smallerKmer1, smallerKmer2) elif mutation == "INDEL" and indelLen == 1: kmer2 = kmer2[1:] #kmer1 = kmer1[15:-15] # for middle 31mer #kmer2 = kmer2[15:-15] smallerKmer1, smallerKmer2 = tools.get_smaller_pair_kmer(
def find_pair_indel_kmer(kmerFile, k_1merFile, output_filename): kmers = {} count = 0 with open(kmerFile, "r") as f: for line in f: words = line.strip().split() l = len(words[0]) newtemp = tools.reverse(words[0]) # these two lines assert newtemp >= words[0] # can be deleted mid = int(l / 2) leftHalf = words[0][:mid] rightHalf = words[0][mid + 1:] if (tools.hamming_distance(rightHalf, words[0][mid:-1]) <= 1 or # mutation => delete tools.hamming_distance(leftHalf, words[0][1:mid + 1]) <= 1): continue #if words[0][mid] == words[0][mid-1] and words[0][mid] == words[0][mid+1] : # only keep those pair, delete at first position or last position #if words[0][mid] == words[0][mid-1]: # only keep those pair, delete at first position or last position #continue key = leftHalf + rightHalf Rkey = tools.reverse(key) assert key <= Rkey if key not in kmers: kmers[key] = [] kmers[key].append((words[0], words[1])) count += 1 print("unique kmer number", count) print("unique k_1mer in kmers", len(kmers)) k_1mers = {} count = 0 with open(k_1merFile, "r") as f: for line in f: words = line.strip().split() l = len(words[0]) key = words[0] newtemp = tools.reverse(key) # these two lines assert newtemp >= key # can be deleted assert key not in k_1mers k_1mers[key] = words[1] # words[1] is coverage count += 1 print("unique k_1mer number", count) pairKmers = [] for key in k_1mers: if key in kmers: if len(kmers[key]) == 1: #print key, k_1mers[key], kmers[key] kmer, c = kmers[key][0] if kmer[:-1] == key or kmer[ 1:] == key: # remove head or tail, they are same continue pairKmers.append((kmer, key, c, k_1mers[key])) else: print("more than 1", key, k_1mers[key], kmers[key]) fout = open(output_filename, "w") sortedKmers = sorted(pairKmers) countU, countD, count = 0, 0, 0 for (k1, k2, c1, c2) in sortedKmers: #print (mid) if k1[mid] == k1[mid + 1] or k1[mid] == k1[mid - 1]: countD += 1 else: countU += 1 count += 1 fout.write("%s %s %s %s\n" % (k1, k2, c1, c2)) fout.close() print(countU, countD, count)
def group_shift_kmer(pair_filename, unique_filename, output_filename, NGS_kmer): left_unique_index, right_unique_index = read_unique_kmer(unique_filename) fout = open(output_filename, "w") count = 0 kmers = {} with open(pair_filename, "r") as f: for line in f: words = line.strip().split() h1 = words[0] h2 = words[1] leftGroup1 = shift_kmer(h1, left_unique_index, right_unique_index, "l") leftGroup2 = shift_kmer(h2, left_unique_index, right_unique_index, "l") cov1 = int(words[2]) cov2 = int(words[3]) sum_coverage = cov1 + cov2 Flag = True i = 0 lenG = min(len(leftGroup1), len(leftGroup2)) while i < lenG: if (tools.hamming_distance(leftGroup1[i][0], leftGroup2[i][0]) != 1 and tools.hamming_distance( leftGroup1[i][0], tools.reverse( leftGroup2[i][0])) != 1): Flag = False break i += 1 ''' # 60 equal to lowest coverage if cov1 - cov2 >= 60 or cov2 - cov1 >= 60: # or float(cov2/cov1) >=1.7 or float(cov1/cov2) >=1.7: continue if sum_coverage <= 180 or sum_coverage >= 280: continue ''' if Flag == False: continue rightGroup1 = shift_kmer(h1, left_unique_index, right_unique_index, "r") rightGroup2 = shift_kmer(h2, left_unique_index, right_unique_index, "r") i = 0 lenG = min(len(rightGroup1), len(rightGroup2)) while i < lenG: if (tools.hamming_distance(rightGroup1[i][0], rightGroup2[i][0]) != 1 and tools.hamming_distance( rightGroup1[i][0], tools.reverse( rightGroup2[i][0])) != 1): Flag = False break i += 1 if Flag == False: continue group1, group2 = [(h1, 'f')], [(h2, 'f')] group1.extend(leftGroup1) group2.extend(leftGroup2) group1.extend(rightGroup1) group2.extend(rightGroup2) gSize1 = len(group1) gSize2 = len(group2) if gSize1 <= kmerSize - 1 or gSize2 <= kmerSize - 1: #if gSize1 <=kmerSize/2 or gSize2 <= kmerSize/2: continue count += 1 fout.write("group %s %s %s %s %s\n" % (count, h1, h2, words[2], words[3])) print "group", count, len(leftGroup1), len(rightGroup1), len( leftGroup2), len(rightGroup2) for ele in group1: fout.write("%s %s " % (ele[0], ele[1])) if ele[0] not in kmers: kmers[ele[0]] = [] kmers[ele[0]].append(str(count) + ele[1] + 'A') # A is zore fout.write("\n") for ele in group2: fout.write("%s %s " % (ele[0], ele[1])) if ele[0] not in kmers: kmers[ele[0]] = [] kmers[ele[0]].append(str(count) + ele[1] + 'B') # B is one fout.write("\n") fout.close() fout = open(NGS_kmer, "w") print "total group number", count sortedKmers = sorted(kmers.items()) filterGroup = set() for ele in sortedKmers: if len(ele[1]) >= 2: print ele for ID in ele[1]: filterGroup.add(ID[:-2]) continue fout.write("%s" % ele[0]) l = ele[1] for e in l: fout.write(" %s" % e) fout.write("\n") fout.close() print "filter group size", len(filterGroup) foutFilter = open(filter_filename, "w") with open(output_filename, "r") as f: state = 0 for line in f: if state == 0 and line.startswith("group"): words = line.split() if words[1] not in filterGroup: foutFilter.write(line) state = 1 else: state = -1 elif state == 1: foutFilter.write(line) state = 2 elif state == 2: foutFilter.write(line) state = 0 elif state == -1: state = 0