Ejemplo n.º 1
0
 def update_clouds(self, left_kmer,
                   kmers):  # kmers: key is kmer, value is clouds(reads)
     right_kmer = tools.bool_reverse(left_kmer)
     for key in kmers:
         if tools.hamming_distance(key, left_kmer) < tools.hamming_distance(
                 key, right_kmer):
             self.left_clouds.update(kmers[key])
         elif tools.hamming_distance(
                 key, left_kmer) > tools.hamming_distance(key, right_kmer):
             self.right_clouds.update(kmers[key])
Ejemplo n.º 2
0
def find_indel_pair_kmer(mapK, uniqK_1mers, k):    
    mid = int(k/2)
    print ("before filter mapK")
    uniqMapK = {}
    for key in mapK:
        for (kmer, cov) in mapK[key]:
            leftHalf = kmer[:mid]
            rightHalf = kmer[mid+1:]
            if (tools.hamming_distance(rightHalf, kmer[mid : -1 ] ) <= 1 or # mutation => delete 
                    tools.hamming_distance(leftHalf, kmer[1:mid+1]) <= 1 ):
                mapK[key].remove((kmer, cov))
        #if len(mapK[key]) == 1:
        if len(mapK[key]) >= 1:
            uniqMapK[key] = mapK[key]

    print ("after filter mapK")
    mapK_1 = {}

    print ( "uniq K-1 mer size", len(uniqK_1mers) )
    for (key, cov) in uniqK_1mers:        
        #assert key not in uniqK_1mers 
        mapK_1[key] = cov

    indelPair = []

    print ( "map K-1 size", len(mapK_1) )
    for key in mapK_1:
        if key in uniqMapK:
            #assert len( mapK[key] ) == 1
            #kmer, c = mapK[key][0]
            ''' # covered by left half or right half shift one
            if kmer[:-1] == key or kmer[1:] == key: # remove head or tail, they are same
                print (kmer, "remove head or tail same with remove at middle")
                continue
            '''   
            temp = copy.deepcopy(uniqMapK[key])
            temp.append( (key, mapK_1[key]) )
            #indelPair.append( (kmer, key, c, mapK_1[key]) )
            indelPair.append( temp )
            #else:
            #    print ("more than one kmer corr. k-1mer", key, mapK_1[key], mapK[key])
    ''' 
    fout = open("indel_pair", "w")
    sortedKmers = sorted(indelKmers)
    for (k1, k2, c1, c2) in sortedKmers:
        fout.write(">kmer_indel%s_1_cov_%s\n" % (ID, c1))
        fout.write("%s\n" % ( k1 ) )
        fout.write(">kmer_indel%s_2_cov_%s\n" % (ID, c2))
        fout.write("%s\n" % ( k2 ) )
        ID += 1
    fout.close()       
    '''
    return indelPair
Ejemplo n.º 3
0
    def calc_MEC(self):

        ans = 0
        for c in self.left_clouds.union(self.unsure_clouds):
            s1 = c.seq
            s2 = self[c.start:c.end + 1]
            ans += tools.hamming_distance(s1, s2)
        #print "left MEC", ans
        for c in self.right_clouds:
            s1 = tools.list_reverse(c.seq)
            s2 = self[c.start:c.end + 1]
            ans += tools.hamming_distance(s1, s2)
        #print "right MEC", ans
        self.MEC = ans
        return ans
Ejemplo n.º 4
0
 def deal_unsure_clouds(self):
     print "deal unsure clouds"
     sure = list()
     for c in self.unsure_clouds:
         s0 = c.seq
         s1 = tools.list_reverse(s0)
         s2 = self[c.start:c.end + 1]  # haplotype seq
         print s0, s1, s2
         if tools.hamming_distance(s0, s2) < tools.hamming_distance(s1, s2):
             self.left_clouds.add(c)
             sure.append(c)
         elif tools.hamming_distance(s0, s2) > tools.hamming_distance(
                 s1, s2):
             self.right_clouds.add(c)
             sure.append(c)
     for c in sure:
         self.unsure_clouds.remove(c)
Ejemplo n.º 5
0
    def assign_clouds_part_region(self, clouds_at_index, start, end):

        self.clouds = self.get_clouds_part_region(clouds_at_index, start, end)
        self.left_clouds = set()
        self.right_clouds = set()
        for c in self.clouds:
            s0 = c.seq
            s1 = tools.list_reverse(s0)
            s2 = self[c.start:c.end + 1]  # haplotype seq
            #print c.name
            d02 = tools.hamming_distance(s0, s2)
            d12 = tools.hamming_distance(s1, s2)
            if d02 < d12:
                self.left_clouds.add(c)
            elif d02 > d12:
                self.right_clouds.add(c)
            else:
                self.unsure_clouds.add(c)
        return self.calc_MEC()
            continue
        elif state == 1:
            kmer1 = line.strip()
            state = 2
            line = f.readline()
            line = f.readline()
            continue
        elif state == 2:
            kmer2 = line.strip()
            if mutation == "SNP":
                snpNumber += 1
                #print (ID, kmer1, kmer2)
                assert len(kmer1) == len(kmer2)
                #kmer1 = kmer1[15:-15] # if we want middle 31mer
                #kmer2 = kmer2[15:-15] # this two line
                if tools.hamming_distance(kmer1, kmer2) != 1:
                    #print ("not isolated with 31mer")
                    #print (ID, kmer1, kmer2)
                    state = 0
                    line = f.readline()
                    continue
                smallerKmer1, smallerKmer2 = tools.get_smaller_pair_kmer(
                    kmer1, kmer2)
                snpPairKmer.append((smallerKmer1, smallerKmer2))
                #print (ID, smallerKmer1, smallerKmer2)
            elif mutation == "INDEL" and indelLen == 1:
                kmer2 = kmer2[1:]

                #kmer1 = kmer1[15:-15] # for middle 31mer
                #kmer2 = kmer2[15:-15]
                smallerKmer1, smallerKmer2 = tools.get_smaller_pair_kmer(
Ejemplo n.º 7
0
def find_pair_indel_kmer(kmerFile, k_1merFile, output_filename):
    kmers = {}
    count = 0
    with open(kmerFile, "r") as f:
        for line in f:
            words = line.strip().split()
            l = len(words[0])
            newtemp = tools.reverse(words[0])  # these two lines
            assert newtemp >= words[0]  # can be deleted

            mid = int(l / 2)
            leftHalf = words[0][:mid]
            rightHalf = words[0][mid + 1:]

            if (tools.hamming_distance(rightHalf, words[0][mid:-1]) <= 1
                    or  # mutation => delete 
                    tools.hamming_distance(leftHalf,
                                           words[0][1:mid + 1]) <= 1):
                continue

            #if words[0][mid] == words[0][mid-1] and words[0][mid] == words[0][mid+1] :  # only keep those pair, delete at first position or last position
            #if words[0][mid] == words[0][mid-1]:  # only keep those pair, delete at first position or last position
            #continue

            key = leftHalf + rightHalf

            Rkey = tools.reverse(key)
            assert key <= Rkey
            if key not in kmers:
                kmers[key] = []
            kmers[key].append((words[0], words[1]))
            count += 1

    print("unique kmer number", count)
    print("unique k_1mer in kmers", len(kmers))

    k_1mers = {}
    count = 0
    with open(k_1merFile, "r") as f:
        for line in f:
            words = line.strip().split()
            l = len(words[0])
            key = words[0]
            newtemp = tools.reverse(key)  # these two lines
            assert newtemp >= key  # can be deleted
            assert key not in k_1mers
            k_1mers[key] = words[1]  # words[1] is coverage
            count += 1
    print("unique k_1mer number", count)

    pairKmers = []
    for key in k_1mers:
        if key in kmers:
            if len(kmers[key]) == 1:
                #print key, k_1mers[key], kmers[key]
                kmer, c = kmers[key][0]
                if kmer[:-1] == key or kmer[
                        1:] == key:  # remove head or tail, they are same
                    continue
                pairKmers.append((kmer, key, c, k_1mers[key]))
            else:
                print("more than 1", key, k_1mers[key], kmers[key])

    fout = open(output_filename, "w")
    sortedKmers = sorted(pairKmers)
    countU, countD, count = 0, 0, 0
    for (k1, k2, c1, c2) in sortedKmers:
        #print (mid)
        if k1[mid] == k1[mid + 1] or k1[mid] == k1[mid - 1]:
            countD += 1
        else:
            countU += 1
        count += 1
        fout.write("%s %s %s %s\n" % (k1, k2, c1, c2))
    fout.close()
    print(countU, countD, count)
Ejemplo n.º 8
0
def group_shift_kmer(pair_filename, unique_filename, output_filename,
                     NGS_kmer):

    left_unique_index, right_unique_index = read_unique_kmer(unique_filename)

    fout = open(output_filename, "w")
    count = 0
    kmers = {}
    with open(pair_filename, "r") as f:
        for line in f:
            words = line.strip().split()
            h1 = words[0]
            h2 = words[1]
            leftGroup1 = shift_kmer(h1, left_unique_index, right_unique_index,
                                    "l")
            leftGroup2 = shift_kmer(h2, left_unique_index, right_unique_index,
                                    "l")

            cov1 = int(words[2])
            cov2 = int(words[3])
            sum_coverage = cov1 + cov2

            Flag = True
            i = 0
            lenG = min(len(leftGroup1), len(leftGroup2))
            while i < lenG:
                if (tools.hamming_distance(leftGroup1[i][0], leftGroup2[i][0])
                        != 1 and tools.hamming_distance(
                            leftGroup1[i][0], tools.reverse(
                                leftGroup2[i][0])) != 1):
                    Flag = False
                    break
                i += 1
            '''
            # 60 equal to lowest coverage
            if cov1 - cov2 >= 60 or cov2 - cov1 >= 60: # or float(cov2/cov1) >=1.7 or float(cov1/cov2) >=1.7:
                continue
            if sum_coverage <= 180 or sum_coverage >= 280:
                continue
            '''
            if Flag == False:
                continue

            rightGroup1 = shift_kmer(h1, left_unique_index, right_unique_index,
                                     "r")
            rightGroup2 = shift_kmer(h2, left_unique_index, right_unique_index,
                                     "r")

            i = 0
            lenG = min(len(rightGroup1), len(rightGroup2))
            while i < lenG:
                if (tools.hamming_distance(rightGroup1[i][0],
                                           rightGroup2[i][0]) != 1
                        and tools.hamming_distance(
                            rightGroup1[i][0], tools.reverse(
                                rightGroup2[i][0])) != 1):
                    Flag = False
                    break
                i += 1
            if Flag == False:
                continue

            group1, group2 = [(h1, 'f')], [(h2, 'f')]
            group1.extend(leftGroup1)
            group2.extend(leftGroup2)
            group1.extend(rightGroup1)
            group2.extend(rightGroup2)

            gSize1 = len(group1)
            gSize2 = len(group2)
            if gSize1 <= kmerSize - 1 or gSize2 <= kmerSize - 1:
                #if gSize1 <=kmerSize/2 or gSize2 <= kmerSize/2:
                continue
            count += 1
            fout.write("group %s %s %s %s %s\n" %
                       (count, h1, h2, words[2], words[3]))
            print "group", count, len(leftGroup1), len(rightGroup1), len(
                leftGroup2), len(rightGroup2)
            for ele in group1:
                fout.write("%s %s " % (ele[0], ele[1]))
                if ele[0] not in kmers:
                    kmers[ele[0]] = []
                kmers[ele[0]].append(str(count) + ele[1] + 'A')  # A is zore
            fout.write("\n")

            for ele in group2:
                fout.write("%s %s " % (ele[0], ele[1]))
                if ele[0] not in kmers:
                    kmers[ele[0]] = []
                kmers[ele[0]].append(str(count) + ele[1] + 'B')  # B is one
            fout.write("\n")
    fout.close()
    fout = open(NGS_kmer, "w")
    print "total group number", count
    sortedKmers = sorted(kmers.items())
    filterGroup = set()
    for ele in sortedKmers:
        if len(ele[1]) >= 2:
            print ele
            for ID in ele[1]:
                filterGroup.add(ID[:-2])
            continue
        fout.write("%s" % ele[0])
        l = ele[1]
        for e in l:
            fout.write(" %s" % e)
        fout.write("\n")
    fout.close()

    print "filter group size", len(filterGroup)
    foutFilter = open(filter_filename, "w")
    with open(output_filename, "r") as f:
        state = 0
        for line in f:
            if state == 0 and line.startswith("group"):
                words = line.split()
                if words[1] not in filterGroup:
                    foutFilter.write(line)
                    state = 1
                else:
                    state = -1
            elif state == 1:
                foutFilter.write(line)
                state = 2
            elif state == 2:
                foutFilter.write(line)
                state = 0
            elif state == -1:
                state = 0