Esempio n. 1
0
def check(text):
    """Check the text."""
    error_code = "example.first"
    msg = "First line always has an error."

    reverse(text)

    return [(1, 1, error_code, msg)]
Esempio n. 2
0
def find_pair_kmer(input_filename, output_filename):    
    m={}
    with open(input_filename, "r") as f:
        count, count2 = 0,0

        for line in f:
            #if count >100:
                #break
            words = line.strip().split()
            l = len(words[0])
            newtemp = tools.reverse(words[0]) # these two lines 
            assert newtemp >= words[0]        # can be deleted  
            key= words[0][:l/2] + words[0][l/2+1:]
            if key not in m:
                m[key] = []
            m[key].append( (words[0], words[1]) )
            count += 1

    print "unique kmer number", count        
    print "not small kmer (compare reverse kmer)", count2
    print "total number possible pair kmer", len(m)    

    pairKmers = []
    count1, countLarge2 =0, 0
    for key in m:
        if len(m[key]) == 1:
            count1 += 1
        elif len(m[key]) == 2:
            k1 = m[key][0][0]
            k2 = m[key][1][0]
            newk1 = tools.reverse(k1)             # these three line 
            newk2 = tools.reverse(k2)             # can be deleted
            assert k1 <= newk1 and k2 <= newk2
            #min_k = min(k1, k2)
            #min_new = min(newk1, newk2)
            #if min_new < min_k:
                #k1 = newk1
                #k2 = newk2
            #sum_coverage = int(m[key][0][1]) + int (m[key][1][1])
            if k1 < k2:
                pairKmers.append( (k1, k2, m[key][0][1], m[key][1][1]) )
            else:
                pairKmers.append( (k2, k1, m[key][1][1], m[key][0][1]) )
        else:
            countLarge2 += 1

    print "kmer cannot find pair number", count1       
    print "more than one mutation in middle", countLarge2

    fout = open(output_filename, "w")
    sortedKmers = sorted(pairKmers)
    for (k1, k2, c1, c2) in sortedKmers:      
        fout.write("%s %s %s %s\n" % ( k1, k2, c1, c2 ) )
    fout.close()        
Esempio n. 3
0
def extend_to_right(h1, left_index, right_index, k):
    #print (type(k))
    mid = int(k/2)
    key = h1[-(k-1):]
    Rkey = tools.reverse(key)
    temp, Rtemp = h1, tools.reverse(h1)
    add, Radd = "", ""
    for i in range(0, mid):
        flag, flagR = False, False
        if key in left_index:
            temp = temp + left_index[key][0]
            Rtemp = tools.reverse(temp)
            flag = True

        if Rkey in right_index:
            Rtemp = right_index[Rkey][0] + Rtemp
            temp = tools.reverse(Rtemp)
            flagR = True

        if flag == True and flagR == False:
            add = add + left_index[key][0]
            Radd = tools.reverse(add)
            key = temp[-(k-1):]
            Rkey = tools.reverse(key)
        elif flag == False and flagR == True: 
            Radd = right_index[Rkey][0] + Radd
            add = tools.reverse(Radd)
            Rkey = Rtemp[:(k-1)]
            key = tools.reverse(Rkey)
        elif flag == flagR:
            if flag == True:
                temp = temp[:-1]
                Rtemp = Rtemp[1:]
            break
    return temp, add
Esempio n. 4
0
def find_non_pair_kmer(uniqKmer, k):

    mid = int(k/2)
    left = {}
    for (kmer, cov) in uniqKmer:
        leftKey = kmer[:mid] 
        if leftKey not in left:
            left[ leftKey ] = []  
        left[leftKey].append( (kmer, cov) )
        Rkmer = tools.reverse(kmer)
        leftKey = Rkmer[:mid] 
        if leftKey not in left:
            left[ leftKey ] = []  
        left[leftKey].append( (Rkmer,cov) )
    
    #build map: overlap is key
    mapMerge = build_map_merge(left, k)
    nonPair = merge_pair(mapMerge) 

    '''
    fout = open("non_pair", "w")
    sortedKmers = sorted(nonPair)
    ID = 1
    for (k1, k2, c1, c2, c3, c4) in sortedKmers:      
        fout.write(">kmer_non%s_1_cov_%s_cov_%s\n" % (ID, c1, c3))
        fout.write("%s\n" % ( k1 ) )
        fout.write(">kmer_non%s_2_cov_%s_cov_%s\n" % (ID, c2, c4))
        fout.write("%s\n" % ( k2 ) )
        ID += 1
    fout.close()        
    '''
    return nonPair
Esempio n. 5
0
def get_FP_position(k1, k2):
    refFilename = "/media/yanbo/Data/reference/hg37/chr22.fa"
    record = SeqIO.read(open(refFilename), "fasta")
    print(record.id)
    seq = str(record.seq).upper()
    #count = 0
    #for (k1, k2, c1, c2) in pairFP:
    Rk1 = tools.reverse(k1)
    Rk2 = tools.reverse(k2)
    a, b = seq.count(k1), seq.count(Rk1)
    if a > 0:
        print(k1, k2, a, seq.index(k1))
    if b > 0:
        print(Rk1, Rk2, b, seq.index(Rk1))

    c, d = seq.count(k2), seq.count(Rk2)
    if c > 0:
        print(k2, k1, c, seq.index(k2))
    if d > 0:
        print(Rk2, Rk1, d, seq.index(Rk2))
Esempio n. 6
0
def update_map_merge(left_i, left_j, key1, key2, mapMerge):

    k1, cov1 = left_i
    k2, cov2 = left_j
    minkey1, minkey2 = tools.get_smaller_pair_kmer(key1, key2)
    if (minkey1, minkey2) not in mapMerge:
        mapMerge[ (minkey1, minkey2) ] = []   
    Rk1, Rk2 = tools.reverse(k1), tools.reverse(k2)
    if k1.count(minkey1) == 1 and k2.count(minkey2) == 1:
        mapMerge[ (minkey1, minkey2) ].append( (k1, k2, cov1, cov2) )
    elif k1.count(minkey2) == 1 and k2.count(minkey1) == 1: 
        mapMerge[ (minkey1, minkey2) ].append( (k2, k1, cov2, cov1) )
    elif Rk1.count(minkey1) == 1 and Rk2.count(minkey2) == 1:   
        mapMerge[ (minkey1, minkey2) ].append( (Rk1, Rk2, cov1, cov2) )
    elif Rk1.count(minkey2) == 1 and Rk2.count(minkey1) == 1:
        mapMerge[ (minkey1, minkey2) ].append( (Rk2, Rk1, cov2, cov1) )
    else:
        print ("something wrong 1")
        sys.exit()
    return     
Esempio n. 7
0
    def Reverse(self, request, context):
        """
        running the Reverse object declared in the proto file
        """
        original_string = request.request_string
        reversed_string = reverse(original_string)

        return pb.ResponseString(
            original_string=original_string,
            reversed_string=reversed_string
        )
Esempio n. 8
0
def pick_smaller_unique_kmer(input_filename, low, high, output_filename):
    fout = open(output_filename, "w")
    with open(input_filename, "r") as f:
        for line in f:
            words = line.strip().split()
            coverage = int(words[1])
            if coverage < low or coverage > high:
                continue
            kmer = words[0]
            newkmer = tools.reverse(kmer)
            if kmer > newkmer:
                kmer = newkmer
            fout.write("%s %s\n" % (kmer, words[1]))
        fout.close()
Esempio n. 9
0
def pick_smaller_unique_kmer(input_filename, low, high):
    uniqKmer = []
    with open(input_filename, "r") as f:
        for line in f:
            words = line.strip().split()
            coverage = int(words[1])
            if coverage < low or coverage > high:
                continue 
            kmer = words[0]
            newkmer = tools.reverse(kmer)
            if kmer > newkmer:    
                kmer = newkmer
            uniqKmer.append( (kmer, coverage) )
    return uniqKmer
Esempio n. 10
0
def problem004c():
    """Just count down till we find a palindrome because we know
    100001 (smallest palindrome) < P < 999*999
    """

    output = 999*999
    while output > 100001:
        if output == reverse(output):
            #There must be a cleaner way to perform this test
            test = [output / x for x in range(100,1000) if 
                    output/x > 100 and 
                    output/x < 999 and 
                    float(output)/x == output/x]
            if len(test):
                print output
                break
        if output == 906609:
            break
        output -= 1
        
    return output
Esempio n. 11
0
def find_snp_pair_kmer(uniqKmers, k):    
    m={}
    mid = int(k/2)
    print ("before build mapK")
    for (kmer, cov) in uniqKmers:
        Rkmer = tools.reverse(kmer) # these two lines 
        assert Rkmer >= kmer        # can be deleted  
        key= kmer[:mid] + kmer[mid+1:]
        if key not in m:
            m[key] = []
        m[key].append( (kmer, cov) )
    print ("after build mapK")
    print ("unique kmer number", len(uniqKmers))        
    print ("total number possible pair kmer", len(m))    

    pairKmers = []
    count1, countLarge2 =0, 0
    for key in m:
        mKeyLen = len(m[key])
        if mKeyLen == 1:
            count1 += 1
        '''    
        else:   
            onePair = []   
            for (kmer, c) in m[key]:
                onePair.append( (kmer,c) )
            pairKmers.append(onePair)    
        '''    
        elif len(m[key]) == 2:
            k1 = m[key][0][0]
            k2 = m[key][1][0]
            if k1 < k2:
                pairKmers.append( (k1, k2, m[key][0][1], m[key][1][1]) )
            else:
                pairKmers.append( (k2, k1, m[key][1][1], m[key][0][1]) )
        else:
            countLarge2 += 1    
Esempio n. 12
0
    def run(self):
        state = 0
        fout = open("part_matrix_" + str(self.thread_id), "w")
        count = 0
        for text in self.temp_list:
            #columns = text.split('\001')
            if state == 0 and text.startswith("@"):
                readID = text.strip()[1:]
                state = 1
                count += 1
                if count % 1000 == 0:
                    print("thread ", self.thread_id, "deal reads ", count)
            elif state == 1:
                #print readID
                state = 0
                seq = text.strip()
                seqLen = len(seq)
                intersection = []
                for i in range(seqLen - 21):
                    key = str(seq[i:i + 21])
                    Rkey = tools.reverse(key)
                    Rflag = False
                    if key > Rkey:
                        key = Rkey
                        Rflag = True
                    re = binarySearch(NGS, key)
                    if re != -1:
                        if Rflag == True:
                            #print "1"
                            temp = re[1][:-2] + tools.reverse_ward(
                                re[1][-2]
                            ) + re[1][
                                -1]  # Rkmer direction is opposite with reads
                            intersection.append(
                                (temp, i))  # i is position in reads
                        else:
                            #print "2"
                            intersection.append(
                                (re[1], i))  # i is position in reads

                #print intersection
                '''
                Rseq = tools.reverse(seq)
                intersection = []
                for i in range(seqLen-21):
                    key = str(Rseq[i:i+21])
                    re = binarySearch(NGS, key)
                    if re != -1:
                        intersection.append( re[1] )
                print intersection
                '''

                if len(intersection) > 0:
                    print intersection
                PosList = decide_kmer(intersection)
                #if count == 10:
                #sys.exit()
                if len(PosList) <= 1:
                    continue
                fout.write("%s %s " % (len(PosList), readID))
                for (p, binary, pos) in PosList:
                    fout.write("%s %s %s " % (p, binary, pos))
                score = len(PosList) * '4'
                fout.write("%s\n" % score)
                #print len(seq), len(kmers)
                #sys.exit()
            else:
                continue
        fout.close()
# File Name: fastq2Reversefasta.py
# Author: Yanbo Li
# mail: [email protected]
# Created Time: Mon 29 Jul 2019 17:02:56 AEST
#########################################################################
#!/bin/bash
import os
import sys
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import tools
inFile = sys.argv[1]
outFile = sys.argv[2]
#records = []
fout = open(outFile, "w")
for record in SeqIO.parse(inFile, "fastq"):
    print(record.id)
    if record.seq.count('N') > 0:
        continue
    rec1 = SeqRecord(record.seq, id=record.id + "_1")
    rec2 = SeqRecord(tools.reverse(record.seq), id=record.id + "_2")
    #records.append(rec1)
    #records.append(rec2)
    fout.write(">%s\n" % rec1.id)
    fout.write("%s\n" % rec1.seq)
    fout.write(">%s\n" % rec2.id)
    fout.write("%s\n" % rec2.seq)
#SeqIO.write(records, sys.argv[2], "fasta")
fout.close()
Esempio n. 14
0
def find_pair_indel_kmer(kmerFile, k_1merFile, output_filename):
    kmers = {}
    count = 0
    with open(kmerFile, "r") as f:
        for line in f:
            words = line.strip().split()
            l = len(words[0])
            newtemp = tools.reverse(words[0])  # these two lines
            assert newtemp >= words[0]  # can be deleted

            mid = int(l / 2)
            leftHalf = words[0][:mid]
            rightHalf = words[0][mid + 1:]

            if (tools.hamming_distance(rightHalf, words[0][mid:-1]) <= 1
                    or  # mutation => delete 
                    tools.hamming_distance(leftHalf,
                                           words[0][1:mid + 1]) <= 1):
                continue

            #if words[0][mid] == words[0][mid-1] and words[0][mid] == words[0][mid+1] :  # only keep those pair, delete at first position or last position
            #if words[0][mid] == words[0][mid-1]:  # only keep those pair, delete at first position or last position
            #continue

            key = leftHalf + rightHalf

            Rkey = tools.reverse(key)
            assert key <= Rkey
            if key not in kmers:
                kmers[key] = []
            kmers[key].append((words[0], words[1]))
            count += 1

    print("unique kmer number", count)
    print("unique k_1mer in kmers", len(kmers))

    k_1mers = {}
    count = 0
    with open(k_1merFile, "r") as f:
        for line in f:
            words = line.strip().split()
            l = len(words[0])
            key = words[0]
            newtemp = tools.reverse(key)  # these two lines
            assert newtemp >= key  # can be deleted
            assert key not in k_1mers
            k_1mers[key] = words[1]  # words[1] is coverage
            count += 1
    print("unique k_1mer number", count)

    pairKmers = []
    for key in k_1mers:
        if key in kmers:
            if len(kmers[key]) == 1:
                #print key, k_1mers[key], kmers[key]
                kmer, c = kmers[key][0]
                if kmer[:-1] == key or kmer[
                        1:] == key:  # remove head or tail, they are same
                    continue
                pairKmers.append((kmer, key, c, k_1mers[key]))
            else:
                print("more than 1", key, k_1mers[key], kmers[key])

    fout = open(output_filename, "w")
    sortedKmers = sorted(pairKmers)
    countU, countD, count = 0, 0, 0
    for (k1, k2, c1, c2) in sortedKmers:
        #print (mid)
        if k1[mid] == k1[mid + 1] or k1[mid] == k1[mid - 1]:
            countD += 1
        else:
            countU += 1
        count += 1
        fout.write("%s %s %s %s\n" % (k1, k2, c1, c2))
    fout.close()
    print(countU, countD, count)
Esempio n. 15
0
def shift_kmer(h, left_unique_index, right_unique_index, label):
    # last para forward string or backward string

    group = []
    #Rgroup = []

    temp = h
    length = len(h) / 2
    #########################
    #x:  AAAATC....TTATT
    #y:   AAATC....TTATT
    # x<x' but y>y'
    #x':  AATA ...TTTT
    #y': AAATA ...TTT
    #if unique kmer only store one samller kmer this code wrong
    #unique kmer now store forward and bacward string to use this code
    #we later improve this
    #now code is suit for only store one smaller kmer
    ###################

    if label == "l":
        key = temp[1:]
        Rkey = tools.reverse(key)
        #print "ls", h, key, Rkey
        for i in range(0, length):
            flag, flagR = False, False
            if key in left_unique_index and len(left_unique_index[key]) == 1:
                temp = key + left_unique_index[key][0]
                flag = True

            if Rkey in right_unique_index and len(
                    right_unique_index[Rkey]) == 1:
                Rtemp = right_unique_index[Rkey][0] + Rkey
                flagR = True

            if flag == True and flagR == True:

                #print "shift stop, size of group", len(group)
                if len(group) >= 5:
                    print "shift stop", key, temp, "orginal", h
                    print "forward and backward both have next"
                    print Rkey, Rtemp
                break
                #sys.exit()
            elif flag == True and flagR == False:
                key = temp[1:]
                Rkey = tools.reverse(key)
                group.append((temp, 'f'))
                #print i, temp, key, Rkey
            elif flag == False and flagR == True:
                Rkey = Rtemp[:-1]
                key = tools.reverse(Rkey)
                #ward = tool.reverse_ward(ward)
                group.append((Rtemp, 'b'))
                #print i, Rtemp, key, Rkey
            else:

                #print "shift stop, size of group", len(group)
                if len(group) >= 5:
                    print "shift stop", key, Rkey, "orginal", h
                    print key in left_unique_index, Rkey in right_unique_index
                break
        #print "le", group

    if label == "r":
        key = temp[:-1]
        Rkey = tools.reverse(key)
        #print "rs", h, key, Rkey
        for i in range(0, length):
            flag, flagR = False, False

            if key in right_unique_index and len(right_unique_index[key]) == 1:
                temp = right_unique_index[key][0] + key
                flag = True

            if Rkey in left_unique_index and len(left_unique_index[Rkey]) == 1:
                Rtemp = Rkey + left_unique_index[Rkey][0]
                flagR = True

            if flag == True and flagR == True:
                #print "rr", key, temp
                #print Rkey, Rtemp
                break
            elif flag == True and flagR == False:
                key = temp[:-1]
                Rkey = tools.reverse(key)
                group.append((temp, 'f'))
                #print i, temp, key, Rkey
            elif flag == False and flagR == True:
                Rkey = Rtemp[1:]
                key = tools.reverse(Rkey)
                #ward = tool.reverse_ward(ward)
                group.append((Rtemp, 'b'))
                #print i, Rtemp, key, Rkey
            else:
                break
        #print "re", group
    ''' 
    if label == "r":
        for i in range(0, length):
            key = temp[:-1]
            if key in unique_index and len(right_unique_index[key]) == 1:
                temp = right_unique_index[key][0] + key
                group.append( temp )
            else:
                break
    '''
    assert len(group) <= len(h)
    return group
Esempio n. 16
0
def group_shift_kmer(pair_filename, unique_filename, output_filename,
                     NGS_kmer):

    left_unique_index, right_unique_index = read_unique_kmer(unique_filename)

    fout = open(output_filename, "w")
    count = 0
    kmers = {}
    with open(pair_filename, "r") as f:
        for line in f:
            words = line.strip().split()
            h1 = words[0]
            h2 = words[1]
            leftGroup1 = shift_kmer(h1, left_unique_index, right_unique_index,
                                    "l")
            leftGroup2 = shift_kmer(h2, left_unique_index, right_unique_index,
                                    "l")

            cov1 = int(words[2])
            cov2 = int(words[3])
            sum_coverage = cov1 + cov2

            Flag = True
            i = 0
            lenG = min(len(leftGroup1), len(leftGroup2))
            while i < lenG:
                if (tools.hamming_distance(leftGroup1[i][0], leftGroup2[i][0])
                        != 1 and tools.hamming_distance(
                            leftGroup1[i][0], tools.reverse(
                                leftGroup2[i][0])) != 1):
                    Flag = False
                    break
                i += 1
            '''
            # 60 equal to lowest coverage
            if cov1 - cov2 >= 60 or cov2 - cov1 >= 60: # or float(cov2/cov1) >=1.7 or float(cov1/cov2) >=1.7:
                continue
            if sum_coverage <= 180 or sum_coverage >= 280:
                continue
            '''
            if Flag == False:
                continue

            rightGroup1 = shift_kmer(h1, left_unique_index, right_unique_index,
                                     "r")
            rightGroup2 = shift_kmer(h2, left_unique_index, right_unique_index,
                                     "r")

            i = 0
            lenG = min(len(rightGroup1), len(rightGroup2))
            while i < lenG:
                if (tools.hamming_distance(rightGroup1[i][0],
                                           rightGroup2[i][0]) != 1
                        and tools.hamming_distance(
                            rightGroup1[i][0], tools.reverse(
                                rightGroup2[i][0])) != 1):
                    Flag = False
                    break
                i += 1
            if Flag == False:
                continue

            group1, group2 = [(h1, 'f')], [(h2, 'f')]
            group1.extend(leftGroup1)
            group2.extend(leftGroup2)
            group1.extend(rightGroup1)
            group2.extend(rightGroup2)

            gSize1 = len(group1)
            gSize2 = len(group2)
            if gSize1 <= kmerSize - 1 or gSize2 <= kmerSize - 1:
                #if gSize1 <=kmerSize/2 or gSize2 <= kmerSize/2:
                continue
            count += 1
            fout.write("group %s %s %s %s %s\n" %
                       (count, h1, h2, words[2], words[3]))
            print "group", count, len(leftGroup1), len(rightGroup1), len(
                leftGroup2), len(rightGroup2)
            for ele in group1:
                fout.write("%s %s " % (ele[0], ele[1]))
                if ele[0] not in kmers:
                    kmers[ele[0]] = []
                kmers[ele[0]].append(str(count) + ele[1] + 'A')  # A is zore
            fout.write("\n")

            for ele in group2:
                fout.write("%s %s " % (ele[0], ele[1]))
                if ele[0] not in kmers:
                    kmers[ele[0]] = []
                kmers[ele[0]].append(str(count) + ele[1] + 'B')  # B is one
            fout.write("\n")
    fout.close()
    fout = open(NGS_kmer, "w")
    print "total group number", count
    sortedKmers = sorted(kmers.items())
    filterGroup = set()
    for ele in sortedKmers:
        if len(ele[1]) >= 2:
            print ele
            for ID in ele[1]:
                filterGroup.add(ID[:-2])
            continue
        fout.write("%s" % ele[0])
        l = ele[1]
        for e in l:
            fout.write(" %s" % e)
        fout.write("\n")
    fout.close()

    print "filter group size", len(filterGroup)
    foutFilter = open(filter_filename, "w")
    with open(output_filename, "r") as f:
        state = 0
        for line in f:
            if state == 0 and line.startswith("group"):
                words = line.split()
                if words[1] not in filterGroup:
                    foutFilter.write(line)
                    state = 1
                else:
                    state = -1
            elif state == 1:
                foutFilter.write(line)
                state = 2
            elif state == 2:
                foutFilter.write(line)
                state = 0
            elif state == -1:
                state = 0