Beispiel #1
0
def clusterUnitTest():

    typeLen = 30
    readLen = 40
    copyLen = 10
    p = 0.01

    noisyReads = np.zeros(typeLen * copyLen * 2 * readLen,
                          dtype=np.int8).reshape(typeLen * copyLen,
                                                 2 * readLen)
    rawReads = np.zeros(typeLen * readLen,
                        dtype=np.int8).reshape(typeLen, readLen)

    for typeindex in range(typeLen):
        for eachbaseindex in range(readLen):
            rawReads[typeindex][eachbaseindex] = random.randint(1, 4)

    for index in range(copyLen):
        startindex = index * typeLen
        endindex = (index + 1) * typeLen
        tempNoisyReads = dataGen.addIndelNoise(rawReads, p)
        for j in range(startindex, endindex):
            noisyReads[j][0:len(tempNoisyReads[j -
                                               startindex])] = tempNoisyReads[
                                                   j - startindex][:]

    logging.rawDataSave("clusterReadsUnitTest", "", "", noisyReads, 'n')
def clusterUnitTest():
    

    
    typeLen = 30 
    readLen = 40
    copyLen = 10 
    p = 0.01 
 
 
    noisyReads  = np.zeros(typeLen*copyLen*2*readLen, dtype = np.int8).reshape(typeLen*copyLen,2*readLen)
    rawReads = np.zeros(typeLen*readLen, dtype = np.int8).reshape(typeLen, readLen)    
    
    for typeindex in range(typeLen):
        for eachbaseindex in range(readLen):
            rawReads[typeindex][eachbaseindex] = random.randint(1,4)
            
    
        
    
    for index in range(copyLen):
        startindex = index* typeLen
        endindex = (index +1 )*typeLen
        tempNoisyReads = dataGen.addIndelNoise(rawReads, p)
        for j in range(startindex, endindex):
            noisyReads[j][0:len(tempNoisyReads[j-startindex])] = tempNoisyReads[j-startindex][:]
        
    
    logging.rawDataSave("clusterReadsUnitTest", "", "", noisyReads, 'n')
def generateData( typeOfGen,detail, parameterRobot): 
    N, G, L,p = parameterRobot.N, parameterRobot.G, parameterRobot.L,parameterRobot.p 
    
    motherGen = np.zeros(G, dtype = np.int8 )
    
    reads = np.zeros(N*L, dtype = np.int8).reshape(N,L) 
    noisyReads = np.zeros(N*L, dtype = np.int8).reshape(N,L)
    detailArr = detail.split('-')
    
    for index in range(G): 
        motherGen[index] = random.randint(1,4) 

    if typeOfGen == 'r' :         
        
        longestRepeatlength = int(detailArr[0])
        
        repeatLoc1 = random.randint(0, G-2*longestRepeatlength)
        repeatLoc2 = random.randint(repeatLoc1 +longestRepeatlength ,G-longestRepeatlength )
        
        print "repeatLoc1, repeatLoc2, detail", repeatLoc1, repeatLoc2, detail
        
        
        for eachindex in range(repeatLoc2, repeatLoc2 + longestRepeatlength):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc2 + repeatLoc1]           
            
    elif typeOfGen == 'i': 
        
        lrepeat = int(detailArr[0])
        linter = int(detailArr[1])
        repeatLoc1 = random.randint(0  , G- 2*lrepeat -2*linter  )
        intrepeatLoc1 = random.randint(repeatLoc1 + lrepeat ,G- lrepeat -2*linter )
        repeatLoc2 = random.randint(intrepeatLoc1 + linter , G- lrepeat - linter )
        intrepeatLoc2 = random.randint(repeatLoc2 + lrepeat, G-linter)
        
        for eachindex in range(repeatLoc2, repeatLoc2+ lrepeat):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc2 + repeatLoc1]
        
        for eachindex in range(intrepeatLoc2, intrepeatLoc2+ linter):
            motherGen[eachindex] = motherGen[eachindex -intrepeatLoc2 + intrepeatLoc1 ]
        
        print "repeatLoc1,repeatLoc2,intrepeatLoc1,intrepeatLoc2, lrepeat,linter",repeatLoc1,repeatLoc2,intrepeatLoc1,intrepeatLoc2, lrepeat,linter
    
    elif typeOfGen == 't' :
        ltriple = int(detailArr[0])
        
        repeatLoc1 = random.randint(0, G-3*ltriple)
        repeatLoc2 = random.randint(repeatLoc1+ ltriple, G- 2*ltriple)
        repeatLoc3 = random.randint(repeatLoc2+ ltriple, G- ltriple)
        
        for eachindex in range(repeatLoc2, repeatLoc2 + ltriple):
            motherGen[eachindex] = motherGen[eachindex -repeatLoc2 + repeatLoc1 ]
            
        for eachindex in range(repeatLoc3, repeatLoc3 + ltriple):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc3 + repeatLoc1]
            
            
        print "repeatLoc1, repeatLoc2, repeatLoc3, ltriple", repeatLoc1, repeatLoc2, repeatLoc3, ltriple

    elif typeOfGen == 'd' :
        filename = detailArr[0] 
        truncatestart = int(detailArr[1])
        truncateend = int(detailArr[2])
        
        
        f = open(filename , 'r')
        
        line = f.readline()
        
        motherStr = ""
        print "Genome Detail ", line
        
        while(len(line) > 0):   
            line= f.readline()
            motherStr = motherStr + line[0:-1]
        
        print "genomeLen", len(motherStr)
        
        G = truncateend - truncatestart 
        print "G:", G
        motherGen = np.zeros(G, dtype = np.int8)
        
        for eachindex in range(truncatestart, truncateend):
            if motherStr[eachindex] == 'A':
                motherGen[eachindex- truncatestart] = 1
            elif motherStr[eachindex] == 'C':
                motherGen[eachindex- truncatestart] = 2
            elif motherStr[eachindex] == 'G':
                motherGen[eachindex- truncatestart] = 3
            elif motherStr[eachindex] == 'T':
                motherGen[eachindex- truncatestart] = 4

        f.close()
    
    
    elif typeOfGen == 'm':
        lrepeat = int(detailArr[0])
        lsnp = int(detailArr[1])
        lint = int(detailArr[2])

        randLoc1 = random.randint(0, G - 2*lrepeat - 2*lsnp - 2*lint)
        randLocSnp1 = random.randint(randLoc1+ lrepeat, G - lrepeat - 2*lsnp - 2*lint)
        randLocint1 = random.randint(randLocSnp1 + lsnp , G - lrepeat - lsnp - 2*lint)
        
        randLoc2 = random.randint(randLocint1 + lint, G - lrepeat - lsnp - lint)
        randLocSnp2 = random.randint(randLoc2 + lrepeat, G - lsnp - lint)
        randLocint2 = random.randint(randLocSnp2 + lsnp, G - lint)
        
        
        for eachindex in range(randLoc2, randLoc2+ lrepeat):
            motherGen[eachindex] = motherGen[eachindex - randLoc2 + randLoc1]
            
        for eachindex in range(randLocSnp2, randLocSnp2+ lsnp):
            motherGen[eachindex] = motherGen[eachindex - randLocSnp2 + randLocSnp1]
            
        for eachindex in range(randLocint2, randLocint2 + lint):
            motherGen[eachindex] = motherGen[eachindex - randLocint2 + randLocint1]
            #print eachindex, eachindex - randLocint2 + randLocint1

        #print motherGen[randLocint1:randLocint1+lint] == motherGen[randLocint2:randLocint2+lint]
        # introduce 1 SNP on both sides
        
        motherGen[randLocSnp2 + lsnp/4] = motherGen[randLocSnp2 + lsnp/4] + 1
        if motherGen[randLocSnp2 + lsnp/4] == 5 :
            motherGen[randLocSnp2 + lsnp/4] = 1
            
        motherGen[randLocSnp2 + lsnp*3/4] = motherGen[randLocSnp2 + lsnp*3/4] + 1
        if motherGen[randLocSnp2 + lsnp*3/4] == 5:
            motherGen[randLocSnp2 + lsnp*3/4] = 1
        
        print "randLocSnp1 + lsnp/4, randLocSnp2 + lsnp/4",randLocSnp1 + lsnp/4, randLocSnp2 + lsnp/4
        print motherGen[randLocSnp1 + lsnp/4], motherGen[randLocSnp2 + lsnp/4]

        print "randLocSnp1 + 3*lsnp/4, randLocSnp2 + 3*lsnp/4",randLocSnp1 + 3*lsnp/4, randLocSnp2 + 3*lsnp/4
        print motherGen[randLocSnp1 + 3*lsnp/4], motherGen[randLocSnp2 + 3*lsnp/4]
        
 #       motherGen[randLocSnp2 + lsnp/2] = motherGen[randLocSnp2 + lsnp/2] + 1
#        if motherGen[randLocSnp2 + lsnp/2] == 5 :
#            motherGen[randLocSnp2 + lsnp/2] = 1

#        print motherGen[randLocSnp1 + lsnp/2], motherGen[randLocSnp2 + lsnp/2]


    elif typeOfGen == 'a':
        print "Tandem Repeat"
        ltandem1 = int(detailArr[0])
        lcopyNum1 = int(detailArr[1])
        ltandem2 = int(detailArr[2])
        lcopyNum2 = int(detailArr[3])
        
        lrepeat1 = ltandem1*lcopyNum1
        lrepeat2 = ltandem2*lcopyNum2
        
        randLoc1 = random.randint(0, G - lrepeat1 - lrepeat2)
        randLoc2 = random.randint(randLoc1 + lrepeat1, G  - lrepeat2)
        
        for copyindex in range(lcopyNum1):
            for eachindex in range(randLoc1+copyindex*ltandem1, randLoc1 + (copyindex+1) *ltandem1 ):
                motherGen[eachindex]= motherGen[eachindex- copyindex*ltandem1]
        
        for copyindex in range(lcopyNum2):
            for eachindex in range(randLoc2+copyindex*ltandem2, randLoc2 + (copyindex+1) *ltandem2 ):
                motherGen[eachindex]= motherGen[eachindex- copyindex*ltandem2]
                
        print "randLoc1, randLoc2", randLoc1, randLoc2
        
    else:
        
        print "Error in Type "
        
    indel = parameterRobot.indel
    reads, noisyReads = readGen(N,L,p, motherGen, indel)
    
    logging.rawDataSave(parameterRobot.defaultFolder+"UnitTest", motherGen, reads, noisyReads)
    
    return motherGen, reads, noisyReads
def generateData(typeOfGen, detail, parameterRobot):
    N, G, L, p = parameterRobot.N, parameterRobot.G, parameterRobot.L, parameterRobot.p

    motherGen = np.zeros(G, dtype=np.int8)

    reads = np.zeros(N * L, dtype=np.int8).reshape(N, L)
    noisyReads = np.zeros(N * L, dtype=np.int8).reshape(N, L)
    detailArr = detail.split('-')

    for index in range(G):
        motherGen[index] = random.randint(1, 4)

    if typeOfGen == 'r':

        longestRepeatlength = int(detailArr[0])

        repeatLoc1 = random.randint(0, G - 2 * longestRepeatlength)
        repeatLoc2 = random.randint(repeatLoc1 + longestRepeatlength,
                                    G - longestRepeatlength)

        print "repeatLoc1, repeatLoc2, detail", repeatLoc1, repeatLoc2, detail

        for eachindex in range(repeatLoc2, repeatLoc2 + longestRepeatlength):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc2 +
                                             repeatLoc1]

    elif typeOfGen == 'i':

        lrepeat = int(detailArr[0])
        linter = int(detailArr[1])
        repeatLoc1 = random.randint(0, G - 2 * lrepeat - 2 * linter)
        intrepeatLoc1 = random.randint(repeatLoc1 + lrepeat,
                                       G - lrepeat - 2 * linter)
        repeatLoc2 = random.randint(intrepeatLoc1 + linter,
                                    G - lrepeat - linter)
        intrepeatLoc2 = random.randint(repeatLoc2 + lrepeat, G - linter)

        for eachindex in range(repeatLoc2, repeatLoc2 + lrepeat):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc2 +
                                             repeatLoc1]

        for eachindex in range(intrepeatLoc2, intrepeatLoc2 + linter):
            motherGen[eachindex] = motherGen[eachindex - intrepeatLoc2 +
                                             intrepeatLoc1]

        print "repeatLoc1,repeatLoc2,intrepeatLoc1,intrepeatLoc2, lrepeat,linter", repeatLoc1, repeatLoc2, intrepeatLoc1, intrepeatLoc2, lrepeat, linter

    elif typeOfGen == 't':
        ltriple = int(detailArr[0])

        repeatLoc1 = random.randint(0, G - 3 * ltriple)
        repeatLoc2 = random.randint(repeatLoc1 + ltriple, G - 2 * ltriple)
        repeatLoc3 = random.randint(repeatLoc2 + ltriple, G - ltriple)

        for eachindex in range(repeatLoc2, repeatLoc2 + ltriple):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc2 +
                                             repeatLoc1]

        for eachindex in range(repeatLoc3, repeatLoc3 + ltriple):
            motherGen[eachindex] = motherGen[eachindex - repeatLoc3 +
                                             repeatLoc1]

        print "repeatLoc1, repeatLoc2, repeatLoc3, ltriple", repeatLoc1, repeatLoc2, repeatLoc3, ltriple

    elif typeOfGen == 'd':
        filename = detailArr[0]
        truncatestart = int(detailArr[1])
        truncateend = int(detailArr[2])

        f = open(filename, 'r')

        line = f.readline()

        motherStr = ""
        print "Genome Detail ", line

        while (len(line) > 0):
            line = f.readline()
            motherStr = motherStr + line[0:-1]

        print "genomeLen", len(motherStr)

        G = truncateend - truncatestart
        print "G:", G
        motherGen = np.zeros(G, dtype=np.int8)

        for eachindex in range(truncatestart, truncateend):
            if motherStr[eachindex] == 'A':
                motherGen[eachindex - truncatestart] = 1
            elif motherStr[eachindex] == 'C':
                motherGen[eachindex - truncatestart] = 2
            elif motherStr[eachindex] == 'G':
                motherGen[eachindex - truncatestart] = 3
            elif motherStr[eachindex] == 'T':
                motherGen[eachindex - truncatestart] = 4

        f.close()

    elif typeOfGen == 'm':
        lrepeat = int(detailArr[0])
        lsnp = int(detailArr[1])
        lint = int(detailArr[2])

        randLoc1 = random.randint(0, G - 2 * lrepeat - 2 * lsnp - 2 * lint)
        randLocSnp1 = random.randint(randLoc1 + lrepeat,
                                     G - lrepeat - 2 * lsnp - 2 * lint)
        randLocint1 = random.randint(randLocSnp1 + lsnp,
                                     G - lrepeat - lsnp - 2 * lint)

        randLoc2 = random.randint(randLocint1 + lint,
                                  G - lrepeat - lsnp - lint)
        randLocSnp2 = random.randint(randLoc2 + lrepeat, G - lsnp - lint)
        randLocint2 = random.randint(randLocSnp2 + lsnp, G - lint)

        for eachindex in range(randLoc2, randLoc2 + lrepeat):
            motherGen[eachindex] = motherGen[eachindex - randLoc2 + randLoc1]

        for eachindex in range(randLocSnp2, randLocSnp2 + lsnp):
            motherGen[eachindex] = motherGen[eachindex - randLocSnp2 +
                                             randLocSnp1]

        for eachindex in range(randLocint2, randLocint2 + lint):
            motherGen[eachindex] = motherGen[eachindex - randLocint2 +
                                             randLocint1]
            #print eachindex, eachindex - randLocint2 + randLocint1

        #print motherGen[randLocint1:randLocint1+lint] == motherGen[randLocint2:randLocint2+lint]
        # introduce 1 SNP on both sides

        motherGen[randLocSnp2 +
                  lsnp / 4] = motherGen[randLocSnp2 + lsnp / 4] + 1
        if motherGen[randLocSnp2 + lsnp / 4] == 5:
            motherGen[randLocSnp2 + lsnp / 4] = 1

        motherGen[randLocSnp2 +
                  lsnp * 3 / 4] = motherGen[randLocSnp2 + lsnp * 3 / 4] + 1
        if motherGen[randLocSnp2 + lsnp * 3 / 4] == 5:
            motherGen[randLocSnp2 + lsnp * 3 / 4] = 1

        print "randLocSnp1 + lsnp/4, randLocSnp2 + lsnp/4", randLocSnp1 + lsnp / 4, randLocSnp2 + lsnp / 4
        print motherGen[randLocSnp1 + lsnp / 4], motherGen[randLocSnp2 +
                                                           lsnp / 4]

        print "randLocSnp1 + 3*lsnp/4, randLocSnp2 + 3*lsnp/4", randLocSnp1 + 3 * lsnp / 4, randLocSnp2 + 3 * lsnp / 4
        print motherGen[randLocSnp1 + 3 * lsnp / 4], motherGen[randLocSnp2 +
                                                               3 * lsnp / 4]

#       motherGen[randLocSnp2 + lsnp/2] = motherGen[randLocSnp2 + lsnp/2] + 1


#        if motherGen[randLocSnp2 + lsnp/2] == 5 :
#            motherGen[randLocSnp2 + lsnp/2] = 1

#        print motherGen[randLocSnp1 + lsnp/2], motherGen[randLocSnp2 + lsnp/2]

    elif typeOfGen == 'a':
        print "Tandem Repeat"
        ltandem1 = int(detailArr[0])
        lcopyNum1 = int(detailArr[1])
        ltandem2 = int(detailArr[2])
        lcopyNum2 = int(detailArr[3])

        lrepeat1 = ltandem1 * lcopyNum1
        lrepeat2 = ltandem2 * lcopyNum2

        randLoc1 = random.randint(0, G - lrepeat1 - lrepeat2)
        randLoc2 = random.randint(randLoc1 + lrepeat1, G - lrepeat2)

        for copyindex in range(lcopyNum1):
            for eachindex in range(randLoc1 + copyindex * ltandem1,
                                   randLoc1 + (copyindex + 1) * ltandem1):
                motherGen[eachindex] = motherGen[eachindex -
                                                 copyindex * ltandem1]

        for copyindex in range(lcopyNum2):
            for eachindex in range(randLoc2 + copyindex * ltandem2,
                                   randLoc2 + (copyindex + 1) * ltandem2):
                motherGen[eachindex] = motherGen[eachindex -
                                                 copyindex * ltandem2]

        print "randLoc1, randLoc2", randLoc1, randLoc2

    else:

        print "Error in Type "

    indel = parameterRobot.indel
    reads, noisyReads = readGen(N, L, p, motherGen, indel)

    logging.rawDataSave(parameterRobot.defaultFolder + "UnitTest", motherGen,
                        reads, noisyReads)

    return motherGen, reads, noisyReads