예제 #1
0
    def logEdges(self, folderName, stagename):
        print "Logging edges"
        logList = []
        
        if stagename == "XResolution":
            mapDummyToRealDic =self.readInJSON(folderName, "mapDummyToRealDic.json")
        else:
            mapDummyToRealDic = {}

        for eachnode in self.graphNodesList:
            tmpNodeIndexList = []
            for kk in eachnode.nodeIndexList:
                if kk >= self.N1:
                    tmpNodeIndexList += mapDummyToRealDic[str(kk-self.N1)][1]
                else:
                    tmpNodeIndexList += [kk]

            if len(tmpNodeIndexList) >= 2:
                for i in range(len(tmpNodeIndexList)-1):
                    currentName = tmpNodeIndexList[i]
                    nextName =  tmpNodeIndexList[i+1]
                    cName =  abunHouseKeeper.parseIDToName(currentName,'C',0)
                    nName =  abunHouseKeeper.parseIDToName(nextName,'C',0)
                    logList.append([cName, nName])

        with open( folderName + stagename + ".json", 'w') as f:
            json.dump(logList, f)    
예제 #2
0
def getCtAgg(inList, myCountDic, Gnew, lenDic):
    newInList = []
    print "New getCtAgg", len(inList)
    for i in inList:
        tmp1 = convert4to1base(i)
        tmp2 = convert4to2base(i)
        covTmp = 0
        lenTmp = 0

        print "len(Gnew.graphNodesList[tmp2].nodeIndexList), len(inList)", len(
            Gnew.graphNodesList[tmp2].nodeIndexList), len(inList)

        for eachindex in Gnew.graphNodesList[tmp2].nodeIndexList:

            if lenDic[abunHouseKeeper.parseIDToName(eachindex, 'C',
                                                    len(lenDic))] > lenTmp:
                lenTmp = lenDic[abunHouseKeeper.parseIDToName(
                    eachindex, 'C', len(lenDic))]

                name = "Segkk" + str(eachindex / 2)
                covTmp = myCountDic[name]

        newInList.append([tmp2, covTmp])

    return newInList
예제 #3
0
    def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename):
        print "condenseEdgeRemove"
        thresPass = 100
        thresForStrangeCut = 5000
        ### kkdebug

        toRemoveList = []
        
        for eachnode in self.graphNodesList:
            if len(eachnode.nodeIndexList) > 0:
                if len(eachnode.listOfNextNodes) ==1  :
                    nextNodeIndex = eachnode.listOfNextNodes[0][0]
                    nextNode= self.graphNodesList[nextNodeIndex]
                    if len(nextNode.listOfPrevNodes) == 1 : 
                        currentName = eachnode.nodeIndex
                        nextName =  nextNode.nodeIndex

                        contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5)

                        cName =  abunHouseKeeper.parseIDToName(currentName,'C',0)
                        nName =  abunHouseKeeper.parseIDToName(nextName,'C',0)

                        noGoNext = self.readInJSON(folderName, "noGoNext.json")
                        noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

                        overlap = [-1, -1]
                        ctr = 0 

                        for eachpath in contigReadPaths:
                            if len(eachpath) > 2: 
                                ctr = ctr + 1 
                                
                            elif len(eachpath) == 2:     
                                
                                contigName = cName
                                leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

                                contigName = nName
                                rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
                                
                                overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)


                        if ctr <= thresPass and  (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ):
                    
                            self.removeEdge(currentName, nextName)
                            toRemoveList.append([currentName, nextName])


        ### kkdebug
        #with open( "dataFolder/toRemoveList.json", 'w') as f:
        #    json.dump(toRemoveList, f)    

        self.findAdjList()
예제 #4
0
    def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename):
        print "condenseEdgeRemove"
        thresPass = 100
        thresForStrangeCut = 5000
        ### kkdebug

        toRemoveList = []
        
        for eachnode in self.graphNodesList:
            if len(eachnode.nodeIndexList) > 0:
                if len(eachnode.listOfNextNodes) ==1  :
                    nextNodeIndex = eachnode.listOfNextNodes[0][0]
                    nextNode= self.graphNodesList[nextNodeIndex]
                    if len(nextNode.listOfPrevNodes) == 1 : 
                        currentName = eachnode.nodeIndex
                        nextName =  nextNode.nodeIndex

                        contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5)

                        cName =  abunHouseKeeper.parseIDToName(currentName,'C',0)
                        nName =  abunHouseKeeper.parseIDToName(nextName,'C',0)

                        noGoNext = self.readInJSON(folderName, "noGoNext.json")
                        noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

                        overlap = [-1, -1]
                        ctr = 0 

                        for eachpath in contigReadPaths:
                            if len(eachpath) > 2: 
                                ctr = ctr + 1 
                                
                            elif len(eachpath) == 2:     
                                
                                contigName = cName
                                leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

                                contigName = nName
                                rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
                                
                                overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)


                        if ctr <= thresPass and  (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ):
                    
                            self.removeEdge(currentName, nextName)
                            toRemoveList.append([currentName, nextName])


        ### kkdebug
        #with open( "dataFolder/toRemoveList.json", 'w') as f:
        #    json.dump(toRemoveList, f)    

        self.findAdjList()
예제 #5
0
def findNoGoByNoHeads(noGoList, side, folderName):
    noGoListNew = []

    sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
     formSortedDataList(folderName)

    lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta")
    lenDicRead = IORobot.obtainLength(folderName,
                                      "phasingSeedName_Double.fasta")

    for x in noGoList:
        rList = findAttachedReads(x, side, folderName, sortedContigList,
                                  sortedContigDic, lenDicContig, lenDicRead)
        cList = findAttachedContigs(rList, side, folderName, sortedReadList,
                                    sortedReadDic, lenDicContig, lenDicRead)

        if bestMatchContigOnly == False:
            bestContigIDList = findBreakContigAdv(cList)
        else:
            bestContigIDList = findBreakContig(cList)

        if len(rList) > 0 and len(cList) > 0:
            print "x, side, len(rList), len(cList), len(bestContigIDList)",\
              abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
            print "cList", bestContigIDList

        noGoListNew = noGoListNew + bestContigIDList

    return noGoListNew
예제 #6
0
def findAttachedReads(x, side, folderName, sortedContigList, sortedContigDic,
                      lenDicContig, lenDicRead):
    rList = []
    '''
	Format : 
	  [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [ IDY]  | [TAGS]
	=====================================================================================
       1      562  |      819     1418  |      562      600  |    84.72  | Contig0_d	Read121_d
       1      562  |     4077     3478  |      562      600  |    84.72  | Contig0_d	Read121_p
       1      564  |      656       68  |      564      589  |    90.13  | Contig0_d	Read382_d
       1      564  |     6996     7584  |      564      589  |    90.13  | Contig0_d	Read382_p
       1      571  |     1386      815  |      571      572  |    86.60  | Contig0_d	Read421_d

	'''

    thres = thresMiddleContig

    key = abunHouseKeeper.parseIDToName(x, 'C', 0)
    if key in sortedContigDic:
        tmp = sortedContigDic[key]

        while tmp < len(sortedContigList) and sortedContigList[tmp][-2] == key:
            eachsub = sortedContigList[tmp]
            if overlapCR(eachsub, side, thres, lenDicContig, lenDicRead):
                rList.append(eachsub[-1])

            tmp = tmp + 1

        distinctRList = abunHouseKeeper.getDistinct(rList)

    else:
        distinctRList = []
    return distinctRList
예제 #7
0
def findNoGoByNoHeads(noGoList, side, folderName):
	noGoListNew = []

	sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
		formSortedDataList(folderName)


	lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" )
	lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")

	for x in noGoList:
		rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead)
		cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead)

		if bestMatchContigOnly == False:
			bestContigIDList = findBreakContigAdv(cList)
		else:
			bestContigIDList = findBreakContig(cList)

		if len(rList) > 0 and len(cList) > 0:
			print "x, side, len(rList), len(cList), len(bestContigIDList)",\
				 abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
			print "cList", bestContigIDList
 
		noGoListNew = noGoListNew + bestContigIDList


	return noGoListNew
예제 #8
0
    def bipartiteLocalResolve(self, resolvedList, inList, outList, folderName):

        #noGoNext = self.readInJSON(folderName, "noGoNext.json")
        #noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

        for u in inList:
            self.clearOut(u/2)

        for v in outList:
            self.clearIn(v/2)

        for e in resolvedList:
            u, v =e[0], e[-1]
            cName =  abunHouseKeeper.parseIDToName(u,'C',0)
            nName =  abunHouseKeeper.parseIDToName(v,'C',0)

            #if not cName in noGoNext and not nName in noGoPrev:  
            self.insertEdge(u,v,1997)
예제 #9
0
    def bipartiteLocalResolve(self, resolvedList, inList, outList, folderName):

        #noGoNext = self.readInJSON(folderName, "noGoNext.json")
        #noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

        if len(resolvedList) > 0:
            for u in inList:
                self.clearOut(u/2)

            for v in outList:
                self.clearIn(v/2)

            for e in resolvedList:
                u, v =e[0], e[-1]
                cName =  abunHouseKeeper.parseIDToName(u,'C',0)
                nName =  abunHouseKeeper.parseIDToName(v,'C',0)

                #if not cName in noGoNext and not nName in noGoPrev:  
                self.insertEdge(u,v,1997)
예제 #10
0
def getCtAgg(inList, myCountDic, Gnew, lenDic):
    newInList = []
    print "New getCtAgg", len(inList)
    for i in inList:
        tmp1 = convert4to1base(i)
        tmp2 = convert4to2base(i)
        covTmp = 0
        lenTmp = 0

        print "len(Gnew.graphNodesList[tmp2].nodeIndexList), len(inList)", len(
            Gnew.graphNodesList[tmp2].nodeIndexList
        ), len(inList)

        for eachindex in Gnew.graphNodesList[tmp2].nodeIndexList:

            if lenDic[abunHouseKeeper.parseIDToName(eachindex, "C", len(lenDic))] > lenTmp:
                lenTmp = lenDic[abunHouseKeeper.parseIDToName(eachindex, "C", len(lenDic))]

                name = "Segkk" + str(eachindex / 2)
                covTmp = myCountDic[name]

        newInList.append([tmp2, covTmp])

    return newInList
예제 #11
0
def findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead):
	rList = [] 

	
	'''
	Format : 
	  [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [ IDY]  | [TAGS]
	=====================================================================================
       1      562  |      819     1418  |      562      600  |    84.72  | Contig0_d	Read121_d
       1      562  |     4077     3478  |      562      600  |    84.72  | Contig0_d	Read121_p
       1      564  |      656       68  |      564      589  |    90.13  | Contig0_d	Read382_d
       1      564  |     6996     7584  |      564      589  |    90.13  | Contig0_d	Read382_p
       1      571  |     1386      815  |      571      572  |    86.60  | Contig0_d	Read421_d

	'''


	thres = thresMiddleContig

	key = abunHouseKeeper.parseIDToName(x, 'C', 0)
	if key in sortedContigDic:
		tmp = sortedContigDic[key]
		
		while tmp < len(sortedContigList) and sortedContigList[tmp][-2] == key:
			eachsub = sortedContigList[tmp]
			if overlapCR(eachsub, side, thres, lenDicContig,lenDicRead):
				rList.append(eachsub[-1])

			tmp = tmp + 1


		distinctRList = abunHouseKeeper.getDistinct(rList)

	else:
		distinctRList = []
	return distinctRList
예제 #12
0
def singleGapLookUp(eachmatchpair,folderName, N1,  mummerLink,  contigReadGraph, contigFilename,readsetFilename):
    #print eachmatchpair
    leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0],eachmatchpair[-1],0,0,""
    
    succReadsList = []
    G = seqGraphWt(0)
    G.loadFromFile(folderName, contigReadGraph)
    succReadsList = BFS(leftCtgIndex,rightCtgIndex, G, N1)

    if len(succReadsList) > 0:
        succReadsList.pop(0)
        succReadsList.pop(-1)
    else:
        print "interesting item for future study"

    print "succReadsList" , succReadsList
    
    if len(succReadsList) == 0:
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1)
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        
        print "overlap contig : ", overlap
        
        leftEnd = len(leftSeg) - overlap[0]
        middleContent = ""
        
    else:
        
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1)
        print contigName
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        readName = abunHouseKeeper.parseIDToName(succReadsList[0], 'R', N1)
        print readName
        rightSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        
        print "overlap start read : ", overlap
        
        leftEnd = len(leftSeg) - overlap[0]
        
        middleContent = ""
        
        for i in range(len(succReadsList)-1):
            readName = abunHouseKeeper.parseIDToName(succReadsList[i], 'R', N1)
            leftSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
            readName = abunHouseKeeper.parseIDToName(succReadsList[i+1], 'R', N1)
            rightSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
            
            overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
            print "overlap middle read : ", overlap
            middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] 
        
        
        readName = abunHouseKeeper.parseIDToName(succReadsList[-1], 'R', N1)
        leftSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        print "overlap end read : ", overlap
        
        middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]]

    return [leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent]
예제 #13
0
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename):

    print eachmatchpair
    leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, ""

    succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1)

    succReadsList = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5)
    # shuffle(allPaths)

    print "allPaths", allPaths

    possibleList = []
    for p in allPaths:
        noContig = True
        for pp in p[1:-1]:
            if pp < N1:
                noContig = False
        if noContig == True:
            possibleList.append(p)
    print "possibleList", possibleList

    minListLen = 1000
    for p in possibleList:
        if len(p) < minListLen:
            succReadsList = p
            minListLen = len(p)

    if len(succReadsList) > 0:
        succReadsList.pop(0)
        succReadsList.pop(-1)
    else:
        print "interesting item for future study"

    print "succReadsList", succReadsList

    if len(succReadsList) == 0:
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap contig : ", overlap

        leftEnd = len(leftSeg) - overlap[0]
        middleContent = ""

    else:

        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        print contigName
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1)
        print readName
        rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap start read : ", overlap

        leftEnd = len(leftSeg) - overlap[0]

        middleContent = ""

        for i in range(len(succReadsList) - 1):
            readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1)
            leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1)
            rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            overlap = IORobot.alignWithName(
                leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
            )
            print "overlap middle read : ", overlap
            middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

        readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1)
        leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )
        print "overlap end read : ", overlap

        middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

    return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]