Ejemplo n.º 1
0
def findNoGoByNoHeads(noGoList, side, folderName):
	noGoListNew = []

	sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
		formSortedDataList(folderName)


	lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" )
	lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")

	for x in noGoList:
		rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead)
		cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead)

		if bestMatchContigOnly == False:
			bestContigIDList = findBreakContigAdv(cList)
		else:
			bestContigIDList = findBreakContig(cList)

		if len(rList) > 0 and len(cList) > 0:
			print "x, side, len(rList), len(cList), len(bestContigIDList)",\
				 abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
			print "cList", bestContigIDList
 
		noGoListNew = noGoListNew + bestContigIDList


	return noGoListNew
Ejemplo n.º 2
0
def findNoGoByNoHeads(noGoList, side, folderName):
    noGoListNew = []

    sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
     formSortedDataList(folderName)

    lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta")
    lenDicRead = IORobot.obtainLength(folderName,
                                      "phasingSeedName_Double.fasta")

    for x in noGoList:
        rList = findAttachedReads(x, side, folderName, sortedContigList,
                                  sortedContigDic, lenDicContig, lenDicRead)
        cList = findAttachedContigs(rList, side, folderName, sortedReadList,
                                    sortedReadDic, lenDicContig, lenDicRead)

        if bestMatchContigOnly == False:
            bestContigIDList = findBreakContigAdv(cList)
        else:
            bestContigIDList = findBreakContig(cList)

        if len(rList) > 0 and len(cList) > 0:
            print "x, side, len(rList), len(cList), len(bestContigIDList)",\
              abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
            print "cList", bestContigIDList

        noGoListNew = noGoListNew + bestContigIDList

    return noGoListNew
def findCoverageFromRawData(folderName):
    contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    G = 0
    NL = 0
    for eachitem in contigLenDic:
        G = G + contigLenDic[eachitem]

    for eachitem in readLenDic:
        NL = NL + readLenDic[eachitem]

    c = (NL * 1.0) / G
    print c
    return c
Ejemplo n.º 4
0
def findCoverageFromRawData(folderName):
    contigLenDic  = IORobot.obtainLength(folderName, "contigs.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    G = 0 
    NL = 0 
    for eachitem in contigLenDic:
        G = G+ contigLenDic[eachitem]
        
    for eachitem in readLenDic:
        NL = NL+ readLenDic[eachitem]
    
    c = (NL*1.0)/G 
    print c 
    return c 
Ejemplo n.º 5
0
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph,
                        contigFilename, readsetFilename):
    '''
    Algorithm: 
    1)Load ContigReadGraph and form xResolvedGraph
    2)Transitive reduction and remove double pointers
    3)Bipartite resolution
    4)xResolve 
    5)Form gapLookUp 
    6)Read contigs out from graph
    7)CheckAns and get it done today again... 
    '''
    if abunHouseKeeper.abunGlobalRunEM == True:
        emalgo.generateAssociatedReadDic(folderName)

    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDic)

    Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink,
                        readsetFilename, contigFilename)
    Gnew.logEdges(folderName, "graphsurgery")

    #Gnew.reportEdge()
    #assert(False)

    Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic,
                       lenDic, mummerLink)
    Gnew.logEdges(folderName, "BResolution")

    XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1,
                mummerLink)
    Gnew.logEdges(folderName, "XResolution")

    readContigForAbunSplit(folderName, mummerLink, contigFilename,
                           readsetFilename, N1, contigReadGraph)
Ejemplo n.º 6
0
def generateGapContentLookup(
    folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={}
):
    gapContentLookUpList = []

    contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta")
    N1 = len(contigLenDic) * 2

    resolvedList = []

    print "mapDummyToRealDic", mapDummyToRealDic
    for eachmatchpair in oldResolvedList:
        tmpList = []

        if eachmatchpair[0] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1]
        else:
            tmpList.append(eachmatchpair[0])

        if eachmatchpair[-1] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1]
        else:
            tmpList.append(eachmatchpair[-1])

        for ii in range(len(tmpList) - 1):
            resolvedList.append([tmpList[ii], tmpList[ii + 1]])

    gapContentLookUpList = parallelGapLookUp(
        resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename
    )

    return gapContentLookUpList
Ejemplo n.º 7
0
def filterEdge(adjacencyList, folderName, contigFilename):
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    thresFoPhase = 2000
    smallList, largeList = [], []
    for eachitem in lenDic:
        id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C')
        if lenDic[eachitem] < thresFoPhase:
            smallList.append(id)
        else:
            largeList.append(id)
    
    newAdjacencyList = [[] for i in range(len(adjacencyList))]
    
    for i in largeList:
        for eachitem in adjacencyList[i]:
######## IMPORTANT:
            if  eachitem in largeList and eachitem / 2 != i / 2:
######## NEED TO REMOVE IN PRODUCTION if True
                newAdjacencyList[i].append(eachitem)
    
    
    print "len(smallList)  , len(largeList): ", len(smallList)  , len(largeList)
    print "lenDic: ", lenDic
    
    for eachitem in newAdjacencyList:
        print "newAdjacencyList :", eachitem 
        
    return newAdjacencyList
Ejemplo n.º 8
0
def filterEdge(adjacencyList, folderName, contigFilename):
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    thresFoPhase = 2000
    smallList, largeList = [], []
    for eachitem in lenDic:
        id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C')
        if lenDic[eachitem] < thresFoPhase:
            smallList.append(id)
        else:
            largeList.append(id)

    newAdjacencyList = [[] for i in range(len(adjacencyList))]

    for i in largeList:
        for eachitem in adjacencyList[i]:
            ######## IMPORTANT:
            if eachitem in largeList and eachitem / 2 != i / 2:
                ######## NEED TO REMOVE IN PRODUCTION if True
                newAdjacencyList[i].append(eachitem)

    print "len(smallList)  , len(largeList): ", len(smallList), len(largeList)
    print "lenDic: ", lenDic

    for eachitem in newAdjacencyList:
        print "newAdjacencyList :", eachitem

    return newAdjacencyList
Ejemplo n.º 9
0
def formExtraEdges(
        folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/",
        optTypeFileHeader="phaseString",
        contigFilename="improved3",
        G=[],
        N1=0):

    dataList = alignerRobot.extractMumData(folderName,
                                           optTypeFileHeader + "CR" + "Out")
    dataList.sort(key=itemgetter(-2))
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")

    count = 0
    tmpItem = []
    embedContig2ReadDic, read2EmbedContigDic = {}, {}

    for key, items in groupby(dataList, itemgetter(-2)):
        isEmbedded = False
        for eachitem in items:
            #print eachitem
            if eachitem[4] > lenDic[key] - 300:
                isEmbedded = True
                tmpItem = eachitem

        if isEmbedded:
            count = count + 1
            readName = tmpItem[-1]
            embedContig2ReadDic[key] = readName
            read2EmbedContigDic[readName] = key

    print "len(embedContig2ReadDic)", len(embedContig2ReadDic)

    #assert(False)

    for contigName in embedContig2ReadDic:
        readName = embedContig2ReadDic[contigName]

        readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID(
            readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C')

        for eachprev in G.graphNodesList[readIndex].listOfPrevNodes:
            idNode, wt = eachprev[0], eachprev[1]
            if idNode < N1:
                G.insertEdge(idNode, contigIndex, wt)

        for eachnext in G.graphNodesList[readIndex].listOfNextNodes:
            idNode, wt = eachnext[0], eachnext[1]
            if idNode < N1:
                G.insertEdge(contigIndex, idNode, wt)

    return G
Ejemplo n.º 10
0
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename):
    print "colorNodes"
    lenDic = IORobot.obtainLength(folderName, sourceFilename+".fasta")
    print lenDic
    thresForShort = 15000
    shortList = []
    longList = []
    for eachitem in lenDic:
        if lenDic[eachitem] > thresForShort:
            longList.append(eachitem)
        else:
            shortList.append(eachitem)
    
    IORobot.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList)
    IORobot.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
def colorNodes(folderName, mummerPath, sourceFilename, contigFilename, readsetFilename):
    print "colorNodes"
    lenDic = IORobot.obtainLength(folderName, sourceFilename + ".fasta")
    print lenDic
    thresForShort = 15000
    shortList = []
    longList = []
    for eachitem in lenDic:
        if lenDic[eachitem] > thresForShort:
            longList.append(eachitem)
        else:
            shortList.append(eachitem)

    IORobot.putListToFileO(folderName, sourceFilename + ".fasta", contigFilename, longList)
    IORobot.putListToFileO(folderName, sourceFilename + ".fasta", readsetFilename, shortList)
Ejemplo n.º 12
0
def decideCut(folderName, mummerPath):
    
    '''
    Input : directPath.fasta, indirectPath.fasta
    Output : toDelete 
    '''
    thres = 50
    
    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName, \
            "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True)
    
    dataList =  alignerRobot.extractMumData(folderName , "indirectvsdirectOut")
    lenDic = IORobot.obtainLength(folderName, "directPath.fasta")

    ctr =0 
    ctrindirect = 0 

    dataList.sort(key = itemgetter(-1))

    toDelete = True

    for key, items in groupby(dataList, itemgetter(-1)):
        print "key", key 
        ctr = ctr + 1
        isFound = False
        for eachitem in items:
            if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres:
                isFound = True

        if isFound:
            ctrindirect = ctrindirect + 1


    epsilon = 1.1

    print "ctrindirect, ctr", ctrindirect, ctr

    if ctrindirect*1.0/ctr < (1- epsilon):
        toDelete = False
    else:
        toDelete = True


    return toDelete
Ejemplo n.º 13
0
def decideCut(folderName, mummerPath):
    
    '''
    Input : directPath.fasta, indirectPath.fasta
    Output : toDelete 
    '''
    thres = 50
    
    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName, \
            "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True)
    
    dataList =  alignerRobot.extractMumData(folderName , "indirectvsdirectOut")
    lenDic = IORobot.obtainLength(folderName, "directPath.fasta")

    ctr =0 
    ctrindirect = 0 

    dataList.sort(key = itemgetter(-1))

    toDelete = True

    for key, items in groupby(dataList, itemgetter(-1)):
        print "key", key 
        ctr = ctr + 1
        isFound = False
        for eachitem in items:
            if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres:
                isFound = True

        if isFound:
            ctrindirect = ctrindirect + 1


    epsilon = 1.1

    print "ctrindirect, ctr", ctrindirect, ctr

    if ctrindirect*1.0/ctr < (1- epsilon):
        toDelete = False
    else:
        toDelete = True


    return toDelete
Ejemplo n.º 14
0
 def runningTestSet(self ,myFolderName, ctexpected, commandList, matchingContigFile):
     
     print "Integration test on RepeatPhaserMain:  " + myFolderName
     self.sourceFolder = myFolderName
     
     os.system("rm -rf "+ self.testingFolder)
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     for eachcommand in commandList:
         os.system(eachcommand)
     
     lenDic = IORobot.obtainLength(self.testingFolder,  matchingContigFile)
     
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
Ejemplo n.º 15
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    # cut here

    adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
Ejemplo n.º 16
0
    def runningTestSet(self, myFolderName, ctexpected, commandList,
                       matchingContigFile):

        print "Integration test on RepeatPhaserMain:  " + myFolderName
        self.sourceFolder = myFolderName

        os.system("rm -rf " + self.testingFolder)
        os.system("mkdir " + self.testingFolder)

        for eachitem in self.listOfFiles:
            os.system("cp " + self.sourceFolder + eachitem + " " +
                      self.testingFolder)

        for eachcommand in commandList:
            os.system(eachcommand)

        lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile)

        assert (len(lenDic) == ctexpected)
        os.system("rm -rf " + self.testingFolder)
Ejemplo n.º 17
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    

    # cut here

    adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
Ejemplo n.º 18
0
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):

    """
    Algorithm: 
    1)Load ContigReadGraph and form xResolvedGraph
    2)Transitive reduction and remove double pointers
    3)Bipartite resolution
    4)xResolve 
    5)Form gapLookUp 
    6)Read contigs out from graph
    7)CheckAns and get it done today again... 
    """

    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDic)

    Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename)
    Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic)
    XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1)

    readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
Ejemplo n.º 19
0
def checkPathLength(path, G, N1, folderName):
    
    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    sumLength = 0
    overlapLength = 0
    for index, i in zip(path, range(len(path))):
        header = "Read" + str((index - N1) / 2) + "_"
        if (index - N1) % 2 == 0:
            header = header + "p"
        else:
            header = header + "d"
        print "lenDicRR[header], ", lenDicRR[header], header 
        print (index - N1) * 2 + 1, (index - N1) * 2 + 2
        sumLength = sumLength + lenDicRR[header]
        
        if i != len(path) - 1:
            for eachnext in G.graphNodesList[index].listOfNextNodes:
                if eachnext[0] == path[i + 1]:
                    overlapLength = overlapLength + eachnext[1]
                    break 
    print sumLength, overlapLength, sumLength - overlapLength
Ejemplo n.º 20
0
def checkPathLength(path, G, N1, folderName):

    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    sumLength = 0
    overlapLength = 0
    for index, i in zip(path, range(len(path))):
        header = "Read" + str((index - N1) / 2) + "_"
        if (index - N1) % 2 == 0:
            header = header + "p"
        else:
            header = header + "d"
        print "lenDicRR[header], ", lenDicRR[header], header
        print(index - N1) * 2 + 1, (index - N1) * 2 + 2
        sumLength = sumLength + lenDicRR[header]

        if i != len(path) - 1:
            for eachnext in G.graphNodesList[index].listOfNextNodes:
                if eachnext[0] == path[i + 1]:
                    overlapLength = overlapLength + eachnext[1]
                    break
    print sumLength, overlapLength, sumLength - overlapLength
Ejemplo n.º 21
0
def generateGapContentLookup(folderName,
                             mummerLink,
                             oldResolvedList,
                             contigReadGraph,
                             contigFilename,
                             readsetFilename,
                             mapDummyToRealDic={}):
    gapContentLookUpList = []

    contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta")
    N1 = len(contigLenDic) * 2

    resolvedList = []

    print "mapDummyToRealDic", mapDummyToRealDic
    for eachmatchpair in oldResolvedList:
        tmpList = []

        if eachmatchpair[0] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] -
                                                      N1)][1]
        else:
            tmpList.append(eachmatchpair[0])

        if eachmatchpair[-1] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] -
                                                      N1)][1]
        else:
            tmpList.append(eachmatchpair[-1])

        for ii in range(len(tmpList) - 1):
            resolvedList.append([tmpList[ii], tmpList[ii + 1]])

    gapContentLookUpList = abunGraphLib.parallelGapLookUp(
        resolvedList, folderName, N1, mummerLink, contigReadGraph,
        contigFilename, readsetFilename)

    return gapContentLookUpList
Ejemplo n.º 22
0
def test1():
	lenDic = {}
	coverageDic = {}
	
	lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta")
	
	f = open("/Users/kakitlam/Documents/abundata", 'r')
	tmp = f.readline()
	
	while len(tmp) > 0:
		if len(tmp) > 10:
			myitem = tmp[0:-1].split()
			coverageDic[myitem[0]] = float(myitem[1])
		tmp = f.readline()
	
	f.close()
	
	myList = []
	baseCt = {}
	
	for eachitem in lenDic:
		myList.append(lenDic[eachitem]*coverageDic[eachitem])
		baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem]
	
	
	for eachitem in lenDic :
		print eachitem,  baseCt[eachitem]
	
	
	
	for eachitem in lenDic :
		print eachitem, lenDic[eachitem]
	
	
	for eachitem in lenDic :
	        print eachitem, coverageDic[eachitem]
Ejemplo n.º 23
0
def viewLenDic():
	
	folderName = "Apr10Test/"
	json_data = open(folderName + "myCountDic.json", 'r')
	myCountDic = json.load(json_data)
	
	contigLenDic = IORobot.obtainLength(folderName,  "LC_n.fasta")
	
	toPlotListX = []
	toPlotListY = []
	
	for eachitem in contigLenDic:
		toPlotListX.append(myCountDic[eachitem])
		toPlotListY.append(contigLenDic[eachitem])
	
	print toPlotListX, toPlotListY
	
	
	with open(folderName + "toPlotListX.json", 'w') as f:
		json.dump(toPlotListX, f)
		
	
	with open(folderName + "toPlotListY.json", 'w') as f:
		json.dump(toPlotListY, f)
Ejemplo n.º 24
0
def continuousIntegration():
	if False:
		G = graphLib.seqGraph(10)
		for i in range(5):
			G.insertEdge(i,i+1,1997)
			G.insertEdge(i,i+2, 1997)

		resultList = abunGraphLib.BFS_revisit(1,3,G,1)

		print "resultList", resultList 

	if False : 

		folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \
			"Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta"

		abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile)
    
		if False:
			lenDic = IORobot.obtainLength(folderName , contigFile)
			N1 = len(lenDic)

			print "N1", N1

			G = graphLib.seqGraph(0)
			G.loadFromFile(folderName, "phaseStringGraph1")

			adj = [[] for i in range(N1)]

			for i in range(N1): 
			    adj[i] = abunGraphLib.findAllReachable(i, N1, G)

			Gnew = abunGraphLib.seqGraphDynamic(N1)

			for i in range(N1):
			    for j in adj[i]:
			        Gnew.insertEdge(i,j,1997)


			Gnew.initAdv()    
			Gnew.doubleEdgeReduction()

			contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3)
			contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5)

			print "contigPaths", contigPaths
			print "contigReadPaths", contigReadPaths

			Gnew.transitiveReduction()

	if False:
		toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/")
		print toDelete

	if False:
		G = graphLib.seqGraph(0)
		G.loadFromFile("Apr10TestA/", "xResolvedGraph")

		if False:
			for i in range(len(G.graphNodesList)):

				v = G.graphNodesList[i]

				if len(v.nodeIndexList) > 0:
					print i , v.listOfPrevNodes , v.listOfNextNodes

		G.reportEdge()
		lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta")
		mylist = [401, 207, 405, 407, 344]

		json_data = open("Apr10TestA/" + "myCountDic.json", 'r')
		myCountDic = json.load(json_data)

		for x in mylist:
			print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)]


	if False:
		folderName = "Apr10TestA/"
		G = graphLib.seqGraph(0)
		G.loadFromFile(folderName , "xResolvedGraph")

		json_data = open(folderName + "mapDummyToRealDic.json", 'r')
		mapDummyToRealDic = json.load(json_data)

		lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
		print len(G.graphNodesList)
		print len(mapDummyToRealDic)
		
		print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic)


	if False:
		abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/")

	if False: 
		nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/")

	if False:
		folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1"
		G = graphLib.seqGraph(0)
		kthres, edgeThres = 3, 1
		G.loadFromFile(folderName, contigReadGraph)
		lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta")

		N1 = len(lenDic)

		adj = [[] for i in range(N1)]

		for i in range(N1): 
		    tmpList = abunGraphLib.findAllReachable(i, N1, G)
		    
		    for j in tmpList:
		        if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres:
		            adj[i].append(j) 

		    #print i, adj[i]

	    ### Filter adaptor skipped case 

		adaptorPair = []

		for i in range(len(adj)):
		    if  i % 2 == 0:
		        if i + 1 in adj[i]:
		            adj[i].remove(i+1)
		            adaptorPair.append([i, i+1])
		    elif i % 2 ==1: 
		        if i-1 in adj[i] :
		            adj[i].remove(i-1)
		            adaptorPair.append([i, i-1])

		Gnew = abunGraphLib.seqGraphDynamic(N1)

		for i in range(N1):
		    for j in adj[i]:
		        Gnew.insertEdge(i,j,1997)

		for eachpair in adaptorPair:
		    u, v = eachpair[0], eachpair[1]
		    for x in Gnew.graphNodesList[u].listOfPrevNodes:
		        xIndex = x[0]
		        Gnew.removeEdge(xIndex, v)
		    for y in Gnew.graphNodesList[v].listOfNextNodes:
		        yIndex = y[0]
		        Gnew.removeEdge(u, yIndex)


        #Gnew.reportEdge()
		count2 = 0
		for i in range(len(Gnew.graphNodesList)):
			if  len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and  len(Gnew.graphNodesList[i].listOfNextNodes) == 2:
				count2 = count2 + 1
				print str(i)+"{color:red}"

		print "count2, ", count2

		### End filter adaptor skipped case 
	if True:
		nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    
    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
    
        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" :    
                print "debug" , eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" :    
                print "debug" , eachitem
            
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []
    
    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    
    
Ejemplo n.º 26
0
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = 20
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Ejemplo n.º 27
0
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "


    
    
    # 0. Load previous data
    G = abunGraphLib.seqGraphWt(0)
    G.loadFromFile(folderName, contigReadGraph)
    Grev = abunGraphLib.formReverseGraph(G)
    
    json_data = open(folderName + repeatFilename, 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
     
    bigDumpList = []
    
    print "len(repeatList)", len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1):
            print rIn, rOut
            if  (len(rIn) == 1 and len(rOut) == 1):
                rIn = [rIn[0], rIn[0]]
                rOut = [rOut[0], rOut[0]]
            
            # 1. Records reachable indices
            kkIn , kkOut = [], []
            for eachkk in rIn:
                kkIn.append(str(eachkk)+"_"+"in")
            
            for eachkk in rOut:
                kkOut.append(str(eachkk)+"_"+"out")
                
            
            abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            #checkPathLength(repeatPathway, G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList)
            
            # ## Experimental
            repeatList = allPassList
            
            # ## End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
        

     


    # 7. Format return and move on to the phasing 
    with open(folderName + repeatSpec, 'w') as outfile:
        json.dump(bigDumpList, outfile)
Ejemplo n.º 28
0
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    thres = 5 
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)
        
        
        # print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        if len(maxItm) > 0 :
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()
Ejemplo n.º 29
0
def formReadContigStringGraph(folderName,
                              mummerLink,
                              contigFilename,
                              readsetFilename,
                              optTypeFileHeader,
                              graphName,
                              needAlignment=True):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"

    #if needAlignment:
    #    alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    if needAlignment:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [[header, referenceFile, queryFile, ""]],
            houseKeeper.globalParallel)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if needAlignment:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if needAlignment:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    addDataToList(dataListCC, G, 0, 0, 'C', 'C')

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')

    Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1)

    Gnew.saveToFile(folderName, graphName)

    print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
Ejemplo n.º 30
0
import matplotlib.pyplot as plt
from finisherSCCoreLib import IORobot

lenDic = {}
coverageDic = {}

lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta")

f = open("/Users/kakitlam/Documents/abundata", 'r')
tmp = f.readline()

while len(tmp) > 0:
	if len(tmp) > 10:
		myitem = tmp[0:-1].split()
		coverageDic[myitem[0]] = float(myitem[1])
	tmp = f.readline()

f.close()

myList = []
baseCt = {}

for eachitem in lenDic:
	myList.append(lenDic[eachitem]*coverageDic[eachitem])
	baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem]


for eachitem in lenDic :
	print eachitem,  baseCt[eachitem]

Ejemplo n.º 31
0
def generateAbundanceGraph(folderName, mummerLink):

    print "generateAbundanceGraph"

    """
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    """
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)

        """
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        """
        outputName, referenceName, queryName, specialName = (
            "outAbun" + indexOfMum,
            "improved3.fasta",
            "raw_reads.part-" + indexOfMum + ".fasta",
            "outAbun" + indexOfMum,
        )
        workerList.append([outputName, referenceName, queryName, specialName])

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False)
        """
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        """

    dataList = []

    for i in range(1, 1 + numberOfFiles):
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out")

    """
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    """

    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList = []

    print "len(dataList)", len(dataList)

    if not abunHouseKeeper.abunGlobalAvoidrefine:
        myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut")
    else:
        extraDataList = []

    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False)

    with open(folderName + "myCountDic.json", "w") as f:
        json.dump(myCountDic, f)

    return myCountDic
Ejemplo n.º 32
0
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
    
Ejemplo n.º 33
0
def performPhasing(folderName, mummerLink):
    print "performPhasing"
    '''
    1. Interface from alignmentBridge.py : 
        shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList)
        cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse  = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True)
    
    2. Format of input data data : 
        bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
    
    3. IO : 
        a) Input :
            repeatSpecification.txt, phasingSeedName_Double.fasta, graph G 
        b) Output :
            improved4.fasta
            
    3. Algorithm: 
        a) reformatNoisyReads 
        b) reformatToProcessList
        c) formShortToLongMapping
    
    '''

    json_data = open(folderName + 'repeatSpecification.txt', 'r')
    loadData = json.load(json_data)
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    
    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    
    lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    loadData = filterReverseComp(loadData, N1)
    
    toPhaseList = []
    
    if True:
        for eachitem in loadData:
            # print eachitem
            flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] 
            
            noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1)
            
            toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1)
    
            shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1)
            
            indelRobot = createIndelRobot(folderName)
            
            cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init")
            in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote")
            extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True)
            
            if extendResult != -1:
                print "extendResult: ", extendResult
                toPhaseList.append(eachitem + [extendResult])
            
        with open(folderName + 'toPhaseList.txt', 'w') as outfile:
            json.dump(toPhaseList, outfile)

    json_data = open(folderName + 'toPhaseList.txt', 'r')
    toPhaseList = json.load(json_data)
    
    outputResults(folderName, mummerLink, toPhaseList, N1, G)
def resolvingTandem(
    folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec
):
    print "resolvingTandem"
    """
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    """
    # 0 ) Load all the data
    thres = 5

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")

    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR:
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())

    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR:
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, "r")
    loadData = json.load(json_data)

    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}

    for eachrepProfile in loadData:
        # 1)
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)

        # 2)
        if isTerminate:
            v = returnPathList[-1]
            i = 0
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i + 1

            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]

        repeatContent = ""

        for kk in range(len(tandemPath[0:-1])):
            eachitem = tandemPath[kk] - N1
            nextitem = tandemPath[kk + 1] - N1
            readName = "Read" + str(eachitem / 2) + "_"
            nextReadName = "Read" + str(nextitem / 2) + "_"
            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"

            if nextitem % 2 == 0:
                nextReadName = nextReadName + "p"
            elif nextitem % 2 == 1:
                nextReadName = nextReadName + "d"

            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent + myContigsDic[readName][0:-overlap]

        print "len(repeatContent)", len(repeatContent)

        fout = open(folderName + repeatTempFilename, "w")
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""

        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge = repeatContentLarge + repeatContent
        fout.close()

        # 4)
        repeatReadList = eachrepProfile[1]

        myList = []
        for eachitem in repeatReadList:

            readName = "Read" + str((eachitem - N1) / 2) + "_"

            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"
            myList.append(readName)

        IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList)

        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta")

        dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out")

        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)

        # print "dataList[0]", dataList[0]
        dataList.sort(key=itemgetter(-1))
        for key, values in groupby(dataList, itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]

            # print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue

        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch * 1.0 / (c * lrepeat)
        print "BIG NUMBER of THE DAY: ", ct

        # 6)
        # a) find the starting point
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1] - N1

        contigName = "Contig" + str(startContig / 2)
        if startContig % 2 == 0:
            contigName = contigName + "_p"
        elif startContig % 2 == 1:
            contigName = contigName + "_d"

        readName = "Read" + str(firstRead / 2)
        if firstRead % 2 == 0:
            readName = readName + "_p"
        elif firstRead % 2 == 1:
            readName = readName + "_d"

        overlapFirst = dataListCRDic[contigName + ";" + readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]

        f1 = open(folderName + "firstOverlap.fasta", "w")
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()

        if True:
            alignerRobot.useMummerAlign(
                mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta"
            )

        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out")

        dataList.sort(key=itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi

        print maxItm
        if len(maxItm) > 0:
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template
        print "ct*lrepeat", int(repeatStart + ct * lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])

    # 7) Combine all the repeat information and do the join

    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]

        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)

        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]

        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig

    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i)

    checkingList = [False for i in range(N1)]

    fout = open(folderName + "tademResolved.fasta", "w")

    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C")
        if checkingList[id / 2] == False:

            fout.write(">Segkk" + str(counter) + "\n")

            fout.write(contigsTmp[eachcontig])
            counter = counter + 1
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk / 2] = True

    fout.close()
Ejemplo n.º 35
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
Ejemplo n.º 36
0
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    
    gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3")
    
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = houseKeeper.globalParallelFileNum
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Ejemplo n.º 37
0
def generateAbundanceGraph(folderName, mummerLink):
    
    
    print "generateAbundanceGraph"
    
    '''
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    '''
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        '''
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        '''
        outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta",  "outAbun" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        '''
        
    dataList = []
    
    for i in range(1, 1+numberOfFiles): 
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out")
    

    '''
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    '''
         
    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta")
    

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList= []
    
    
    print "len(dataList)", len(dataList)
    
    if not abunHouseKeeper.abunGlobalAvoidrefine: 
        myCountDic =  evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,  True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" )
    else:
        extraDataList = []
        
    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False)
    
    with open(folderName + 'myCountDic.json', 'w') as f:
        json.dump(myCountDic, f)

    
    return myCountDic
Ejemplo n.º 38
0
import matplotlib.pyplot as plt
from finisherSCCoreLib import IORobot

lenDic = {}
coverageDic = {}

lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta")

f = open("/Users/kakitlam/Documents/abundata", 'r')
tmp = f.readline()

while len(tmp) > 0:
    if len(tmp) > 10:
        myitem = tmp[0:-1].split()
        coverageDic[myitem[0]] = float(myitem[1])
    tmp = f.readline()

f.close()

myList = []
baseCt = {}

for eachitem in lenDic:
    myList.append(lenDic[eachitem] * coverageDic[eachitem])
    baseCt[eachitem] = lenDic[eachitem] * coverageDic[eachitem]

for eachitem in lenDic:
    print eachitem, baseCt[eachitem]

for eachitem in lenDic:
    print eachitem, lenDic[eachitem]
Ejemplo n.º 39
0
def formReadContigStringGraph(folderName, mummerLink, contigFilename,
                              readsetFilename, optTypeFileHeader, graphName):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header,
                                    referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p":
                print "debug", eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p":
                print "debug", eachitem

        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]

    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)

    checkGraphLength(G, N1, lenDicRR)

    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)