def abunSplit(folderName, mummerLink, myCountDic):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """
    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2

    G = graphLib.seqGraph(N1)

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def replaceFiles(folderName, replacedName):
    commandList = []
    commandList.append("cp " + folderName + "improved3.fasta " + folderName +
                       "improved3_backup.fasta")
    commandList.append("cp " + folderName + "improved3_Double.fasta " +
                       folderName + "improved3_backup.fasta")

    IORobot.writeToFile_Double1(folderName, replacedName[0:-6] + ".fasta",
                                replacedName[0:-6] + "_Double.fasta", "contig")

    commandList.append("cp " + folderName + replacedName + " " + folderName +
                       "improved3.fasta")

    command = "perl -pe 's/>[^\$]*$/\">Segkk\" . $n++ .\"\n\"/ge' " + folderName + "improved3.fasta > " + folderName + "newImproved3.fasta "
    commandList.append(command)

    command = "cp " + folderName + "newImproved3.fasta  " + folderName + "improved3.fasta "
    commandList.append(command)

    commandList.append("cp " + folderName + replacedName[0:-6] +
                       "_Double.fasta " + folderName +
                       "improved3_Double.fasta")

    for eachcommand in commandList:
        print eachcommand
        os.system(eachcommand)
Example #3
0
def abunSplit(folderName, mummerLink, myCountDic):
    
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''
    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)
    
    N1 = len(myCountDic)*2
    
    G = graphLib.seqGraph(N1)
    
    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic)
        addEdges(G, resolvedList)
        
    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta", "improved3_Double.fasta")
def findNoGoByNoHeads(noGoList, side, folderName):
	noGoListNew = []

	sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
		formSortedDataList(folderName)


	lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta" )
	lenDicRead = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")

	for x in noGoList:
		rList = findAttachedReads(x, side, folderName,sortedContigList,sortedContigDic, lenDicContig,lenDicRead)
		cList = findAttachedContigs(rList, side, folderName, sortedReadList, sortedReadDic, lenDicContig,lenDicRead)

		if bestMatchContigOnly == False:
			bestContigIDList = findBreakContigAdv(cList)
		else:
			bestContigIDList = findBreakContig(cList)

		if len(rList) > 0 and len(cList) > 0:
			print "x, side, len(rList), len(cList), len(bestContigIDList)",\
				 abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
			print "cList", bestContigIDList
 
		noGoListNew = noGoListNew + bestContigIDList


	return noGoListNew
Example #5
0
def findNoGoByNoHeads(noGoList, side, folderName):
    noGoListNew = []

    sortedContigList,  sortedReadList, sortedContigDic, sortedReadDic =\
     formSortedDataList(folderName)

    lenDicContig = IORobot.obtainLength(folderName, "mFixed_Double.fasta")
    lenDicRead = IORobot.obtainLength(folderName,
                                      "phasingSeedName_Double.fasta")

    for x in noGoList:
        rList = findAttachedReads(x, side, folderName, sortedContigList,
                                  sortedContigDic, lenDicContig, lenDicRead)
        cList = findAttachedContigs(rList, side, folderName, sortedReadList,
                                    sortedReadDic, lenDicContig, lenDicRead)

        if bestMatchContigOnly == False:
            bestContigIDList = findBreakContigAdv(cList)
        else:
            bestContigIDList = findBreakContig(cList)

        if len(rList) > 0 and len(cList) > 0:
            print "x, side, len(rList), len(cList), len(bestContigIDList)",\
              abunHouseKeeper.parseIDToName(x,'C',0), side, len(rList), len(cList), len(bestContigIDList)
            print "cList", bestContigIDList

        noGoListNew = noGoListNew + bestContigIDList

    return noGoListNew
Example #6
0
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph,
              contigFilename, readsetFilename):
    '''
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    '''

    json_data = open(folderName + "phaseRepeat.txt", 'r')
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName,
                                      contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph,
            contigFilename, readsetFilename)

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abun.fasta",
                                  contigFilename + "_Double.fasta",
                                  gapContentLookUpDic)
Example #7
0
def abunSplit(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):

    """
    Input : repeatSpecification.txt , myCountDic.json, improved3.fasta, raw_reads.fasta
    Output : abunsplit.fasta
    
    Algorithm : 
    
    1. Load data from various sources [various json files]
    
    2. For each repeat interior:
        a) identify the abundances associated with in/out contigs
        b) perform a split and record the split
    
    3. Use split results to generate contigs [may already exist in newPhasing.py ] 
        a) use a graph to capture the split results 
        b) use reads to fill in any gaps 
        c) read out the contigs 
    
    """

    json_data = open(folderName + "phaseRepeat.txt", "r")
    repeatPairs = json.load(json_data)
    repeatPairs = obtainNonEmpty(repeatPairs)

    N1 = len(myCountDic) * 2
    print "N1", N1

    G = graphLib.seqGraph(N1)

    gapContentLookUpList = []

    for eachitem in repeatPairs:
        inList, outList = eachitem[0], eachitem[1]
        resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)
        print "resolvedList", resolvedList
        gapContentLookUpList += generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename
        )

        addEdges(G, resolvedList)

    gapContentLookUpDic = {}
    gapContentLookUpList.sort()

    for eachitem in gapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # some how change ASplitter here by appending necessary information

    G.condense()
    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abun.fasta", contigFilename + "_Double.fasta", gapContentLookUpDic
    )
Example #8
0
    def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename):
        print "condenseEdgeRemove"
        thresPass = 100
        thresForStrangeCut = 5000
        ### kkdebug

        toRemoveList = []
        
        for eachnode in self.graphNodesList:
            if len(eachnode.nodeIndexList) > 0:
                if len(eachnode.listOfNextNodes) ==1  :
                    nextNodeIndex = eachnode.listOfNextNodes[0][0]
                    nextNode= self.graphNodesList[nextNodeIndex]
                    if len(nextNode.listOfPrevNodes) == 1 : 
                        currentName = eachnode.nodeIndex
                        nextName =  nextNode.nodeIndex

                        contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5)

                        cName =  abunHouseKeeper.parseIDToName(currentName,'C',0)
                        nName =  abunHouseKeeper.parseIDToName(nextName,'C',0)

                        noGoNext = self.readInJSON(folderName, "noGoNext.json")
                        noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

                        overlap = [-1, -1]
                        ctr = 0 

                        for eachpath in contigReadPaths:
                            if len(eachpath) > 2: 
                                ctr = ctr + 1 
                                
                            elif len(eachpath) == 2:     
                                
                                contigName = cName
                                leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

                                contigName = nName
                                rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
                                
                                overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)


                        if ctr <= thresPass and  (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ):
                    
                            self.removeEdge(currentName, nextName)
                            toRemoveList.append([currentName, nextName])


        ### kkdebug
        #with open( "dataFolder/toRemoveList.json", 'w') as f:
        #    json.dump(toRemoveList, f)    

        self.findAdjList()
Example #9
0
    def condenseEdgeRemove(self, G_ContigRead, folderName, mummerLink, contigFilename):
        print "condenseEdgeRemove"
        thresPass = 100
        thresForStrangeCut = 5000
        ### kkdebug

        toRemoveList = []
        
        for eachnode in self.graphNodesList:
            if len(eachnode.nodeIndexList) > 0:
                if len(eachnode.listOfNextNodes) ==1  :
                    nextNodeIndex = eachnode.listOfNextNodes[0][0]
                    nextNode= self.graphNodesList[nextNodeIndex]
                    if len(nextNode.listOfPrevNodes) == 1 : 
                        currentName = eachnode.nodeIndex
                        nextName =  nextNode.nodeIndex

                        contigReadPaths = findAllPathK(currentName,nextName, G_ContigRead, 5)

                        cName =  abunHouseKeeper.parseIDToName(currentName,'C',0)
                        nName =  abunHouseKeeper.parseIDToName(nextName,'C',0)

                        noGoNext = self.readInJSON(folderName, "noGoNext.json")
                        noGoPrev = self.readInJSON(folderName, "noGoPrev.json")

                        overlap = [-1, -1]
                        ctr = 0 

                        for eachpath in contigReadPaths:
                            if len(eachpath) > 2: 
                                ctr = ctr + 1 
                                
                            elif len(eachpath) == 2:     
                                
                                contigName = cName
                                leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

                                contigName = nName
                                rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
                                
                                overlap = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)


                        if ctr <= thresPass and  (cName in noGoNext or nName in noGoPrev or overlap[0] > thresForStrangeCut ):
                    
                            self.removeEdge(currentName, nextName)
                            toRemoveList.append([currentName, nextName])


        ### kkdebug
        #with open( "dataFolder/toRemoveList.json", 'w') as f:
        #    json.dump(toRemoveList, f)    

        self.findAdjList()
Example #10
0
def readContigForAbunSplit(folderName, mummerLink, contigFilename,
                           readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", 'r')
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[
                    j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", 'w') as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph,
        contigFilename, readsetFilename, mapDummyToRealDic)

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [
            eachitem[2], eachitem[3], eachitem[4]
        ]
        print eachitem[2:4], len(eachitem[4])

    #segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " +
              folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName,
                                             contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(G, folderName, mummerLink, "abunPre.fasta",
                                  "tmpWithDummy.fasta", gapContentLookUpDic,
                                  mapDummyToRealDic)

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink,
                                                     "abunPre", "abunMum",
                                                     "abun")
Example #11
0
def colorNodes(folderName, mummerPath,sourceFilename, contigFilename, readsetFilename):
    print "colorNodes"
    lenDic = IORobot.obtainLength(folderName, sourceFilename+".fasta")
    print lenDic
    thresForShort = 15000
    shortList = []
    longList = []
    for eachitem in lenDic:
        if lenDic[eachitem] > thresForShort:
            longList.append(eachitem)
        else:
            shortList.append(eachitem)
    
    IORobot.putListToFileO(folderName, sourceFilename+".fasta", contigFilename, longList)
    IORobot.putListToFileO(folderName, sourceFilename+".fasta", readsetFilename, shortList)
Example #12
0
def findCoverageFromRawData(folderName):
    contigLenDic  = IORobot.obtainLength(folderName, "contigs.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    G = 0 
    NL = 0 
    for eachitem in contigLenDic:
        G = G+ contigLenDic[eachitem]
        
    for eachitem in readLenDic:
        NL = NL+ readLenDic[eachitem]
    
    c = (NL*1.0)/G 
    print c 
    return c 
def colorNodes(folderName, mummerPath, sourceFilename, contigFilename, readsetFilename):
    print "colorNodes"
    lenDic = IORobot.obtainLength(folderName, sourceFilename + ".fasta")
    print lenDic
    thresForShort = 15000
    shortList = []
    longList = []
    for eachitem in lenDic:
        if lenDic[eachitem] > thresForShort:
            longList.append(eachitem)
        else:
            shortList.append(eachitem)

    IORobot.putListToFileO(folderName, sourceFilename + ".fasta", contigFilename, longList)
    IORobot.putListToFileO(folderName, sourceFilename + ".fasta", readsetFilename, shortList)
def findCoverageFromRawData(folderName):
    contigLenDic = IORobot.obtainLength(folderName, "contigs.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    G = 0
    NL = 0
    for eachitem in contigLenDic:
        G = G + contigLenDic[eachitem]

    for eachitem in readLenDic:
        NL = NL + readLenDic[eachitem]

    c = (NL * 1.0) / G
    print c
    return c
Example #15
0
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph,
                        contigFilename, readsetFilename):
    '''
    Algorithm: 
    1)Load ContigReadGraph and form xResolvedGraph
    2)Transitive reduction and remove double pointers
    3)Bipartite resolution
    4)xResolve 
    5)Form gapLookUp 
    6)Read contigs out from graph
    7)CheckAns and get it done today again... 
    '''
    if abunHouseKeeper.abunGlobalRunEM == True:
        emalgo.generateAssociatedReadDic(folderName)

    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDic)

    Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink,
                        readsetFilename, contigFilename)
    Gnew.logEdges(folderName, "graphsurgery")

    #Gnew.reportEdge()
    #assert(False)

    Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic,
                       lenDic, mummerLink)
    Gnew.logEdges(folderName, "BResolution")

    XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1,
                mummerLink)
    Gnew.logEdges(folderName, "XResolution")

    readContigForAbunSplit(folderName, mummerLink, contigFilename,
                           readsetFilename, N1, contigReadGraph)
Example #16
0
def filterEdge(adjacencyList, folderName, contigFilename):
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    thresFoPhase = 2000
    smallList, largeList = [], []
    for eachitem in lenDic:
        id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C')
        if lenDic[eachitem] < thresFoPhase:
            smallList.append(id)
        else:
            largeList.append(id)

    newAdjacencyList = [[] for i in range(len(adjacencyList))]

    for i in largeList:
        for eachitem in adjacencyList[i]:
            ######## IMPORTANT:
            if eachitem in largeList and eachitem / 2 != i / 2:
                ######## NEED TO REMOVE IN PRODUCTION if True
                newAdjacencyList[i].append(eachitem)

    print "len(smallList)  , len(largeList): ", len(smallList), len(largeList)
    print "lenDic: ", lenDic

    for eachitem in newAdjacencyList:
        print "newAdjacencyList :", eachitem

    return newAdjacencyList
Example #17
0
def filterEdge(adjacencyList, folderName, contigFilename):
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    thresFoPhase = 2000
    smallList, largeList = [], []
    for eachitem in lenDic:
        id = abunHouseKeeper.parseEdgeNameToID(eachitem, 'C')
        if lenDic[eachitem] < thresFoPhase:
            smallList.append(id)
        else:
            largeList.append(id)
    
    newAdjacencyList = [[] for i in range(len(adjacencyList))]
    
    for i in largeList:
        for eachitem in adjacencyList[i]:
######## IMPORTANT:
            if  eachitem in largeList and eachitem / 2 != i / 2:
######## NEED TO REMOVE IN PRODUCTION if True
                newAdjacencyList[i].append(eachitem)
    
    
    print "len(smallList)  , len(largeList): ", len(smallList)  , len(largeList)
    print "lenDic: ", lenDic
    
    for eachitem in newAdjacencyList:
        print "newAdjacencyList :", eachitem 
        
    return newAdjacencyList
Example #18
0
def generateGapContentLookup(
    folderName, mummerLink, oldResolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic={}
):
    gapContentLookUpList = []

    contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta")
    N1 = len(contigLenDic) * 2

    resolvedList = []

    print "mapDummyToRealDic", mapDummyToRealDic
    for eachmatchpair in oldResolvedList:
        tmpList = []

        if eachmatchpair[0] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] - N1)][1]
        else:
            tmpList.append(eachmatchpair[0])

        if eachmatchpair[-1] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] - N1)][1]
        else:
            tmpList.append(eachmatchpair[-1])

        for ii in range(len(tmpList) - 1):
            resolvedList.append([tmpList[ii], tmpList[ii + 1]])

    gapContentLookUpList = parallelGapLookUp(
        resolvedList, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename
    )

    return gapContentLookUpList
Example #19
0
def readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph):

    json_data = open(folderName + "mapDummyToRealDic.json", "r")
    mapDummyToRealDic = json.load(json_data)

    G = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "xResolvedGraph")

    gapContentLookUpDic = {}

    furtherGapList = []
    for i in range(N1):
        if len(G.graphNodesList[i].nodeIndexList) > 1:
            for j in range(len(G.graphNodesList[i].nodeIndexList) - 1):

                bk, fwd = G.graphNodesList[i].nodeIndexList[j], G.graphNodesList[i].nodeIndexList[j + 1]

                key = str(bk) + "_" + str(fwd)

                if not key in gapContentLookUpDic:
                    furtherGapList.append([bk, fwd])

    with open(folderName + "furtherGapList.json", "w") as f:
        json.dump(furtherGapList, f)

    furtherGapContentLookUpList = generateGapContentLookup(
        folderName, mummerLink, furtherGapList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
    )

    for eachitem in furtherGapContentLookUpList:
        gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
        print eachitem[2:4], len(eachitem[4])

    # segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")

    print "Final step: really hacking a file"
    os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
    contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

    IORobot.extractGraphToContigs(
        G, folderName, mummerLink, "abunPre.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
    )

    if True:
        nonRedundantResolver.removeRedundantWithFile(folderName, mummerLink, "abunPre", "abunMum", "abun")
def replaceFiles( folderName, replacedName) :
    commandList = []
    commandList.append("cp " + folderName + "improved3.fasta " + folderName + "improved3_backup.fasta")
    commandList.append("cp " + folderName + "improved3_Double.fasta " + folderName + "improved3_backup.fasta")
    
    IORobot.writeToFile_Double1(folderName, replacedName[0:-6]+".fasta", replacedName[0:-6]+"_Double.fasta", "contig")
    
    commandList.append("cp " + folderName + replacedName + " "+folderName + "improved3.fasta")
    
    command = "perl -pe 's/>[^\$]*$/\">Segkk\" . $n++ .\"\n\"/ge' "+folderName+"improved3.fasta > "+folderName+"newImproved3.fasta "
    commandList.append(command)
    
    command = "cp " +folderName+"newImproved3.fasta  "+folderName+"improved3.fasta "
    commandList.append(command)


    commandList.append("cp " + folderName + replacedName[0:-6]+"_Double.fasta " + folderName + "improved3_Double.fasta")
    
    for eachcommand in commandList:
        print eachcommand
        os.system(eachcommand)
Example #21
0
def formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile):
    '''
    Input : directPathList, indirectPathList, contigFile, readFile
    Output: directPath.fasta, indirectPath.fasta
    '''

    contigList = IORobot.readContigsFromFile(folderName,contigFile)
    readList = IORobot.readContigsFromFile(folderName,readFile)
    
    directPathSeqList =  IORobot.pathListToSeqListTransform(directPathList, contigList, readList, mummerPath, folderName)    
    indirectPathSeqList =  IORobot.pathListToSeqListTransform(indirectPathList, contigList, readList, mummerPath, folderName)    

    IORobot.writeSegOut(directPathSeqList,folderName,"directPath.fasta")
    IORobot.writeSegOut(indirectPathSeqList,folderName,"indirectPath.fasta")
Example #22
0
def formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile):
    '''
    Input : directPathList, indirectPathList, contigFile, readFile
    Output: directPath.fasta, indirectPath.fasta
    '''

    contigList = IORobot.readContigsFromFile(folderName,contigFile)
    readList = IORobot.readContigsFromFile(folderName,readFile)
    
    directPathSeqList =  IORobot.pathListToSeqListTransform(directPathList, contigList, readList, mummerPath, folderName)    
    indirectPathSeqList =  IORobot.pathListToSeqListTransform(indirectPathList, contigList, readList, mummerPath, folderName)    

    IORobot.writeSegOut(directPathSeqList,folderName,"directPath.fasta")
    IORobot.writeSegOut(indirectPathSeqList,folderName,"indirectPath.fasta")
Example #23
0
def formExtraEdges(
        folderName="/home/kakitfive/kkdata2/MetaFinisherSC/dataFolderBackup/",
        optTypeFileHeader="phaseString",
        contigFilename="improved3",
        G=[],
        N1=0):

    dataList = alignerRobot.extractMumData(folderName,
                                           optTypeFileHeader + "CR" + "Out")
    dataList.sort(key=itemgetter(-2))
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")

    count = 0
    tmpItem = []
    embedContig2ReadDic, read2EmbedContigDic = {}, {}

    for key, items in groupby(dataList, itemgetter(-2)):
        isEmbedded = False
        for eachitem in items:
            #print eachitem
            if eachitem[4] > lenDic[key] - 300:
                isEmbedded = True
                tmpItem = eachitem

        if isEmbedded:
            count = count + 1
            readName = tmpItem[-1]
            embedContig2ReadDic[key] = readName
            read2EmbedContigDic[readName] = key

    print "len(embedContig2ReadDic)", len(embedContig2ReadDic)

    #assert(False)

    for contigName in embedContig2ReadDic:
        readName = embedContig2ReadDic[contigName]

        readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID(
            readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C')

        for eachprev in G.graphNodesList[readIndex].listOfPrevNodes:
            idNode, wt = eachprev[0], eachprev[1]
            if idNode < N1:
                G.insertEdge(idNode, contigIndex, wt)

        for eachnext in G.graphNodesList[readIndex].listOfNextNodes:
            idNode, wt = eachnext[0], eachnext[1]
            if idNode < N1:
                G.insertEdge(contigIndex, idNode, wt)

    return G
Example #24
0
def decideCut(folderName, mummerPath):
    
    '''
    Input : directPath.fasta, indirectPath.fasta
    Output : toDelete 
    '''
    thres = 50
    
    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName, \
            "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True)
    
    dataList =  alignerRobot.extractMumData(folderName , "indirectvsdirectOut")
    lenDic = IORobot.obtainLength(folderName, "directPath.fasta")

    ctr =0 
    ctrindirect = 0 

    dataList.sort(key = itemgetter(-1))

    toDelete = True

    for key, items in groupby(dataList, itemgetter(-1)):
        print "key", key 
        ctr = ctr + 1
        isFound = False
        for eachitem in items:
            if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres:
                isFound = True

        if isFound:
            ctrindirect = ctrindirect + 1


    epsilon = 1.1

    print "ctrindirect, ctr", ctrindirect, ctr

    if ctrindirect*1.0/ctr < (1- epsilon):
        toDelete = False
    else:
        toDelete = True


    return toDelete
Example #25
0
def decideCut(folderName, mummerPath):
    
    '''
    Input : directPath.fasta, indirectPath.fasta
    Output : toDelete 
    '''
    thres = 50
    
    if True:
        alignerRobot.useMummerAlign(mummerPath, folderName, \
            "indirectvsdirect", "indirectPath.fasta", "directPath.fasta", specialForRaw = False, specialName = "", refinedVersion= True)
    
    dataList =  alignerRobot.extractMumData(folderName , "indirectvsdirectOut")
    lenDic = IORobot.obtainLength(folderName, "directPath.fasta")

    ctr =0 
    ctrindirect = 0 

    dataList.sort(key = itemgetter(-1))

    toDelete = True

    for key, items in groupby(dataList, itemgetter(-1)):
        print "key", key 
        ctr = ctr + 1
        isFound = False
        for eachitem in items:
            if eachitem[2] < thres and eachitem[3] > lenDic[key] - thres:
                isFound = True

        if isFound:
            ctrindirect = ctrindirect + 1


    epsilon = 1.1

    print "ctrindirect, ctr", ctrindirect, ctr

    if ctrindirect*1.0/ctr < (1- epsilon):
        toDelete = False
    else:
        toDelete = True


    return toDelete
 def runningTestSet(self ,myFolderName, ctexpected, commandList, matchingContigFile):
     
     print "Integration test on RepeatPhaserMain:  " + myFolderName
     self.sourceFolder = myFolderName
     
     os.system("rm -rf "+ self.testingFolder)
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     for eachcommand in commandList:
         os.system(eachcommand)
     
     lenDic = IORobot.obtainLength(self.testingFolder,  matchingContigFile)
     
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
Example #27
0
def mapStrangePairs():
	folderName = "Apr10Test/"
	
	json_data = open(folderName + "furtherGapList.json", 'r')
	furtherGapList = json.load(json_data)
	
	segLookUp = IORobot.readContigsFromFile(folderName, "LC_n_Double.fasta")
	
	f = open(folderName + "wrongCondense.fasta", 'w')
	ctr = 0
	for eachitem in furtherGapList:
		beforeI, afterI = eachitem[0], eachitem[1]
		
		f.write(">Segkk"+str(ctr)+"\n")
		f.write(segLookUp[beforeI]+"\n")
		ctr = ctr + 1 
		
		f.write(">Segkk"+str(ctr)+"\n")
		f.write(segLookUp[afterI]+"\n")
		ctr = ctr + 1 
	
	f.close()
	
	if False:
		alignerRobot.useMummerAlign("/usr/bin/", folderName, "wrongCondenseDebug", "reference.fasta", "wrongCondense.fasta")
	
	dataList = alignerRobot.extractMumData(folderName, "wrongCondenseDebugOut")
	
	dataList.sort(key = itemgetter(-1))
	
	mappedDic = {}
	
	for key, items in groupby(dataList, itemgetter(-1)):
		print "key", key
		matchLen = -1
		
		for eachitem in items: 
			if eachitem[-4] > matchLen:
				mappedDic[key]  = eachitem
				matchLen = eachitem[-4]
				
	
	for eachitem in mappedDic:
		print "results : ", eachitem, mappedDic[eachitem]
Example #28
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    # cut here

    adjListToRepeatList(newAdjacencyList, folderName, repeatFilename)
Example #29
0
    def runningTestSet(self, myFolderName, ctexpected, commandList,
                       matchingContigFile):

        print "Integration test on RepeatPhaserMain:  " + myFolderName
        self.sourceFolder = myFolderName

        os.system("rm -rf " + self.testingFolder)
        os.system("mkdir " + self.testingFolder)

        for eachitem in self.listOfFiles:
            os.system("cp " + self.sourceFolder + eachitem + " " +
                      self.testingFolder)

        for eachcommand in commandList:
            os.system(eachcommand)

        lenDic = IORobot.obtainLength(self.testingFolder, matchingContigFile)

        assert (len(lenDic) == ctexpected)
        os.system("rm -rf " + self.testingFolder)
Example #30
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph, repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    

    # cut here

    adjListToRepeatList(newAdjacencyList,folderName,repeatFilename )
Example #31
0
def checkPathLength(path, G, N1, folderName):
    
    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    sumLength = 0
    overlapLength = 0
    for index, i in zip(path, range(len(path))):
        header = "Read" + str((index - N1) / 2) + "_"
        if (index - N1) % 2 == 0:
            header = header + "p"
        else:
            header = header + "d"
        print "lenDicRR[header], ", lenDicRR[header], header 
        print (index - N1) * 2 + 1, (index - N1) * 2 + 2
        sumLength = sumLength + lenDicRR[header]
        
        if i != len(path) - 1:
            for eachnext in G.graphNodesList[index].listOfNextNodes:
                if eachnext[0] == path[i + 1]:
                    overlapLength = overlapLength + eachnext[1]
                    break 
    print sumLength, overlapLength, sumLength - overlapLength
Example #32
0
def checkPathLength(path, G, N1, folderName):

    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    sumLength = 0
    overlapLength = 0
    for index, i in zip(path, range(len(path))):
        header = "Read" + str((index - N1) / 2) + "_"
        if (index - N1) % 2 == 0:
            header = header + "p"
        else:
            header = header + "d"
        print "lenDicRR[header], ", lenDicRR[header], header
        print(index - N1) * 2 + 1, (index - N1) * 2 + 2
        sumLength = sumLength + lenDicRR[header]

        if i != len(path) - 1:
            for eachnext in G.graphNodesList[index].listOfNextNodes:
                if eachnext[0] == path[i + 1]:
                    overlapLength = overlapLength + eachnext[1]
                    break
    print sumLength, overlapLength, sumLength - overlapLength
Example #33
0
def abunSplitAdvResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):

    """
    Algorithm: 
    1)Load ContigReadGraph and form xResolvedGraph
    2)Transitive reduction and remove double pointers
    3)Bipartite resolution
    4)xResolve 
    5)Form gapLookUp 
    6)Read contigs out from graph
    7)CheckAns and get it done today again... 
    """

    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDic)

    Gnew = graphSurgery(myCountDic, folderName, contigReadGraph, mummerLink, readsetFilename, contigFilename)
    Gnew = BResolution(Gnew, folderName, contigReadGraph, N1, myCountDic, lenDic)
    XResolution(folderName, contigReadGraph, Gnew, myCountDic, lenDic, N1)

    readContigForAbunSplit(folderName, mummerLink, contigFilename, readsetFilename, N1, contigReadGraph)
Example #34
0
def generateGapContentLookup(folderName,
                             mummerLink,
                             oldResolvedList,
                             contigReadGraph,
                             contigFilename,
                             readsetFilename,
                             mapDummyToRealDic={}):
    gapContentLookUpList = []

    contigLenDic = IORobot.obtainLength(folderName, contigFilename + ".fasta")
    N1 = len(contigLenDic) * 2

    resolvedList = []

    print "mapDummyToRealDic", mapDummyToRealDic
    for eachmatchpair in oldResolvedList:
        tmpList = []

        if eachmatchpair[0] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[0] -
                                                      N1)][1]
        else:
            tmpList.append(eachmatchpair[0])

        if eachmatchpair[-1] >= N1:
            tmpList = tmpList + mapDummyToRealDic[str(eachmatchpair[-1] -
                                                      N1)][1]
        else:
            tmpList.append(eachmatchpair[-1])

        for ii in range(len(tmpList) - 1):
            resolvedList.append([tmpList[ii], tmpList[ii + 1]])

    gapContentLookUpList = abunGraphLib.parallelGapLookUp(
        resolvedList, folderName, N1, mummerLink, contigReadGraph,
        contigFilename, readsetFilename)

    return gapContentLookUpList
Example #35
0
def viewLenDic():
	
	folderName = "Apr10Test/"
	json_data = open(folderName + "myCountDic.json", 'r')
	myCountDic = json.load(json_data)
	
	contigLenDic = IORobot.obtainLength(folderName,  "LC_n.fasta")
	
	toPlotListX = []
	toPlotListY = []
	
	for eachitem in contigLenDic:
		toPlotListX.append(myCountDic[eachitem])
		toPlotListY.append(contigLenDic[eachitem])
	
	print toPlotListX, toPlotListY
	
	
	with open(folderName + "toPlotListX.json", 'w') as f:
		json.dump(toPlotListX, f)
		
	
	with open(folderName + "toPlotListY.json", 'w') as f:
		json.dump(toPlotListY, f)
Example #36
0
def test1():
	lenDic = {}
	coverageDic = {}
	
	lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta")
	
	f = open("/Users/kakitlam/Documents/abundata", 'r')
	tmp = f.readline()
	
	while len(tmp) > 0:
		if len(tmp) > 10:
			myitem = tmp[0:-1].split()
			coverageDic[myitem[0]] = float(myitem[1])
		tmp = f.readline()
	
	f.close()
	
	myList = []
	baseCt = {}
	
	for eachitem in lenDic:
		myList.append(lenDic[eachitem]*coverageDic[eachitem])
		baseCt[eachitem] = lenDic[eachitem]*coverageDic[eachitem]
	
	
	for eachitem in lenDic :
		print eachitem,  baseCt[eachitem]
	
	
	
	for eachitem in lenDic :
		print eachitem, lenDic[eachitem]
	
	
	for eachitem in lenDic :
	        print eachitem, coverageDic[eachitem]
Example #37
0
def outputResults(folderName, mummerLink, toPhaseList, N1, G):
    '''    
    Algorithm :
    a) Write as contigs 
    b) Add back reverse complement 
    c) Create G2 as the readOut part 
    d) Output the contigs by a function call

    '''
    # a) 
    combinedName = "contigAndRead_Double.fasta"
    os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName)
    
    fout = open(folderName + combinedName, 'a')
    fin = open(folderName + "phasingSeedName_Double.fasta", 'r')

    tmp = fin.readline().rstrip()
    while len(tmp) > 0:
        if tmp[0] != ">":
            fout.write(tmp + "\n")
        else:
            infoArr = tmp[5:].split("_")
            fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2))
            fout.write("_" + infoArr[1] + "\n")
        tmp = fin.readline().rstrip()
        
    fin.close()
    fout.close()

    # b)
    '''
    [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1
    
    [2 , 690, 28, 212, 0]
    '''
    
    completePhaseList = []
    for eachitem in toPhaseList:
        repeat = eachitem[-3]
        flanking = eachitem[-2]
        result = eachitem[-1]
        
        
        revrepeat = []
        for eachsub in eachitem[-3][-1::-1]:
            revrepeat.append(eachsub + pow(-1, eachsub))
            
        revflanking = [[] for i in range(4)] 
        
        for j in range(2):
            for eachsub in eachitem[-2][j + 2][-1::-1]:
                revflanking[j].append(eachsub + pow(-1, eachsub))
            for eachsub in eachitem[-2][j][-1::-1]:
                revflanking[j + 2].append(eachsub + pow(-1, eachsub))
            
        revresult = eachitem[-1]
        
        completePhaseList.append([repeat, flanking, result])
        completePhaseList.append([revrepeat, revflanking, revresult])
    
    print "completePhaseList", completePhaseList
    # c) 
    G2 = graphLib.seqGraph(N1)
    nameDic = {}
    for i in range(N1):
        nameDic[i] = i
        
    for eachitem in completePhaseList:
        repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2]
        path = [[], []]
        
        if result == 0:
            path[0] = flanking[0][0:-1] + repeat + flanking[2][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[3][1:]
        else:
            path[0] = flanking[0][0:-1] + repeat + flanking[3][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[2][1:]
        
        print path[0] , path[1]
        for i  in range(2):
            eachpath = path[i]
            currentNode = G2.graphNodesList[eachpath[0]]
            
            for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))):
                if ctr != len(eachpath[1:]) - 1:
                    myindex = len(G2.graphNodesList)
                    nameDic[myindex] = nextNodeIndex
                    
                    newNode = graphLib.seqGraphNode(myindex)
                    G2.graphNodesList.append(newNode)
                else:
                    newNode = G2.graphNodesList[nextNodeIndex]
                    
                wt = 0
                for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes:
                    if eachck[0] == nextNodeIndex:
                        wt = eachck[1]
                        break
                    
                newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt])
                currentNode.listOfNextNodes.append([newNode.nodeIndex, wt])
                
                currentNode = newNode
                
    graphFileName = "phaseGraphFinal"
    G2.condense()
    G2.saveToFile(folderName, graphFileName)
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
Example #38
0
def formReadContigStringGraph(folderName,
                              mummerLink,
                              contigFilename,
                              readsetFilename,
                              optTypeFileHeader,
                              graphName,
                              needAlignment=True):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"

    #if needAlignment:
    #    alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    if needAlignment:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [[header, referenceFile, queryFile, ""]],
            houseKeeper.globalParallel)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if needAlignment:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if needAlignment:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    addDataToList(dataListCC, G, 0, 0, 'C', 'C')

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')

    Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1)

    Gnew.saveToFile(folderName, graphName)

    print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
Example #39
0
def singleGapLookUp(eachmatchpair, folderName, N1, mummerLink, contigReadGraph, contigFilename, readsetFilename):

    print eachmatchpair
    leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0], eachmatchpair[-1], 0, 0, ""

    succReadsList = abunGraphLib.findPathBtwEnds(folderName, leftCtgIndex, rightCtgIndex, contigReadGraph, N1)

    succReadsList = []
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    allPaths = abunGraphLib.findAllPathK(leftCtgIndex, rightCtgIndex, G, 5)
    # shuffle(allPaths)

    print "allPaths", allPaths

    possibleList = []
    for p in allPaths:
        noContig = True
        for pp in p[1:-1]:
            if pp < N1:
                noContig = False
        if noContig == True:
            possibleList.append(p)
    print "possibleList", possibleList

    minListLen = 1000
    for p in possibleList:
        if len(p) < minListLen:
            succReadsList = p
            minListLen = len(p)

    if len(succReadsList) > 0:
        succReadsList.pop(0)
        succReadsList.pop(-1)
    else:
        print "interesting item for future study"

    print "succReadsList", succReadsList

    if len(succReadsList) == 0:
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap contig : ", overlap

        leftEnd = len(leftSeg) - overlap[0]
        middleContent = ""

    else:

        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, "C", N1)
        print contigName
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        readName = abunHouseKeeper.parseIDToName(succReadsList[0], "R", N1)
        print readName
        rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )

        print "overlap start read : ", overlap

        leftEnd = len(leftSeg) - overlap[0]

        middleContent = ""

        for i in range(len(succReadsList) - 1):
            readName = abunHouseKeeper.parseIDToName(succReadsList[i], "R", N1)
            leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            readName = abunHouseKeeper.parseIDToName(succReadsList[i + 1], "R", N1)
            rightSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

            overlap = IORobot.alignWithName(
                leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
            )
            print "overlap middle read : ", overlap
            middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

        readName = abunHouseKeeper.parseIDToName(succReadsList[-1], "R", N1)
        leftSeg = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, "C", N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        overlap = IORobot.alignWithName(
            leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex)
        )
        print "overlap end read : ", overlap

        middleContent = middleContent + leftSeg[0 : len(leftSeg) - overlap[0]]

    return [leftCtgIndex, rightCtgIndex, leftEnd, rightStart, middleContent]
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    
    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
    
        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" :    
                print "debug" , eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" :    
                print "debug" , eachitem
            
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []
    
    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    
    
Example #41
0
def performPhasing(folderName, mummerLink):
    print "performPhasing"
    '''
    1. Interface from alignmentBridge.py : 
        shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList)
        cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse  = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True)
    
    2. Format of input data data : 
        bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
    
    3. IO : 
        a) Input :
            repeatSpecification.txt, phasingSeedName_Double.fasta, graph G 
        b) Output :
            improved4.fasta
            
    3. Algorithm: 
        a) reformatNoisyReads 
        b) reformatToProcessList
        c) formShortToLongMapping
    
    '''

    json_data = open(folderName + 'repeatSpecification.txt', 'r')
    loadData = json.load(json_data)
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    
    lenDicRR = IORobot.obtainLength(folderName, "phasingSeedName_Double.fasta")
    
    lenDicCC = IORobot.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    loadData = filterReverseComp(loadData, N1)
    
    toPhaseList = []
    
    if True:
        for eachitem in loadData:
            # print eachitem
            flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] 
            
            noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1)
            
            toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1)
    
            shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal, dicToOriginal, lenDicCR, N1)
            
            indelRobot = createIndelRobot(folderName)
            
            cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init")
            in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote")
            extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True)
            
            if extendResult != -1:
                print "extendResult: ", extendResult
                toPhaseList.append(eachitem + [extendResult])
            
        with open(folderName + 'toPhaseList.txt', 'w') as outfile:
            json.dump(toPhaseList, outfile)

    json_data = open(folderName + 'toPhaseList.txt', 'r')
    toPhaseList = json.load(json_data)
    
    outputResults(folderName, mummerLink, toPhaseList, N1, G)
def resolvingTandem(
    folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec
):
    print "resolvingTandem"
    """
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    """
    # 0 ) Load all the data
    thres = 5

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")

    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR:
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())

    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR:
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, "r")
    loadData = json.load(json_data)

    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}

    for eachrepProfile in loadData:
        # 1)
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)

        # 2)
        if isTerminate:
            v = returnPathList[-1]
            i = 0
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i + 1

            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]

        repeatContent = ""

        for kk in range(len(tandemPath[0:-1])):
            eachitem = tandemPath[kk] - N1
            nextitem = tandemPath[kk + 1] - N1
            readName = "Read" + str(eachitem / 2) + "_"
            nextReadName = "Read" + str(nextitem / 2) + "_"
            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"

            if nextitem % 2 == 0:
                nextReadName = nextReadName + "p"
            elif nextitem % 2 == 1:
                nextReadName = nextReadName + "d"

            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent + myContigsDic[readName][0:-overlap]

        print "len(repeatContent)", len(repeatContent)

        fout = open(folderName + repeatTempFilename, "w")
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""

        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge = repeatContentLarge + repeatContent
        fout.close()

        # 4)
        repeatReadList = eachrepProfile[1]

        myList = []
        for eachitem in repeatReadList:

            readName = "Read" + str((eachitem - N1) / 2) + "_"

            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"
            myList.append(readName)

        IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList)

        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta")

        dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out")

        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)

        # print "dataList[0]", dataList[0]
        dataList.sort(key=itemgetter(-1))
        for key, values in groupby(dataList, itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]

            # print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue

        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch * 1.0 / (c * lrepeat)
        print "BIG NUMBER of THE DAY: ", ct

        # 6)
        # a) find the starting point
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1] - N1

        contigName = "Contig" + str(startContig / 2)
        if startContig % 2 == 0:
            contigName = contigName + "_p"
        elif startContig % 2 == 1:
            contigName = contigName + "_d"

        readName = "Read" + str(firstRead / 2)
        if firstRead % 2 == 0:
            readName = readName + "_p"
        elif firstRead % 2 == 1:
            readName = readName + "_d"

        overlapFirst = dataListCRDic[contigName + ";" + readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]

        f1 = open(folderName + "firstOverlap.fasta", "w")
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()

        if True:
            alignerRobot.useMummerAlign(
                mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta"
            )

        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out")

        dataList.sort(key=itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi

        print maxItm
        if len(maxItm) > 0:
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template
        print "ct*lrepeat", int(repeatStart + ct * lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])

    # 7) Combine all the repeat information and do the join

    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]

        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)

        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]

        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig

    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i)

    checkingList = [False for i in range(N1)]

    fout = open(folderName + "tademResolved.fasta", "w")

    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C")
        if checkingList[id / 2] == False:

            fout.write(">Segkk" + str(counter) + "\n")

            fout.write(contigsTmp[eachcontig])
            counter = counter + 1
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk / 2] = True

    fout.close()
Example #43
0
def identifyRepeat(folderName, mummerLink, contigFilename, contigReadGraph,
                   repeatFilename, optionToRun):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''

    # ## (a) reachability test to find partners
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")

    adjacencyList = [[] for i in range(len(lenDicCC))]

    N1 = len(lenDicCC)

    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug

    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G)
        print "i, adjacencyList[i] : ", i, adjacencyList[i]

    # ## (b) formation of bipartite graph
    if optionToRun == "tandem":
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase":
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName,
                                                   contigFilename)

    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()

    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0:
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)

        repeatList.append([
            abunHouseKeeper.getDistinct(leftList),
            abunHouseKeeper.getDistinct(rightList)
        ])

    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)

    assert (loadData == repeatList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename,
                              readsetFilename, optTypeFileHeader, graphName):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header,
                                    referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p":
                print "debug", eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p":
                print "debug", eachitem

        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]

    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)

    checkGraphLength(G, N1, lenDicRR)

    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
Example #45
0
def abunSplitWithXResolve(folderName, mummerLink, myCountDic, contigReadGraph, contigFilename, readsetFilename):
    N1 = len(myCountDic) * 2
    print "N1", N1

    # Debug
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)

    adj = [[] for i in range(N1)]

    for i in range(N1):
        adj[i] = abunGraphLib.findAllReachable(i, N1, G)

    Gnew = graphLib.seqGraph(N1)

    for i in range(N1):
        for j in adj[i]:
            Gnew.insertEdge(i, j, 1)

    Gnew.reportEdge()
    # End Debug

    if False:
        json_data = open(folderName + "phaseRepeat.txt", "r")
        repeatPairs = json.load(json_data)
        repeatPairs = obtainNonEmpty(repeatPairs)

        biResolvedCombineList = []
        for eachitem in repeatPairs:
            inList, outList = eachitem[0], eachitem[1]
            resolvedList = determindMatch(inList, outList, myCountDic, folderName, contigReadGraph, N1)

            biResolvedCombineList += resolvedList

        ### Xnode repeatResolution
        xResolvedList, mapDummyToRealDic = xNodeResolving(folderName, contigReadGraph)

        ### Combine resolution
        resolvedList = xResolvedList + biResolvedCombineList
        resolvedList = abunHouseKeeper.getDistinct(resolvedList)
        print "resolvedList, len(resolvedList),len(xResolvedList), len(biResolvedCombineList) ", resolvedList, len(
            resolvedList
        ), len(xResolvedList), len(biResolvedCombineList)

        with open(folderName + "resolvedList.json", "w") as f:
            json.dump(resolvedList, f)

        with open(folderName + "mapDummyToRealDic.json", "w") as f:
            json.dump(mapDummyToRealDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        gapContentLookUpList = []
        gapContentLookUpList = generateGapContentLookup(
            folderName, mummerLink, resolvedList, contigReadGraph, contigFilename, readsetFilename, mapDummyToRealDic
        )
        gapContentLookUpDic = {}
        gapContentLookUpList.sort()

        for eachitem in gapContentLookUpList:
            gapContentLookUpDic[str(eachitem[0]) + "_" + str(eachitem[1])] = [eachitem[2], eachitem[3], eachitem[4]]
            print eachitem[2:4], len(eachitem[4])

        with open(folderName + "gapContentLookUpDic.json", "w") as f:
            json.dump(gapContentLookUpDic, f)

    if False:
        json_data = open(folderName + "resolvedList.json", "r")
        resolvedList = json.load(json_data)

        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(N1 + len(mapDummyToRealDic))
        addEdges(G, resolvedList)
        G.condense()

        G.saveToFile(folderName, "xResolvedGraph")

    if False:
        json_data = open(folderName + "mapDummyToRealDic.json", "r")
        mapDummyToRealDic = json.load(json_data)

        G = graphLib.seqGraph(0)
        G.loadFromFile(folderName, "xResolvedGraph")

        json_data = open(folderName + "gapContentLookUpDic.json", "r")
        gapContentLookUpDic = json.load(json_data)

        print "Final step: really hacking a file"
        os.system("cp " + folderName + contigFilename + "_Double.fasta " + folderName + "tmpWithDummy.fasta")
        contigList = IORobot.readContigsFromFile(folderName, contigFilename + "_Double.fasta")

        f = open(folderName + "tmpWithDummy.fasta", "a")
        for i in range(len(mapDummyToRealDic)):
            id = mapDummyToRealDic[str(i)]
            f.write(">SegDum" + str(i) + "\n")
            f.write(contigList[id] + "\n")
        f.close()

        IORobot.extractGraphToContigs(
            G, folderName, mummerLink, "abun.fasta", "tmpWithDummy.fasta", gapContentLookUpDic, mapDummyToRealDic
        )
Example #46
0
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    thres = 5 
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)
        
        
        # print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        if len(maxItm) > 0 :
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "


    
    
    # 0. Load previous data
    G = abunGraphLib.seqGraphWt(0)
    G.loadFromFile(folderName, contigReadGraph)
    Grev = abunGraphLib.formReverseGraph(G)
    
    json_data = open(folderName + repeatFilename, 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
     
    bigDumpList = []
    
    print "len(repeatList)", len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1):
            print rIn, rOut
            if  (len(rIn) == 1 and len(rOut) == 1):
                rIn = [rIn[0], rIn[0]]
                rOut = [rOut[0], rOut[0]]
            
            # 1. Records reachable indices
            kkIn , kkOut = [], []
            for eachkk in rIn:
                kkIn.append(str(eachkk)+"_"+"in")
            
            for eachkk in rOut:
                kkOut.append(str(eachkk)+"_"+"out")
                
            
            abunGraphLib.markReachableIndices(G, Grev, kkIn, kkOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = abunGraphLib.markInsideNodes(G, kkIn, kkOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = abunGraphLib.markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = abunGraphLib.markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            #checkPathLength(repeatPathway, G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = abunGraphLib.markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = abunGraphLib.markAssociatedReads(G, singleMissList, allPassList)
            
            # ## Experimental
            repeatList = allPassList
            
            # ## End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
        

     


    # 7. Format return and move on to the phasing 
    with open(folderName + repeatSpec, 'w') as outfile:
        json.dump(bigDumpList, outfile)
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter):

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0

    dataList.sort(key=itemgetter(-1))

    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)

    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""

        for eachitem in items:
            ct = eachitem[6] / 100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key]

        ctkk = ctkk + 1
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1

    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]

    print "Missed coverage  ", (cttot - ctbase) / (4.7 * pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic))

    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0:
            toAddReadList.append(eachitem)

    """
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    """

    if continueFilter:
        numberOfFiles = 20

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList)

        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = (
            bindir
            + "/finisherSCCoreLib/fasta-splitter.pl --n-parts "
            + str(numberOfFiles)
            + " "
            + folderName
            + "selected_raw.fasta"
        )
        os.system(command)

        workerList = []

        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)

            outputName, referenceName, queryName, specialName = (
                "outAbunRefine" + indexOfMum,
                "improved3.fasta",
                "selected_raw.part-" + indexOfMum + ".fasta",
                "abunMissOut" + indexOfMum,
            )
            workerList.append([outputName, referenceName, queryName, specialName])

        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True
        )
        alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles)

    for i in range(len(myCountDic)):
        eachitem = "Segkk" + str(i)
        print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem])

    return myCountDic
Example #49
0
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = abunGraphLib.findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = abunGraphLib.filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = abunGraphLib.seqGraphWt(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([abunHouseKeeper.getDistinct(leftList), abunHouseKeeper.getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
    
Example #50
0
def continuousIntegration():
	if False:
		G = graphLib.seqGraph(10)
		for i in range(5):
			G.insertEdge(i,i+1,1997)
			G.insertEdge(i,i+2, 1997)

		resultList = abunGraphLib.BFS_revisit(1,3,G,1)

		print "resultList", resultList 

	if False : 

		folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile = \
			"Apr10Test/", "/usr/bin/", [[1, 486, 217], [1, 8642, 217], [1, 13465, 217]], [[1, 486, 217]], "improved3_Double.fasta", "phasingSeedName_Double.fasta"

		abunGraphLib.formPathSeq(folderName, mummerPath, directPathList, indirectPathList, contigFile, readFile)
    
		if False:
			lenDic = IORobot.obtainLength(folderName , contigFile)
			N1 = len(lenDic)

			print "N1", N1

			G = graphLib.seqGraph(0)
			G.loadFromFile(folderName, "phaseStringGraph1")

			adj = [[] for i in range(N1)]

			for i in range(N1): 
			    adj[i] = abunGraphLib.findAllReachable(i, N1, G)

			Gnew = abunGraphLib.seqGraphDynamic(N1)

			for i in range(N1):
			    for j in adj[i]:
			        Gnew.insertEdge(i,j,1997)


			Gnew.initAdv()    
			Gnew.doubleEdgeReduction()

			contigPaths = abunGraphLib.findAllPathK(1, 217, Gnew, 3)
			contigReadPaths = abunGraphLib.findAllPathK(1, 217, G, 5)

			print "contigPaths", contigPaths
			print "contigReadPaths", contigReadPaths

			Gnew.transitiveReduction()

	if False:
		toDelete = abunGraphLib.decideCut("Apr10Test/", "/usr/bin/")
		print toDelete

	if False:
		G = graphLib.seqGraph(0)
		G.loadFromFile("Apr10TestA/", "xResolvedGraph")

		if False:
			for i in range(len(G.graphNodesList)):

				v = G.graphNodesList[i]

				if len(v.nodeIndexList) > 0:
					print i , v.listOfPrevNodes , v.listOfNextNodes

		G.reportEdge()
		lenDic = IORobot.obtainLength("Apr10TestA/", "improved3_Double.fasta")
		mylist = [401, 207, 405, 407, 344]

		json_data = open("Apr10TestA/" + "myCountDic.json", 'r')
		myCountDic = json.load(json_data)

		for x in mylist:
			print x, lenDic["Contig"+str(x/2)+"_p"], myCountDic["Segkk"+str(x/2)]


	if False:
		folderName = "Apr10TestA/"
		G = graphLib.seqGraph(0)
		G.loadFromFile(folderName , "xResolvedGraph")

		json_data = open(folderName + "mapDummyToRealDic.json", 'r')
		mapDummyToRealDic = json.load(json_data)

		lenDic = IORobot.obtainLength(folderName, "improved3_Double.fasta")
		print len(G.graphNodesList)
		print len(mapDummyToRealDic)
		
		print "fake N1 , real N1 ", len(G.graphNodesList) - len(mapDummyToRealDic), len(lenDic)


	if False:
		abunSplitter.mainFlow("Apr10TestB/", "/usr/bin/")

	if False: 
		nonRedundantResolver.removeEmbedded("Apr10TestD/", "/usr/bin/")

	if False:
		folderName, contigReadGraph = "Apr10TestA/", "phaseStringGraph1"
		G = graphLib.seqGraph(0)
		kthres, edgeThres = 3, 1
		G.loadFromFile(folderName, contigReadGraph)
		lenDic = IORobot.obtainLength(folderName , "improved3_Double.fasta")

		N1 = len(lenDic)

		adj = [[] for i in range(N1)]

		for i in range(N1): 
		    tmpList = abunGraphLib.findAllReachable(i, N1, G)
		    
		    for j in tmpList:
		        if len(abunGraphLib.findAllPathK(i,j,G,kthres)) >= edgeThres:
		            adj[i].append(j) 

		    #print i, adj[i]

	    ### Filter adaptor skipped case 

		adaptorPair = []

		for i in range(len(adj)):
		    if  i % 2 == 0:
		        if i + 1 in adj[i]:
		            adj[i].remove(i+1)
		            adaptorPair.append([i, i+1])
		    elif i % 2 ==1: 
		        if i-1 in adj[i] :
		            adj[i].remove(i-1)
		            adaptorPair.append([i, i-1])

		Gnew = abunGraphLib.seqGraphDynamic(N1)

		for i in range(N1):
		    for j in adj[i]:
		        Gnew.insertEdge(i,j,1997)

		for eachpair in adaptorPair:
		    u, v = eachpair[0], eachpair[1]
		    for x in Gnew.graphNodesList[u].listOfPrevNodes:
		        xIndex = x[0]
		        Gnew.removeEdge(xIndex, v)
		    for y in Gnew.graphNodesList[v].listOfNextNodes:
		        yIndex = y[0]
		        Gnew.removeEdge(u, yIndex)


        #Gnew.reportEdge()
		count2 = 0
		for i in range(len(Gnew.graphNodesList)):
			if  len(Gnew.graphNodesList[i].listOfPrevNodes) == 2 and  len(Gnew.graphNodesList[i].listOfNextNodes) == 2:
				count2 = count2 + 1
				print str(i)+"{color:red}"

		print "count2, ", count2

		### End filter adaptor skipped case 
	if True:
		nonRedundantResolver.removeRedundantWithFile("May11TestB/" , "/usr/bin/", "abun", "abunDebug", "abunNoEmbed")
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    
    gapFiller.formRelatedReadsFile(folderName, mummerLink, "improved3")
    
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = houseKeeper.globalParallelFileNum
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Example #52
0
def generateAbundanceGraph(folderName, mummerLink):
    
    
    print "generateAbundanceGraph"
    
    '''
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    '''
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        '''
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        '''
        outputName, referenceName, queryName, specialName= "outAbun"+indexOfMum, "improved3.fasta", "raw_reads.part-"+ indexOfMum + ".fasta",  "outAbun" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        '''
        
    dataList = []
    
    for i in range(1, 1+numberOfFiles): 
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList+ alignerRobot.extractMumData(folderName, "outAbun"+ str(indexOfMum)+"Out")
    

    '''
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    '''
         
    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName , "raw_reads.fasta")
    

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList= []
    
    
    print "len(dataList)", len(dataList)
    
    if not abunHouseKeeper.abunGlobalAvoidrefine: 
        myCountDic =  evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,  True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut" )
    else:
        extraDataList = []
        
    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,False)
    
    with open(folderName + 'myCountDic.json', 'w') as f:
        json.dump(myCountDic, f)

    
    return myCountDic
def generateAbundanceGraph(folderName, mummerLink):

    print "generateAbundanceGraph"

    """
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    """
    numberOfFiles = 20
    workerList = []
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)

        """
        "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        """
        outputName, referenceName, queryName, specialName = (
            "outAbun" + indexOfMum,
            "improved3.fasta",
            "raw_reads.part-" + indexOfMum + ".fasta",
            "outAbun" + indexOfMum,
        )
        workerList.append([outputName, referenceName, queryName, specialName])

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel, False)
        """
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improved3.fasta "+folderName+"raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)
    
        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMumAbun" + indexOfMum
        os.system(command)
        """

    dataList = []

    for i in range(1, 1 + numberOfFiles):
        if i < 10:
            indexOfMum = "0" + str(i)
        else:
            indexOfMum = str(i)
        dataList = dataList + alignerRobot.extractMumData(folderName, "outAbun" + str(indexOfMum) + "Out")

    """
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    """

    lenDic = IORobot.obtainLength(folderName, "improved3.fasta")
    readLenDic = IORobot.obtainLength(folderName, "raw_reads.fasta")

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = [0 for i in range(lenDic[eachitem])]

    thres = 30
    lenSum = 0
    extraDataList = []

    print "len(dataList)", len(dataList)

    if not abunHouseKeeper.abunGlobalAvoidrefine:
        myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, True)
        extraDataList = alignerRobot.extractMumData(folderName, "abunMissOut")
    else:
        extraDataList = []

    dataList = dataList + extraDataList
    myCountDic = evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, False)

    with open(folderName + "myCountDic.json", "w") as f:
        json.dump(myCountDic, f)

    return myCountDic
def getAllAssociatedReads(folderName, mummerLink,forFastaName):
    '''
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
    
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    '''
    
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    os.system(command)
    
    N = abunHouseKeeper.abunGlobalReadSearchDepth
    
    print "N: ", N
    if N >0 :
        for trial in range(N):
            print "trial", trial
            numberOfFiles = 20
            
            if True: 
                workerList = []
                
                for dummyI in range(1, numberOfFiles + 1):
                    indexOfMum = ""
                    if dummyI < 10:
                        indexOfMum = "0" + str(dummyI)
                    else:
                        indexOfMum = str(dummyI)
                    
                    outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile, "raw_reads.part-"+ indexOfMum + ".fasta",  header + indexOfMum
                    workerList.append([outputName, referenceName, queryName, specialName])
    
                alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,False)
            
            dataList = []
            
            for i in range(1, 1+numberOfFiles): 
                if i < 10:
                    indexOfMum = "0" + str(i)
                else:
                    indexOfMum = str(i)
                dataList = dataList+ alignerRobot.extractMumData(folderName, header+ str(indexOfMum)+"Out")
            
            
            filterList = []
            
            lenDicRR = IORobot.obtainLength(folderName, queryFile)
            
            print "len(dataList)", len(dataList)
            for eachitem in dataList:
                if checkSatisfy(eachitem, lenDicRR):
                    filterList.append(eachitem)
                
            filterList.sort(key=itemgetter(-1))
            newReads = []
            
            for key, items in groupby(filterList, itemgetter(-1)):
                newReads.append(key)
                                        
            
            f = open(folderName + forFastaName + ".txt", 'w')
            
            for eachitem in newReads:
                f.write(eachitem + "\n")
            f.close()
                
            command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
            os.system(command)
    else:
        os.system("cp " + folderName + "relatedReads.fasta " + folderName + forFastaName + ".fasta")
Example #55
0
import matplotlib.pyplot as plt
from finisherSCCoreLib import IORobot

lenDic = {}
coverageDic = {}

lenDic = IORobot.obtainLength("/Users/kakitlam/", "abun.fasta")

f = open("/Users/kakitlam/Documents/abundata", 'r')
tmp = f.readline()

while len(tmp) > 0:
    if len(tmp) > 10:
        myitem = tmp[0:-1].split()
        coverageDic[myitem[0]] = float(myitem[1])
    tmp = f.readline()

f.close()

myList = []
baseCt = {}

for eachitem in lenDic:
    myList.append(lenDic[eachitem] * coverageDic[eachitem])
    baseCt[eachitem] = lenDic[eachitem] * coverageDic[eachitem]

for eachitem in lenDic:
    print eachitem, baseCt[eachitem]

for eachitem in lenDic:
    print eachitem, lenDic[eachitem]
Example #56
0
def singleGapLookUp(eachmatchpair,folderName, N1,  mummerLink,  contigReadGraph, contigFilename,readsetFilename):
    #print eachmatchpair
    leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent = eachmatchpair[0],eachmatchpair[-1],0,0,""
    
    succReadsList = []
    G = seqGraphWt(0)
    G.loadFromFile(folderName, contigReadGraph)
    succReadsList = BFS(leftCtgIndex,rightCtgIndex, G, N1)

    if len(succReadsList) > 0:
        succReadsList.pop(0)
        succReadsList.pop(-1)
    else:
        print "interesting item for future study"

    print "succReadsList" , succReadsList
    
    if len(succReadsList) == 0:
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1)
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)

        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        
        print "overlap contig : ", overlap
        
        leftEnd = len(leftSeg) - overlap[0]
        middleContent = ""
        
    else:
        
        contigName = abunHouseKeeper.parseIDToName(leftCtgIndex, 'C', N1)
        print contigName
        leftSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        readName = abunHouseKeeper.parseIDToName(succReadsList[0], 'R', N1)
        print readName
        rightSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        
        print "overlap start read : ", overlap
        
        leftEnd = len(leftSeg) - overlap[0]
        
        middleContent = ""
        
        for i in range(len(succReadsList)-1):
            readName = abunHouseKeeper.parseIDToName(succReadsList[i], 'R', N1)
            leftSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
            readName = abunHouseKeeper.parseIDToName(succReadsList[i+1], 'R', N1)
            rightSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
            
            overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
            print "overlap middle read : ", overlap
            middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]] 
        
        
        readName = abunHouseKeeper.parseIDToName(succReadsList[-1], 'R', N1)
        leftSeg  = IORobot.myRead(folderName, readsetFilename + "_Double.fasta", readName)
        
        contigName = abunHouseKeeper.parseIDToName(rightCtgIndex, 'C', N1)
        rightSeg = IORobot.myRead(folderName, contigFilename + "_Double.fasta", contigName)
        
        overlap = IORobot.alignWithName(leftSeg, rightSeg, folderName, mummerLink, str(leftCtgIndex) + "_" + str(rightCtgIndex) )
        print "overlap end read : ", overlap
        
        middleContent = middleContent + leftSeg[0:len(leftSeg)-overlap[0]]

    return [leftCtgIndex ,rightCtgIndex, leftEnd, rightStart, middleContent]