Ejemplo n.º 1
0
def formSeqGraph(folderName, mummerLink):
    print "formSeqGraph"
    startList, graphNodes = [], []

    print "formSeqGraph: Reading in best successors and predecessors"
    rightConnect = readConnectList(folderName, "rightConnect.txt")
    leftConnect = readConnectList(folderName, "leftConnect.txt")

    numberOfNodes = len(rightConnect)

    print "formSeqGraph: Initializing seqGraph"
    G = graphLib.seqGraph(numberOfNodes)

    print "formSeqGraph: Adding edges to seqGraph"
    for eachitem, i in zip(rightConnect, range(len(rightConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(index, connector, weight)

    for eachitem, i in zip(leftConnect, range(len(leftConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(connector, index, weight)

    G.cleanEdge()
    G.condense()
    print "formSeqGraph: Outputting seqGraph to condensedGraph.txt"
    G.saveToFile(folderName, "condensedGraph.txt")
    G.checkSelfLoops()
    G.checkCompleteness()

    G2 = graphLib.seqGraph(0)
    G2.loadFromFile(folderName, "condensedGraph.txt")

    houseKeeper.compareGraphUnitTest(G, G2)
    G.reportDummyUsefulNode()
    G.reportEdge()

    graphFileName = "condensedGraph.txt"
    contigFile = "noEmbed_Double.fasta"
    outContigFile = "improved.fasta"
    outOpenList = "openZone.txt"

    print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile,
                          outContigFile, outOpenList)
Ejemplo n.º 2
0
def readContigOut(folderName, mummerLink, graphFileName, contigFile,
                  outContigFile, outOpenList):

    print "readContigOut"

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, graphFileName)
    G.findStartEndList()

    myContigsDic = loadContigsFromFile(folderName, contigFile)

    contigUsed = [False for i in range(len(G.graphNodesList) / 2)]

    seqToPrint = []
    openList = []

    noForRevMismatch = True

    for eachnode in G.graphNodesList:

        if len(eachnode.nodeIndexList) > 0:
            tmpSeq = ""
            # ## debug consistency of t/f
            ckList = []
            for dummy in eachnode.nodeIndexList:
                indexToAdd = dummy
                readNum = indexToAdd / 2
                ckList.append(contigUsed[readNum])

            if (len(ckList) > 0 and not all(ckList) and any(ckList)):
                noForRevMismatch = False

            # ## end debug

            for i in range(len(eachnode.nodeIndexList)):

                indexToAdd = eachnode.nodeIndexList[i]
                readNum = indexToAdd / 2
                orientation = indexToAdd % 2

                if contigUsed[readNum] == False:

                    if i != len(eachnode.nodeIndexList) - 1:

                        overlapLen = eachnode.overlapList[i]

                        if orientation == 0:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'p'][0:-overlapLen]
                        else:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'd'][0:-overlapLen]
                    else:
                        if orientation == 0:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'p']
                        else:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'd']

                    contigUsed[readNum] = True

            if len(tmpSeq) > 0:
                if eachnode.nodeIndex in G.myStartList:
                    openList.append('Segkk' + str(len(seqToPrint)) + ',noprev')
                if eachnode.nodeIndex in G.myEndList:
                    openList.append('Segkk' + str(len(seqToPrint)) + ',nonext')

                # ## Debug
                if eachnode.nodeIndex == 444:
                    print 439, len(seqToPrint)

                if eachnode.nodeIndex == 67:
                    print 67, len(seqToPrint)

                # ## End Debug
                seqToPrint.append(tmpSeq)

    print "No forward/reverse mismatch ?", noForRevMismatch
    fImproved = open(folderName + outContigFile, 'w')
    for eachcontig, dummyIndex in zip(seqToPrint, range(len(seqToPrint))):
        fImproved.write(">Segkk" + str(dummyIndex) + '\n')
        fImproved.write(eachcontig + '\n')

    fImproved.close()

    print "All contigs used? ", all(contigUsed)
    print "NContig", len(seqToPrint)

    f = open(folderName + outOpenList, 'w')
    f.write(str(len(seqToPrint)) + '\n')
    for eachitem in openList:
        f.write(str(eachitem) + str('\n'))
    f.close()
Ejemplo n.º 3
0
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    print "xPhased: Building seqGraph"
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt"
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
Ejemplo n.º 4
0
def readContigOut(folderName,
                  mummerLink,
                  graphFileName,
                  contigFile,
                  outContigFile,
                  outOpenList,
                  nameDic={}):

    print "readContigOut"

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, graphFileName)
    G.findStartEndList()

    myContigsDic = loadContigsFromFile(folderName, contigFile)

    contigUsed = [False for i in range(len(G.graphNodesList) / 2)]

    seqToPrint = []
    openList = []

    noForRevMismatch = True

    print len(G.graphNodesList)
    for eachnode in G.graphNodesList:
        print eachnode.nodeIndexList
        if len(eachnode.nodeIndexList) > 0:
            tmpSeq = ""
            # ## debug consistency of t/f
            ckList = []
            for dummy in eachnode.nodeIndexList:
                indexToAdd = dummy
                readNum = indexToAdd / 2
                ckList.append(contigUsed[readNum])

            if (len(ckList) > 0 and not all(ckList) and any(ckList)):
                noForRevMismatch = False

            # ## end debug
            if contigUsed[eachnode.nodeIndexList[0] / 2] == False:
                contigUsed[eachnode.nodeIndexList[0] / 2] = True
                contigUsed[eachnode.nodeIndexList[-1] / 2] = True

                for i in range(len(eachnode.nodeIndexList)):

                    indexToAdd = eachnode.nodeIndexList[i]
                    readNum = indexToAdd / 2
                    orientation = indexToAdd % 2

                    #print nameDic[indexToAdd]
                    if len(nameDic) > 0:
                        orientation = nameDic[indexToAdd] % 2
                        readNum = nameDic[indexToAdd] / 2

                    #print readNum
                    if i != len(eachnode.nodeIndexList) - 1:

                        overlapLenOld = eachnode.overlapList[i]

                        # Can we hijack here for the overlap Length... seems like minimal changes
                        overlapLen = useAlignToGetLen(eachnode, i, nameDic,
                                                      orientation,
                                                      myContigsDic, readNum,
                                                      folderName, mummerLink)
                        # End Hijacking
                        print overlapLen, overlapLenOld

                        if orientation == 0:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'p'][0:-overlapLen]
                        else:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'd'][0:-overlapLen]
                    else:
                        if orientation == 0:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'p']
                        else:
                            tmpSeq = tmpSeq + myContigsDic['Contig' +
                                                           str(readNum) + '_' +
                                                           'd']

            if len(tmpSeq) > 0:
                if eachnode.nodeIndex in G.myStartList:
                    openList.append('Segkk' + str(len(seqToPrint)) + ',noprev')
                if eachnode.nodeIndex in G.myEndList:
                    openList.append('Segkk' + str(len(seqToPrint)) + ',nonext')

                seqToPrint.append(tmpSeq)

    print "No forward/reverse mismatch ?", noForRevMismatch
    fImproved = open(folderName + outContigFile, 'w')
    for eachcontig, dummyIndex in zip(seqToPrint, range(len(seqToPrint))):
        print len(eachcontig)
        fImproved.write(">Segkk" + str(dummyIndex) + '\n')
        fImproved.write(eachcontig + '\n')

    fImproved.close()

    print "All contigs used? ", all(contigUsed)
    print "NContig", len(seqToPrint)

    f = open(folderName + outOpenList, 'w')
    f.write(str(len(seqToPrint)) + '\n')
    for eachitem in openList:
        f.write(str(eachitem) + str('\n'))
    f.close()