def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel
        )

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    # print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") 

    if not os.path.isfile(folderName + "selfOut"):
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres)
        
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Beispiel #3
0
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ,
                              outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName,
                                         [["redundantRvsQ", fileR, fileQ, ""]],
                                         houseKeeper.globalParallel)

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    #print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName +
              "SC_n.fasta")
Beispiel #4
0
def removeRedundantWithFile(folderName, mummerLink, inputFilename,
                            mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " +
              folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName +
              inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta",
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta')

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta",
                           outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " + folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink,
            folderName,
            [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]],
            houseKeeper.globalParallel,
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta")

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
Beispiel #6
0
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"

    thres = 10

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName
    os.system(command)

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName
    os.system(command)

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            "self", houseKeeper.globalContigName, houseKeeper.globalContigName,
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed",
                           nameList)
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " +
              folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName +
              "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName,
            [["self", "contigs.fasta", "contigs.fasta", ""]],
            houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    
    thres = 10
    
    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName  + houseKeeper.globalReadName
    os.system(command)

    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName  + houseKeeper.globalContigName
    os.system(command)


    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    
    dataList = alignerRobot.transformCoor(dataList)
    
    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)
    
    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        
        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]
            
            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                
    
    
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
    
Beispiel #9
0
def mainFlow(folderName, mummerLink, pickupname, mapcontigsname):
    print "Go Bears! ! !"

    print "pickupname, mapcontigsname", pickupname, mapcontigsname

    if not pickupname in [
            "noEmbed.fasta", "improved.fasta", "improved2.fasta"
    ]:
        nonRedundantResolver.removeEmbedded(folderName, mummerLink)

    if not pickupname in ["improved.fasta", "improved2.fasta"]:
        overlapResolver.fetchSuccessor(folderName, mummerLink)
        overlapResolver.formSeqGraph(folderName, mummerLink)

    if not pickupname in ["improved2.fasta"]:
        gapFiller.fillGap(folderName, mummerLink)

    twoRepeatOneBridgeSolver.xPhased(folderName, mummerLink)

    # ECReduction(folderName , mummerLink )
    # compareWithReference(folderName , mummerLink)

    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName,
                         "noEmbedtmp.fasta", "noEmbed.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName,
                         "improvedtmp.fasta", "improved.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName,
                         "improved2tmp.fasta", "improved2.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName,
                         "improved3tmp.fasta", "improved3.fasta")

    if mapcontigsname != None:
        houseKeeper.performMapping(folderName, mummerLink, mapcontigsname)

    print "<3 Do cool things that matter <3"
Beispiel #10
0
def mainFlow(folderName , mummerLink, pickupname, mapcontigsname):      
    print "Go Bears! ! !" 
    
    print "pickupname, mapcontigsname", pickupname, mapcontigsname
    
    if not pickupname in ["noEmbed.fasta", "improved.fasta", "improved2.fasta"]:
        nonRedundantResolver.removeEmbedded(folderName , mummerLink)
     
    if not pickupname in ["improved.fasta", "improved2.fasta"]:
        overlapResolver.fetchSuccessor(folderName , mummerLink)
        overlapResolver.formSeqGraph(folderName , mummerLink)
    
    if not pickupname in ["improved2.fasta"]:
        gapFiller.fillGap(folderName , mummerLink)
    
    twoRepeatOneBridgeSolver.xPhased(folderName , mummerLink)
    
    # ECReduction(folderName , mummerLink )
    # compareWithReference(folderName , mummerLink)
    
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "noEmbedtmp.fasta", "noEmbed.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improvedtmp.fasta", "improved.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved2tmp.fasta", "improved2.fasta")
    IORobot.fillInMissed(folderName, mummerLink, houseKeeper.globalContigName, "improved3tmp.fasta", "improved3.fasta")
    

    if mapcontigsname != None:
        houseKeeper.performMapping(folderName, mummerLink, mapcontigsname)
        
    print "<3 Do cool things that matter <3"
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def formSeqGraph(folderName , mummerLink):
    print "formSeqGraph" 
    startList, graphNodes = [], []
    
    print "formSeqGraph: Reading in best successors and predecessors"
    rightConnect = readConnectList(folderName, "rightConnect.txt")
    leftConnect = readConnectList(folderName, "leftConnect.txt")
    
    numberOfNodes = len(rightConnect)
    
    print "formSeqGraph: Initializing seqGraph"
    G = graphLib.seqGraph(numberOfNodes)
        
    print "formSeqGraph: Adding edges to seqGraph"
    for eachitem, i  in zip(rightConnect, range(len(rightConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(index, connector, weight)
    
    for eachitem, i  in zip(leftConnect, range(len(leftConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(connector, index, weight)
    

    G.cleanEdge()
    G.condense()
    print "formSeqGraph: Outputting seqGraph to condensedGraph.txt"
    G.saveToFile(folderName, "condensedGraph.txt")
    G.checkSelfLoops()
    G.checkCompleteness()
    
    G2 = graphLib.seqGraph(0)
    G2.loadFromFile(folderName, "condensedGraph.txt")
    
    houseKeeper.compareGraphUnitTest(G, G2)
    G.reportDummyUsefulNode()
    G.reportEdge()
    
    graphFileName = "condensedGraph.txt"
    contigFile = "noEmbed_Double.fasta"
    outContigFile = "improved.fasta"
    outOpenList = "openZone.txt"
    
    print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
Beispiel #13
0
def formSeqGraph(folderName, mummerLink):
    print "formSeqGraph"
    startList, graphNodes = [], []

    print "formSeqGraph: Reading in best successors and predecessors"
    rightConnect = readConnectList(folderName, "rightConnect.txt")
    leftConnect = readConnectList(folderName, "leftConnect.txt")

    numberOfNodes = len(rightConnect)

    print "formSeqGraph: Initializing seqGraph"
    G = graphLib.seqGraph(numberOfNodes)

    print "formSeqGraph: Adding edges to seqGraph"
    for eachitem, i in zip(rightConnect, range(len(rightConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(index, connector, weight)

    for eachitem, i in zip(leftConnect, range(len(leftConnect))):
        index = i
        connector, weight = eachitem
        G.insertEdge(connector, index, weight)

    G.cleanEdge()
    G.condense()
    print "formSeqGraph: Outputting seqGraph to condensedGraph.txt"
    G.saveToFile(folderName, "condensedGraph.txt")
    G.checkSelfLoops()
    G.checkCompleteness()

    G2 = graphLib.seqGraph(0)
    G2.loadFromFile(folderName, "condensedGraph.txt")

    houseKeeper.compareGraphUnitTest(G, G2)
    G.reportDummyUsefulNode()
    G.reportEdge()

    graphFileName = "condensedGraph.txt"
    contigFile = "noEmbed_Double.fasta"
    outContigFile = "improved.fasta"
    outOpenList = "openZone.txt"

    print "formSeqGraph: Outputting improved contigs from seqGraph to improved.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile,
                          outContigFile, outOpenList)
Beispiel #14
0
def observeOverlap(folderName):

    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    dataList = alignerRobot.transformCoor(dataList)
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    matchThres = 10000
    nonMatchThres = 500
    count = 0

    newDataList = []
    for eachitem in dataList:
        name1, name2 = eachitem[-2], eachitem[-1]
        matchLen1, matchLen2 = eachitem[4], eachitem[5]
        start1, end1, start2, end2 = eachitem[0], eachitem[1], eachitem[
            2], eachitem[3]
        #        if name1!= name2 and min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        #        and min(start1, start2) > nonMatchThres \
        if name1!= name2 and ( min(lenDic[name1] - end1, lenDic[name2] - end2 ) > nonMatchThres \
        or min(start1, start2) > nonMatchThres ) \
        and matchLen1> matchThres:
            print "eachitem ", eachitem, lenDic[name1], lenDic[name2]
            count = count + 1
            newDataList.append(eachitem)

    print "Count: " + str(count)

    blkDic = getBreakPointFromDataList(folderName, newDataList)

    LCList = IORobot.loadContigsFromFile(folderName, "contigs.fasta")

    contigList = []

    for eachcontig in LCList:
        #print eachcontig
        if not eachcontig in blkDic:
            contigList = contigList + [LCList[eachcontig]]
        else:
            contigList = contigList + tmpBreakAcBkPts(LCList[eachcontig],
                                                      blkDic[eachcontig])

    print "len(contigList)", len(contigList)
    IORobot.writeSegOut(contigList, folderName, "breakChains.fasta")
Beispiel #15
0
 def runningTestSet(self ,myFolderName, ctexpected):
     print "Integration test on FinisherSC:  " + myFolderName
     self.sourceFolder = myFolderName
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     os.system("python finisherSC.py -par 4 "+ self.testingFolder + " "+ self.mummerPath)
     lenDic = IORobot.obtainLength(self.testingFolder, "/improved3.fasta")
     print lenDic
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
Beispiel #16
0
 def runningTestSet(self ,myFolderName, ctexpected):
     print "Integration test on FinisherSC:  " + myFolderName
     self.sourceFolder = myFolderName
     os.system("mkdir " + self.testingFolder)
     
     for eachitem in self.listOfFiles:
         os.system("cp "+ self.sourceFolder + eachitem + " " +self.testingFolder)
     
     os.system("python finisherSC.py -par 4 "+ self.testingFolder + " "+ self.mummerPath)
     lenDic = IORobot.obtainLength(self.testingFolder, "/improved3.fasta")
     print lenDic
     assert(len(lenDic) == ctexpected)
     os.system("rm -rf "+ self.testingFolder)
Beispiel #17
0
def getBreakPointFromDataList(folderName, dataList):
    g = 1000
    blkDic = {}
    dataList.sort(key=itemgetter(-2))
    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    json_data = open(folderName + "modifiedOutliners.json", 'r')
    breakPtsDic = json.load(json_data)
    sep = 5000

    for key, items in groupby(dataList, itemgetter(-2)):
        contigName = key
        newList = []
        for eachitem in items:
            newList.append([eachitem[0], eachitem[1]])
        newList.sort()

        bktmp = [0]

        if newList[0][0] > g:
            if withinBound(sep, breakPtsDic[contigName], newList[0][0]):
                bktmp.append(newList[0][0])

        #bktmp.append(newList[0][0])
        for i in range(len(newList) - 1):
            if newList[i + 1][0] > newList[i][1] + g:
                if withinBound(sep, breakPtsDic[contigName],
                               newList[i + 1][0]):
                    bktmp.append(newList[i + 1][0])

        bktmp.append(lenDic[contigName])

        blkDic[contigName] = bktmp
        print "contigName: " + contigName
        print "bktmp:", bktmp
        print "breakPtsDic[contigName]", breakPtsDic[contigName]

    return blkDic
Beispiel #18
0
def formRelatedReadsFile(folderName, mummerLink):    
    # Find associated read and extract into a file associatedReads.fasta
    # Input: contigs.fasta, cleaned_Reads.fasta 
    # Output: relatedReads.fasta

    # ## Extract heads of the contigs
    print ">formRelatedReadsFile"
    
    f = open(folderName + "improved.fasta", 'r')
    f2 = open(folderName + "improvedTrunc.fasta", 'w')
    temp = f.readline()
    tempContig = ""
    thres = 400
    runningIndex = 0
    endThres = 10 
    
    while len(temp) > 0:
        if temp[-1] == '\n':
            temp = temp[0:-1]
        
        
        if temp[0] == '>':

            if len(tempContig) > 0:
                IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
                runningIndex = runningIndex + 1 
                
                                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
                runningIndex = runningIndex + 1
                
                tempContig = ""
        else:
            tempContig = tempContig + temp
        
        temp = f.readline()

    IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
    runningIndex = runningIndex + 1
                  
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
    runningIndex = runningIndex + 1
    
    
    f2.close()
    f.close()
    
    # ## Write double stranded reads
    IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig")
    # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read")
    
    # ## Apply MUMMER on them using cleanedReads against them
    assoiatedReadIndex = []
    nameList = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    
    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName
        os.system(command)
    
    
    workerList = []
    
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
            
        outputName, referenceName, queryName, specialName=  "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum 
        workerList.append([outputName, referenceName, queryName, specialName])
    
    
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum )
        
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)

        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum
        os.system(command)
        '''
        

    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        f = open(folderName + "fromMum" + indexOfMum, 'r')
    
        for i in range(6):
            tmp = f.readline()
        
        while len(tmp) > 0:
            infoArr = tmp.split('|')
            myArr = infoArr[-1].split('\t')
            rdGpArr = infoArr[-1].split('\t')
            contigName = rdGpArr[0].rstrip().lstrip()
            readName = rdGpArr[1].rstrip().lstrip()
            
            endSegArr = infoArr[0].split(" ")
            pos = []
            for eachitem in endSegArr:
                if len(eachitem) > 0:
                    pos.append(int(eachitem))
                    
            startPos = pos[0]
            endPos = pos[1]
            if startPos < endThres and endPos > thres - endThres:
                assoiatedReadIndex.append(myArr[1])
                nameList.append([int(contigName.split('_')[1]), readName])
            tmp = f.readline()
        
        f.close()
    
    
    nameList.sort()

    assoiatedReadIndex.sort()
    
    # print "assoiatedReadIndex", assoiatedReadIndex
    
    ckIndex = 0
    f = open(folderName + "associatedNames.txt", 'w')
    oneItem = 0
    keyFound = []
    for key, items in groupby(assoiatedReadIndex):
        
        countItem = 0
        for eachitem in items:
            countItem += 1
            
        if countItem == 1:
            
            oneItem += 1
        else:
            key = key.rstrip()
            if not key in keyFound:
                f.write(key + '\n')
                keyFound.append(key)

        ckIndex += 1
    
    print "ckIndex,oneItem: ", ckIndex, oneItem
    f.close()

    fFilter = open(folderName + "associatedNames.txt", 'r')
    
    fout = open(folderName + "associatedNames2.txt", 'w') 
    
    maxCount = 12000
    mytmpDum = fFilter.readline() 
    i = 0
    while i < maxCount and len(mytmpDum) > 0:
        fout.write(mytmpDum)  
        mytmpDum = fFilter.readline() 
        i = i + 1
        
    fout.close()   
    fFilter.close()

    command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta"
    os.system(command)
    
    IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
Beispiel #19
0
def writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink):
    # ## repeat aware logging
    # print "myExtraLinkList", myExtraLinkList
    # ## end repeat aware logging

    myExtraLinkList = loggingReadsToRepeat(blockedSet + dataSet, contigList)    
    i = 0
    fOriginal = open(folderName + "improved.fasta", 'r')
    readSet = []
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            if len(tmpRead) > 0:
                readSet.append(tmpRead)
                tmpRead = ""
        else:
            tmpRead = tmpRead + tmp
            
        tmp = fOriginal.readline().rstrip()
    readSet.append(tmpRead)  
    fOriginal.close()
    
    # ## Put the needed rawReads into the RAM using Dictionary
    
    fAppendRaw = open(folderName + "appendRaw.txt", 'w')
    for eachraw in rawReadList:
        fAppendRaw.write(eachraw)
        fAppendRaw.write('\n')
    fAppendRaw.close()
    
    command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "appendRaw.txt " + folderName + "relatedReads_Double.fasta > " + folderName + "rawToAppend.fasta"
    os.system(command)
    
    rawRead = {}
    
    fOriginal = open(folderName + "rawToAppend.fasta", 'r')
    tmp = fOriginal.readline().rstrip()
    tmpRead = ""
    tmpName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            
            if len(tmpRead) > 0:
                rawRead[tmpName] = tmpRead
                tmpRead = ""
                
            tmpName = tmp[1:]
        else:
            tmpRead = tmpRead + tmp
            
        tmp = fOriginal.readline().rstrip()
        
    rawRead[tmpName] = tmpRead
    # ## End
    
    seqToPrint = []
    contigUsed = [False for i in range(numberOfContig / 2)]
    storedStrand = [[-1, 'n'] for i in range(numberOfContig)]
    
    finalList = []
    for eachContig, i in zip(contigList, range(len(contigList))):
        tmpList = []
        for eachitem in eachContig:

            readNum = eachitem / 2
            if contigUsed[readNum] == False:
                seqToPrint.append(eachitem)
                tmpList.append(eachitem)
                contigUsed[readNum] = True
                # ## mark ouput strandinfo
                storedStrand[eachitem] = [len(finalList), 'p']
                
                
        if len(tmpList) > 0:
            finalList.append(tmpList)
    
    
    for kkk in range(len(storedStrand)):
        if storedStrand[kkk][1] == 'n':
            if kkk % 2 == 0:
                storedStrand[kkk][0] = storedStrand[kkk + 1][0]
                storedStrand[kkk][1] = 'd'
            else:
                storedStrand[kkk][0] = storedStrand[kkk - 1][0]
                storedStrand[kkk][1] = 'd'
    
    # ## begin stored output blocked pairs
    blockExtraStored(storedStrand, myExtraLinkList, folderName)
    # ## end output blocked pairs stored
    

    fImproved = open(folderName + "improved2.fasta", 'w')
    
    for eachcontig, dummyIndex in zip(finalList, range(len(finalList))):
        fImproved.write(">Segkk" + str(dummyIndex) + '\n')
        tmpStore = -1997
        tmpStore2 = -1998
        tmpStore3 = -1999
        
        for eachseg, hidum in zip(eachcontig, range(len(eachcontig))):
            readNum = eachseg / 2
            orientation = eachseg % 2            
            newStart = 0 
    
    
            # Begin hack 
            ### old statement 
            x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2
            ### End old statement
            
            if hidum == 0:
                x , y , l = tmpStore, leftConnect[eachseg][1], tmpStore2
            else:
                
                prevseg = eachcontig[hidum-1]
                
                prevReadNum = prevseg/2
                prevOrient = prevseg %2
                
                
                if prevOrient == 0:
                    leftSeg = readSet[prevReadNum]
                else:
                    leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum])
                    
                rightSeg = tmpStore3
                
                overlapX = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                
                leftSeg = tmpStore3
                if orientation == 0:
                    rightSeg = readSet[readNum] 
                else:
                    rightSeg = houseKeeper.reverseComplement(readSet[readNum])
                    
                overlapY = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                
                
                print "Before : x, y , l : ",  x, y , l 
                x = overlapX[1]
                y = overlapY[0]
                l = tmpStore2
                print "After : x, y , l : ",  x, y , l 
                 
            # End hack 
            
            extraRead = ""
            if hidum == 0:
                newStart = 0
            else:
                
                if l < x + y:
                    # begin hack 
                    ### old statement 
                    newStart = x + y - l
                    ### end old statement 
                    
                    prevseg = eachcontig[hidum-1]
                
                    prevReadNum = prevseg/2
                    prevOrient = prevseg %2
                    
                    if prevOrient == 0:
                        leftSeg = readSet[prevReadNum]
                    else:
                        leftSeg = houseKeeper.reverseComplement(readSet[prevReadNum])
                    
                        
                    if orientation == 0:
                        rightSeg = readSet[readNum] 
                    else:
                        rightSeg = houseKeeper.reverseComplement(readSet[readNum])
                    
                    print "Before : ", newStart
                    overlapNewStart = IORobot.align(leftSeg, rightSeg, folderName, mummerLink)
                    newStart = overlapNewStart[1]
                    print "After : ", newStart
                    # end hack 
                    
                else:
                    newStart = 0
                    if option == 'polish':
                        print "Missing polish"
                        extraRead = tmpStore3[x:l - y]
                        # extraRead = performPolishing(leftConnect[eachseg][0], eachseg, tmpStore3[x:l-y],  dataSet, folderName)
                    else:
                        extraRead = tmpStore3[x:l - y]
    
            print extraRead[0:10], len(extraRead)
            
            fImproved.write(extraRead)
            
            if orientation == 0:
                fImproved.write(readSet[readNum][newStart:])   

            else:
                fImproved.write(houseKeeper.reverseComplement(readSet[readNum])[newStart:])
            
            if rightConnect[eachseg][1] != -1:
                tmpStore = rightConnect[eachseg][1]
                tmpStore2 = len(rawRead[rightConnect[eachseg][2]])
                tmpStore3 = rawRead[rightConnect[eachseg][2]]
                
        fImproved.write('\n')
        
    fImproved.close()
Beispiel #20
0
def extractEdgeSet(folderName, mummerLink, option="nopolish"):
    # Tasks: reconstruct the string  graph
    
    # Input : relatedReads_Double.fasta, conig_Double.fasta
    # Intermediate files: fromMum_overlap , fromMum_overlap
    # Output: connectivity of eachNode: InList, OutList [critical]
    #         connectivity of eachNode: arrow representation with size [optional]
    
    
    # ## Perform MUMMER alignment
    print ">Extract Edge set"
    contigOnlyLengthDic = IORobot.obtainLength(folderName, "improved.fasta")
    
    # print lengthDic
    lengthDic = IORobot.findContigLength(folderName, "improved")
    
    numberOfContig = len(contigOnlyLengthDic)*2

    K = 400
    thres = 5
    
    
    # ## Apply MUMMER on them using cleanedReads against them
    IORobot.truncateEndOfContigs(folderName, "improved_Double.fasta", "smaller_improvedContig.fasta", 25000, lengthDic)
    dataSet = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    

    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "relatedReads_Double.fasta"
        os.system(command)
        
        
    workerList = [] 
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        outputName, referenceName, queryName, specialName=  "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
        
        
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "outRefine", "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", True,  "fromMumRefine" + indexOfMum)
        
    
    for dummyI in range(1, numberOfFiles + 1):
        tmpSet = IORobot.obtainLinkInfoReadContig(dummyI, mummerLink, folderName,thres, lengthDic, K)
        dataSet = dataSet + tmpSet
    
    # ## repeat aware
    usableJunction = loadOpenList(folderName)
    dataSet, blockedSet = filterRepeatEnd(dataSet, usableJunction)
    # ## repeat aware end
    
    dataSet.sort()
    matchPair = formMatchPairFromReadInfo(dataSet)
    
    # Bug fix on repeat detection from reads alone
    matchPair = filterRepeatPair(matchPair)
    # end bug fix
    
    # print matchPair

    bestMatchPair = []
    
    for key, items in groupby(matchPair, itemgetter(0, 1)):
        maxvalue = -1
        maxLenPair = []
        for eachitem in items:
            if eachitem[2] > maxvalue:
                maxvalue = eachitem[2]
                maxLenPair = [eachitem[3], eachitem[4], eachitem[5]]
        bestMatchPair.append([key[0], key[1], maxvalue, maxLenPair[0], maxLenPair[1], maxLenPair[2]])
    
    contigList, leftConnect, rightConnect, rawReadList = formbestpair(bestMatchPair,numberOfContig)
    print "contigList", contigList
    
    writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink)
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
def fetchSuccessor(folderName , mummerLink): 
    
    print "fetchSuccessor"
    left_connect, right_connect = [], [] 
        
    print "Direct greedy"
    print "fetchSuccessor: Aligning non-contained contigs to themselves, output files are greedy*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy")
    # [next_item, overlap_length]
    
    leftConnect = [[-1, -1] for i in range(numberOfContig)]
    rightConnect = [[-1, -1] for i in range(numberOfContig)]
    
    dataSet.sort(reverse=True, key=itemgetter(1))
    
    print "fetchSuccessor: Finding best successors"
    for key, items in groupby(dataSet, itemgetter(1)):
        # if key == "Contig217_d":
        #    print "dddd"
        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[2]
        

        prefix = myName.split('_')
        suffix = connectorName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(rightConnect[prefixContig][0] == -1)
        rightConnect[prefixContig][0] = suffixContig
        rightConnect[prefixContig][1] = lengthOfOverlap
        

    dataSet.sort(reverse=True, key=itemgetter(2))
    
    print "fetchSuccessor: Finding best predecessors"
    for key, items in groupby(dataSet, itemgetter(2)):

        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[1]
        

        prefix = connectorName.split('_')
        suffix = myName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(leftConnect[suffixContig][0] == -1)
        leftConnect[suffixContig][0] = prefixContig 
        leftConnect[suffixContig][1] = lengthOfOverlap
    
    
    print "fetchSuccessor: Outputting best successors to rightConnect.txt"
    # ## Write to file: 
    f = open(folderName + 'rightConnect.txt', 'w')
    for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
    
    print "fetchSuccessor: Outputting best predecessors to leftConnect.txt"
    f = open(folderName + 'leftConnect.txt', 'w')
    for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
Beispiel #23
0
def xPhased(folderName , mummerLink):
    # ## Repeat resolution  [Proxy for MB]
    # 1. Re-form the contig string graph with ALL connections from contigs only V
    # 2. Log down the reads and associated blocked contigs V 
    # 3. Use reads to connect;
    # 4. Transform graph by identifying 1 successor/predecessor case ; Condense(important);
    # 5. Read out contigs
    
    print "xPhased: Aligning improved2.fasta against itself, outputting to mb*.delta"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "improved2", "mb")
    
    lenDic = IORobot.obtainLength(folderName, "improved2_Double.fasta")
    
    confidenLenThres = 0 
    
    print "xPhased: Building seqGraph"
    G = graphLib.seqGraph(numberOfContig)
    extraEdges = loadEdgeFromBlockedReads(folderName)
    
    for eachitem in dataSet:
        # print eachitem
        wt, myin, myout = eachitem
        myInData = myin[6:].split('_')
        myOutData = myout[6:].split('_')
        
        if myInData[1] == 'p':
            offsetin = 0
        else:
            offsetin = 1
        
        if myOutData[1] == 'p':
            offsetout = 0
        else:
            offsetout = 1
            
        i = int(myInData[0]) * 2 + offsetin
        j = int(myOutData[0]) * 2 + offsetout
        
        ck = False
        
        for eachedge in extraEdges:
            mystart, myend, len1, len2 = eachedge[0], eachedge[1], eachedge[2] , eachedge[3]
            if [i, j] == [mystart, myend] and min(len1, len2) >= wt and lenDic[myin] >= confidenLenThres and lenDic[myout] >= confidenLenThres:
                ck = True
                
        if ck:
            G.insertEdge(i, j, wt)
    
    
    # G.reportEdge()
    G.MBResolve()
    G.reportEdge()
    
    print "xPhased: Saving condensed seqGraph to condensedGraphMB.txt"
    G.saveToFile(folderName, "condensedGraphMB.txt")
    graphFileName = "condensedGraphMB.txt"
    contigFile = "improved2_Double.fasta"
    outContigFile = "improved3.fasta"
    outOpenList = "openZoneMB.txt"
    
    print "xPhased: Outputting improved contigs from condensed seqGraph to improved3.fasta"
    IORobot.readContigOut(folderName, mummerLink, graphFileName, contigFile, outContigFile, outOpenList)
    
    
    # ## Repeat resolution  [Proxy for phasing step]
    # 6. Find out the repeat region by MSA
    # 7. Find out the location of SNPs and extend across repeat 
    # [short cut : use contig creator : your job here is to get data into the correct formats]
    
    
    
    
    print "xPhased"
Beispiel #24
0
def fetchSuccessor(folderName , mummerLink): 
    
    print "fetchSuccessor"
    left_connect, right_connect = [], [] 
        
    print "Direct greedy"
    numberOfContig, dataSet = IORobot.obtainLinkInfo(folderName, mummerLink, "noEmbed", "greedy")
    # [next_item, overlap_length]
    
    leftConnect = [[-1, -1] for i in range(numberOfContig)]
    rightConnect = [[-1, -1] for i in range(numberOfContig)]
    
    dataSet.sort(reverse=True, key=itemgetter(1))
    
    for key, items in groupby(dataSet, itemgetter(1)):
        # if key == "Contig217_d":
        #    print "dddd"
        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[2]
        

        prefix = myName.split('_')
        suffix = connectorName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(rightConnect[prefixContig][0] == -1)
        rightConnect[prefixContig][0] = suffixContig
        rightConnect[prefixContig][1] = lengthOfOverlap
        

    dataSet.sort(reverse=True, key=itemgetter(2))
    
    for key, items in groupby(dataSet, itemgetter(2)):

        maxVal = -1
        myName = key
        connectorName = "" 
        for eachsubitem in items:
            if eachsubitem[0] > maxVal:
                maxVal = eachsubitem[0]
                connectorName = eachsubitem[1]
        

        prefix = connectorName.split('_')
        suffix = myName.split('_')
        lengthOfOverlap = maxVal
        
        if prefix[1] == 'p':
            prefixContig = int(prefix[0][6:]) * 2 
        else:
            prefixContig = int(prefix[0][6:]) * 2 + 1
        
        if suffix[1] == 'p':
            suffixContig = int(suffix[0][6:]) * 2 
        else:
            suffixContig = int(suffix[0][6:]) * 2 + 1
            
        assert(leftConnect[suffixContig][0] == -1)
        leftConnect[suffixContig][0] = prefixContig 
        leftConnect[suffixContig][1] = lengthOfOverlap
    
    
    # ## Write to file: 
    f = open(folderName + 'rightConnect.txt', 'w')
    for eachitem, dummyIndex in zip(rightConnect, range(len(rightConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()
    
    f = open(folderName + 'leftConnect.txt', 'w')
    for eachitem, dummyIndex in zip(leftConnect, range(len(leftConnect))):
        f.write(str(dummyIndex) + ',' + str(eachitem[0]) + ',' + str(eachitem[1]) + '\n')
        
    f.close()