Ejemplo n.º 1
0
def formReadContigStringGraph(folderName,
                              mummerLink,
                              contigFilename,
                              readsetFilename,
                              optTypeFileHeader,
                              graphName,
                              needAlignment=True):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"

    #if needAlignment:
    #    alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    if needAlignment:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [[header, referenceFile, queryFile, ""]],
            houseKeeper.globalParallel)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if needAlignment:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if needAlignment:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    addDataToList(dataListCC, G, 0, 0, 'C', 'C')

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')

    Gnew = formExtraEdges(folderName, optTypeFileHeader, contigFilename, G, N1)

    Gnew.saveToFile(folderName, graphName)

    print "len(Gnew.graphNodesList)", len(Gnew.graphNodesList)
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    
    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
    
        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p" :    
                print "debug" , eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p" :    
                print "debug" , eachitem
            
        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []
    
    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header )
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    
    
Ejemplo n.º 3
0
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    thres = 5 
    
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)
        
        
        # print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        if len(maxItm) > 0 :
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()
def resolvingTandem(
    folderName, mummerPath, contigReadGraph, contigFilename, readsetFilename, optTypeFileHeader, repeatSpec
):
    print "resolvingTandem"
    """
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    """
    # 0 ) Load all the data
    thres = 5

    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")

    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR:
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())

    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR:
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] + ";" + eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, "r")
    loadData = json.load(json_data)

    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename + "_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}

    for eachrepProfile in loadData:
        # 1)
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)

        # 2)
        if isTerminate:
            v = returnPathList[-1]
            i = 0
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i + 1

            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]

        repeatContent = ""

        for kk in range(len(tandemPath[0:-1])):
            eachitem = tandemPath[kk] - N1
            nextitem = tandemPath[kk + 1] - N1
            readName = "Read" + str(eachitem / 2) + "_"
            nextReadName = "Read" + str(nextitem / 2) + "_"
            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"

            if nextitem % 2 == 0:
                nextReadName = nextReadName + "p"
            elif nextitem % 2 == 1:
                nextReadName = nextReadName + "d"

            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent + myContigsDic[readName][0:-overlap]

        print "len(repeatContent)", len(repeatContent)

        fout = open(folderName + repeatTempFilename, "w")
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""

        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge = repeatContentLarge + repeatContent
        fout.close()

        # 4)
        repeatReadList = eachrepProfile[1]

        myList = []
        for eachitem in repeatReadList:

            readName = "Read" + str((eachitem - N1) / 2) + "_"

            if eachitem % 2 == 0:
                readName = readName + "p"
            elif eachitem % 2 == 1:
                readName = readName + "d"
            myList.append(readName)

        IORobot.putListToFileO(folderName, readsetFilename + "_Double.fasta", "toAlignReads", myList)

        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName, mummerFile, repeatTempFilename, "toAlignReads.fasta")

        dataList = alignerRobot.extractMumData(folderName, mummerFile + "Out")

        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)

        # print "dataList[0]", dataList[0]
        dataList.sort(key=itemgetter(-1))
        for key, values in groupby(dataList, itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]

            # print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue

        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch * 1.0 / (c * lrepeat)
        print "BIG NUMBER of THE DAY: ", ct

        # 6)
        # a) find the starting point
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1] - N1

        contigName = "Contig" + str(startContig / 2)
        if startContig % 2 == 0:
            contigName = contigName + "_p"
        elif startContig % 2 == 1:
            contigName = contigName + "_d"

        readName = "Read" + str(firstRead / 2)
        if firstRead % 2 == 0:
            readName = readName + "_p"
        elif firstRead % 2 == 1:
            readName = readName + "_d"

        overlapFirst = dataListCRDic[contigName + ";" + readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]

        f1 = open(folderName + "firstOverlap.fasta", "w")
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()

        if True:
            alignerRobot.useMummerAlign(
                mummerPath, folderName, "myFirstOverlap", repeatTempFilename, "firstOverlap.fasta"
            )

        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap" + "Out")

        dataList.sort(key=itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi

        print maxItm
        if len(maxItm) > 0:
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
        else:
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template
        print "ct*lrepeat", int(repeatStart + ct * lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName] = repeatContentLarge[repeatStart : int(repeatStart + ct * lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])

    # 7) Combine all the repeat information and do the join

    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]

        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)

        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]

        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig

    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i)

    checkingList = [False for i in range(N1)]

    fout = open(folderName + "tademResolved.fasta", "w")

    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, "C")
        if checkingList[id / 2] == False:

            fout.write(">Segkk" + str(counter) + "\n")

            fout.write(contigsTmp[eachcontig])
            counter = counter + 1
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk / 2] = True

    fout.close()
def formReadContigStringGraph(folderName, mummerLink, contigFilename,
                              readsetFilename, optTypeFileHeader, graphName):
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    IORobot.writeToFile_Double1(folderName, contigFilename + ".fasta",
                                contigFilename + "_Double.fasta", "contig")
    IORobot.writeToFile_Double1(folderName, readsetFilename + ".fasta",
                                readsetFilename + "_Double.fasta", "reads")

    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta", contigFilename + "_Double.fasta"
    if True:
        alignerRobot.useMummerAlign(mummerLink, folderName, header,
                                    referenceFile, queryFile)

    lenDicCC = IORobot.obtainLength(folderName,
                                    contigFilename + "_Double.fasta")
    dataListCC = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCC = abunHouseKeeper.filterData(dataListCC, lenDicCC)

    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta", readsetFilename + "_Double.fasta"

    lenDicRR = IORobot.obtainLength(folderName,
                                    readsetFilename + "_Double.fasta")

    if not abunHouseKeeper.abunGlobalRRDisable:
        if True:
            alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                              header)

        dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
        dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
        for eachitem in dataListRR:
            if eachitem[-2] == "Read164_p" and eachitem[-1] == "Read159_p":
                print "debug", eachitem
            if eachitem[-1] == "Read164_p" and eachitem[-2] == "Read159_p":
                print "debug", eachitem

        dataListRR = abunHouseKeeper.filterDataIdentical(dataListRR, lenDicRR)

    else:
        dataListRR = []

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta", readsetFilename + "_Double.fasta"
    if True:
        alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                          header)
        #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)

    numberOfNodes = len(lenDicCR)
    G = graphLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''

    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]

    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]

    addDataToList(dataListRR, G, N1, N1, 'R', 'R')

    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)

    checkGraphLength(G, N1, lenDicRR)

    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)