Example #1
0
def formReverseGraph(G):
    nNode = len(G.graphNodesList)
    Grev = commonLib.seqGraph(nNode)
    for i in range(nNode):
        for j in range(nNode):
            haveInserted = commonLib.nameInEdgeList(j, G.graphNodesList[i].listOfNextNodes) 
            if haveInserted:      
                Grev.insertEdge(j, i, 100)
    return Grev
Example #2
0
def performPhasing(folderName, mummerLink):
    print "performPhasing"
    '''
    1. Interface from alignmentBridge.py : 
        shortToLongMap = formRelatedMap(f2, noisyReads, currentNode, indelRobot, toProcessList)
        cleaner.cleaning([noisyReads,noisyReads] ,shortToLongMap, toProcessList,indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse  = cleaner.cleaning([noisyReads, noisyReads],shortToLongMap, toProcessList,indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList,indelRobot,longReadToUse, True)
    
    2. Format of input data data : 
        bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
    
    3. IO : 
        a) Input :
            repeatSpecification.txt, phasingSeedName_Double.fasta, graph G 
        b) Output :
            improved4.fasta
            
    3. Algorithm: 
        a) reformatNoisyReads 
        b) reformatToProcessList
        c) formShortToLongMapping
    
    '''

    json_data = open(folderName + 'repeatSpecification.txt', 'r')
    loadData = json.load(json_data)
    
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    
    lenDicRR = commonLib.obtainLength(folderName, "phasingSeedName_Double.fasta")
    
    lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    for eachitem in loadData:
        print eachitem
        flankingList, repeatList, repeatPathway, flankingPathsList = eachitem[0], eachitem[1], eachitem[2], eachitem[3] 
        
        noisyReads, dicToOriginal, dicFromOriginal = reformatNoisyReads(folderName, flankingList, repeatList, N1)
        
        toProcessList = reformatToProcessList(folderName , flankingList, repeatList, dicFromOriginal, N1)

        shortToLongMap = formShortToLongMapping(folderName, G, toProcessList, dicFromOriginal,dicToOriginal, lenDicCR, N1 )
        
        indelRobot = createIndelRobot(folderName)
        
        cleaner.cleaning([noisyReads, noisyReads] , shortToLongMap, toProcessList, indelRobot, "init")
        in1List, in2List, out1List, out2List, commonList, longReadToUse = cleaner.cleaning([noisyReads, noisyReads], shortToLongMap, toProcessList, indelRobot, "vote")
        extendResult = extender.readExtender(in1List, in2List, out1List, out2List, commonList, indelRobot, longReadToUse, True)
        
        if extendResult != -1:
            print "extendResult: ", extendResult
            assert(1==2)
Example #3
0
def connectContigs(toPhase, toRemove, toBR, folderName, mummerLink):
    print "\nConnect Contigs"
    tmpList = []
    delThres =20000
    lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    
    for eachitem in toRemove:
        tmpList.append(eachitem/2)
        
    tmpList.sort()
    removeContigIndexList = []
    for key, items in groupby(tmpList):
        name = "Contig"+ str(key)+"_p"
        if lenDic[name] < delThres:
            removeContigIndexList.append(2*key)
            removeContigIndexList.append(2*key+1)
        
    print "removeContigIndexList", removeContigIndexList
    
    
    
    ### toRemove ===> remove both strand when detected
    G = commonLib.seqGraph(len(lenDic))
    
    ### hack ! make the nodeIndexList to be empty for empty nodes

    for eachnode in G.graphNodesList:
        if eachnode.nodeIndex in removeContigIndexList:
            eachnode.nodeIndexList = []
    
    # form a graph, .condense, then use readContigOut
    ### add edge 
    for eachedge in toBR:
        i = eachedge[0]/2
        j = eachedge[1]/2
        wt = eachedge[3]+1
        print "i, j, wt", i, j, wt
        G.insertEdge(i, j, wt)
    
    tmpFileName = "xphasebonus"
    G.condense()
    G.saveToFile(folderName,tmpFileName )
    
    commonLib.readContigOut(folderName, mummerLink, tmpFileName, "improved3_Double.fasta", "improved4.fasta", tmpFileName+"Open")
Example #4
0
def defineRepeatAndFlanking(folderName, mummerLink,contigFilename,contigReadGraph,repeatFilename,repeatSpec ):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "


    
    
    # 0. Load previous data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    Grev = formReverseGraph(G)
    
    json_data = open(folderName + repeatFilename, 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
     
    bigDumpList = []
    
    print "len(repeatList)", len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if ( len(rIn) == 2 and len(rOut) == 2) or (len(rIn) == 1 and len(rOut) == 1):
            print rIn, rOut
            if  (len(rIn) == 1 and len(rOut) == 1):
                rIn = [rIn[0], rIn[0]]
                rOut = [rOut[0], rOut[0]]
            
            # 1. Records reachable indices
            kkIn , kkOut = [], []
            for eachkk in rIn:
                kkIn.append(str(eachkk)+"_"+"in")
            
            for eachkk in rOut:
                kkOut.append(str(eachkk)+"_"+"out")
                
            
            markReachableIndices(G, Grev, kkIn, kkOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = markInsideNodes(G, kkIn, kkOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            #checkPathLength(repeatPathway, G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList)
            
            # ## Experimental
            repeatList = allPassList
            
            # ## End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
        

     


    # 7. Format return and move on to the phasing 
    with open(folderName + repeatSpec, 'w') as outfile:
        json.dump(bigDumpList, outfile)
Example #5
0
def identifyRepeat(folderName, mummerLink,contigFilename,contigReadGraph, repeatFilename, optionToRun  ):
    '''
    Input : Graph --- phaseStringGraph1
    Output: repeat pairs { [ (1,2), (3,4) ] , [(5,6),(7,8)] } 
    Algorithm: 
        a) Reachability test on the graph to find the partners
        b) Form Bipartite graph
        c) Find connected component in the bipartite and define as repeat pairs

    '''
    
    # ## (a) reachability test to find partners 
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    # G.reportEdge()
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    
    adjacencyList = [[] for i in range(len(lenDicCC))]
    
    N1 = len(lenDicCC)
    
    
    # # Debug
    # for i in range(14):
    #    debugGraphPath(i, 2, G, N1)
    # # End Debug
    
    for i in range(len(lenDicCC)):
        adjacencyList[i] = findAllReachable(i, N1, G) 
        print "i, adjacencyList[i] : ", i , adjacencyList[i]
    
    # ## (b) formation of bipartite graph
    if optionToRun == "tandem" :
        newAdjacencyList = adjacencyList
    elif optionToRun == "xphase": 
        newAdjacencyList = filterEdge(adjacencyList, folderName, contigFilename)
    
    G2 = commonLib.seqGraph(N1 * 2)
    for i in range(N1):
        for j in newAdjacencyList[i]:
            G2.insertEdge(2 * i, 2 * j + 1, 1)
            G2.insertEdge(2 * j + 1, 2 * i, 1)

    clusters = G2.findConnectedComponents()
    
    repeatList = []
    for eachitem in clusters:
        leftList, rightList = [], []
        for eachsubitem in eachitem:
            if eachsubitem % 2 == 0 :
                leftList.append(eachsubitem)
            else:
                rightList.append(eachsubitem)
                
        
        repeatList.append([getDistinct(leftList), getDistinct(rightList)])
           
    with open(folderName + repeatFilename, 'w') as outfile:
        json.dump(repeatList, outfile)

    
    json_data = open(folderName + repeatFilename, 'r')
    loadData = json.load(json_data)
    
    
    assert(loadData == repeatList)
Example #6
0
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    
    '''
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
    Algorithm: 
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V
    '''

    G = []

    commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    
    
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = commonLib.extractMumData(folderName, header + "Out")
    dataListCC = filterData(dataListCC, lenDicCC)
    
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")

    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = filterData(dataListRR, lenDicRR)

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = filterData(dataListCR, lenDicCR)
            
    numberOfNodes = len(lenDicCR) 
    G = commonLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    
    '''
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    
    '''
    
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    
    
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    
    checkGraphLength(G, N1, lenDicRR)
    
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
Example #7
0
def outputResults(folderName, mummerLink, toPhaseList, N1, G):
    '''    
    Algorithm :
    a) Write as contigs 
    b) Add back reverse complement 
    c) Create G2 as the readOut part 
    d) Output the contigs by a function call

    '''
    # a) 
    combinedName = "contigAndRead_Double.fasta"
    os.system("cp " + folderName + "improved3_Double.fasta " + folderName + combinedName)
    
    fout = open(folderName + combinedName, 'a')
    fin = open(folderName + "phasingSeedName_Double.fasta", 'r')

    tmp = fin.readline().rstrip()
    while len(tmp) > 0:
        if tmp[0] != ">":
            fout.write(tmp + "\n")
        else:
            infoArr = tmp[5:].split("_")
            fout.write(">Contig" + str(int(infoArr[0]) + N1 / 2))
            fout.write("_" + infoArr[1] + "\n")
        tmp = fin.readline().rstrip()
        
    fin.close()
    fout.close()

    # b)
    '''
    [28], [[2, 690, 28], [6, 126, 28], [28, 212, 0], [28, 216, 4]], 1
    
    [2 , 690, 28, 212, 0]
    '''
    
    completePhaseList = []
    for eachitem in toPhaseList:
        repeat = eachitem[-3]
        flanking = eachitem[-2]
        result = eachitem[-1]
        
        
        revrepeat = []
        for eachsub in eachitem[-3][-1::-1]:
            revrepeat.append(eachsub + pow(-1, eachsub))
            
        revflanking = [[] for i in range(4)] 
        
        for j in range(2):
            for eachsub in eachitem[-2][j + 2][-1::-1]:
                revflanking[j].append(eachsub + pow(-1, eachsub))
            for eachsub in eachitem[-2][j][-1::-1]:
                revflanking[j + 2].append(eachsub + pow(-1, eachsub))
            
        revresult = eachitem[-1]
        
        completePhaseList.append([repeat, flanking, result])
        completePhaseList.append([revrepeat, revflanking, revresult])
    
    print "completePhaseList", completePhaseList
    # c) 
    G2 = commonLib.seqGraph(N1)
    nameDic = {}
    for i in range(N1):
        nameDic[i] = i
        
    for eachitem in completePhaseList:
        repeat, flanking, result = eachitem[0] , eachitem[1] , eachitem[2]
        path = [[], []]
        
        if result == 0:
            path[0] = flanking[0][0:-1] + repeat + flanking[2][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[3][1:]
        else:
            path[0] = flanking[0][0:-1] + repeat + flanking[3][1:]
            path[1] = flanking[1][0:-1] + repeat + flanking[2][1:]
        
        print path[0] , path[1]
        for i  in range(2):
            eachpath = path[i]
            currentNode = G2.graphNodesList[eachpath[0]]
            
            for nextNodeIndex, ctr in zip(eachpath[1:], range(len(eachpath[1:]))):
                if ctr != len(eachpath[1:]) - 1:
                    myindex = len(G2.graphNodesList)
                    nameDic[myindex] = nextNodeIndex
                    
                    newNode = commonLib.seqGraphNode(myindex)
                    G2.graphNodesList.append(newNode)
                else:
                    newNode = G2.graphNodesList[nextNodeIndex]
                    
                wt = 0
                for eachck in G.graphNodesList[nameDic[currentNode.nodeIndex]].listOfNextNodes:
                    if eachck[0] == nextNodeIndex:
                        wt = eachck[1]
                        break
                    
                newNode.listOfPrevNodes.append([currentNode.nodeIndex, wt])
                currentNode.listOfNextNodes.append([newNode.nodeIndex, wt])
                
                currentNode = newNode
                
    graphFileName = "phaseGraphFinal"
    G2.condense()
    G2.saveToFile(folderName, graphFileName)
    
    commonLib.readContigOut(folderName, mummerLink, graphFileName, combinedName, "improved4.fasta", "outOpenListphaing", nameDic)
Example #8
0
def defineRepeatAndFlanking(folderName, mummerLink):
    '''
    Input : 
V        a) String graph : G                
V        b) Repeat Pairing : repeatList     
        
    Output : 
V        a) chain of repeat indices (e.g. [S= R1, R33, R45, R24= E]) 
V        b) chain of flanking region indices for in1/2 out1/2 middle (e.g. [C1, R2, R4] )
V        c) in1/2 out1/2 and middle reads per repeat (e.g. [R1, R33, R45, R24])  
        
    Algorithm : 
V        1. Find repeat by graph operations
V        2. Find flanking region by graph operations
V        3. Find associated reads by graph operations
    '''
    
    print "defineRepeatAndFlanking: "
    
    # 0. Load previous data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, "phaseStringGraph1")
    Grev = formReverseGraph(G)
    
    json_data = open(folderName + 'phaseRepeat.txt', 'r')
    repeatList = json.load(json_data)
    
    lenDicCC = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    N1 = len(lenDicCC)
    
    
    print "repeatList: ", repeatList
    print "len(G.graphNodesList)", len(G.graphNodesList)
    
    bigDumpList = []
    
    print "len(repeatList)",len(repeatList) , repeatList
    for r in repeatList:
        rIn, rOut = [], []
        for eachitem in r[0]:
            rIn.append(eachitem / 2)
        for eachitem in r[1]:
            rOut.append(eachitem / 2)
        
        if len(rIn) == 2 and len(rOut) == 2:
            print rIn, rOut
        
            # 1. Records reachable indices
            kkIn , kkOut = [],[]
            '''
            for eachnext in G.graphNodesList[4].listOfNextNodes:
                print 4, eachnext
                kkIn.append(eachnext[0])

            for eachprev in G.graphNodesList[6].listOfPrevNodes:
                print 6, eachprev
                kkOut.append(eachprev[0])
                
            print set(kkIn).intersection(set(kkOut))
           
            print  len( G.graphNodesList[0].listOfNextNodes), len( G.graphNodesList[2].listOfNextNodes)
            print  len( G.graphNodesList[1].listOfPrevNodes), len( G.graphNodesList[3].listOfPrevNodes)
            
            print  len( Grev.graphNodesList[0].listOfPrevNodes), len( Grev.graphNodesList[2].listOfPrevNodes)
            print  len( Grev.graphNodesList[1].listOfNextNodes), len( Grev.graphNodesList[3].listOfNextNodes)
           ''' 
            markReachableIndices(G, Grev, rIn, rOut, N1)
            
            # 2. Marks inside nodes
            singleMissList, allPassList = markInsideNodes(G, rIn, rOut)
            for i in range(4): 
                print "len(singleMissList[i]), len(allPassList)", len(singleMissList[i]), len(allPassList)

            # 3. Finds start/end of repeat
            myStartIndex, myEndIndex = markStartEndNodes(G, rIn, rOut, singleMissList, allPassList)
            print myStartIndex, myEndIndex
            
            # 4. Find repeat interior by shortest path joining S/E
            repeatPathway = markInterior(G , myStartIndex, myEndIndex, N1)
            print "repeatPathway", repeatPathway
            checkPathLength(repeatPathway,  G, N1, folderName)
            
            # 5. Find flanking region by shortest path search again
            flankingPathsList = markFlankingRegion(G, rIn, rOut, myStartIndex, myEndIndex, N1)
            print flankingPathsList
            
            # 6. Find associated reads by graph node query
            flankingList, repeatList = markAssociatedReads(G, singleMissList, allPassList)
            
            ### Experimental
            repeatList = allPassList
            
            ### End Experimental
            for eachlist in flankingList:
                print len(eachlist), len(repeatList)
            
            bigDumpList.append([flankingList, repeatList, repeatPathway, flankingPathsList])
            
            
    # 7. Format return and move on to the phasing 
    with open(folderName + 'repeatSpecification.txt', 'w') as outfile:
        json.dump(bigDumpList, outfile)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    '''
    Input : repeat info 
    Output : count, join. 
    
    Algorithm: 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    '''
    # 0 ) Load all the data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"
    


    myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")
    
    header = optTypeFileHeader + "RR"
    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = newPhasing.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[1] > eachitem[3]:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = newPhasing.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[1] > eachitem[3]:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic



    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    
    contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    
    
    
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
       
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
                
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        
        repeatContent = ""
    
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
            
        print "len(repeatContent)", len(repeatContent)
        
        fout = open(folderName + repeatTempFilename, 'w')
        fout.write(">RepeatSegment\n")
        repeatContentLarge = ""
        
        for i in range(maxDuplicate):
            fout.write(repeatContent)
            repeatContentLarge= repeatContentLarge + repeatContent
        fout.close()
        
        # 4)
        repeatReadList =  eachrepProfile[1]
        
        myList= []
        for eachitem in repeatReadList:
            
            readName = "Read" + str((eachitem- N1)/2) + "_"
    
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            myList.append(readName)
            
        commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        
        dataList = commonLib.extractMumData(folderName, mummerFile+"Out")
        
        
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = 50 # Important parameters : FIX needed in production
        
        #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta")
        
        print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
    
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        
    
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
    
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        f1.write(">combined\n")
        f1.write(tmpCombine)
        f1.close()
        
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        
        dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out")
        
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        
        print maxItm
        repeatStart = maxItm[0]
        contigEnd = maxItm[2]
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
        
    # 7) Combine all the repeat information and do the join
    
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
        
    
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
        leaderAgg[leaderList[i]].append(i) 
    
    checkingList = [False for i in range(N1)]
    
    fout = open(folderName + "tademResolved.fasta", 'w')
    
    counter = 0
    for eachcontig in contigsTmp:
        id = newPhasing.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
        
            fout.write(">Segkk"+str(counter)+ "\n")
            
            fout.write(contigsTmp[eachcontig])
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True
    
    fout.close()