예제 #1
def getAllAssociatedReads(folderName, mummerLink):
    Input : relatedReads.fasta, raw_reads.fasta 
    Output : all_associated_reads.fasta
     Algorithm : 
        a) Get all the associated reads
        b) Loop for N=1 times : ==> this correspond 4 reads to link between the bridge in total
            i) Align the raws and tmp_seedReads
            ii) Put the new reads into the SeedReads
    forFastaName = "phasingSeedName"
    header, referenceFile, queryFile = "seedReads", forFastaName + ".fasta" , "raw_reads.fasta"
    command = "cp " + folderName + "relatedReads.fasta " + folderName + referenceFile
    N = 1
    for trial in range(N):
        print "trial", trial
        if False:
            command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + header + " " + folderName + referenceFile + " " + folderName + queryFile
            command = mummerLink + "show-coords -r " + folderName + header + ".delta > " + folderName + header + "Out"
        dataList = commonLib.extractMumData(folderName, header + "Out")
        filterList = []
        lenDicRR = commonLib.obtainLength(folderName, queryFile)
        print "len(dataList)", len(dataList)
        for eachitem in dataList:
            if checkSatisfy(eachitem, lenDicRR):
        newReads = []
        for key, items in groupby(filterList, itemgetter(-1)):
        f = open(folderName + forFastaName + ".txt", 'w')
        for eachitem in newReads:
            f.write(eachitem + "\n")
        command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + forFastaName + ".txt " + folderName + "raw_reads.fasta > " + folderName + forFastaName + ".fasta"
예제 #2
def formReadContigStringGraph(folderName, mummerLink, contigFilename, readsetFilename, optTypeFileHeader, graphName):
    Input : all_associated_reads.fasta, improved3.fasta
    Output : (G) String Graph linking the reads and contigs
        a) Form double reads and contigs                            V
        b) Mummer the data and extract dataList three times         V
        c) Use the subroutine to output a graph                     V
        d) Output the graph to a file phasing_String_graph.graph    V

    G = []

    commonLib.writeToFile_Double1(folderName, contigFilename + ".fasta", contigFilename + "_Double.fasta", "contig")
    commonLib.writeToFile_Double1(folderName, readsetFilename + ".fasta", readsetFilename + "_Double.fasta", "reads")
    header, referenceFile, queryFile = optTypeFileHeader + "CC", contigFilename + "_Double.fasta" , contigFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)

    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    dataListCC = commonLib.extractMumData(folderName, header + "Out")
    dataListCC = filterData(dataListCC, lenDicCC)
    header, referenceFile, queryFile = optTypeFileHeader + "RR", readsetFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")

    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = filterData(dataListRR, lenDicRR)

    header, referenceFile, queryFile = optTypeFileHeader + "CR", contigFilename + "_Double.fasta" , readsetFilename + "_Double.fasta"
    if True:
        commonLib.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = filterData(dataListCR, lenDicCR)
    numberOfNodes = len(lenDicCR) 
    G = commonLib.seqGraph(numberOfNodes)
    N1, N2 = len(lenDicCC), len(lenDicRR)
    print "N1, N2, numberOfNodes: ", N1, N2, numberOfNodes
    e.g. of dataListCC[0], dataListRR[0], dataListCR[0]
    [1, 520, 2913194, 2913716, 520, 523, 99.05, 'Contig0_d', 'Contig2_d']
    [1, 1383, 1253, 2603, 1383, 1351, 82.39, 'Read0_d', 'Read1705_p']
    [1, 718, 4334, 5074, 718, 741, 91.91, 'Contig0_d', 'Read1018_d']
    # print dataListCC[0]
    # print dataListRR[0]
    # print dataListCR[0]
    # for eachitem in dataListCC:
    #    print eachitem
    addDataToList(dataListCC, G, 0, 0, 'C', 'C')
    # for eachitem in dataListRR[0:10]:
    #    print eachitem , lenDicRR[eachitem[-2]], lenDicRR[eachitem[-1]]
    addDataToList(dataListRR, G, N1, N1, 'R', 'R')
    addDataToList(dataListCR, G, 0, N1, 'C', 'R')
    # G.reportEdge()
    G.saveToFile(folderName, graphName)
    checkGraphLength(G, N1, lenDicRR)
    # print len(G.graphNodesList[0].listOfPrevNodes), len(G.graphNodesList[0].listOfNextNodes)
    print "len(G.graphNodesList)", len(G.graphNodesList)
예제 #3
def defineRegionOfInterest(folderName , mummerLink):
    # Form inInfo and outInfo for [ name and endUsed ]
    # Define [terminating loc] for inInfo and outInfo
    print "defineRegionOfInterest"
    # commonLib.writeToFile_Double1(folderName, "improved3.fasta", "improved3_Double.fasta", "contig")
    # commonLib.useMummerAlign(mummerLink, folderName, "phasing", "improved3_Double.fasta", "improved3_Double.fasta")
    dataList = commonLib.extractMumData(folderName, "phasing" + "Out")
    lenDic = commonLib.obtainLength(folderName, "improved3_Double.fasta")
    print "Record length of contigs"
    for eachitem in lenDic:
        print lenDic[eachitem], eachitem
    print "\nPerform alignment and group associated contigs"
    # Convention : 0_p_L, 0_p_R, 0_d_L, 0_d_R 
    N = len(lenDic) * 2
    clusterList = []
    for i in range(N):
        if i % 2 == 0:
            clusterList[i].terminatingLoc = 0
            clusterList[i].terminatingLoc = lenDic[parseIDToName(i)[0:-2]]
    oppoPairList = []
    for eachitem in dataList:
        terminatingLoc, resultOfCk = checkSameSideRequirement(eachitem, lenDic)
        isOppMatch, pair = checkOppositeSideRequirement(eachitem, lenDic)
        if isOppMatch:
            index1 = parseContigName(pair[0], 'R')
            index2 = parseContigName(pair[1], 'L')
            oppoPairList.append([index1, index2, pair[2], pair[3]])
        if resultOfCk == 'L' or resultOfCk == 'R':
            index1 = parseContigName(eachitem[-2], resultOfCk)
            index2 = parseContigName(eachitem[-1], resultOfCk)
            union(clusterList[index1], clusterList[index2])
            if resultOfCk == 'L':
                if clusterList[index1].terminatingLoc < terminatingLoc[0]:
                    clusterList[index1].terminatingLoc = terminatingLoc[0]
                if clusterList[index2].terminatingLoc < terminatingLoc[1]:
                    clusterList[index2].terminatingLoc = terminatingLoc[1]
            elif resultOfCk == 'R':
                if clusterList[index1].terminatingLoc > terminatingLoc[0]:
                    clusterList[index1].terminatingLoc = terminatingLoc[0]
                if clusterList[index2].terminatingLoc > terminatingLoc[1]:
                    clusterList[index2].terminatingLoc = terminatingLoc[1]

    headList = []
    for eachitem in clusterList:
        if find(eachitem) == eachitem:
    for eachitem in headList:
        for eachsub in familyList(eachitem):
            print parseIDToName(eachsub.id), eachsub.terminatingLoc,
    nFamily = len(headList)
    # Define the match of inInfo vs outInfo [matchList]
    for key, items in groupby(oppoPairList, itemgetter(0, 1)):
        # print parseIDToName(key[0]), parseIDToName(key[1])
    matchList = []
    for eachitem in headList:
        if eachitem.id % 2 == 1:
            successorIndex = eachitem.findSuccessor()
            if successorIndex != -1:
                matchList.append([eachitem.id, successorIndex])
    repeatList = []
    for eachitem in  matchList:
        if eachitem[0] != -1 and eachitem[1] != -1:
            inList = []
            for eachsubitem in familyList(find(clusterList[eachitem[0]])):
                inList.append([eachsubitem.id, eachsubitem.terminatingLoc])
            outList = []
            for eachsubitem in familyList(find(clusterList[eachitem[1]])):
                outList.append([eachsubitem.id, eachsubitem.terminatingLoc])
            repeatList.append([inList, outList])
    # Filter the embedded contigs
    globalRemoveList = []
    for eachitem in repeatList:
        inList = eachitem[0]
        outList = eachitem[1]
        toRemoveList = []
        for aitem in inList:
            for bitem in outList:
                if aitem[0] / 2 == bitem[0] / 2 :
                    toRemoveList.append([aitem, bitem])
                    globalRemoveList.append(aitem[0] / 2)
        for eachsub in toRemoveList:
            if eachsub[0] in inList:
            if eachsub[1] in outList:
    print "\nRepeats and in/out contigs"
    for i in range(len(repeatList)):
        print "(repeatList[i][0]),(repeatList[i][1]): ", (repeatList[i][0]), (repeatList[i][1]) 
    print "globalRemoveList: ", globalRemoveList
    print "oppoPairList", oppoPairList
    # Define the repeat 
    contigDic = commonLib.loadContigsFromFile(folderName, "improved3_Double.fasta") 
    newRepeatList = []
    newBRList = []
    print "\nRepeat interior and defining flanking region"
    for eachrepeat in repeatList:
        # ## Get the initial trial
        inReadList = []
        outReadList = []
        for eachitem in eachrepeat[0]:
        for eachitem in eachrepeat[1]:
        tmpLink = []
        for eachoppoPair in oppoPairList:
            if eachoppoPair[0] in inReadList and eachoppoPair[1] in outReadList:
                tmpLink = eachoppoPair
        if len(tmpLink) > 0:
            f1Read, f2Read = tmpLink[0], tmpLink[1]
            f1 , a1, f2, a2 = -1, tmpLink[2], -1 , tmpLink[3]
            for eachitem in eachrepeat[0]:
                if eachitem[0] == f1Read:
                    f1 = eachitem[1]
            for eachitem in eachrepeat[1]:
                if eachitem[0] == f2Read:
                    f2 = eachitem[1]
            print "f1Read, f2Read, f1, a1, f2, a2:\t ", f1Read, f2Read, f1, a1, f2, a2
            f1tilde , f2tilde = f1, f2
            # ## Refine it
            for myrecord in dataList:
                myid = parseContigName(myrecord[-2], 'R')       
                otherid = parseContigName(myrecord[-1], 'R')
                if myid == f1Read and otherid != myid and otherid in inReadList:
                    if checkSameSideRequirement(myrecord, lenDic):
                        myStart = myrecord[0] 
                        if myStart > f1tilde:
                            f1tilde = myStart
            for myrecord in dataList:
                myid = parseContigName(myrecord[-2], 'L')       
                otherid = parseContigName(myrecord[-1], 'L')
                if myid == f2Read and otherid != myid and otherid in outReadList:
                    if checkSameSideRequirement(myrecord, lenDic):
                        myEnd = myrecord[1] 
                        if myEnd < f2tilde:
                            f2tilde = myEnd
            # ## Output the loc indices and read from the real contig to get the repeat out
            print "f1Read, f2Read, f1tilde, a1, f2tilde, a2:  \t", f1Read, f2Read, f1tilde, a1, f2tilde, a2
            f1Read_parsed = parseIDToName(f1Read)[0:-2]
            f2Read_parsed = parseIDToName(f2Read)[0:-2]
            print "f1Read_parsed, f2Read_parsed", f1Read_parsed, f2Read_parsed, lenDic[f1Read_parsed], lenDic[f2Read_parsed]
            if a2 < f2tilde:
                repeatSegment = contigDic[f1Read_parsed][f1tilde:] + contigDic[f2Read_parsed][a2:f2tilde]
                repeatSegment = contigDic[f1Read_parsed][f1tilde:f2tilde - a2]
            print "len(repeatSegment)", len(repeatSegment)
            # ## Put to repeat, remove from repeat, add toBR 
            if a2 >= f2tilde or a1 <= f1tilde:
                newBRList.append([f1Read, f2Read, a1, a2])
                tmpSeg = [[], [], repeatSegment]
                for eachin in eachrepeat[0]:
                    if eachin[0] != f1Read:
                for eachout in eachrepeat[1]:
                    if eachout[0] != f2Read:
                if len(tmpSeg[0]) == 1 and len(tmpSeg[1]) == 1:
                    # TODO
                    inIndex = tmpSeg[0][0][0]
                    outIndex = tmpSeg[1][0][0]
                    found = False
                    # if repeat exists , then fill in the blanks
                    # otherwise, fill 0, 0. 
                    a1New, a2New = -1, 0
                    for  secondrecord in dataList:
                        isOppMatch, pair = checkOppositeSideRequirement(secondrecord, lenDic)
                        if isOppMatch:
                            index1 = parseContigName(pair[0], 'R')
                            index2 = parseContigName(pair[1], 'L')
                            if index1 == inIndex and index2 == outIndex:
                                oppoPairList.append([index1, index2, pair[2], pair[3]])
                    if not found:
                        newBRList.append([inIndex, outIndex, -1, 0])
                        newBRList.append([inIndex, outIndex, a1New, a2New])
                elif len(tmpSeg[0]) >= 1 and len(tmpSeg[1]) >= 1:
                if len(eachrepeat[0]) == 1 and len(eachrepeat[1]) == 1:
                    newBRList.append([f1Read, f2Read, a1, a2])
                elif len(eachrepeat[0]) >= 1 and len(eachrepeat[1]) >= 1:
                    newRepeatList.append([ repeatSegment, eachrepeat[0], eachrepeat[1]])
    # Format output 
    # Rmk: if only 1 copy is left, addToBR; if 0 left, remove that repeat    
    toPhase = newRepeatList
    toRemove = globalRemoveList
    toBR = newBRList
    print "Items to be returned to next step:"
    print "toPhase", len(toPhase)
    print "toRemove", len(toRemove)
    print "toBR", len(toBR) , toBR
    connectContigs(toPhase, toRemove, toBR, folderName, mummerLink)
예제 #4
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    Input : repeat info 
    Output : count, join. 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    # 0 ) Load all the data
    G = commonLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = commonLib.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = commonLib.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = commonLib.obtainLength(folderName, readsetFilename + "_Double.fasta")
    header = optTypeFileHeader + "RR"
    dataListRR = commonLib.extractMumData(folderName, header + "Out")
    dataListRR = newPhasing.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[1] > eachitem[3]:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = commonLib.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = commonLib.extractMumData(folderName, header + "Out")
    dataListCR = newPhasing.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[1] > eachitem[3]:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    contigsTmp = commonLib.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = commonLib.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        repeatContent = ""
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
        print "len(repeatContent)", len(repeatContent)
        fout = open(folderName + repeatTempFilename, 'w')
        repeatContentLarge = ""
        for i in range(maxDuplicate):
            repeatContentLarge= repeatContentLarge + repeatContent
        # 4)
        repeatReadList =  eachrepProfile[1]
        myList= []
        for eachitem in repeatReadList:
            readName = "Read" + str((eachitem- N1)/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
        commonLib.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        dataList = commonLib.extractMumData(folderName, mummerFile+"Out")
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = 50 # Important parameters : FIX needed in production
        #lengthDic = commonLib.obtainLength(folderName, readsetFilename+"_Double.fasta")
        print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        if True:
            commonLib.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        dataList = commonLib.extractMumData(folderName, "myFirstOverlap"+"Out")
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        print maxItm
        repeatStart = maxItm[0]
        contigEnd = maxItm[2]
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
    # 7) Combine all the repeat information and do the join
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
    checkingList = [False for i in range(N1)]
    fout = open(folderName + "tademResolved.fasta", 'w')
    counter = 0
    for eachcontig in contigsTmp:
        id = newPhasing.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
            fout.write(">Segkk"+str(counter)+ "\n")
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True