def removeEdgeInBatch(G, noGoPrev, noGoNext):

    for v in noGoPrev:
        G.clearIn(abunHouseKeeper.parseEdgeNameToID(v, 'C'))

    for v in noGoNext:
        G.clearOut(abunHouseKeeper.parseEdgeNameToID(v, 'C'))

    return G
Exemple #3
def formExtraEdges(

    dataList = alignerRobot.extractMumData(folderName,
                                           optTypeFileHeader + "CR" + "Out")
    lenDic = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")

    count = 0
    tmpItem = []
    embedContig2ReadDic, read2EmbedContigDic = {}, {}

    for key, items in groupby(dataList, itemgetter(-2)):
        isEmbedded = False
        for eachitem in items:
            #print eachitem
            if eachitem[4] > lenDic[key] - 300:
                isEmbedded = True
                tmpItem = eachitem

        if isEmbedded:
            count = count + 1
            readName = tmpItem[-1]
            embedContig2ReadDic[key] = readName
            read2EmbedContigDic[readName] = key

    print "len(embedContig2ReadDic)", len(embedContig2ReadDic)


    for contigName in embedContig2ReadDic:
        readName = embedContig2ReadDic[contigName]

        readIndex, contigIndex = abunHouseKeeper.parseEdgeNameToID(
            readName, 'R'), abunHouseKeeper.parseEdgeNameToID(contigName, 'C')

        for eachprev in G.graphNodesList[readIndex].listOfPrevNodes:
            idNode, wt = eachprev[0], eachprev[1]
            if idNode < N1:
                G.insertEdge(idNode, contigIndex, wt)

        for eachnext in G.graphNodesList[readIndex].listOfNextNodes:
            idNode, wt = eachnext[0], eachnext[1]
            if idNode < N1:
                G.insertEdge(contigIndex, idNode, wt)

    return G
def addDataToList(dataList, G, startIndex1, startIndex2, type1, type2):

    threshold = 50

    for eachitem in dataList:
        wt = min(eachitem[4], eachitem[5])

        if eachitem[0] < threshold:

            j = abunHouseKeeper.parseEdgeNameToID(eachitem[-2],
                                                  type1) + startIndex1
            i = abunHouseKeeper.parseEdgeNameToID(eachitem[-1],
                                                  type2) + startIndex2
            j = abunHouseKeeper.parseEdgeNameToID(eachitem[-1],
                                                  type2) + startIndex2
            i = abunHouseKeeper.parseEdgeNameToID(eachitem[-2],
                                                  type1) + startIndex1

        G.insertEdge(i, j, wt)
def resolvingTandem(folderName, mummerPath, contigReadGraph,contigFilename, readsetFilename, optTypeFileHeader, repeatSpec):
    print "resolvingTandem"
    Input : repeat info 
    Output : count, join. 
    1. Find loops
    2. Form repeat
    3. Form chain of repeat copies back to back
    4. Align reads
    5. Calculate extra bases beyond flanking region
    6. Calculate count
    7. Join the contigs
    # 0 ) Load all the data
    thres = 5 
    G = graphLib.seqGraph(0)
    G.loadFromFile(folderName, contigReadGraph)
    lenDicCC = IORobot.obtainLength(folderName, contigFilename+"_Double.fasta")
    N1 = len(lenDicCC)

    maxDuplicate = 10
    repeatTempFilename = "tandemRepeatTemplate.fasta"
    mummerFile = "myTandemRepeatTemplate"

    myContigsDic = IORobot.loadContigsFromFile(folderName, readsetFilename+"_Double.fasta")    
    lenDicRR = IORobot.obtainLength(folderName, readsetFilename + "_Double.fasta")
    header = optTypeFileHeader + "RR"
    dataListRR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListRR = abunHouseKeeper.filterData(dataListRR, lenDicRR)
    dataListRRDic = {}
    for eachitem in dataListRR: 
        if eachitem[2] < thres:
            dataListRRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    header = optTypeFileHeader + "CR"
    lenDicCC = IORobot.obtainLength(folderName, contigFilename + "_Double.fasta")
    lenDicCR = dict(lenDicCC.items() + lenDicRR.items())
    dataListCR = alignerRobot.extractMumData(folderName, header + "Out")
    dataListCR = abunHouseKeeper.filterData(dataListCR, lenDicCR)
    dataListCRDic = {}
    for eachitem in dataListCR: 
        if eachitem[2] < thres:
            dataListCRDic[eachitem[-2] +";"+eachitem[-1]] = eachitem[4]

    print dataListCRDic

    json_data = open(folderName + repeatSpec, 'r')
    loadData = json.load(json_data)
    contigsTmp = IORobot.loadContigsFromFile(folderName, contigFilename+"_Double.fasta")
    readTmp = IORobot.loadContigsFromFile(folderName, readsetFilename + "_Double.fasta")

    happyTandemList = {}
    for eachrepProfile in loadData:
        # 1) 
        startContig = eachrepProfile[-1][0][0]
        isTerminate, returnPathList = DFSwithPath(G, G.graphNodesList[startContig], [startContig], N1, False)
        # 2) 
        if isTerminate:
            v = returnPathList[-1]
            i =0 
            tandemPath = []
            while i < len(returnPathList):
                if returnPathList[i] == v:
                    tandemPath = returnPathList[i:]
                    i = len(returnPathList)
                i = i +1
            print returnPathList
            print tandemPath
        # 3) [fix it when have time later ; to just use graph; bug at the min thing]
        repeatContent = ""
        for kk in range(len(tandemPath[0:-1])): 
            eachitem = tandemPath[kk]- N1
            nextitem = tandemPath[kk+1] - N1
            readName = "Read" + str(eachitem/2) + "_"
            nextReadName = "Read" + str(nextitem/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
            if nextitem %2 ==0 :
                nextReadName = nextReadName + "p"
            elif nextitem %2 ==1:
                nextReadName = nextReadName + "d"
            overlap = dataListRRDic[readName + ";" + nextReadName]
            print overlap
            repeatContent = repeatContent +  myContigsDic[readName][0:-overlap]
        print "len(repeatContent)", len(repeatContent)
        fout = open(folderName + repeatTempFilename, 'w')
        repeatContentLarge = ""
        for i in range(maxDuplicate):
            repeatContentLarge= repeatContentLarge + repeatContent
        # 4)
        repeatReadList =  eachrepProfile[1]
        myList= []
        for eachitem in repeatReadList:
            readName = "Read" + str((eachitem- N1)/2) + "_"
            if eachitem %2 ==0 :
                readName = readName + "p"
            elif eachitem %2 ==1:
                readName = readName + "d"
        IORobot.putListToFileO(folderName, readsetFilename+"_Double.fasta", "toAlignReads", myList)
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,mummerFile , repeatTempFilename, "toAlignReads.fasta")
        dataList = alignerRobot.extractMumData(folderName, mummerFile+"Out")
        # 5)
        totalBasesMatch = 0
        lrepeat = len(repeatContent)
        c = findCoverageFromRawData(folderName)
        # print "dataList[0]", dataList[0]
        dataList.sort(key = itemgetter(-1))
        for key, values in  groupby(dataList,itemgetter(-1)):
            maxValue = -1
            for eachsub in values:
                if eachsub[5] > maxValue:
                    maxValue = eachsub[5]
            #print key, maxValue
            totalBasesMatch = totalBasesMatch + maxValue
        print c, lrepeat, totalBasesMatch
        ct = totalBasesMatch*1.0/(c*lrepeat)
        print "BIG NUMBER of THE DAY: ", ct
        # 6) 
        # a) find the starting point 
        startContig = eachrepProfile[-1][0][0]
        firstRead = eachrepProfile[-1][0][1]-N1

        contigName = "Contig"+ str(startContig/2)
        if startContig %2 == 0:
            contigName = contigName + "_p"
        elif startContig%2 ==1:
            contigName = contigName + "_d"
        readName = "Read"+ str(firstRead/2)
        if firstRead %2 == 0:
            readName = readName + "_p"
        elif firstRead%2 ==1:
            readName = readName + "_d"
        overlapFirst = dataListCRDic[contigName+";"+readName]
        tmpCombine = contigsTmp[contigName][0:-overlapFirst] + readTmp[readName]
        f1 = open(folderName + "firstOverlap.fasta", 'w')
        if True:
            alignerRobot.useMummerAlign(mummerPath, folderName,"myFirstOverlap" , repeatTempFilename, "firstOverlap.fasta")
        dataList = alignerRobot.extractMumData(folderName, "myFirstOverlap"+"Out")
        dataList.sort(key = itemgetter(0))
        maxVal = -1
        maxItm = []
        for eachi in dataList:
            if eachi[5] > maxVal:
                maxVal = eachi[5]
                maxItm = eachi
        print maxItm
        if len(maxItm) > 0 :
            repeatStart = maxItm[0]
            contigEnd = maxItm[2]
            repeatStart = 0
            contigEnd = -1
        # b) format return : prepare the repeat template 
        print "ct*lrepeat", int(repeatStart + ct*lrepeat)
        print "repeatStart", repeatStart
        happyTandemList[contigName]= repeatContentLarge[repeatStart:int(repeatStart + ct*lrepeat)]
        contigsTmp[contigName] = tmpCombine[0:contigEnd]
        print "len(contigsTmp[contigName])", len(contigsTmp[contigName])
        print "len(happyTandemList[contigName])", len(happyTandemList[contigName])
    # 7) Combine all the repeat information and do the join
    leaderList = [i for i in range(len(contigsTmp))]
    for eachrepProfile in loadData:
        startContig = eachrepProfile[-1][0][0]
        endContig = eachrepProfile[-1][-1][-1]
        leaderContig = leaderList[startContig]
        leaderName = parseIDToName(leaderContig)
        endName = parseIDToName(endContig)
        startName = parseIDToName(startContig)
        contigsTmp[leaderName] = contigsTmp[leaderName] + happyTandemList[startName]
        if endContig != leaderContig:
            contigsTmp[leaderName] = contigsTmp[leaderName] + contigsTmp[endName]
            contigsTmp[endName] = ""
            leaderList[endContig] = leaderContig
    leaderAgg = [[] for i in range(len(leaderList))]
    for i in range(len(leaderList)):
    checkingList = [False for i in range(N1)]
    fout = open(folderName + "tademResolved.fasta", 'w')
    counter = 0
    for eachcontig in contigsTmp:
        id = abunHouseKeeper.parseEdgeNameToID(eachcontig, 'C')
        if checkingList[id/2] == False:
            fout.write(">Segkk"+str(counter)+ "\n")
            counter = counter + 1    
            for eachkk in leaderAgg[leaderList[id]]:
                checkingList[eachkk/2] = True