Example #1
0
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ,
                              outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName,
                                         [["redundantRvsQ", fileR, fileQ, ""]],
                                         houseKeeper.globalParallel)

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    #print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName +
              "SC_n.fasta")
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta") 

    if not os.path.isfile(folderName + "selfOut"):
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')
    removeList = alignerRobot.extractMumDataAndRemove(folderName,"selfOut",lenDic,thres)
        
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
def removeRedundantRefvsQuery(folderName, mummerLink, fileR, fileQ, outputFileName):

    thres = 10

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["redundantRvsQ", fileR, fileQ, ""]], houseKeeper.globalParallel
        )

    dataList = alignerRobot.extractMumData(folderName, "redundantRvsQOut")
    lenDicR = IORobot.obtainLength(folderName, fileR)
    lenDicQ = IORobot.obtainLength(folderName, fileQ)

    isRedundantList = []

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        l1, l2 = lenDicR[name1], lenDicQ[name2]

        if abs(l2 - match2) < thres:
            isRedundantList.append(name2)

    # print lenDicQ

    nonRedundantList = obtainComplement(lenDicQ, isRedundantList)

    print nonRedundantList
    IORobot.putListToFileO(folderName, fileQ, outputFileName, nonRedundantList)

    os.system("cp " + folderName + "SC_n_tmp.fasta " + folderName + "SC_n.fasta")
Example #4
0
def removeRedundantWithFile(folderName, mummerLink, inputFilename,
                            mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " +
              folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName +
              inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta",
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + '.fasta')

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta",
                           outputFileName, returnList)
def removeRedundantWithFile(folderName, mummerLink, inputFilename, mummerTmpName, outputFileName):
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + inputFilename + ".fasta  > " + folderName + inputFilename + "2.fasta")

    os.system("cp " + folderName + inputFilename + "2.fasta " + folderName + inputFilename + ".fasta")

    if True:
        alignerRobot.useMummerAlignBatch(
            mummerLink,
            folderName,
            [[mummerTmpName, inputFilename + ".fasta", inputFilename + ".fasta", ""]],
            houseKeeper.globalParallel,
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, mummerTmpName + "Out")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, inputFilename + ".fasta")

    removeList = []

    shortEmbedClusterDic = {}

    for eachitem in lenDic:
        shortEmbedClusterDic[eachitem] = clusterElem(eachitem)

    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                union(shortEmbedClusterDic[name1], shortEmbedClusterDic[name2])

    nameList = obtainComplement(lenDic, removeList)

    returnList = []

    for eachitem in nameList:
        if find(shortEmbedClusterDic[eachitem]).id == eachitem:
            returnList.append(eachitem)

    print "len(nameList), len(returnList)", len(nameList), len(returnList)

    IORobot.putListToFileO(folderName, inputFilename + ".fasta", outputFileName, returnList)
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " +
              folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName +
              "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName,
            [["self", "contigs.fasta", "contigs.fasta", ""]],
            houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, 'contigs.fasta')

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Example #7
0
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"

    thres = 10

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName + houseKeeper.globalReadName
    os.system(command)

    command = r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName + houseKeeper.globalContigName
    os.system(command)

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            "self", houseKeeper.globalContigName, houseKeeper.globalContigName,
            ""
        ]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[
            7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed",
                           nameList)
def removeEmbedded(folderName , mummerLink):
    print "removeEmbedded"
    
    thres = 10
    
    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "raw_reads.fasta > " + folderName  + houseKeeper.globalReadName
    os.system(command)

    command= r'''perl -pe 's/>[^\$]*$/">Seg" . ++$n ."\n"/ge' ''' + folderName + "contigs.fasta > " + folderName  + houseKeeper.globalContigName
    os.system(command)


    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [["self", houseKeeper.globalContigName, houseKeeper.globalContigName, ""]], houseKeeper.globalParallel)
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName
    
    dataList = alignerRobot.extractMumData(folderName, "selfOut")
    
    dataList = alignerRobot.transformCoor(dataList)
    
    lenDic = IORobot.obtainLength(folderName, houseKeeper.globalContigName)
    
    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]
        
        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]
            
            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem
                
    
    
    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)
    
    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)
    
    IORobot.putListToFileO(folderName, houseKeeper.globalContigName, "noEmbed", nameList)
    
def removeEmbedded(folderName, mummerLink):
    print "removeEmbedded"
    thres = 10
    os.system("sed -e 's/|//g' " + folderName + "contigs.fasta  > " + folderName + "contigs2.fasta")

    os.system("cp " + folderName + "contigs2.fasta " + folderName + "contigs.fasta")

    if True:
        print "removeEmbedded: Aligning contigs.fasta to contigs.fasta, outputs are self*.delta"
        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, [["self", "contigs.fasta", "contigs.fasta", ""]], houseKeeper.globalParallel
        )
        # alignerRobot.useMummerAlign(mummerLink, folderName, "self", "contigs.fasta", "contigs.fasta")
        # outputName, referenceName, queryName, specialName

    print "removeEmbedded: Extracting MUMmer data from delta files to selfOut"
    dataList = alignerRobot.extractMumData(folderName, "selfOut")

    dataList = alignerRobot.transformCoor(dataList)

    lenDic = IORobot.obtainLength(folderName, "contigs.fasta")

    removeList = []
    for eachitem in dataList:
        match1, match2, name1, name2 = eachitem[4], eachitem[5], eachitem[7], eachitem[8]

        if name1 != name2:
            l1, l2 = lenDic[name1], lenDic[name2]

            if abs(l1 - match1) < thres and abs(l2 - match2) > thres:
                removeList.append(name1)
            elif abs(l1 - match1) > thres and abs(l2 - match2) < thres:
                removeList.append(name2)
            elif abs(l1 - match1) < thres and abs(l2 - match2) < thres:
                print "Both shortembedd", eachitem

    nameList = []
    for eachitem in lenDic:
        nameList.append(eachitem)

    print len(nameList)

    for eachitem in removeList:
        if eachitem in nameList:
            nameList.remove(eachitem)
    print len(nameList)

    print "removeEmbedded: Outputting non-contained contigs to noEmbed.fasta"
    IORobot.putListToFileO(folderName, "contigs.fasta", "noEmbed", nameList)
Example #10
0
def fillInMissed(folderName, mummerLink, filerefname, filequeryname, fileoutname):
    
    os.system("mv " + folderName + fileoutname + " " + folderName + filequeryname )
    alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[fileoutname+"fillmiss", filerefname, filequeryname, ""]], houseKeeper.globalParallel)
    
    dataList = alignerRobot.extractMumData(folderName, fileoutname+"fillmissOut")

    lenDic = obtainLength(folderName, filerefname)

    ### Check if there is any missing parts 

    # Format of the dataList :  1      765  |    11596    10822  |      765      775  |    84.25  | ref_NC_001133_       scf7180000000702"
    
    dataList.sort(key = itemgetter(-2))
    thres = 100
    extraList = []

    for key, items in groupby(dataList, itemgetter(-2)):
        isFound = False
        for eachitem in items:
            if abs(int(eachitem[4])  - lenDic[key]) < thres:
                isFound = True
                break

        if not isFound:
            extraList.append(key)

    ### Fill in any missing items
    
    referenceDic = loadContigsFromFile(folderName, filerefname)
    queryDic = loadContigsFromFile(folderName, filequeryname)
    
    ctgList = [referenceDic[eachitem] for eachitem in extraList] + [queryDic[eachitem] for eachitem in queryDic]
    writeSegOut(ctgList, folderName, fileoutname)

    print "fileoutname: len(extraList)",fileoutname,  len(extraList), len(ctgList)
Example #11
0
def formRelatedReadsFile(folderName, mummerLink):    
    # Find associated read and extract into a file associatedReads.fasta
    # Input: contigs.fasta, cleaned_Reads.fasta 
    # Output: relatedReads.fasta

    # ## Extract heads of the contigs
    print ">formRelatedReadsFile"
    
    f = open(folderName + "improved.fasta", 'r')
    f2 = open(folderName + "improvedTrunc.fasta", 'w')
    temp = f.readline()
    tempContig = ""
    thres = 400
    runningIndex = 0
    endThres = 10 
    
    while len(temp) > 0:
        if temp[-1] == '\n':
            temp = temp[0:-1]
        
        
        if temp[0] == '>':

            if len(tempContig) > 0:
                IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
                runningIndex = runningIndex + 1 
                
                                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
                runningIndex = runningIndex + 1
                
                IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
                runningIndex = runningIndex + 1
                
                tempContig = ""
        else:
            tempContig = tempContig + temp
        
        temp = f.readline()

    IORobot.writeToFile(f2, runningIndex, tempContig[0:thres])
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, tempContig[-thres:])
    runningIndex = runningIndex + 1
                  
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[0:thres]))
    runningIndex = runningIndex + 1
    
    IORobot.writeToFile(f2, runningIndex, houseKeeper.reverseComplement(tempContig[-thres:]))
    runningIndex = runningIndex + 1
    
    
    f2.close()
    f.close()
    
    # ## Write double stranded reads
    IORobot.writeToFile_Double1(folderName, "improved.fasta", "improved_Double.fasta", "contig")
    # writeToFile_Double1(folderName, "raw_reads.fasta", "raw_reads_Double.fasta","read")
    
    # ## Apply MUMMER on them using cleanedReads against them
    assoiatedReadIndex = []
    nameList = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    
    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + houseKeeper.globalReadName
        os.system(command)
    
    
    workerList = []
    
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
            
        outputName, referenceName, queryName, specialName=  "outGapFillRaw"+indexOfMum , "improvedTrunc.fasta", houseKeeper.globalReadName[0:-6] + ".part-" + indexOfMum + ".fasta", "fromMum" + indexOfMum 
        workerList.append([outputName, referenceName, queryName, specialName])
    
    
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "out", "improvedTrunc.fasta", "raw_reads.part-" + indexOfMum + ".fasta", True, "fromMum" + indexOfMum )
        
        '''
        command = mummerLink + "nucmer --maxmatch --nosimplify -p " + folderName + "out " + folderName + "improvedTrunc.fasta raw_reads.part-" + indexOfMum + ".fasta"
        os.system(command)

        command = mummerLink + "show-coords -r " + folderName + "out.delta > " + folderName + "fromMum" + indexOfMum
        os.system(command)
        '''
        

    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        f = open(folderName + "fromMum" + indexOfMum, 'r')
    
        for i in range(6):
            tmp = f.readline()
        
        while len(tmp) > 0:
            infoArr = tmp.split('|')
            myArr = infoArr[-1].split('\t')
            rdGpArr = infoArr[-1].split('\t')
            contigName = rdGpArr[0].rstrip().lstrip()
            readName = rdGpArr[1].rstrip().lstrip()
            
            endSegArr = infoArr[0].split(" ")
            pos = []
            for eachitem in endSegArr:
                if len(eachitem) > 0:
                    pos.append(int(eachitem))
                    
            startPos = pos[0]
            endPos = pos[1]
            if startPos < endThres and endPos > thres - endThres:
                assoiatedReadIndex.append(myArr[1])
                nameList.append([int(contigName.split('_')[1]), readName])
            tmp = f.readline()
        
        f.close()
    
    
    nameList.sort()

    assoiatedReadIndex.sort()
    
    # print "assoiatedReadIndex", assoiatedReadIndex
    
    ckIndex = 0
    f = open(folderName + "associatedNames.txt", 'w')
    oneItem = 0
    keyFound = []
    for key, items in groupby(assoiatedReadIndex):
        
        countItem = 0
        for eachitem in items:
            countItem += 1
            
        if countItem == 1:
            
            oneItem += 1
        else:
            key = key.rstrip()
            if not key in keyFound:
                f.write(key + '\n')
                keyFound.append(key)

        ckIndex += 1
    
    print "ckIndex,oneItem: ", ckIndex, oneItem
    f.close()

    fFilter = open(folderName + "associatedNames.txt", 'r')
    
    fout = open(folderName + "associatedNames2.txt", 'w') 
    
    maxCount = 12000
    mytmpDum = fFilter.readline() 
    i = 0
    while i < maxCount and len(mytmpDum) > 0:
        fout.write(mytmpDum)  
        mytmpDum = fFilter.readline() 
        i = i + 1
        
    fout.close()   
    fFilter.close()

    command = "perl -ne 'if(/^>(\S+)/){$c=$i{$1}}$c?print:chomp;$i{$_}=1 if @ARGV' " + folderName + "associatedNames2.txt " + folderName + houseKeeper.globalReadName +" > " + folderName + "relatedReads.fasta"
    os.system(command)
    
    IORobot.writeToFile_Double1(folderName, "relatedReads.fasta", "relatedReads_Double.fasta", "read")
Example #12
0
def extractEdgeSet(folderName, mummerLink, option="nopolish"):
    # Tasks: reconstruct the string  graph
    
    # Input : relatedReads_Double.fasta, conig_Double.fasta
    # Intermediate files: fromMum_overlap , fromMum_overlap
    # Output: connectivity of eachNode: InList, OutList [critical]
    #         connectivity of eachNode: arrow representation with size [optional]
    
    
    # ## Perform MUMMER alignment
    print ">Extract Edge set"
    contigOnlyLengthDic = IORobot.obtainLength(folderName, "improved.fasta")
    
    # print lengthDic
    lengthDic = IORobot.findContigLength(folderName, "improved")
    
    numberOfContig = len(contigOnlyLengthDic)*2

    K = 400
    thres = 5
    
    
    # ## Apply MUMMER on them using cleanedReads against them
    IORobot.truncateEndOfContigs(folderName, "improved_Double.fasta", "smaller_improvedContig.fasta", 25000, lengthDic)
    dataSet = []
    
    numberOfFiles = max(20, houseKeeper.globalParallel)
    

    if True:
        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "relatedReads_Double.fasta"
        os.system(command)
        
        
    workerList = [] 
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
        
        outputName, referenceName, queryName, specialName=  "outGapFillRefine"+indexOfMum , "smaller_improvedContig.fasta",  "relatedReads_Double.part-" + indexOfMum + ".fasta",  "fromMumRefine" + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
    
        
        
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,True)
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, "outRefine", "smaller_improvedContig.fasta", "relatedReads_Double.part-" + indexOfMum + ".fasta", True,  "fromMumRefine" + indexOfMum)
        
    
    for dummyI in range(1, numberOfFiles + 1):
        tmpSet = IORobot.obtainLinkInfoReadContig(dummyI, mummerLink, folderName,thres, lengthDic, K)
        dataSet = dataSet + tmpSet
    
    # ## repeat aware
    usableJunction = loadOpenList(folderName)
    dataSet, blockedSet = filterRepeatEnd(dataSet, usableJunction)
    # ## repeat aware end
    
    dataSet.sort()
    matchPair = formMatchPairFromReadInfo(dataSet)
    
    # Bug fix on repeat detection from reads alone
    matchPair = filterRepeatPair(matchPair)
    # end bug fix
    
    # print matchPair

    bestMatchPair = []
    
    for key, items in groupby(matchPair, itemgetter(0, 1)):
        maxvalue = -1
        maxLenPair = []
        for eachitem in items:
            if eachitem[2] > maxvalue:
                maxvalue = eachitem[2]
                maxLenPair = [eachitem[3], eachitem[4], eachitem[5]]
        bestMatchPair.append([key[0], key[1], maxvalue, maxLenPair[0], maxLenPair[1], maxLenPair[2]])
    
    contigList, leftConnect, rightConnect, rawReadList = formbestpair(bestMatchPair,numberOfContig)
    print "contigList", contigList
    
    writeContigReadCombine(blockedSet, dataSet, folderName, rawReadList, numberOfContig, contigList, leftConnect, option, rightConnect, mummerLink)
Example #13
0
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile):
    thres = 5
    minLen = 400
    # thres = 10
    # minLen = 200

    writeToFile_Double1(folderName, inputFile + ".fasta",
                        inputFile + "_Double.fasta", "contig")

    fmyFile = open(folderName + inputFile + "_Double.fasta", 'r')
    fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w')

    tmp = fmyFile.readline().rstrip()
    maxSize = 50000

    myName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            fSmaller.write(tmp + '\n')
            myName = tmp[1:]
        else:
            component = tmp[0:min(len(tmp), maxSize)]
            countComp = len(component)
            fSmaller.write(component)

            component = tmp[max(0, len(tmp) - maxSize):len(tmp)]
            fSmaller.write(component)
            countComp = countComp + len(component)

            print "DebugName", myName, countComp
            fSmaller.write('\n')

        tmp = fmyFile.readline().rstrip()

    fSmaller.close()
    fmyFile.close()

    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[
            mummerFile, inputFile + "_contigs_Double.fasta",
            inputFile + "_contigs_Double.fasta", ""
        ]], houseKeeper.globalParallel)

        # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta")

    lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta")

    dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out")

    # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName]

    dataSet = []

    for eachitem in dataSetRaw:
        helperStart, helperEnd, readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem

        detailHelper = helperName.split('_')
        detailRead = readName.split('_')

        if detailHelper[0] != detailRead[0] and helperName != readName and max(
                matchLen1, matchLen2) > minLen and readStart < readEnd and min(
                    helperStart, readStart) < thres and min(
                        lengthDic[helperName] - helperEnd,
                        lengthDic[readName] - readEnd) + 1 < thres:
            conditionForMatch = True
        else:
            conditionForMatch = False

        if conditionForMatch:
            if helperStart < thres:

                dataSet.append((max(matchLen1,
                                    matchLen2), readName, helperName))

    dataSet.sort(reverse=True)

    numberOfContig = len(lengthDic)

    return numberOfContig, dataSet
Example #14
0
def obtainLinkInfo(folderName, mummerLink, inputFile, mummerFile):
    # minLen = 200
    minLen = 400
    
    if houseKeeper.globalRelaxThres == False:
        thres = 5
    elif houseKeeper.globalRelaxThres == True:
        thres = 10
    
    
    writeToFile_Double1(folderName, inputFile + ".fasta", inputFile + "_Double.fasta", "contig")
    
    fmyFile = open(folderName + inputFile + "_Double.fasta", 'r')
    fSmaller = open(folderName + inputFile + "_contigs_Double.fasta", 'w')

    tmp = fmyFile.readline().rstrip()
    maxSize = 50000

    myName = ""
    while len(tmp) > 0:
        if tmp[0] == '>':
            fSmaller.write(tmp + '\n')
            myName = tmp[1:]
        else:
            component = tmp[0:min(len(tmp), maxSize)] 
            countComp = len(component)
            fSmaller.write(component)
            
            component = tmp[max(0, len(tmp) - maxSize):len(tmp)]
            fSmaller.write(component)
            countComp = countComp + len(component)
            

            print "DebugName", myName, countComp
            fSmaller.write('\n')

        tmp = fmyFile.readline().rstrip()

    fSmaller.close()
    fmyFile.close()
    
    if True:
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, [[mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta", ""]], houseKeeper.globalParallel )
        
        # alignerRobot.useMummerAlign(mummerLink, folderName, mummerFile, inputFile + "_contigs_Double.fasta", inputFile + "_contigs_Double.fasta")
        
        
    lengthDic = obtainLength(folderName, inputFile + "_contigs_Double.fasta") 
    
    dataSetRaw = alignerRobot.extractMumData(folderName, mummerFile + "Out")
    
    # ## Format [ helperStart, helperEnd , readStart, readEnd,matchLen1,matchLen2,percentMatch,helperName,readName]
    
    
    dataSet = []
    
    for eachitem in dataSetRaw: 
        helperStart, helperEnd , readStart, readEnd, matchLen1, matchLen2, percentMatch, helperName, readName = eachitem 
        
        detailHelper = helperName.split('_')
        detailRead = readName.split('_')
        

        if detailHelper[0] != detailRead[0] and  helperName != readName and max(matchLen1, matchLen2) > minLen and readStart < readEnd  and min(helperStart, readStart) < thres and min(lengthDic[helperName] - helperEnd, lengthDic[readName] - readEnd) + 1 < thres:
            conditionForMatch = True
        else:
            conditionForMatch = False

        if conditionForMatch :
            if helperStart < thres:
                
                dataSet.append((max(matchLen1, matchLen2), readName, helperName))
    
    dataSet.sort(reverse=True)
    
    numberOfContig = len(lengthDic)
    
    return numberOfContig, dataSet