Example #1
0
def alignerSubRoutine(folderName, referenceFile, queryFile, mummerLink,
                      header):
    #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    numberOfFiles = houseKeeper.globalParallelFileNum
    bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
    command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(
        numberOfFiles) + " " + folderName + queryFile
    os.system(command)

    os.system("cp *.fasta " + folderName)
    os.system("rm *.fasta ")

    workerList = []

    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)

        outputName, referenceName, queryName, specialName = header + indexOfMum, referenceFile, queryFile[
            0:-6] + ".part-" + indexOfMum + ".fasta", header + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])

    alignerRobot.useMummerAlignBatch(mummerLink,
                                     folderName,
                                     workerList,
                                     houseKeeper.globalParallel,
                                     specialForRaw=False,
                                     refinedVersion=False)
    alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, header,
                                        header + "Out", numberOfFiles)
def alignerSubRoutine(folderName ,referenceFile,  queryFile, mummerLink, header ):   
    #alignerRobot.useMummerAlign(mummerLink, folderName, header, referenceFile, queryFile)
    numberOfFiles = 20
    bindir =  os.path.abspath(os.path.dirname(sys.argv[0]))   
    command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + queryFile
    os.system(command)
    
    os.system("cp *.fasta " + folderName )
    os.system("rm *.fasta ")
    
    workerList = []
    
    for dummyI in range(1, numberOfFiles + 1):
        indexOfMum = ""
        if dummyI < 10:
            indexOfMum = "0" + str(dummyI)
        else:
            indexOfMum = str(dummyI)
       
        outputName, referenceName, queryName, specialName= header+indexOfMum, referenceFile,queryFile[0:-6]+".part-"+ indexOfMum + ".fasta" ,  header + indexOfMum
        workerList.append([outputName, referenceName, queryName, specialName])
        
    alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = False, refinedVersion = False)
    alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, header,header +"Out", numberOfFiles)
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink, continueFilter):

    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0

    dataList.sort(key=itemgetter(-1))

    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)

    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""

        for eachitem in items:
            ct = eachitem[6] / 100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key]

        ctkk = ctkk + 1
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1

    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]

    print "Missed coverage  ", (cttot - ctbase) / (4.7 * pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 * len(readLenDic))

    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0:
            toAddReadList.append(eachitem)

    """
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    """

    if continueFilter:
        numberOfFiles = 20

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw", toAddReadList)

        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = (
            bindir
            + "/finisherSCCoreLib/fasta-splitter.pl --n-parts "
            + str(numberOfFiles)
            + " "
            + folderName
            + "selected_raw.fasta"
        )
        os.system(command)

        workerList = []

        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)

            outputName, referenceName, queryName, specialName = (
                "outAbunRefine" + indexOfMum,
                "improved3.fasta",
                "selected_raw.part-" + indexOfMum + ".fasta",
                "abunMissOut" + indexOfMum,
            )
            workerList.append([outputName, referenceName, queryName, specialName])

        alignerRobot.useMummerAlignBatch(
            mummerLink, folderName, workerList, houseKeeper.globalParallel, specialForRaw=True, refinedVersion=True
        )
        alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles)

    for i in range(len(myCountDic)):
        eachitem = "Segkk" + str(i)
        print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem])

    return myCountDic
Example #4
0
def evaluateCoverage(dataList, lenDic, readLenDic, folderName,mummerLink, continueFilter):
    
    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0
            
    dataList.sort(key = itemgetter(-1)) 
    
    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)
    
    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""
        
        for eachitem in items:
            ct = eachitem[6]/100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct 
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key] 
        
        ctkk = ctkk + 1 
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1
    
    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]
        
    print "Missed coverage  ", (cttot - ctbase)/(4.7*pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk)/(1.0*len(readLenDic)) 
    
    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0 :
            toAddReadList.append(eachitem)
    
    '''
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    '''
    
    if continueFilter:
        numberOfFiles= 20
        
        IORobot.putListToFileO(folderName, "raw_reads.fasta" , "selected_raw", toAddReadList)
        
        bindir =  os.path.abspath(os.path.dirname(sys.argv[0]))   
        command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(numberOfFiles) + " " + folderName + "selected_raw.fasta"
        os.system(command)
        
        workerList = []
        
        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)
           
            outputName, referenceName, queryName, specialName= "outAbunRefine"+indexOfMum, "improved3.fasta", "selected_raw.part-"+ indexOfMum + ".fasta",  "abunMissOut" + indexOfMum
            workerList.append([outputName, referenceName, queryName, specialName])
            
        alignerRobot.useMummerAlignBatch(mummerLink, folderName, workerList, houseKeeper.globalParallel ,specialForRaw = True, refinedVersion = True)
        alignerRobot.combineMultipleCoorMum( True, mummerLink, folderName, "outAbunRefine", "abunMissOut", numberOfFiles)
        

        
    for i in range(len(myCountDic)):
        eachitem = "Segkk"+str(i)
        print eachitem , myCountDic[eachitem]/(1.0*lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem]/(1.0*lenDic[eachitem])
        
    return myCountDic
Example #5
0
def evaluateCoverage(dataList, lenDic, readLenDic, folderName, mummerLink,
                     continueFilter, contigFilename):
    '''
    not sure if that is the right documentation... 

    Input : string_graph_3, improved3.fasta, raw_reads.fasta
    Output : string_graph_4 with weights [need a data structure to store the weight on node]

    Algorithm : 
    1. Find your favorite mappers to map read back
        a. MUMmer, Bowtie, bbmap, any that works V 
        b. And then write a short parser to parse the results V 
    2. Calculate count on the abundances 
        a. Aggregate by taking average [put weights on bin along contigs]
        b. Inheritance and a subclass 
    3. Find your favorite graphical tool to display 
        a. Use a javascript library [halfviz should just work ! put weight on edge ]

    '''
    myCountDic = {}
    for eachitem in lenDic:
        myCountDic[eachitem] = 0

    dataList.sort(key=itemgetter(-1))

    ctkk, ctbase = 0, 0
    toAddBackDic = copy.deepcopy(readLenDic)

    for key, items in groupby(dataList, itemgetter(-1)):
        maxMatch = -1
        bestname = ""

        for eachitem in items:
            ct = eachitem[6] / 100.0 * eachitem[4]
            if ct > maxMatch:
                maxMatch = ct
                bestname = eachitem[-2]
        myCountDic[bestname] += readLenDic[key]

        ctkk = ctkk + 1
        ctbase = ctbase + readLenDic[key]
        toAddBackDic[key] = -1

    cttot = 0
    for eachitem in readLenDic:
        cttot = cttot + readLenDic[eachitem]

    print "Missed coverage  ", (cttot - ctbase) / (4.7 * pow(10, 6))
    print "percentage miss read", (len(readLenDic) - ctkk) / (1.0 *
                                                              len(readLenDic))

    toAddReadList = []
    for eachitem in toAddBackDic:
        if toAddBackDic[eachitem] >= 0:
            toAddReadList.append(eachitem)
    '''
    This part need the most parallelism because it is most intense with -l 10 
    split V, workerList V , combine 
    '''

    if continueFilter:
        numberOfFiles = houseKeeper.globalParallelFileNum

        IORobot.putListToFileO(folderName, "raw_reads.fasta", "selected_raw",
                               toAddReadList)

        bindir = os.path.abspath(os.path.dirname(sys.argv[0]))
        command = bindir + "/finisherSCCoreLib/fasta-splitter.pl --n-parts " + str(
            numberOfFiles) + " " + folderName + "selected_raw.fasta"
        os.system(command)

        workerList = []

        for dummyI in range(1, numberOfFiles + 1):
            indexOfMum = ""
            if dummyI < 10:
                indexOfMum = "0" + str(dummyI)
            else:
                indexOfMum = str(dummyI)

            outputName, referenceName, queryName, specialName = "outAbunRefine" + indexOfMum, contigFilename + ".fasta", "selected_raw.part-" + indexOfMum + ".fasta", "abunMissOut" + indexOfMum
            workerList.append(
                [outputName, referenceName, queryName, specialName])

        alignerRobot.useMummerAlignBatch(mummerLink,
                                         folderName,
                                         workerList,
                                         houseKeeper.globalParallel,
                                         specialForRaw=True,
                                         refinedVersion=True)
        alignerRobot.combineMultipleCoorMum(True, mummerLink, folderName,
                                            "outAbunRefine", "abunMissOut",
                                            numberOfFiles)

    for eachitem in lenDic:
        #eachitem = "Segkk"+str(i)
        print eachitem, myCountDic[eachitem] / (1.0 * lenDic[eachitem])
        myCountDic[eachitem] = myCountDic[eachitem] / (1.0 * lenDic[eachitem])

    return myCountDic