Beispiel #1
0
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput):

    print("Run " + bam)

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, None)
     
    chromosomes = testFile.getChromosomes()
    
    bedTree = bedToIntervallTree(bed)
    #evalHist = [0] *  
    
    outFile = open(outputFile, "w")
    print("read.name", "read.chromosome", "read.startRefPos", "sim.utr", "read.utr", "sim.tcCount", "read.tcCount", sep = "\t", file=outFile)
    
    total = 0
    correct = 0
    correcPosWrongTC = 0
    wrongPos = 0
    
    minBaseQual = 0
    for chromosome in chromosomes:
        readIterator = testFile.readsInChromosome(chromosome, minBaseQual)
        
        for read in readIterator:
            total += 1
            simInfo = read.name.split("_")
            utrSim = simInfo[0]
            tcCountSim = int(simInfo[2])
            
            utrFound = None
            if read.chromosome in bedTree:
                overlaps = list(bedTree[read.chromosome][read.startRefPos:read.endRefPos])
                if len(overlaps) > 0:
                    utrFound = overlaps[0].data
            
            if utrFound == utrSim:
                if tcCountSim == read.tcCount:
                    correct += 1
                else:
                    correcPosWrongTC += 1
            else:
                wrongPos += 1 
            
            print(read.name, read.chromosome, read.startRefPos, utrSim, utrFound, tcCountSim, read.tcCount, sep = "\t", file=outFile)
    
    print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
Beispiel #2
0
def evaluateReads(bam, referenceFile, bed, outputFile, mainOutput):

    print("Run " + bam)

    # Go through one chr after the other
    testFile = SlamSeqBamFile(bam, referenceFile, None)

    chromosomes = testFile.getChromosomes()

    bedTree = bedToIntervallTree(bed)
    #evalHist = [0] *

    outFile = open(outputFile, "w")
    print("read.name",
          "read.chromosome",
          "read.startRefPos",
          "sim.utr",
          "read.utr",
          "sim.tcCount",
          "read.tcCount",
          sep="\t",
          file=outFile)

    total = 0
    correct = 0
    correcPosWrongTC = 0
    wrongPos = 0

    minBaseQual = 0
    for chromosome in chromosomes:
        readIterator = testFile.readsInChromosome(chromosome, minBaseQual)

        for read in readIterator:
            total += 1
            simInfo = read.name.split("_")
            utrSim = simInfo[0]
            tcCountSim = int(simInfo[2])

            utrFound = None
            if read.chromosome in bedTree:
                overlaps = list(
                    bedTree[read.chromosome][read.startRefPos:read.endRefPos])
                if len(overlaps) > 0:
                    utrFound = overlaps[0].data

            if utrFound == utrSim:
                if tcCountSim == read.tcCount:
                    correct += 1
                else:
                    correcPosWrongTC += 1
            else:
                wrongPos += 1

            print(read.name,
                  read.chromosome,
                  read.startRefPos,
                  utrSim,
                  utrFound,
                  tcCountSim,
                  read.tcCount,
                  sep="\t",
                  file=outFile)

    #print(correct * 100.0 / total, correcPosWrongTC * 100.0 / total, wrongPos * 100.0 / total, total)
    print(correct, correcPosWrongTC, wrongPos, total)
Beispiel #3
0
def multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log):
    
    mappedReads = 0
    unmappedReads = 0
    filteredReads = 0
    
    mqFiltered = 0
    idFiltered = 0
    nmFiltered = 0
    
    utrIntervallTreeDict = bedToIntervallTree(bed)
    
#     debugLog = os.path.join("multimapdebug.log")
#     
#     fo = open(debugLog, "w")
    
    # Buffers for multimappers
    multimapBuffer = {}
    prevRead = ""
    # If read maps to another than previously recorded UTR -> do not dump reads to file
    dumpBuffer = True
    # This string tracks all multiple alignments
    multimapList = ""
#     logList = []
    
    for read in infile:
        if(not read.is_secondary and not read.is_supplementary):
            if(read.is_unmapped):
                unmappedReads += 1
            else:
                mappedReads += 1
                
        # First pass general filters
        if(read.is_unmapped):
            continue
        if(float(read.get_tag("XI")) < minIdentity):
            idFiltered += 1
            continue
        if(NM > -1 and int(read.get_tag("NM")) > NM):
            nmFiltered += 1
            continue
        if (read.mapping_quality == 0) :
            # Previous read was also multimapper
            if (read.query_name != prevRead and prevRead != "") :
                
                #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
                if (dumpBuffer and len(multimapBuffer) > 0) :
                    dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
                    filteredReads += 1

#                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
#                     print(ret,file = fo)
                    #multimapBuffer = {}
                    #multimapBuffer["nonUTR"] = []
                       
#                 for entry in logList:
#                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
#                 logList = []
                     
                dumpBuffer = True
                multimapList = ""
                multimapBuffer = {}
                
            # Query Intervall tree for given chromosome for UTs
            chr = infile.getrname(read.reference_id)
            start = read.reference_start
            end = read.reference_end
            
            if (utrIntervallTreeDict.has_key(chr)) :
                query = utrIntervallTreeDict[chr][start:end]
            else :
                query = set()
            
            if len(query) > 0:
                # First UTR hit is recorded without checks
                if (len(multimapBuffer) == 0) :
                    for result in query :
                        if (not multimapBuffer.has_key(result.data)) :
                            multimapBuffer[result.data] = []
                        multimapBuffer[result.data].append(read)
                # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR
                else :
                    for result in query :
                        if (not multimapBuffer.has_key(result.data)) :
                            multimapBuffer[result.data] = []
                            multimapBuffer[result.data].append(read)
                            dumpBuffer = False
                        else :
                            multimapBuffer[result.data].append(read)

#             else :
#                 # If no overlap -> nonUTR
#                 multimapBuffer["nonUTR"].append(read)
#                 for result in query :
#                     logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data)
#             else :
#                 logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF")
            
            multimapList = multimapList + chr + ":" + str(start) + "-" + str(end) + " "
            
            prevRead = read.query_name
        else :
            # Dump any multimappers before a unique mapper
            #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) :
            if (len(multimapBuffer) > 0) :
                if (dumpBuffer) :
                    dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
                    filteredReads += 1
#                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
#                     print(ret,file = fo)
                multimapBuffer = {}
#                 for entry in logList:
#                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
#                 logList = []
                #multimapBuffer["nonUTR"] = []
                dumpBuffer = True
                multimapList = ""
                
            # Record all unique mappers
            prevRead = read.query_name
            outfile.write(read)
            filteredReads += 1
            
    # Dump last portion if it was multimapper
    #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
    if (dumpBuffer and len(multimapBuffer) > 0) :
        dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
        filteredReads += 1
        
    multimapper = mappedReads - filteredReads - idFiltered - nmFiltered
        
    print("Criterion\tFiltered reads",file=log)
    print("MQ < 0\t0",file=log)
    print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
    print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
    print("MM\t" + str(multimapper),file=log)
        
#     fo.close()
    return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper
Beispiel #4
0
def multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, log):

    mappedReads = 0
    unmappedReads = 0
    filteredReads = 0

    mqFiltered = 0
    idFiltered = 0
    nmFiltered = 0

    utrIntervallTreeDict = bedToIntervallTree(bed)

    #     debugLog = os.path.join("multimapdebug.log")
    #
    #     fo = open(debugLog, "w")

    # Buffers for multimappers
    multimapBuffer = {}
    prevRead = ""
    # If read maps to another than previously recorded UTR -> do not dump reads to file
    dumpBuffer = True
    # This string tracks all multiple alignments
    multimapList = ""
    #     logList = []

    for read in infile:
        if (not read.is_secondary and not read.is_supplementary):
            if (read.is_unmapped):
                unmappedReads += 1
            else:
                mappedReads += 1

        # First pass general filters
        if (read.is_unmapped):
            continue
        if (float(read.get_tag("XI")) < minIdentity):
            idFiltered += 1
            continue
        if (NM > -1 and int(read.get_tag("NM")) > NM):
            nmFiltered += 1
            continue
        if (read.mapping_quality == 0):
            # Previous read was also multimapper
            if (read.query_name != prevRead and prevRead != ""):

                #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
                if (dumpBuffer and len(multimapBuffer) > 0):
                    dumpBufferToBam(multimapBuffer, multimapList, outfile,
                                    infile)
                    filteredReads += 1

#                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
#                     print(ret,file = fo)
#multimapBuffer = {}
#multimapBuffer["nonUTR"] = []

#                 for entry in logList:
#                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
#                 logList = []

                dumpBuffer = True
                multimapList = ""
                multimapBuffer = {}

            # Query Intervall tree for given chromosome for UTs
            chr = infile.getrname(read.reference_id)
            start = read.reference_start
            end = read.reference_end

            if (chr in utrIntervallTreeDict):
                query = utrIntervallTreeDict[chr][start:end]
            else:
                query = set()

            if len(query) > 0:
                # First UTR hit is recorded without checks
                if (len(multimapBuffer) == 0):
                    for result in query:
                        if (not result.data in multimapBuffer):
                            multimapBuffer[result.data] = []
                        multimapBuffer[result.data].append(read)
                # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR
                else:
                    for result in query:
                        if (not result.data in multimapBuffer):
                            multimapBuffer[result.data] = []
                            multimapBuffer[result.data].append(read)
                            dumpBuffer = False
                        else:
                            multimapBuffer[result.data].append(read)

#             else :
#                 # If no overlap -> nonUTR
#                 multimapBuffer["nonUTR"].append(read)
#                 for result in query :
#                     logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + result.data)
#             else :
#                 logList.append(chr + "\t" + str(start) + "\t" + str(end) + "\t" + "OFF")

            multimapList = multimapList + chr + ":" + str(start) + "-" + str(
                end) + " "

            prevRead = read.query_name
        else:
            # Dump any multimappers before a unique mapper
            #if (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0) :
            if (len(multimapBuffer) > 0):
                if (dumpBuffer):
                    dumpBufferToBam(multimapBuffer, multimapList, outfile,
                                    infile)
                    filteredReads += 1


#                     ret = dumpBufferToBam(multimapBuffer, outfile, infile)
#                     print(ret,file = fo)
                multimapBuffer = {}
                #                 for entry in logList:
                #                     print(prevRead + "\t" + entry + "\t" + str(dumpBuffer), file = fo)
                #                 logList = []
                #multimapBuffer["nonUTR"] = []
                dumpBuffer = True
                multimapList = ""

            # Record all unique mappers
            prevRead = read.query_name
            outfile.write(read)
            filteredReads += 1

    # Dump last portion if it was multimapper
    #if (dumpBuffer and (len(multimapBuffer) > 1 or len(multimapBuffer["nonUTR"]) > 0)) :
    if (dumpBuffer and len(multimapBuffer) > 0):
        dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
        filteredReads += 1

    multimapper = mappedReads - filteredReads - idFiltered - nmFiltered

    print("Criterion\tFiltered reads", file=log)
    print("MQ < 0\t0", file=log)
    print("ID < " + str(minIdentity) + "\t" + str(idFiltered), file=log)
    print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log)
    print("MM\t" + str(multimapper), file=log)

    #     fo.close()
    return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper
Beispiel #5
0
def multimapUTRRetainment(infile, outfile, bed, minIdentity, NM, MQ, log):
    mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered = 0, 0, 0, 0, 0, 0
    utrIntervallTreeDict = bedToIntervallTree(
        bed)  # Is interpreted now to represent any entity in the bed file

    # Buffers for multimappers
    multimapBuffer = {}
    prevRead = ""
    # If read maps to another than previously recorded UTR -> do not dump reads to file
    dumpBuffer = True
    # This string tracks all multiple alignments
    multimapList = ""

    for read in infile:  # infile is AlignedFile according to pysam definition. Read is AlignedSegment
        if not read.is_secondary and not read.is_supplementary:
            if read.is_unmapped:
                unmappedReads += 1
            else:
                mappedReads += 1

        # First pass general filters
        if read.is_unmapped:
            continue
        if float(read.get_tag("XI")) < minIdentity:
            idFiltered += 1
            continue
        if -1 < NM < int(read.get_tag("NM")):
            nmFiltered += 1
            continue
        if read.mapping_quality < MQ:
            # Previous read was also multimapper
            if read.query_name != prevRead and prevRead != "":
                if dumpBuffer and len(multimapBuffer) > 0:
                    dumpBufferToBam(multimapBuffer, multimapList, outfile,
                                    infile)
                    filteredReads += 1

                dumpBuffer = True
                multimapList = ""
                multimapBuffer = {}

            # Query Intervall tree for given chromosome for UTRs
            chr = infile.get_reference_name(read.reference_id)
            start = read.reference_start
            end = read.reference_end

            if chr in utrIntervallTreeDict:
                query = utrIntervallTreeDict[chr][
                    start:
                    end]  # This makes sure that mapping is in a bed region
            else:
                query = set()

            if len(query) > 0:
                # First UTR hit is recorded without checks
                if len(multimapBuffer) == 0:
                    for result in query:
                        if result.data not in multimapBuffer:
                            multimapBuffer[result.data] = []
                        multimapBuffer[result.data].append(read)
                # Second UTR hit looks at previous UTR hits -> no dump if hit on different UTR
                else:
                    for result in query:
                        if result.data not in multimapBuffer:
                            multimapBuffer[result.data] = []
                            multimapBuffer[result.data].append(read)
                            dumpBuffer = False
                        else:
                            multimapBuffer[result.data].append(read)

            multimapList = multimapList + chr + ":" + str(start) + "-" + str(
                end) + " "
            prevRead = read.query_name
        else:  # If read.mapping_quality > mq
            # Dump any multimappers before a unique mapper
            if len(multimapBuffer) > 0:
                if dumpBuffer:
                    dumpBufferToBam(multimapBuffer, multimapList, outfile,
                                    infile)
                    filteredReads += 1
                multimapBuffer = {}
                dumpBuffer = True
                multimapList = ""

            # Record all unique mappers
            prevRead = read.query_name
            outfile.write(read)
            filteredReads += 1

    # Dump last portion if it was multimapper
    if dumpBuffer and len(multimapBuffer) > 0:
        dumpBufferToBam(multimapBuffer, multimapList, outfile, infile)
        filteredReads += 1

    multimapper = mappedReads - filteredReads - idFiltered - nmFiltered

    print("Criterion\tFiltered reads", file=log)
    print("MQ < 0\t0", file=log)
    print("ID < %s\t%s" % (minIdentity, idFiltered), file=log)
    print("NM > %s\t%s" % (NM, nmFiltered), file=log)
    print("MM\t%s" % multimapper, file=log)

    return mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper