Esempio n. 1
0
def computeTconversions(ref,
                        bed,
                        snpsFile,
                        bam,
                        maxReadLength,
                        minQual,
                        outputCSV,
                        outputBedgraphPlus,
                        outputBedgraphMinus,
                        conversionThreshold,
                        log,
                        mle=False):

    referenceFile = pysam.FastaFile(ref)

    sampleInfo = getSampleInfo(bam)

    slamseqInfo = SlamSeqInfo(bam)
    #readNumber = slamseqInfo.MappedReads
    readNumber = slamseqInfo.FilteredReads

    bedMD5 = md5(bed)

    if (mle):
        fileNameTest = replaceExtension(outputCSV, ".tsv", "_perread")
        fileTest = open(fileNameTest, 'w')
        print("#slamdunk v" + __version__,
              __count_version__,
              "sample info:",
              sampleInfo.Name,
              sampleInfo.ID,
              sampleInfo.Type,
              sampleInfo.Time,
              sep="\t",
              file=fileTest)
        print("#annotation:",
              os.path.basename(bed),
              bedMD5,
              sep="\t",
              file=fileTest)
        #print("utr", "n", "k", file=fileTest)
        print(SlamSeqInterval.Header, file=fileTest)

    fileCSV = open(outputCSV, 'w')
    print("#slamdunk v" + __version__,
          __count_version__,
          "sample info:",
          sampleInfo.Name,
          sampleInfo.ID,
          sampleInfo.Type,
          sampleInfo.Time,
          sep="\t",
          file=fileCSV)
    print("#annotation:",
          os.path.basename(bed),
          bedMD5,
          sep="\t",
          file=fileCSV)
    print(SlamSeqInterval.Header, file=fileCSV)

    snps = SNPtools.SNPDictionary(snpsFile)
    snps.read()

    #Go through one chr after the other
    testFile = SlamSeqBamFile(bam, ref, snps)
    if not testFile.bamVersion == __bam_version__:
        raise RuntimeError("Wrong filtered BAM file version detected (" +
                           testFile.bamVersion + "). Expected version " +
                           __bam_version__ + ". Please rerun slamdunk filter.")

    bedMD5 = md5(bed)
    if slamseqInfo.AnnotationMD5 != bedMD5:
        print(
            "Warning: MD5 checksum of annotation (" + bedMD5 +
            ") does not matched MD5 in filtered BAM files (" +
            slamseqInfo.AnnotationMD5 +
            "). Most probably the annotation filed changed after the filtered BAM files were created.",
            file=log)

    conversionBedGraph = {}

    for utr in BedIterator(bed):
        Tcontent = 0
        slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                     utr.strand, utr.name, Tcontent, 0, 0, 0,
                                     0, 0, 0, 0)
        slamSeqUtrMLE = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                        utr.strand, utr.name, Tcontent, 0, 0,
                                        0, 0, 0, 0, 0)
        if (not utr.hasStrand()):
            raise RuntimeError(
                "Input BED file does not contain stranded intervals.")

        if utr.start < 0:
            raise RuntimeError(
                "Negativ start coordinate found. Please check the following entry in your BED file: "
                + utr)
        # Retreive reference sequence
        region = utr.chromosome + ":" + str(utr.start + 1) + "-" + str(
            utr.stop)

        if (utr.chromosome in list(referenceFile.references)):
            #print(refRegion,file=sys.stderr)
            # pysam-0.15.0.1
            #refSeq = referenceFile.fetch(region=region).upper()
            refSeq = referenceFile.fetch(reference=utr.chromosome,
                                         start=utr.start,
                                         end=utr.stop).upper()
            if (utr.strand == "-"):
                #refSeq = complement(refSeq[::-1])
                Tcontent = refSeq.count("A")
            else:
                Tcontent = refSeq.count("T")

            slamSeqUtr._Tcontent = Tcontent

        readIterator = testFile.readInRegion(utr.chromosome, utr.start,
                                             utr.stop, utr.strand,
                                             maxReadLength, minQual,
                                             conversionThreshold)

        tcCountUtr = [0] * utr.getLength()
        coverageUtr = [0] * utr.getLength()

        tInReads = []
        tcInRead = []

        countFwd = 0
        tcCountFwd = 0
        countRev = 0
        tCountRev = 0

        multiMapFwd = 0
        multiMapRev = 0

        for read in readIterator:

            # Overwrite any conversions for non-TC reads (reads with < 2 TC conversions)
            if (not read.isTcRead):
                read.tcCount = 0
                read.mismatches = []
                read.conversionRates = 0.0
                read.tcRate = 0.0

            if (read.direction == ReadDirection.Reverse):
                countRev += 1
                if read.tcCount > 0:
                    tCountRev += 1
                if read.isMultimapper:
                    multiMapRev += 1
            else:
                countFwd += 1
                if read.tcCount > 0:
                    tcCountFwd += 1
                if read.isMultimapper:
                    multiMapFwd += 1

            for mismatch in read.mismatches:
                if (mismatch.isTCMismatch(
                        read.direction == ReadDirection.Reverse)
                        and mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    tcCountUtr[mismatch.referencePosition] += 1

            testN = read.getTcount()
            testk = 0
            for mismatch in read.mismatches:
                if (mismatch.referencePosition >= 0
                        and mismatch.referencePosition < utr.getLength()):
                    if (mismatch.isT(read.direction == ReadDirection.Reverse)):
                        testN += 1
                    if (mismatch.isTCMismatch(
                            read.direction == ReadDirection.Reverse)):
                        testk += 1
            #print(utr.name, read.name, read.direction, testN, testk, read.sequence, sep="\t")
            tInReads.append(testN)
            tcInRead.append(testk)
            #print(utr.name, testN, testk, sep="\t", file=fileTest)

            for i in xrange(read.startRefPos, read.endRefPos):
                if (i >= 0 and i < utr.getLength()):
                    coverageUtr[i] += 1

        if ((utr.strand == "+" and countFwd > 0)
                or (utr.strand == "-" and countRev > 0)):
            tcRateUtr = [
                x * 100.0 / y if y > 0 else 0
                for x, y in zip(tcCountUtr, coverageUtr)
            ]

            readCount = countFwd
            tcReadCount = tcCountFwd
            multiMapCount = multiMapFwd

            if (utr.strand == "-"):
                readCount = countRev
                tcReadCount = tCountRev
                multiMapCount = multiMapRev

            if ((utr.strand == "-" and countFwd > countRev)
                    or (utr.strand == "+" and countRev > countFwd)):
                print(
                    "Warning: " + utr.name + " is located on the " +
                    utr.strand +
                    " strand but read counts are higher for the opposite strand (fwd: "
                    + countFwd + ", rev: " + countRev + ")",
                    file=sys.stderr)

            refSeq = readIterator.getRefSeq()

            # Get number of covered Ts/As in the UTR and compute average conversion rate for all covered Ts/As
            coveredTcount = 0
            avgConversationRate = 0
            coveredPositions = 0
            # Get number of reads on T positions and number of reads with T->C conversions on T positions
            coverageOnTs = 0
            conversionsOnTs = 0

            for position in xrange(0, len(coverageUtr)):

                if (coverageUtr[position] > 0
                        and ((utr.strand == "+" and refSeq[position] == "T") or
                             (utr.strand == "-" and refSeq[position] == "A"))):
                    coveredTcount += 1
                    avgConversationRate += tcRateUtr[position]

                    coverageOnTs += coverageUtr[position]
                    conversionsOnTs += tcCountUtr[position]
                    conversionBedGraph[utr.chromosome + ":" +
                                       str(utr.start + position) + ":" +
                                       str(utr.strand)] = tcRateUtr[position]
                if (coverageUtr[position] > 0):
                    coveredPositions += 1

            if (coveredTcount > 0):
                avgConversationRate = avgConversationRate / coveredTcount
            else:
                avgConversationRate = 0

            # reads per million mapped to the UTR
            readsCPM = 0
            if (readNumber > 0):
                readsCPM = readCount * 1000000.0 / readNumber

            # Convert to SlamSeqInterval and print
            conversionRate = 0
            if (coverageOnTs > 0):
                conversionRate = float(conversionsOnTs) / float(coverageOnTs)
            slamSeqUtr = SlamSeqInterval(utr.chromosome, utr.start, utr.stop,
                                         utr.strand, utr.name, Tcontent,
                                         readsCPM, coverageOnTs,
                                         conversionsOnTs, conversionRate,
                                         readCount, tcReadCount, multiMapCount)
            slamSeqUtrMLE = SlamSeqInterval(
                utr.chromosome, utr.start, utr.stop, utr.strand, utr.name,
                Tcontent, readsCPM, coverageOnTs, conversionsOnTs,
                conversionRate, ",".join(str(x) for x in tInReads),
                ",".join(str(x) for x in tcInRead), multiMapCount)

        print(slamSeqUtr, file=fileCSV)
        if (mle):
            print(slamSeqUtrMLE, file=fileTest)

    fileCSV.close()
    if (mle):
        fileTest.close()

    fileBedgraphPlus = open(outputBedgraphPlus, 'w')
    fileBedgraphMinus = open(outputBedgraphMinus, 'w')

    for position in conversionBedGraph:
        positionData = position.split(":")
        if (positionData[2] == "+"):
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphPlus)
        else:
            print(positionData[0],
                  positionData[1],
                  int(positionData[1]) + 1,
                  conversionBedGraph[position],
                  file=fileBedgraphMinus)

    fileBedgraphPlus.close()
    fileBedgraphMinus.close()

    if (mle):
        fileNameMLE = replaceExtension(outputCSV, ".tsv", "_mle")
        callR(
            getPlotter("compute_conversion_rate_mle") + " -f " + fileNameTest +
            " -r " + "0.024" + " -o " + fileNameMLE + " &> /dev/null")
Esempio n. 2
0
def addTcConversions(bed,
                     readInFile,
                     readOutFile,
                     pulseTimePoint,
                     chaseTimePoint,
                     utrSummaryFile,
                     conversionRate,
                     librarySize,
                     sampleInfo,
                     labeledTranscripts=-1.0):

    # Read utrs from BED file
    utrs = parseUtrBedFile(bed)

    readOutTemp = readOutFile + "_tmp.sam"
    #bamheader = { 'HD': {'VN': '1.0'} }
    #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False)
    readOutSAM = open(readOutTemp, "w")
    print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM)
    utrSummary = open(utrSummaryFile, "w")

    bedMD5 = md5(bed)
    print("#slamdunk v" + __version__,
          __count_version__,
          "sample info:",
          sampleInfo.Name,
          sampleInfo.ID,
          sampleInfo.Type,
          sampleInfo.Time,
          sep="\t",
          file=utrSummary)
    print("#annotation:",
          os.path.basename(bed),
          bedMD5,
          sep="\t",
          file=utrSummary)
    print(SlamSeqInterval.Header, file=utrSummary)

    reads = []
    lastUtrName = None
    utrName = None

    fasta_sequences = SeqIO.parse(open(readInFile), 'fasta')

    for entry in fasta_sequences:

        # TODO: Uncomment to go back to pysam
        #with pysam.FastxFile(readInFile) as fh:
        #for entry in fh:
        #utrName = getUtrName(entry.name)
        utrName = getUtrName(entry.id)
        if (utrName == lastUtrName):
            reads.append(entry)
        elif (lastUtrName == None):
            reads.append(entry)
        else:
            readsCPM = len(reads) * 1000000.0 / librarySize
            readToConvertPercent = computeConversionRate(
                utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint,
                labeledTranscripts)
            readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(
                utrs[lastUtrName], reads, readToConvertPercent, conversionRate,
                readOutSAM)
            printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC,
                            totalTCount, totalTcCount, utrSummary, readsCPM,
                            readToConvertPercent)
            reads = []
        lastUtrName = utrName

    # Last UTR
    readsCPM = len(reads) * 1000000.0 / librarySize
    readToConvertPercent = computeConversionRate(utrs[lastUtrName].score,
                                                 pulseTimePoint,
                                                 chaseTimePoint,
                                                 labeledTranscripts)
    readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(
        utrs[lastUtrName], reads, readToConvertPercent, conversionRate,
        readOutSAM)
    printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount,
                    totalTcCount, utrSummary, readsCPM, readToConvertPercent)

    readOutSAM.close()
    utrSummary.close()

    readOutTempBAM = readOutFile + "_tmp.bam"
    # Convert to BAM
    run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM)
    #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False)
    #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile)

    #for read in samFile:
    #    bamFile.write(read)
    #bamFile.close()
    #samFile.close()

    # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier
    #pysam.sort("-o", readOutFile, readOutTempBAM)  # @UndefinedVariable
    run("samtools sort -o " + readOutFile + " " + readOutTempBAM)
    os.unlink(readOutTemp)
    os.unlink(readOutTempBAM)
Esempio n. 3
0
def Filter(inputBAM, outputBAM, log, bed, MQ=2, minIdentity=0.8, NM=-1, printOnly=False, verbose=True, force=False):
    if(printOnly or checkStep([inputBAM], [outputBAM], force)):
        
        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0
        
        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0
        
        infile = pysam.AlignmentFile(inputBAM, "rb")    
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        
        # Default filtering without bed
        if (bed == None) :
            
            print("#No bed-file supplied. Running default filtering on " + inputBAM + ".",file=log)
            
            for read in infile:
                
                if(not read.is_secondary and not read.is_supplementary):
                    if(read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1
                
                if(read.is_unmapped):
                    continue
                if(read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if(float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if(NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue
                
                if(not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1
                    
                outfile.write(read)
                
            print("Criterion\tFiltered reads",file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered),file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered),file=log)
            print("MM\t0",file=log)
        else :
            # Multimap retention strategy filtering when bed is supplied
            
            random.seed(1)
            
            print("#Bed-file supplied. Running multimap retention filtering strategy on " + inputBAM + ".",file=log)
            
            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log) 
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)
        
        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None) :
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else :
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""
            
            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"        
        
        slamDunkPG = { 'ID': 'slamdunk', 'PN': 'slamdunk filter v' + __version__, 'VN': __bam_version__ }
        if('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [ slamDunkPG ]
        
        infile.close()
        outfile.close()
        
        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)
        
        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)
    
    else:
        print("Skipped filtering for " + inputBAM, file=log)
Esempio n. 4
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False):
    if (printOnly or checkStep([inputBAM], [outputBAM], force)):

        mappedReads = 0
        unmappedReads = 0
        filteredReads = 0

        mqFiltered = 0
        idFiltered = 0
        nmFiltered = 0
        multimapper = 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)

        # Default filtering without bed
        if (bed == None):

            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)

            for read in infile:

                if (not read.is_secondary and not read.is_supplementary):
                    if (read.is_unmapped):
                        unmappedReads += 1
                    else:
                        mappedReads += 1

                if (read.is_unmapped):
                    continue
                if (read.mapping_quality < MQ):
                    mqFiltered += 1
                    continue
                if (float(read.get_tag("XI")) < minIdentity):
                    idFiltered += 1
                    continue
                if (NM > -1 and int(read.get_tag("NM")) > NM):
                    nmFiltered += 1
                    continue

                if (not read.is_secondary and not read.is_supplementary):
                    filteredReads += 1

                outfile.write(read)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < " + str(MQ) + "\t" + str(mqFiltered), file=log)
            print("ID < " + str(minIdentity) + "\t" + str(idFiltered),
                  file=log)
            print("NM > " + str(NM) + "\t" + str(nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied

            random.seed(1)

            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)

            mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered, nmFiltered, multimapper = multimapUTRRetainment(
                infile, outfile, bed, minIdentity, NM, log)
            #mappedReads, unmappedReads, filteredReads = multimapUTRRetainment (infile, outfile, bed, minIdentity, NM, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if ('RG' in inFileBamHeader and len(inFileBamHeader['RG']) > 0):
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if (bed != None):
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader['RG'][0]['DS'] = str(slamseqInfo)
            #inFileBamHeader['RG'][0]['DS'] = "{'sequenced':" + str(mappedReads + unmappedReads) + "," + "'mapped':" + str(mappedReads) + "," + "'filtered':" + str(filteredReads) + "}"

        slamDunkPG = {
            'ID': 'slamdunk',
            'PN': 'slamdunk filter v' + __version__,
            'VN': __bam_version__
        }
        if ('PG' in inFileBamHeader):
            inFileBamHeader['PG'].append(slamDunkPG)
        else:
            inFileBamHeader['PG'] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, verbose)

        pysamIndex(outputBAM)
        #pysamFlagstat(outputBAM)
        #runFlagstat(outputBAM, log, verbose=verbose, dry=printOnly)

    else:
        print("Skipped filtering for " + inputBAM, file=log)
Esempio n. 5
0
def addTcConversions(bed, readInFile, readOutFile, pulseTimePoint, chaseTimePoint, utrSummaryFile, conversionRate, librarySize, sampleInfo, labeledTranscripts = -1.0):
    
    # Read utrs from BED file
    utrs = parseUtrBedFile(bed)
    
    readOutTemp = readOutFile + "_tmp.sam"
    #bamheader = { 'HD': {'VN': '1.0'} }
    #readOutBAM = pysam.AlignmentFile(readOutTemp, "wb", header=bamheader, add_sq_text=False)
    readOutSAM = open(readOutTemp, "w")
    print("@HD\tVN:1.0\tSO:unsorted", file=readOutSAM)
    utrSummary = open(utrSummaryFile, "w")
    
    bedMD5 = md5(bed)
    print("#slamdunk v" + __version__, __count_version__, "sample info:", sampleInfo.Name, sampleInfo.ID, sampleInfo.Type, sampleInfo.Time, sep="\t", file=utrSummary)
    print("#annotation:", os.path.basename(bed), bedMD5, sep="\t", file=utrSummary)
    print(SlamSeqInterval.Header, file=utrSummary)
    
    reads = []
    lastUtrName = None
    utrName = None
    
    fasta_sequences = SeqIO.parse(open(readInFile),'fasta')

    for entry in fasta_sequences:
        
    # TODO: Uncomment to go back to pysam
    #with pysam.FastxFile(readInFile) as fh:
        #for entry in fh:
            #utrName = getUtrName(entry.name)
        utrName = getUtrName(entry.id)
        if(utrName == lastUtrName):
            reads.append(entry)
        elif(lastUtrName == None):
            reads.append(entry)
        else:
            readsCPM = len(reads)  * 1000000.0 / librarySize;
            readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts)          
            readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM)
            printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent)
            reads = []
        lastUtrName = utrName
    
    # Last UTR    
    readsCPM = len(reads) * 1000000.0 / librarySize;
    readToConvertPercent = computeConversionRate(utrs[lastUtrName].score, pulseTimePoint, chaseTimePoint, labeledTranscripts)
    readsWithTC, totalTCount, totalTcCount = addTcConversionsToReads(utrs[lastUtrName], reads, readToConvertPercent, conversionRate, readOutSAM)
    printUtrSummary(utrs[lastUtrName], len(reads), readsWithTC, totalTCount, totalTcCount, utrSummary, readsCPM, readToConvertPercent)
        
            
    readOutSAM.close()       
    utrSummary.close()  
    
    
    readOutTempBAM = readOutFile + "_tmp.bam"
    # Convert to BAM
    run("samtools view -Sb " + readOutTemp + " > " + readOutTempBAM)
    #samFile = pysam.AlignmentFile(readOutTemp, "r", check_header = False, check_sq = False)
    #bamFile = pysam.AlignmentFile(readOutTempBAM, "wb", template=samFile)
    
    #for read in samFile:
    #    bamFile.write(read)
    #bamFile.close()
    #samFile.close()
    
    # Sort reads by query name (doesn't matter for mapping, but makes evaluation easier
    #pysam.sort("-o", readOutFile, readOutTempBAM)  # @UndefinedVariable
    run("samtools sort -o " + readOutFile + " " + readOutTempBAM)
    os.unlink(readOutTemp)
    os.unlink(readOutTempBAM)
Esempio n. 6
0
def Filter(inputBAM,
           outputBAM,
           log,
           bed,
           MQ=2,
           minIdentity=0.8,
           NM=-1,
           printOnly=False,
           verbose=True,
           force=False,
           paired=False):
    inputBAM = os.path.expanduser(inputBAM)
    outputBAM = os.path.expanduser(outputBAM)
    if printOnly or checkStep([inputBAM], [outputBAM], force):
        (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
         nmFiltered, multimapper) = 0, 0, 0, 0, 0, 0, 0

        infile = pysam.AlignmentFile(inputBAM, "rb")
        outfile = pysam.AlignmentFile(outputBAM, "wb", template=infile)
        # Default filtering without bed
        if bed is None:
            print("#No bed-file supplied. Running default filtering on " +
                  inputBAM + ".",
                  file=log)
            if paired:
                read1 = None
                read2 = None
            for read in infile:
                if paired:
                    if not read.is_paired or read.mate_is_unmapped or read.is_duplicate:
                        unmappedReads += 1
                        continue
                    if read.is_read2:
                        read2 = read
                    else:
                        read1 = read
                        read2 = None
                        continue

                if not read.is_secondary and not read.is_supplementary:
                    if read.is_unmapped:
                        unmappedReads += 1
                        continue
                    else:
                        mappedReads += 1

                if not paired:
                    if read.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read.get_tag("NM")):
                        nmFiltered += 1
                        continue

                    filteredReads += 1
                    outfile.write(read)
                else:
                    if read1 is None or read2 is None:
                        continue
                    if read1.query_name != read2.query_name:
                        continue

                    if read1.mapping_quality < MQ and read2.mapping_quality < MQ:
                        mqFiltered += 1
                        continue
                    if float(read1.get_tag("XI")) < minIdentity and float(
                            read2.get_tag("XI")) < minIdentity:
                        idFiltered += 1
                        continue
                    if -1 < NM < int(read1.get_tag("NM")) and -1 < NM < int(
                            read2.get_tag("NM")):
                        nmFiltered += 1
                        continue
                    filteredReads += 1
                    outfile.write(read1)
                    outfile.write(read2)

            print("Criterion\tFiltered reads", file=log)
            print("MQ < 0\t0", file=log)
            print("ID < %s\t%s" % (minIdentity, idFiltered), file=log)
            print("NM > %s\t%s" % (NM, nmFiltered), file=log)
            print("MM\t0", file=log)
        else:
            # Multimap retention strategy filtering when bed is supplied
            print(
                "#Bed-file supplied. Running multimap retention filtering strategy on "
                + inputBAM + ".",
                file=log)
            (mappedReads, unmappedReads, filteredReads, mqFiltered, idFiltered,
             nmFiltered,
             multimapper) = multimapUTRRetainment(infile, outfile, bed,
                                                  minIdentity, NM, MQ, log)

        # Add number of sequenced and number of mapped reads to the read group description
        # Used for creating summary file
        inFileBamHeader = outfile.header
        if "RG" in inFileBamHeader and len(inFileBamHeader["RG"]) > 0:
            slamseqInfo = SlamSeqInfo()
            slamseqInfo.SequencedReads = mappedReads + unmappedReads
            slamseqInfo.MappedReads = mappedReads
            slamseqInfo.FilteredReads = filteredReads
            slamseqInfo.MQFilteredReads = mqFiltered
            slamseqInfo.IdFilteredReads = idFiltered
            slamseqInfo.NmFilteredReads = nmFiltered
            slamseqInfo.MultimapperReads = multimapper

            if bed:
                slamseqInfo.AnnotationName = os.path.basename(bed)
                slamseqInfo.AnnotationMD5 = md5(bed)
            else:
                slamseqInfo.AnnotationName = ""
                slamseqInfo.AnnotationMD5 = ""

            if not isinstance(inFileBamHeader, dict):
                inFileBamHeader = inFileBamHeader.to_dict()
            inFileBamHeader["RG"][0]["DS"] = str(slamseqInfo)

        slamDunkPG = {
            "ID": "slamdunk",
            "PN": "slamdunk filter v" + __version__,
            "VN": __bam_version__
        }
        if "PG" in inFileBamHeader:
            inFileBamHeader["PG"].append(slamDunkPG)
        else:
            inFileBamHeader["PG"] = [slamDunkPG]

        infile.close()
        outfile.close()

        # Sort afterwards
        bamSort(outputBAM, log, inFileBamHeader, paired=False, verbose=verbose)
        if not paired:
            pysamIndex(outputBAM)
    else:
        print("Skipped filtering for " + inputBAM, file=log)