Example #1
0
def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False,
             cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None):

    doCache = False
    if cachePages is not None:
        doCache = True
    else:
        cachePages = 0

    if label is None:
        label = rdsfile

    RDS = readDataset(rdsfile, verbose=True, cache=doCache)
    rdsChromList = RDS.getChromosomes()

    if doVerbose:
        print time.ctime()

    total = 0
    outfile = open(outfilename, "w")
    outbed = open(outbedname, "w")
    outbed.write('track name="%s distal pairs" color=0,255,0\n' % label)

    readlen = RDS.getReadSize()
    flagDict = {}
    for chromosome in rdsChromList:
        if doNotProcessChromosome(chromosome):
            continue

        print chromosome
        uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True)
        if doVerbose:
            print len(uniqDict), time.ctime()

        for readID in uniqDict:
            readList = uniqDict[readID]
            if len(readList) == 2:
                total += 1
                (start1, flag1, pair1) = readList[0]
                (start2, flag2, pair2) = readList[1]

                if flag1 != flag2:
                    dist = abs(start1 - start2)
                    startList = [start1, start2]
                    stopList = [start1 + readlen, start2 + readlen]
                    startList.sort()
                    stopList.sort()
                    if flag1 != "" and flag2 != "" and minDist < dist < maxDist:
                        outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0")
                        outbed.write(outputLine)
                        if doVerbose:
                            print flag1, flag2, dist

                        try:
                            flagDict[flag1].append((flag2, start1, start2))
                        except KeyError:
                            flagDict[flag1] = [(flag2, start1, start2)]

                        try:
                            flagDict[flag2].append((flag1, start1, start2))
                        except KeyError:
                            flagDict[flag2] = [(flag2, start1, start2)]

    print "%d connected regions" % len(flagDict)

    for region in flagDict:
        flagDict[region].sort()
        regionConnections = {}
        for (region2, start1, start2) in flagDict[region]:
            try:
                regionConnections[region2] += 1
            except KeyError:
                regionConnections[region2] = 1

        for region2 in regionConnections:
            if regionConnections[region2] >= minCount:
                outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2]))
                if doVerbose:
                    print "%s\t%s\t%d" % (region, region2, regionConnections[region2])

    outfile.close()
    outbed.close()
    if doVerbose:
        print "finished: ", time.ctime()
Example #2
0
        sense = fields[2]
        chromstarts = fields[8][:-1].split(',')
        chromstops = fields[9][:-1].split(',')
        exonLengths = []
        totalLength = 0
        for index in range(blockCount):
            chromstarts[index] = int(chromstarts[index])
            chromstops[index] = int(chromstops[index])
            exonLengths.append(chromstops[index] - chromstarts[index])
            totalLength += exonLengths[index]
        geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts,
                           exonLengths)
        mapDict[uname] = []
    genedatafile.close()

rds = readDataset(outdbname, init, dataType, verbose=True)

#check that our cacheSize is better than the dataset's default cache size
defaultCacheSize = rds.getDefaultCacheSize()
if cachePages > defaultCacheSize:
    if init:
        rds.setDBcache(cachePages, default=True)
    else:
        rds.setDBcache(cachePages)

if not init and doIndex:
    try:
        if rds.hasIndex():
            rds.dropIndex()
    except:
        if verbose:
Example #3
0
def makeBamFromRds(rdsfile,
                   outfilename,
                   withUniqs=True,
                   withMulti=True,
                   doSplices=False,
                   doPairs=False,
                   withFlag="",
                   useFlagLike=False,
                   enforceChr=False,
                   allChrom=True,
                   doCache=False,
                   cachePages=100000,
                   chromList=[],
                   fastaFileName=""):

    if not withUniqs and not withMulti and not doSplices:
        print "must be outputting at least one of uniqs, multi, or -splices - exiting"
        sys.exit(1)

    print "\nsample:"
    RDS = readDataset(rdsfile, verbose=True, cache=doCache)

    if cachePages > RDS.getDefaultCacheSize():
        RDS.setDBcache(cachePages)

    readlength = RDS.getReadSize()

    if allChrom:
        if withUniqs:
            chromList = RDS.getChromosomes()
        elif withMulti:
            chromList = RDS.getChromosomes(table="multi")
        else:
            chromList = RDS.getChromosomes(table="splices")

        chromList.sort()

    fastaSequenceDict = {}
    if fastaFileName:
        fastafile = open(fastaFileName)
        fastaSequenceDict = getFastaSequenceDictionary(fastaFileName)
        fastafile.close()

    referenceSequenceList = []
    chromRemoveList = []
    for chromosome in chromList:
        if doNotOutputChromosome(chromosome, enforceChr):
            chromRemoveList.append(chromosome)
        else:
            chromosomeLength = RDS.getMaxCoordinate(chromosome,
                                                    doUniqs=withUniqs,
                                                    doMulti=withMulti,
                                                    doSplices=doSplices)
            referenceDataDict = {
                "LN": int(chromosomeLength),
                "SN": str(chromosome)
            }
            referenceSequenceList.append(referenceDataDict)

    for chrom in chromRemoveList:
        chromList.remove(chrom)

    header = {"HD": {"VN": "1.0"}}
    if referenceSequenceList:
        header["SQ"] = referenceSequenceList

    outfile = pysam.Samfile(outfilename, "wb", header=header)

    totalWrites = 0
    noncanonicalSplices = 0
    for chrom in chromList:
        index = 0
        print "chromosome %s" % (chrom)
        if withUniqs or withMulti:
            hitDict = RDS.getReadsDict(fullChrom=True,
                                       chrom=chrom,
                                       flag=withFlag,
                                       withWeight=True,
                                       withID=True,
                                       withPairID=doPairs,
                                       doUniqs=withUniqs,
                                       doMulti=withMulti,
                                       readIDDict=False,
                                       flagLike=useFlagLike,
                                       entryDict=True)

            for read in hitDict[chrom]:
                index += writeBAMEntry(outfile, chrom, read, readlength)

        if doSplices:
            numSpliceReadsWritten, noncanonical = processSpliceReads(
                RDS, outfile, chrom, withFlag, useFlagLike, readlength,
                fastaSequenceDict)
            index += numSpliceReadsWritten
            noncanonicalSplices += noncanonical

        print index
        totalWrites += index

    outfile.close()
    print "%d total reads written" % totalWrites
    print "%d non-canonical splices" % noncanonicalSplices
Example #4
0
    try:
        cachePages = int(sys.argv[sys.argv.index('-cache') + 1])
    except:
        pass

datafile = sys.argv[1]
infileList = []
for index in range(2, len(sys.argv)):
    if sys.argv[index][0] == '-':
        break
    infileList.append(sys.argv[index])

print "destination RDS: %s" % datafile

if '-initrna' in sys.argv:
    rds = readDataset(datafile, initialize=True, datasetType='RNA')
elif '-init' in sys.argv:
    rds = readDataset(datafile, initialize=True)

withFlag = ''
if '-flag' in sys.argv:
    withFlag = sys.argv[sys.argv.index('-flag') + 1]
    print "restrict to flag = %s" % withFlag

rds = readDataset(datafile, verbose=True, cache=doCache)

if cachePages > rds.getDefaultCacheSize():
    rds.setDBcache(cachePages)
    cacheVal = cachePages
else:
    cacheVal = rds.getDefaultCacheSize()
Example #5
0
def makeRdsFromBam(label,
                   samFileName,
                   outDbName,
                   init=True,
                   doIndex=False,
                   useSamFile=False,
                   cachePages=100000,
                   maxMultiReadCount=10,
                   rnaDataType=False,
                   trimReadID=True):

    if useSamFile:
        fileMode = "r"
    else:
        fileMode = "rb"

    try:
        samfile = pysam.Samfile(samFileName, fileMode)
    except ValueError:
        print "samfile index not found"
        sys.exit(1)

    if rnaDataType:
        dataType = "RNA"
    else:
        dataType = "DNA"

    writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))

    rds = readDataset(outDbName, init, dataType, verbose=True)
    if not init and doIndex:
        try:
            if rds.hasIndex():
                rds.dropIndex()
        except:
            pass

    if "sam_mapped" not in rds.getMetadata():
        rds.insertMetadata([("sam_mapped", "True")])

    defaultCacheSize = rds.getDefaultCacheSize()

    if cachePages > defaultCacheSize:
        if init:
            rds.setDBcache(cachePages, default=True)
        else:
            rds.setDBcache(cachePages)

    propertyList = []
    for arg in sys.argv:
        if "::" in arg:
            (pname, pvalue) = arg.strip().split("::")
            propertyList.append((pname, pvalue))

    if len(propertyList) > 0:
        rds.insertMetadata(propertyList)

    countReads = {
        "unmapped": 0,
        "total": 0,
        "unique": 0,
        "multi": 0,
        "multiDiscard": 0,
        "splice": 0
    }

    readsize = 0
    insertSize = 100000

    uniqueInsertList = []
    multiInsertList = []
    spliceInsertList = []

    processedEntryDict = {}
    uniqueReadDict = {}
    multiReadDict = {}
    spliceReadDict = {}

    samFileIterator = samfile.fetch(until_eof=True)

    for read in samFileIterator:
        if read.is_unmapped:
            countReads["unmapped"] += 1
            continue

        if readsize == 0:
            take = (0, 2, 3)  # CIGAR operation (M/match, D/del, N/ref_skip)
            readsize = sum([length for op, length in read.cigar if op in take])
            if init:
                rds.insertMetadata([("readsize", readsize)])

        #Build the read dictionaries
        try:
            readSequence = read.seq
        except KeyError:
            readSequence = ""

        pairReadSuffix = getPairedReadNumberSuffix(read)
        readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
        if trimReadID:
            rdsEntryName = "%s:%s:%d%s" % (label, read.qname,
                                           countReads["total"], pairReadSuffix)
        else:
            rdsEntryName = read.qname

        if processedEntryDict.has_key(readName):
            if isSpliceEntry(read.cigar):
                if spliceReadDict.has_key(readName):
                    del spliceReadDict[readName]
            else:
                if uniqueReadDict.has_key(readName):
                    del uniqueReadDict[readName]

                if multiReadDict.has_key(readName):
                    (read, priorCount, rdsEntryName) = multiReadDict[readName]
                    count = priorCount + 1
                    multiReadDict[readName] = (read, count, rdsEntryName)
                else:
                    multiReadDict[readName] = (read, 1, rdsEntryName)
        else:
            processedEntryDict[readName] = ""
            if isSpliceEntry(read.cigar):
                spliceReadDict[readName] = (read, rdsEntryName)
            else:
                uniqueReadDict[readName] = (read, rdsEntryName)

        if countReads["total"] % insertSize == 0:
            for entry in uniqueReadDict.keys():
                (readData, rdsEntryName) = uniqueReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                uniqueInsertList.append(
                    getRDSEntry(readData, rdsEntryName, chrom, readsize))
                countReads["unique"] += 1

            for entry in spliceReadDict.keys():
                (readData, rdsEntryName) = spliceReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                spliceInsertList.append(
                    getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
                countReads["splice"] += 1

            for entry in multiReadDict.keys():
                (readData, count, rdsEntryName) = multiReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                if count > maxMultiReadCount:
                    countReads["multiDiscard"] += 1
                else:
                    multiInsertList.append(
                        getRDSEntry(readData,
                                    rdsEntryName,
                                    chrom,
                                    readsize,
                                    weight=count))
                    countReads["multi"] += 1

            rds.insertUniqs(uniqueInsertList)
            rds.insertMulti(multiInsertList)
            uniqueInsertList = []
            uniqueReadDict = {}
            multiInsertList = []
            multiReadDict = {}
            if dataType == "RNA":
                rds.insertSplices(spliceInsertList)
                spliceInsertList = []
                spliceReadDict = {}

            print ".",
            sys.stdout.flush()
            processedEntryDict = {}

        countReads["total"] += 1

    if len(uniqueReadDict.keys()) > 0:
        for entry in uniqueReadDict.keys():
            (readData, rdsEntryName) = uniqueReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            uniqueInsertList.append(
                getRDSEntry(readData, rdsEntryName, chrom, readsize))
            countReads["unique"] += 1

        rds.insertUniqs(uniqueInsertList)

    if len(multiReadDict.keys()) > 0:
        for entry in multiReadDict.keys():
            (readData, count, rdsEntryName) = multiReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            if count > maxMultiReadCount:
                countReads["multiDiscard"] += 1
            else:
                multiInsertList.append(
                    getRDSEntry(readData,
                                rdsEntryName,
                                chrom,
                                readsize,
                                weight=count))
                countReads["multi"] += 1

        countReads["multi"] += len(multiInsertList)

    if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
        for entry in spliceReadDict.keys():
            (readData, rdsEntryName) = spliceReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            spliceInsertList.append(
                getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
            countReads["splice"] += 1

        rds.insertSplices(spliceInsertList)

    countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
    countString += "\t%d unique reads" % countReads["unique"]
    countString += "\t%d multi reads" % countReads["multi"]
    countString += "\t%d multi reads count > %d discarded" % (
        countReads["multiDiscard"], maxMultiReadCount)
    if dataType == "RNA":
        countString += "\t%d spliced reads" % countReads["splice"]

    print countString.replace("\t", "\n")

    writeLog("%s.log" % outDbName, verstring, countString)

    if doIndex:
        print "building index...."
        if cachePages > defaultCacheSize:
            rds.setDBcache(cachePages)
            rds.buildIndex(cachePages)
        else:
            rds.buildIndex(defaultCacheSize)
Example #6
0
    doCount = False

doCache = False
cachePages = -1
if '-cache' in sys.argv:
    doCache = True
    try:
        cachePages = int(sys.argv[sys.argv.index('-cache') + 1])
    except:
        pass

datafile = sys.argv[1]
if '-initrna' in sys.argv:
    rds = readDataset(datafile,
                      initialize=True,
                      datasetType='RNA',
                      verbose=True,
                      cache=doCache)
else:
    rds = readDataset(datafile,
                      verbose=True,
                      reportCount=doCount,
                      cache=doCache)

if cachePages > rds.getDefaultCacheSize():
    rds.setDBcache(cachePages)

cacheVal = 0
if '-defaultcache' in sys.argv:
    cacheVal = int(sys.argv[sys.argv.index('-defaultcache') + 1])
    rds.setDBcache(cacheVal, default=True)