def farPairs(rdsfile, outfilename, outbedname, sameChromOnly=False, doVerbose=False, cachePages=None, minDist=1000, maxDist=500000, minCount=2, label=None): doCache = False if cachePages is not None: doCache = True else: cachePages = 0 if label is None: label = rdsfile RDS = readDataset(rdsfile, verbose=True, cache=doCache) rdsChromList = RDS.getChromosomes() if doVerbose: print time.ctime() total = 0 outfile = open(outfilename, "w") outbed = open(outbedname, "w") outbed.write('track name="%s distal pairs" color=0,255,0\n' % label) readlen = RDS.getReadSize() flagDict = {} for chromosome in rdsChromList: if doNotProcessChromosome(chromosome): continue print chromosome uniqDict = RDS.getReadsDict(fullChrom=True, chrom=chromosome, noSense=True, withFlag=True, withPairID=True, doUniqs=True, readIDDict=True) if doVerbose: print len(uniqDict), time.ctime() for readID in uniqDict: readList = uniqDict[readID] if len(readList) == 2: total += 1 (start1, flag1, pair1) = readList[0] (start2, flag2, pair2) = readList[1] if flag1 != flag2: dist = abs(start1 - start2) startList = [start1, start2] stopList = [start1 + readlen, start2 + readlen] startList.sort() stopList.sort() if flag1 != "" and flag2 != "" and minDist < dist < maxDist: outputLine = splitReadWrite(chromosome, 2, startList, stopList, "+", readID, "0,255,0", "0,255,0") outbed.write(outputLine) if doVerbose: print flag1, flag2, dist try: flagDict[flag1].append((flag2, start1, start2)) except KeyError: flagDict[flag1] = [(flag2, start1, start2)] try: flagDict[flag2].append((flag1, start1, start2)) except KeyError: flagDict[flag2] = [(flag2, start1, start2)] print "%d connected regions" % len(flagDict) for region in flagDict: flagDict[region].sort() regionConnections = {} for (region2, start1, start2) in flagDict[region]: try: regionConnections[region2] += 1 except KeyError: regionConnections[region2] = 1 for region2 in regionConnections: if regionConnections[region2] >= minCount: outfile.write("%s\t%s\t%d\n" % (region, region2, regionConnections[region2])) if doVerbose: print "%s\t%s\t%d" % (region, region2, regionConnections[region2]) outfile.close() outbed.close() if doVerbose: print "finished: ", time.ctime()
sense = fields[2] chromstarts = fields[8][:-1].split(',') chromstops = fields[9][:-1].split(',') exonLengths = [] totalLength = 0 for index in range(blockCount): chromstarts[index] = int(chromstarts[index]) chromstops[index] = int(chromstops[index]) exonLengths.append(chromstops[index] - chromstarts[index]) totalLength += exonLengths[index] geneDict[uname] = (sense, blockCount, totalLength, chrom, chromstarts, exonLengths) mapDict[uname] = [] genedatafile.close() rds = readDataset(outdbname, init, dataType, verbose=True) #check that our cacheSize is better than the dataset's default cache size defaultCacheSize = rds.getDefaultCacheSize() if cachePages > defaultCacheSize: if init: rds.setDBcache(cachePages, default=True) else: rds.setDBcache(cachePages) if not init and doIndex: try: if rds.hasIndex(): rds.dropIndex() except: if verbose:
def makeBamFromRds(rdsfile, outfilename, withUniqs=True, withMulti=True, doSplices=False, doPairs=False, withFlag="", useFlagLike=False, enforceChr=False, allChrom=True, doCache=False, cachePages=100000, chromList=[], fastaFileName=""): if not withUniqs and not withMulti and not doSplices: print "must be outputting at least one of uniqs, multi, or -splices - exiting" sys.exit(1) print "\nsample:" RDS = readDataset(rdsfile, verbose=True, cache=doCache) if cachePages > RDS.getDefaultCacheSize(): RDS.setDBcache(cachePages) readlength = RDS.getReadSize() if allChrom: if withUniqs: chromList = RDS.getChromosomes() elif withMulti: chromList = RDS.getChromosomes(table="multi") else: chromList = RDS.getChromosomes(table="splices") chromList.sort() fastaSequenceDict = {} if fastaFileName: fastafile = open(fastaFileName) fastaSequenceDict = getFastaSequenceDictionary(fastaFileName) fastafile.close() referenceSequenceList = [] chromRemoveList = [] for chromosome in chromList: if doNotOutputChromosome(chromosome, enforceChr): chromRemoveList.append(chromosome) else: chromosomeLength = RDS.getMaxCoordinate(chromosome, doUniqs=withUniqs, doMulti=withMulti, doSplices=doSplices) referenceDataDict = { "LN": int(chromosomeLength), "SN": str(chromosome) } referenceSequenceList.append(referenceDataDict) for chrom in chromRemoveList: chromList.remove(chrom) header = {"HD": {"VN": "1.0"}} if referenceSequenceList: header["SQ"] = referenceSequenceList outfile = pysam.Samfile(outfilename, "wb", header=header) totalWrites = 0 noncanonicalSplices = 0 for chrom in chromList: index = 0 print "chromosome %s" % (chrom) if withUniqs or withMulti: hitDict = RDS.getReadsDict(fullChrom=True, chrom=chrom, flag=withFlag, withWeight=True, withID=True, withPairID=doPairs, doUniqs=withUniqs, doMulti=withMulti, readIDDict=False, flagLike=useFlagLike, entryDict=True) for read in hitDict[chrom]: index += writeBAMEntry(outfile, chrom, read, readlength) if doSplices: numSpliceReadsWritten, noncanonical = processSpliceReads( RDS, outfile, chrom, withFlag, useFlagLike, readlength, fastaSequenceDict) index += numSpliceReadsWritten noncanonicalSplices += noncanonical print index totalWrites += index outfile.close() print "%d total reads written" % totalWrites print "%d non-canonical splices" % noncanonicalSplices
try: cachePages = int(sys.argv[sys.argv.index('-cache') + 1]) except: pass datafile = sys.argv[1] infileList = [] for index in range(2, len(sys.argv)): if sys.argv[index][0] == '-': break infileList.append(sys.argv[index]) print "destination RDS: %s" % datafile if '-initrna' in sys.argv: rds = readDataset(datafile, initialize=True, datasetType='RNA') elif '-init' in sys.argv: rds = readDataset(datafile, initialize=True) withFlag = '' if '-flag' in sys.argv: withFlag = sys.argv[sys.argv.index('-flag') + 1] print "restrict to flag = %s" % withFlag rds = readDataset(datafile, verbose=True, cache=doCache) if cachePages > rds.getDefaultCacheSize(): rds.setDBcache(cachePages) cacheVal = cachePages else: cacheVal = rds.getDefaultCacheSize()
def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False, cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True): if useSamFile: fileMode = "r" else: fileMode = "rb" try: samfile = pysam.Samfile(samFileName, fileMode) except ValueError: print "samfile index not found" sys.exit(1) if rnaDataType: dataType = "RNA" else: dataType = "DNA" writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:])) rds = readDataset(outDbName, init, dataType, verbose=True) if not init and doIndex: try: if rds.hasIndex(): rds.dropIndex() except: pass if "sam_mapped" not in rds.getMetadata(): rds.insertMetadata([("sam_mapped", "True")]) defaultCacheSize = rds.getDefaultCacheSize() if cachePages > defaultCacheSize: if init: rds.setDBcache(cachePages, default=True) else: rds.setDBcache(cachePages) propertyList = [] for arg in sys.argv: if "::" in arg: (pname, pvalue) = arg.strip().split("::") propertyList.append((pname, pvalue)) if len(propertyList) > 0: rds.insertMetadata(propertyList) countReads = { "unmapped": 0, "total": 0, "unique": 0, "multi": 0, "multiDiscard": 0, "splice": 0 } readsize = 0 insertSize = 100000 uniqueInsertList = [] multiInsertList = [] spliceInsertList = [] processedEntryDict = {} uniqueReadDict = {} multiReadDict = {} spliceReadDict = {} samFileIterator = samfile.fetch(until_eof=True) for read in samFileIterator: if read.is_unmapped: countReads["unmapped"] += 1 continue if readsize == 0: take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip) readsize = sum([length for op, length in read.cigar if op in take]) if init: rds.insertMetadata([("readsize", readsize)]) #Build the read dictionaries try: readSequence = read.seq except KeyError: readSequence = "" pairReadSuffix = getPairedReadNumberSuffix(read) readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix) if trimReadID: rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix) else: rdsEntryName = read.qname if processedEntryDict.has_key(readName): if isSpliceEntry(read.cigar): if spliceReadDict.has_key(readName): del spliceReadDict[readName] else: if uniqueReadDict.has_key(readName): del uniqueReadDict[readName] if multiReadDict.has_key(readName): (read, priorCount, rdsEntryName) = multiReadDict[readName] count = priorCount + 1 multiReadDict[readName] = (read, count, rdsEntryName) else: multiReadDict[readName] = (read, 1, rdsEntryName) else: processedEntryDict[readName] = "" if isSpliceEntry(read.cigar): spliceReadDict[readName] = (read, rdsEntryName) else: uniqueReadDict[readName] = (read, rdsEntryName) if countReads["total"] % insertSize == 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize)) countReads["unique"] += 1 for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append( getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) countReads["splice"] += 1 for entry in multiReadDict.keys(): (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: countReads["multiDiscard"] += 1 else: multiInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) countReads["multi"] += 1 rds.insertUniqs(uniqueInsertList) rds.insertMulti(multiInsertList) uniqueInsertList = [] uniqueReadDict = {} multiInsertList = [] multiReadDict = {} if dataType == "RNA": rds.insertSplices(spliceInsertList) spliceInsertList = [] spliceReadDict = {} print ".", sys.stdout.flush() processedEntryDict = {} countReads["total"] += 1 if len(uniqueReadDict.keys()) > 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize)) countReads["unique"] += 1 rds.insertUniqs(uniqueInsertList) if len(multiReadDict.keys()) > 0: for entry in multiReadDict.keys(): (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: countReads["multiDiscard"] += 1 else: multiInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) countReads["multi"] += 1 countReads["multi"] += len(multiInsertList) if len(spliceReadDict.keys()) > 0 and dataType == "RNA": for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append( getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) countReads["splice"] += 1 rds.insertSplices(spliceInsertList) countString = "\n%d unmapped reads discarded" % countReads["unmapped"] countString += "\t%d unique reads" % countReads["unique"] countString += "\t%d multi reads" % countReads["multi"] countString += "\t%d multi reads count > %d discarded" % ( countReads["multiDiscard"], maxMultiReadCount) if dataType == "RNA": countString += "\t%d spliced reads" % countReads["splice"] print countString.replace("\t", "\n") writeLog("%s.log" % outDbName, verstring, countString) if doIndex: print "building index...." if cachePages > defaultCacheSize: rds.setDBcache(cachePages) rds.buildIndex(cachePages) else: rds.buildIndex(defaultCacheSize)
doCount = False doCache = False cachePages = -1 if '-cache' in sys.argv: doCache = True try: cachePages = int(sys.argv[sys.argv.index('-cache') + 1]) except: pass datafile = sys.argv[1] if '-initrna' in sys.argv: rds = readDataset(datafile, initialize=True, datasetType='RNA', verbose=True, cache=doCache) else: rds = readDataset(datafile, verbose=True, reportCount=doCount, cache=doCache) if cachePages > rds.getDefaultCacheSize(): rds.setDBcache(cachePages) cacheVal = 0 if '-defaultcache' in sys.argv: cacheVal = int(sys.argv[sys.argv.index('-defaultcache') + 1]) rds.setDBcache(cacheVal, default=True)