def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False): output = [] deletionMarker = "^" position = 0 lengths = re.findall("\d+", mismatchTag) mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag) for mismatchEntry in range(len(mismatchSequences)): mismatch = mismatchSequences[mismatchEntry] position = position + int(lengths[mismatchEntry]) if string.find(mismatch, deletionMarker) == 0: continue try: if querySequence: genomicNucleotide = querySequence[position] else: genomicNucleotide = "N" if sense == "-": mismatch = getComplementNucleotide(mismatch) genomicNucleotide = getComplementNucleotide(genomicNucleotide) elandCompatiblePosition = int(position + 1) output.append( "%s%d%s" % (mismatch, elandCompatiblePosition, genomicNucleotide)) position += 1 except IndexError: if logErrors: errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % ( mismatchTag, querySequence, position) writeLog("MakeRdsFromBamError.log", "1.0", errorMessage) return "" return string.join(output, ",")
merging = True if '-nomerge' in sys.argv: merging = False if '-log' in sys.argv: logfilename = sys.argv[sys.argv.index('-log') + 1] if '-locid' in sys.argv: locID = True print "using locations as region ID" if '-norandom' in sys.argv: ignoreRandom = True print "ignoring 'random' chromosomes" writeLog(logfilename, versionString, string.join(sys.argv[1:])) allregionsDict = {} regionFileList = regionfiles.split(',') numRegions = len(regionFileList) chromList = [] for regionID in range(numRegions): allregionsDict[regionID] = getMergedRegions(regionFileList[regionID], maxDist=mergeregion, minHits=-1, fullChrom=True, verbose=True, chromField=cField, doMerge=merging, pad=padregion) for achrom in allregionsDict[regionID]:
print "flipping read sense" flip = True spacer = 2 if '-spacer' in sys.argv: spacer = int(sys.argv[sys.argv.index('-spacer') + 1]) if '-strip' in sys.argv: stripSpace = True readsize = 0 maxBorder = 0 index = 0 insertSize = 100000 writeLog(outdbname + '.log', verstring, string.join(sys.argv[1:])) def decodeMismatches(mString, rsense): complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'} output = [] mismatches = mString.split(',') for mismatch in mismatches: (pos, change) = mismatch.split(':') (genNT, readNT) = change.split('>') if rsense == '-': readNT = complement[readNT] genNT = complement[genNT] # for eland-compatibility, we are 1-based output.append('%s%d%s' % (readNT, int(pos) + 1, genNT))
def makeRdsFromBam(label, samFileName, outDbName, init=True, doIndex=False, useSamFile=False, cachePages=100000, maxMultiReadCount=10, rnaDataType=False, trimReadID=True): if useSamFile: fileMode = "r" else: fileMode = "rb" try: samfile = pysam.Samfile(samFileName, fileMode) except ValueError: print "samfile index not found" sys.exit(1) if rnaDataType: dataType = "RNA" else: dataType = "DNA" writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:])) rds = readDataset(outDbName, init, dataType, verbose=True) if not init and doIndex: try: if rds.hasIndex(): rds.dropIndex() except: pass if "sam_mapped" not in rds.getMetadata(): rds.insertMetadata([("sam_mapped", "True")]) defaultCacheSize = rds.getDefaultCacheSize() if cachePages > defaultCacheSize: if init: rds.setDBcache(cachePages, default=True) else: rds.setDBcache(cachePages) propertyList = [] for arg in sys.argv: if "::" in arg: (pname, pvalue) = arg.strip().split("::") propertyList.append((pname, pvalue)) if len(propertyList) > 0: rds.insertMetadata(propertyList) countReads = { "unmapped": 0, "total": 0, "unique": 0, "multi": 0, "multiDiscard": 0, "splice": 0 } readsize = 0 insertSize = 100000 uniqueInsertList = [] multiInsertList = [] spliceInsertList = [] processedEntryDict = {} uniqueReadDict = {} multiReadDict = {} spliceReadDict = {} samFileIterator = samfile.fetch(until_eof=True) for read in samFileIterator: if read.is_unmapped: countReads["unmapped"] += 1 continue if readsize == 0: take = (0, 2, 3) # CIGAR operation (M/match, D/del, N/ref_skip) readsize = sum([length for op, length in read.cigar if op in take]) if init: rds.insertMetadata([("readsize", readsize)]) #Build the read dictionaries try: readSequence = read.seq except KeyError: readSequence = "" pairReadSuffix = getPairedReadNumberSuffix(read) readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix) if trimReadID: rdsEntryName = "%s:%s:%d%s" % (label, read.qname, countReads["total"], pairReadSuffix) else: rdsEntryName = read.qname if processedEntryDict.has_key(readName): if isSpliceEntry(read.cigar): if spliceReadDict.has_key(readName): del spliceReadDict[readName] else: if uniqueReadDict.has_key(readName): del uniqueReadDict[readName] if multiReadDict.has_key(readName): (read, priorCount, rdsEntryName) = multiReadDict[readName] count = priorCount + 1 multiReadDict[readName] = (read, count, rdsEntryName) else: multiReadDict[readName] = (read, 1, rdsEntryName) else: processedEntryDict[readName] = "" if isSpliceEntry(read.cigar): spliceReadDict[readName] = (read, rdsEntryName) else: uniqueReadDict[readName] = (read, rdsEntryName) if countReads["total"] % insertSize == 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize)) countReads["unique"] += 1 for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append( getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) countReads["splice"] += 1 for entry in multiReadDict.keys(): (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: countReads["multiDiscard"] += 1 else: multiInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) countReads["multi"] += 1 rds.insertUniqs(uniqueInsertList) rds.insertMulti(multiInsertList) uniqueInsertList = [] uniqueReadDict = {} multiInsertList = [] multiReadDict = {} if dataType == "RNA": rds.insertSplices(spliceInsertList) spliceInsertList = [] spliceReadDict = {} print ".", sys.stdout.flush() processedEntryDict = {} countReads["total"] += 1 if len(uniqueReadDict.keys()) > 0: for entry in uniqueReadDict.keys(): (readData, rdsEntryName) = uniqueReadDict[entry] chrom = samfile.getrname(readData.rname) uniqueInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize)) countReads["unique"] += 1 rds.insertUniqs(uniqueInsertList) if len(multiReadDict.keys()) > 0: for entry in multiReadDict.keys(): (readData, count, rdsEntryName) = multiReadDict[entry] chrom = samfile.getrname(readData.rname) if count > maxMultiReadCount: countReads["multiDiscard"] += 1 else: multiInsertList.append( getRDSEntry(readData, rdsEntryName, chrom, readsize, weight=count)) countReads["multi"] += 1 countReads["multi"] += len(multiInsertList) if len(spliceReadDict.keys()) > 0 and dataType == "RNA": for entry in spliceReadDict.keys(): (readData, rdsEntryName) = spliceReadDict[entry] chrom = samfile.getrname(readData.rname) spliceInsertList.append( getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize)) countReads["splice"] += 1 rds.insertSplices(spliceInsertList) countString = "\n%d unmapped reads discarded" % countReads["unmapped"] countString += "\t%d unique reads" % countReads["unique"] countString += "\t%d multi reads" % countReads["multi"] countString += "\t%d multi reads count > %d discarded" % ( countReads["multiDiscard"], maxMultiReadCount) if dataType == "RNA": countString += "\t%d spliced reads" % countReads["splice"] print countString.replace("\t", "\n") writeLog("%s.log" % outDbName, verstring, countString) if doIndex: print "building index...." if cachePages > defaultCacheSize: rds.setDBcache(cachePages) rds.buildIndex(cachePages) else: rds.buildIndex(defaultCacheSize)
goodfile = open(sys.argv[4],'w') if '-startField' in sys.argv: startField = int(sys.argv[sys.argv.index('-startField') + 1]) if startField < 0: startField = 0 cachePages = 500000 if '-cache' in sys.argv: cachePages = int(sys.argv[sys.argv.index('-cache') + 1]) if cachePages < 250000: cachePages = 250000 doLog = False if '-log' in sys.argv: logfilename = sys.argv[sys.argv.index('-log') + 1] writeLog(logfilename, versionString, string.join(sys.argv[1:])) doLog = True infile = open(filename) if os.path.isfile(dbfile): db = sqlite.connect(dbfile) sql = db.cursor() sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages) sql.execute("PRAGMA temp_store = MEMORY") else: print "No database - passing through" if doLog: writeLog(logfilename, versionString, "No database - passing through") for line in infile: outfile.write(line + '\tNR\tNR\t0.00\n') goodfile.write(line)
# # recordLog.py # ENRAGE # # Created by Ali Mortazavi on 12/14/08. # from commoncode import writeLog import sys if '-verbose' in sys.argv or len(sys.argv) < 4: print '%s: version 1.0' % sys.argv[0] if len(sys.argv) < 4: print 'usage: python %s logFile messenger message [-verbose]' % sys.argv[0] sys.exit(1) writeLog(sys.argv[1], sys.argv[2], sys.argv[3])