Example #1
0
def getMismatches(mismatchTag, querySequence="", sense="+", logErrors=False):
    output = []
    deletionMarker = "^"
    position = 0

    lengths = re.findall("\d+", mismatchTag)
    mismatchSequences = re.findall("\d+([ACGTN]|\\^[ACGTN]+)", mismatchTag)

    for mismatchEntry in range(len(mismatchSequences)):
        mismatch = mismatchSequences[mismatchEntry]
        position = position + int(lengths[mismatchEntry])
        if string.find(mismatch, deletionMarker) == 0:
            continue

        try:
            if querySequence:
                genomicNucleotide = querySequence[position]
            else:
                genomicNucleotide = "N"

            if sense == "-":
                mismatch = getComplementNucleotide(mismatch)
                genomicNucleotide = getComplementNucleotide(genomicNucleotide)

            elandCompatiblePosition = int(position + 1)
            output.append(
                "%s%d%s" %
                (mismatch, elandCompatiblePosition, genomicNucleotide))
            position += 1
        except IndexError:
            if logErrors:
                errorMessage = "getMismatch IndexError; tag: %s, seq: %s, pos: %d" % (
                    mismatchTag, querySequence, position)
                writeLog("MakeRdsFromBamError.log", "1.0", errorMessage)

            return ""

    return string.join(output, ",")
Example #2
0
merging = True
if '-nomerge' in sys.argv:
    merging = False

if '-log' in sys.argv:
    logfilename = sys.argv[sys.argv.index('-log') + 1]

if '-locid' in sys.argv:
    locID = True
    print "using locations as region ID"

if '-norandom' in sys.argv:
    ignoreRandom = True
    print "ignoring 'random' chromosomes"

writeLog(logfilename, versionString, string.join(sys.argv[1:]))

allregionsDict = {}
regionFileList = regionfiles.split(',')
numRegions = len(regionFileList)
chromList = []
for regionID in range(numRegions):
    allregionsDict[regionID] = getMergedRegions(regionFileList[regionID],
                                                maxDist=mergeregion,
                                                minHits=-1,
                                                fullChrom=True,
                                                verbose=True,
                                                chromField=cField,
                                                doMerge=merging,
                                                pad=padregion)
    for achrom in allregionsDict[regionID]:
Example #3
0
    print "flipping read sense"
    flip = True

spacer = 2
if '-spacer' in sys.argv:
    spacer = int(sys.argv[sys.argv.index('-spacer') + 1])

if '-strip' in sys.argv:
    stripSpace = True

readsize = 0
maxBorder = 0
index = 0
insertSize = 100000

writeLog(outdbname + '.log', verstring, string.join(sys.argv[1:]))


def decodeMismatches(mString, rsense):
    complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C', 'N': 'N'}

    output = []
    mismatches = mString.split(',')
    for mismatch in mismatches:
        (pos, change) = mismatch.split(':')
        (genNT, readNT) = change.split('>')
        if rsense == '-':
            readNT = complement[readNT]
            genNT = complement[genNT]
        # for eland-compatibility, we are 1-based
        output.append('%s%d%s' % (readNT, int(pos) + 1, genNT))
Example #4
0
def makeRdsFromBam(label,
                   samFileName,
                   outDbName,
                   init=True,
                   doIndex=False,
                   useSamFile=False,
                   cachePages=100000,
                   maxMultiReadCount=10,
                   rnaDataType=False,
                   trimReadID=True):

    if useSamFile:
        fileMode = "r"
    else:
        fileMode = "rb"

    try:
        samfile = pysam.Samfile(samFileName, fileMode)
    except ValueError:
        print "samfile index not found"
        sys.exit(1)

    if rnaDataType:
        dataType = "RNA"
    else:
        dataType = "DNA"

    writeLog("%s.log" % outDbName, verstring, string.join(sys.argv[1:]))

    rds = readDataset(outDbName, init, dataType, verbose=True)
    if not init and doIndex:
        try:
            if rds.hasIndex():
                rds.dropIndex()
        except:
            pass

    if "sam_mapped" not in rds.getMetadata():
        rds.insertMetadata([("sam_mapped", "True")])

    defaultCacheSize = rds.getDefaultCacheSize()

    if cachePages > defaultCacheSize:
        if init:
            rds.setDBcache(cachePages, default=True)
        else:
            rds.setDBcache(cachePages)

    propertyList = []
    for arg in sys.argv:
        if "::" in arg:
            (pname, pvalue) = arg.strip().split("::")
            propertyList.append((pname, pvalue))

    if len(propertyList) > 0:
        rds.insertMetadata(propertyList)

    countReads = {
        "unmapped": 0,
        "total": 0,
        "unique": 0,
        "multi": 0,
        "multiDiscard": 0,
        "splice": 0
    }

    readsize = 0
    insertSize = 100000

    uniqueInsertList = []
    multiInsertList = []
    spliceInsertList = []

    processedEntryDict = {}
    uniqueReadDict = {}
    multiReadDict = {}
    spliceReadDict = {}

    samFileIterator = samfile.fetch(until_eof=True)

    for read in samFileIterator:
        if read.is_unmapped:
            countReads["unmapped"] += 1
            continue

        if readsize == 0:
            take = (0, 2, 3)  # CIGAR operation (M/match, D/del, N/ref_skip)
            readsize = sum([length for op, length in read.cigar if op in take])
            if init:
                rds.insertMetadata([("readsize", readsize)])

        #Build the read dictionaries
        try:
            readSequence = read.seq
        except KeyError:
            readSequence = ""

        pairReadSuffix = getPairedReadNumberSuffix(read)
        readName = "%s%s%s" % (read.qname, readSequence, pairReadSuffix)
        if trimReadID:
            rdsEntryName = "%s:%s:%d%s" % (label, read.qname,
                                           countReads["total"], pairReadSuffix)
        else:
            rdsEntryName = read.qname

        if processedEntryDict.has_key(readName):
            if isSpliceEntry(read.cigar):
                if spliceReadDict.has_key(readName):
                    del spliceReadDict[readName]
            else:
                if uniqueReadDict.has_key(readName):
                    del uniqueReadDict[readName]

                if multiReadDict.has_key(readName):
                    (read, priorCount, rdsEntryName) = multiReadDict[readName]
                    count = priorCount + 1
                    multiReadDict[readName] = (read, count, rdsEntryName)
                else:
                    multiReadDict[readName] = (read, 1, rdsEntryName)
        else:
            processedEntryDict[readName] = ""
            if isSpliceEntry(read.cigar):
                spliceReadDict[readName] = (read, rdsEntryName)
            else:
                uniqueReadDict[readName] = (read, rdsEntryName)

        if countReads["total"] % insertSize == 0:
            for entry in uniqueReadDict.keys():
                (readData, rdsEntryName) = uniqueReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                uniqueInsertList.append(
                    getRDSEntry(readData, rdsEntryName, chrom, readsize))
                countReads["unique"] += 1

            for entry in spliceReadDict.keys():
                (readData, rdsEntryName) = spliceReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                spliceInsertList.append(
                    getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
                countReads["splice"] += 1

            for entry in multiReadDict.keys():
                (readData, count, rdsEntryName) = multiReadDict[entry]
                chrom = samfile.getrname(readData.rname)
                if count > maxMultiReadCount:
                    countReads["multiDiscard"] += 1
                else:
                    multiInsertList.append(
                        getRDSEntry(readData,
                                    rdsEntryName,
                                    chrom,
                                    readsize,
                                    weight=count))
                    countReads["multi"] += 1

            rds.insertUniqs(uniqueInsertList)
            rds.insertMulti(multiInsertList)
            uniqueInsertList = []
            uniqueReadDict = {}
            multiInsertList = []
            multiReadDict = {}
            if dataType == "RNA":
                rds.insertSplices(spliceInsertList)
                spliceInsertList = []
                spliceReadDict = {}

            print ".",
            sys.stdout.flush()
            processedEntryDict = {}

        countReads["total"] += 1

    if len(uniqueReadDict.keys()) > 0:
        for entry in uniqueReadDict.keys():
            (readData, rdsEntryName) = uniqueReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            uniqueInsertList.append(
                getRDSEntry(readData, rdsEntryName, chrom, readsize))
            countReads["unique"] += 1

        rds.insertUniqs(uniqueInsertList)

    if len(multiReadDict.keys()) > 0:
        for entry in multiReadDict.keys():
            (readData, count, rdsEntryName) = multiReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            if count > maxMultiReadCount:
                countReads["multiDiscard"] += 1
            else:
                multiInsertList.append(
                    getRDSEntry(readData,
                                rdsEntryName,
                                chrom,
                                readsize,
                                weight=count))
                countReads["multi"] += 1

        countReads["multi"] += len(multiInsertList)

    if len(spliceReadDict.keys()) > 0 and dataType == "RNA":
        for entry in spliceReadDict.keys():
            (readData, rdsEntryName) = spliceReadDict[entry]
            chrom = samfile.getrname(readData.rname)
            spliceInsertList.append(
                getRDSSpliceEntry(readData, rdsEntryName, chrom, readsize))
            countReads["splice"] += 1

        rds.insertSplices(spliceInsertList)

    countString = "\n%d unmapped reads discarded" % countReads["unmapped"]
    countString += "\t%d unique reads" % countReads["unique"]
    countString += "\t%d multi reads" % countReads["multi"]
    countString += "\t%d multi reads count > %d discarded" % (
        countReads["multiDiscard"], maxMultiReadCount)
    if dataType == "RNA":
        countString += "\t%d spliced reads" % countReads["splice"]

    print countString.replace("\t", "\n")

    writeLog("%s.log" % outDbName, verstring, countString)

    if doIndex:
        print "building index...."
        if cachePages > defaultCacheSize:
            rds.setDBcache(cachePages)
            rds.buildIndex(cachePages)
        else:
            rds.buildIndex(defaultCacheSize)
Example #5
0
goodfile = open(sys.argv[4],'w')
if '-startField' in sys.argv:
    startField = int(sys.argv[sys.argv.index('-startField') + 1])
if startField < 0:
    startField = 0

cachePages = 500000
if '-cache' in sys.argv:
    cachePages =  int(sys.argv[sys.argv.index('-cache') + 1])
    if cachePages < 250000:
        cachePages = 250000

doLog = False
if '-log' in sys.argv:
    logfilename = sys.argv[sys.argv.index('-log') + 1]
    writeLog(logfilename, versionString, string.join(sys.argv[1:]))
    doLog = True

infile = open(filename)
if os.path.isfile(dbfile):
    db = sqlite.connect(dbfile)
    sql = db.cursor()
    sql.execute("PRAGMA CACHE_SIZE = %d" % cachePages)
    sql.execute("PRAGMA temp_store = MEMORY")
else:
    print "No database - passing through"
    if doLog:
        writeLog(logfilename, versionString, "No database - passing through")    
    for line in infile:
        outfile.write(line + '\tNR\tNR\t0.00\n')
        goodfile.write(line)
Example #6
0
#
#  recordLog.py
#  ENRAGE
#
#  Created by Ali Mortazavi on 12/14/08.
#

from commoncode import writeLog
import sys

if '-verbose' in sys.argv or len(sys.argv) < 4:
    print '%s: version 1.0' % sys.argv[0]

if len(sys.argv) < 4:
    print 'usage: python %s logFile messenger message [-verbose]' % sys.argv[0]
    sys.exit(1)

writeLog(sys.argv[1], sys.argv[2], sys.argv[3])