def main(args):
    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log"
    logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG)

    logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s"
                 % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName))

    # create output directory
    discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    # make sure input sources exist
    peakparser.checkOutDir(args.inDir1,args.outDirName)
    peakparser.checkOutDir(args.inDir2,args.outDirName)

    # make sure config files exist
    configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName
    configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName
    discordant.checkfile(configPath1)
    discordant.checkfile(configPath2)

    # read parameters for both inputs
    configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName)
    configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName)

    maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength'])
    eltLenDict = discordant.makeEltLenDict(configDict1['eltLenFileName'])

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt"
    readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt"
    logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2))
#   print "merging readfiles (%s, %s)" % (readFileName1,readFileName2)
    mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist,eltLenDict)

    # merge bedfiles
    outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed"
    bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed"
    bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed"
    logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2))
#   print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)
    mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist,eltLenDict)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = configDict1
    configDict['bamFileName1'] = configDict1['bamFileName']
    configDict['bamFileName2'] = configDict2['bamFileName']
    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()
Exemple #2
0
def main(args):
    peakparser.checkOutDir(args.outBaseName,args.outDirName)

    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    discordant.checkfile(configPath)
    configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName)

    refGenome = configDict['refGenome']
    checkAnnotDir(refGenome, args.annotDir)

    debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt"
    discordant.checkfile(debugFile)

    # can filter on retroelement subfamilies
    discordant.checkfile(args.eltFile)
    eltDict = {}
    f = open(args.eltFile,'r')
    for elt in f:
        eltDict[elt.strip()]=1

    germlineIns = []
    cancerIns   = []
    normalIns   = []
    otherIns    = []

    f = open(debugFile, 'r')
    for line in f: # loop over lines
        if re.search('valid=1', line) and (re.search('olClN=0', line) or args.allowOverlap):
            insdata = {}
            for c in line.rstrip().rsplit(' '): # loop over columns
               (key,value) = c.rsplit('=')
               insdata[key] = value

            if int(insdata['np']) >= int(args.minPeakSize) and eltDict.has_key(insdata['pME']): 
                insSum = insertionSummary()
                insSum.setPos(insdata['pos'])
                insSum.setEltPos(insdata['eltExt'])
                insSum.setBestStrand(insdata['eInv'],insdata['eS'])
                insSum.setSources(insdata['sources'])
                insSum.setClassIndex(insdata['index'])
                insSum.eltFam   = insdata['pME']
                insSum.numreads = insdata['np']
                if len(insSum.sources) > 1:
                    germlineIns.append(insSum)
                else:
                    if re.search('CANCER', insSum.sources[0]):
                        cancerIns.append(insSum)
                    elif re.search('NORMAL', insSum.sources[0]):
                        normalIns.append(insSum)
                    else:
                        otherIns.append(insSum)
    f.close()
    annotatePos(germlineIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.printout)
    annotatePos(cancerIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.printout)
    annotatePos(normalIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.printout)
    annotatePos(otherIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/other.tab.txt",args.printout)
def main(args):
    checkOutDir(args.outBaseName,args.outDirName)
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    discordant.checkfile(configPath)
    configDict = readConfig(configPath,args.outBaseName,args.outDirName)

    # debugging info
    logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName +".peakparser.log"
    logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG)

    # log parameters
    logging.info("\noutBaseName=%s\nbamFileName=%s\ntabixFileName=%s\neltFileName=%s"
                 % (args.outBaseName,configDict['readFileName'],configDict['tabixFileName'],
                    configDict['eltFileName']))

    discordant.checkfile(configDict['readFileName'])
    discordant.checkfile(configDict['tabixFileName'])
    discordant.checkfile(configDict['eltFileName'])

    tabixFile = pysam.Tabixfile(configDict['tabixFileName'], 'r')
    eltDict = makeEltDict(configDict['eltFileName'])

    # structure for keeping element classes seperate
    eltPeakDict = {}
    for (eltName,eltClass) in eltDict.iteritems():
        eltPeakDict[eltClass] = PeakBuilder(eltClass)

    f = open(configDict['readFileName'])
    for line in f:
        fields = line.rsplit()
        eltClass = ''
        try: 
            eltClass = fields[13].rstrip()
        except:
            print "bad line: " + line
        chrom      = fields[0]
        readPos    = int(fields[1])
        readStrand = fields[2]
        mateElt    = fields[3]
        mateChrom  = fields[4]
        mateStrand = fields[5]
        matePos    = int(fields[6])
        eltStart   = int(fields[7])
        eltEnd     = int(fields[8])
        eltStrand  = fields[9]
        eltFullLen = fields[10]
        genomeName = fields[11]
        peakIndex  = fields[12]

        pair = pairinfo.Pair(chrom,readPos,readStrand,mateElt,mateChrom,
                             mateStrand,matePos,eltStart,eltEnd,eltStrand,
                             eltFullLen,genomeName,peakIndex,eltClass)
        if eltPeakDict[eltClass].lastPeakNum != peakIndex:
            eltPeakDict[eltClass].peaks.append(eltPeakDict[eltClass].peak)
            eltPeakDict[eltClass].peak = pairinfo.Peak()
            eltPeakDict[eltClass].peak.addpair(pair)
            eltPeakDict[eltClass].lastPeakNum = peakIndex
        else:
            eltPeakDict[eltClass].peak.addpair(pair)

    logging.info("starting long output...")
    longOutput(eltPeakDict,args.outBaseName,args.outDirName,tabixFile,eltDict)

    logging.info("starting bed output...")
    bedOutput(eltPeakDict,args.outBaseName,args.outDirName,configDict['refGenome'],tabixFile,eltDict)
Exemple #4
0
def checkAnnotDir(refGenome,annotDir):
    if not os.path.exists(annotDir + "/" + refGenome):
        raise IOError("cannot find genome annotation directory for " + refGenome)
        return 0
    discordant.checkfile(annotDir + "/" + refGenome + "/names.txt")
Exemple #5
0
def main(args):

    # create output directory
    discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite)

    sampleList = [] 

    try:
        sampleList = args.sampleList
    except AttributeError:
        discordant.checkfile(args.sampleListFile)
        sampleList = open(args.sampleListFile, 'r') 

    readFileNames = []
    bamFileNames  = []
    sampleNames   = []
    insertSizes   = []
    readLengths   = []
    eltLenDict    = None
    lastConfig    = None

    for sampleLine in sampleList:
        if not re.search("^#", sampleLine):
            (sampleBam,sampleSubDir,refGenome,groupName) = sampleLine.strip().split()
            peakparser.checkOutDir(sampleSubDir,args.outDirName)
            configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName

            discordant.checkfile(configPath)
            configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName)
            eltLenDict = discordant.makeEltLenDict(configDict['eltLenFileName'])
            lastConfig = configDict

            insertSizes.append(int(configDict['insertSize']))
            readLengths.append(int(configDict['readLength']))

            readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt"
            readFileNames.append(readFileName)
            bamFileNames.append(configDict['bamFileName'])
            sampleNames.append(sampleSubDir)

    maxDist = max(insertSizes) + 2*max(readLengths)        

    # merge readfiles
    outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt"
    mergeChrPosFiles(readFileNames,outReadFileName,maxDist,eltLenDict)

    # write new config file
    configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName
    configDict = lastConfig

    assert len(bamFileNames) == len(sampleNames)

    bfnum = 0
    for bamFileName in bamFileNames:
        bfvname = "bamFileName" + str(bfnum)
        configDict[bfvname] = bamFileName
        bfnum += 1

    snum = 0
    for sampleName in sampleNames:
        samname = "sampleName" + str(snum)
        configDict[samname] = sampleName
        snum += 1

    configDict['merged'] = 'True'
    configDict['outBaseName'] = args.outBaseName
    configDict['outDirName'] = args.outDirName
    configDict['readFileName'] = outReadFileName

    del configDict['bamFileName']

    f = open(configPath, 'w')
    for k,v in configDict.iteritems():
        f.write(k + "=" + v + "\n")
    f.close()