def main(args): # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log" logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG) logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName)) # create output directory discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) # make sure input sources exist peakparser.checkOutDir(args.inDir1,args.outDirName) peakparser.checkOutDir(args.inDir2,args.outDirName) # make sure config files exist configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName discordant.checkfile(configPath1) discordant.checkfile(configPath2) # read parameters for both inputs configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName) configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName) maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength']) eltLenDict = discordant.makeEltLenDict(configDict1['eltLenFileName']) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt" readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt" logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2)) # print "merging readfiles (%s, %s)" % (readFileName1,readFileName2) mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist,eltLenDict) # merge bedfiles outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed" bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed" bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed" logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)) # print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2) mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist,eltLenDict) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = configDict1 configDict['bamFileName1'] = configDict1['bamFileName'] configDict['bamFileName2'] = configDict2['bamFileName'] configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): peakparser.checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName discordant.checkfile(configPath) configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName) refGenome = configDict['refGenome'] checkAnnotDir(refGenome, args.annotDir) debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt" discordant.checkfile(debugFile) # can filter on retroelement subfamilies discordant.checkfile(args.eltFile) eltDict = {} f = open(args.eltFile,'r') for elt in f: eltDict[elt.strip()]=1 germlineIns = [] cancerIns = [] normalIns = [] otherIns = [] f = open(debugFile, 'r') for line in f: # loop over lines if re.search('valid=1', line) and (re.search('olClN=0', line) or args.allowOverlap): insdata = {} for c in line.rstrip().rsplit(' '): # loop over columns (key,value) = c.rsplit('=') insdata[key] = value if int(insdata['np']) >= int(args.minPeakSize) and eltDict.has_key(insdata['pME']): insSum = insertionSummary() insSum.setPos(insdata['pos']) insSum.setEltPos(insdata['eltExt']) insSum.setBestStrand(insdata['eInv'],insdata['eS']) insSum.setSources(insdata['sources']) insSum.setClassIndex(insdata['index']) insSum.eltFam = insdata['pME'] insSum.numreads = insdata['np'] if len(insSum.sources) > 1: germlineIns.append(insSum) else: if re.search('CANCER', insSum.sources[0]): cancerIns.append(insSum) elif re.search('NORMAL', insSum.sources[0]): normalIns.append(insSum) else: otherIns.append(insSum) f.close() annotatePos(germlineIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.printout) annotatePos(cancerIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.printout) annotatePos(normalIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.printout) annotatePos(otherIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/other.tab.txt",args.printout)
def main(args): checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName discordant.checkfile(configPath) configDict = readConfig(configPath,args.outBaseName,args.outDirName) # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName +".peakparser.log" logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG) # log parameters logging.info("\noutBaseName=%s\nbamFileName=%s\ntabixFileName=%s\neltFileName=%s" % (args.outBaseName,configDict['readFileName'],configDict['tabixFileName'], configDict['eltFileName'])) discordant.checkfile(configDict['readFileName']) discordant.checkfile(configDict['tabixFileName']) discordant.checkfile(configDict['eltFileName']) tabixFile = pysam.Tabixfile(configDict['tabixFileName'], 'r') eltDict = makeEltDict(configDict['eltFileName']) # structure for keeping element classes seperate eltPeakDict = {} for (eltName,eltClass) in eltDict.iteritems(): eltPeakDict[eltClass] = PeakBuilder(eltClass) f = open(configDict['readFileName']) for line in f: fields = line.rsplit() eltClass = '' try: eltClass = fields[13].rstrip() except: print "bad line: " + line chrom = fields[0] readPos = int(fields[1]) readStrand = fields[2] mateElt = fields[3] mateChrom = fields[4] mateStrand = fields[5] matePos = int(fields[6]) eltStart = int(fields[7]) eltEnd = int(fields[8]) eltStrand = fields[9] eltFullLen = fields[10] genomeName = fields[11] peakIndex = fields[12] pair = pairinfo.Pair(chrom,readPos,readStrand,mateElt,mateChrom, mateStrand,matePos,eltStart,eltEnd,eltStrand, eltFullLen,genomeName,peakIndex,eltClass) if eltPeakDict[eltClass].lastPeakNum != peakIndex: eltPeakDict[eltClass].peaks.append(eltPeakDict[eltClass].peak) eltPeakDict[eltClass].peak = pairinfo.Peak() eltPeakDict[eltClass].peak.addpair(pair) eltPeakDict[eltClass].lastPeakNum = peakIndex else: eltPeakDict[eltClass].peak.addpair(pair) logging.info("starting long output...") longOutput(eltPeakDict,args.outBaseName,args.outDirName,tabixFile,eltDict) logging.info("starting bed output...") bedOutput(eltPeakDict,args.outBaseName,args.outDirName,configDict['refGenome'],tabixFile,eltDict)
def checkAnnotDir(refGenome,annotDir): if not os.path.exists(annotDir + "/" + refGenome): raise IOError("cannot find genome annotation directory for " + refGenome) return 0 discordant.checkfile(annotDir + "/" + refGenome + "/names.txt")
def main(args): # create output directory discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) sampleList = [] try: sampleList = args.sampleList except AttributeError: discordant.checkfile(args.sampleListFile) sampleList = open(args.sampleListFile, 'r') readFileNames = [] bamFileNames = [] sampleNames = [] insertSizes = [] readLengths = [] eltLenDict = None lastConfig = None for sampleLine in sampleList: if not re.search("^#", sampleLine): (sampleBam,sampleSubDir,refGenome,groupName) = sampleLine.strip().split() peakparser.checkOutDir(sampleSubDir,args.outDirName) configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName discordant.checkfile(configPath) configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName) eltLenDict = discordant.makeEltLenDict(configDict['eltLenFileName']) lastConfig = configDict insertSizes.append(int(configDict['insertSize'])) readLengths.append(int(configDict['readLength'])) readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt" readFileNames.append(readFileName) bamFileNames.append(configDict['bamFileName']) sampleNames.append(sampleSubDir) maxDist = max(insertSizes) + 2*max(readLengths) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" mergeChrPosFiles(readFileNames,outReadFileName,maxDist,eltLenDict) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = lastConfig assert len(bamFileNames) == len(sampleNames) bfnum = 0 for bamFileName in bamFileNames: bfvname = "bamFileName" + str(bfnum) configDict[bfvname] = bamFileName bfnum += 1 snum = 0 for sampleName in sampleNames: samname = "sampleName" + str(snum) configDict[samname] = sampleName snum += 1 configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()