def main(args): # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log" logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG) logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName)) # create output directory pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) # make sure input sources exist peakparser.checkOutDir(args.inDir1,args.outDirName) peakparser.checkOutDir(args.inDir2,args.outDirName) # make sure config files exist configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName pickreads.checkfile(configPath1) pickreads.checkfile(configPath2) # read parameters for both inputs configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName) configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName) maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength']) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt" readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt" logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2)) # print "merging readfiles (%s, %s)" % (readFileName1,readFileName2) mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist) # merge bedfiles outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed" bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed" bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed" logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)) # print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2) mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = configDict1 configDict['bamFileName1'] = configDict1['bamFileName'] configDict['bamFileName2'] = configDict2['bamFileName'] configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): # create output directory pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) pickreads.checkfile(args.sampleListFile) sampleList = open(args.sampleListFile, 'r') readFileNames = [] bamFileNames = [] insertSizes = [] readLengths = [] lastConfig = None for sampleLine in sampleList: if not re.search("^#", sampleLine): (sampleBam,sampleSubDir,refGenome) = sampleLine.strip().split() peakparser.checkOutDir(sampleSubDir,args.outDirName) configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName pickreads.checkfile(configPath) configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName) lastConfig = configDict insertSizes.append(int(configDict['insertSize'])) readLengths.append(int(configDict['readLength'])) readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt" readFileNames.append(readFileName) bamFileNames.append(configDict['bamFileName']) maxDist = max(insertSizes) + 2*max(readLengths) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" mergeChrPosFiles(readFileNames,outReadFileName,maxDist) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = lastConfig bfnum = 0 for bamFileName in bamFileNames: bfvname = "bamFileName" + str(bfnum) configDict[bfvname] = bamFileName bfnum += 1 configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): checkOutDir(args.outBaseName, args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName pickreads.checkfile(configPath) configDict = readConfig(configPath, args.outBaseName, args.outDirName) # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid( ) + "." + args.outBaseName + ".peakparser.log" logging.basicConfig(format='%(asctime)s %(message)s', filename=logfile, level=logging.DEBUG) # log parameters logging.info("\noutBaseName=%s\nbamFileName=%s\nexonTabixFile=%s" % (args.outBaseName, configDict['readFileName'], configDict['exonTabixFile'])) pickreads.checkfile(configDict['readFileName']) pickreads.checkfile(configDict['exonTabixFile']) pickreads.checkfile(args.pgTabixFile) tabixFile = pysam.Tabixfile(configDict['exonTabixFile'], 'r') pgTabix = pysam.Tabixfile(args.pgTabixFile, 'r') pgTabixContigs = {} for contig in pgTabix.contigs: pgTabixContigs[contig] = 1 # structure for keeping element classes seperate geneNameDict = {} f = open(configDict['readFileName']) for line in f: fields = line.rsplit("\t") chrom = fields[0] readPos = int(fields[1]) readStrand = fields[2] mateElt = fields[3] mateChrom = fields[4] mateStrand = fields[5] matePos = int(fields[6]) eltStart = int(fields[7]) eltEnd = int(fields[8]) eltStrand = fields[9] eltFullLen = int(fields[10]) genomeName = fields[11] peakIndex = fields[12] geneName = fields[13].strip() if not geneNameDict.has_key(geneName): geneNameDict[geneName] = PeakBuilder(geneName) pair = pairinfo.Pair(chrom, readPos, readStrand, mateElt, mateChrom, mateStrand, matePos, eltStart, eltEnd, eltStrand, eltFullLen, genomeName, peakIndex, geneName) if geneNameDict[geneName].lastPeakNum != peakIndex: geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak) geneNameDict[geneName].peak = pairinfo.Peak() geneNameDict[geneName].peak.addpair(pair) geneNameDict[geneName].lastPeakNum = peakIndex else: geneNameDict[geneName].peak.addpair(pair) for geneName in geneNameDict.keys(): geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak) logging.info("starting long output...") longOutput(geneNameDict, args.outBaseName, args.outDirName, pgTabix, pgTabixContigs)
def main(args): # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid( ) + "." + args.outBaseName + ".mergepairs.log" logging.basicConfig(format='%(asctime)s %(message)s', filename=logfile, level=logging.DEBUG) logging.info( "\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" % (args.inDir1, args.inDir2, args.outBaseName, args.configFileName)) # create output directory pickreads.prepOutDir(args.outBaseName, args.outDirName, args.overwrite) # make sure input sources exist peakparser.checkOutDir(args.inDir1, args.outDirName) peakparser.checkOutDir(args.inDir2, args.outDirName) # make sure config files exist configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName pickreads.checkfile(configPath1) pickreads.checkfile(configPath2) # read parameters for both inputs configDict1 = peakparser.readConfig(configPath1, args.inDir1, args.outDirName) configDict2 = peakparser.readConfig(configPath2, args.inDir2, args.outDirName) maxDist = int( configDict1['insertSize']) + 2 * int(configDict1['readLength']) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt" readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt" logging.info("merging readfiles (%s, %s)" % (readFileName1, readFileName2)) # print "merging readfiles (%s, %s)" % (readFileName1,readFileName2) mergeChrPosFiles(readFileName1, readFileName2, outReadFileName, maxDist) # merge bedfiles outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed" bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed" bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed" logging.info("merging bedfiles (%s,%s)" % (bedFileName1, bedFileName2)) # print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2) mergeChrPosFiles(bedFileName1, bedFileName2, outBedFileName, maxDist) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = configDict1 configDict['bamFileName1'] = configDict1['bamFileName'] configDict['bamFileName2'] = configDict2['bamFileName'] configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k, v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName pickreads.checkfile(configPath) configDict = readConfig(configPath,args.outBaseName,args.outDirName) # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName +".peakparser.log" logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG) # log parameters logging.info("\noutBaseName=%s\nbamFileName=%s\nexonTabixFile=%s" % (args.outBaseName,configDict['readFileName'],configDict['exonTabixFile'])) pickreads.checkfile(configDict['readFileName']) pickreads.checkfile(configDict['exonTabixFile']) pickreads.checkfile(args.pgTabixFile) tabixFile = pysam.Tabixfile(configDict['exonTabixFile'], 'r') pgTabix = pysam.Tabixfile(args.pgTabixFile, 'r') pgTabixContigs = {} for contig in pgTabix.contigs: pgTabixContigs[contig] = 1 # structure for keeping element classes seperate geneNameDict = {} f = open(configDict['readFileName']) for line in f: fields = line.rsplit("\t") chrom = fields[0] readPos = int(fields[1]) readStrand = fields[2] mateElt = fields[3] mateChrom = fields[4] mateStrand = fields[5] matePos = int(fields[6]) eltStart = int(fields[7]) eltEnd = int(fields[8]) eltStrand = fields[9] eltFullLen = int(fields[10]) genomeName = fields[11] peakIndex = fields[12] geneName = fields[13].strip() if not geneNameDict.has_key(geneName): geneNameDict[geneName] = PeakBuilder(geneName); pair = pairinfo.Pair(chrom,readPos,readStrand,mateElt,mateChrom, mateStrand,matePos,eltStart,eltEnd,eltStrand, eltFullLen,genomeName,peakIndex,geneName) if geneNameDict[geneName].lastPeakNum != peakIndex: geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak) geneNameDict[geneName].peak = pairinfo.Peak() geneNameDict[geneName].peak.addpair(pair) geneNameDict[geneName].lastPeakNum = peakIndex else: geneNameDict[geneName].peak.addpair(pair) for geneName in geneNameDict.keys(): geneNameDict[geneName].peaks.append(geneNameDict[geneName].peak) logging.info("starting long output...") longOutput(geneNameDict,args.outBaseName,args.outDirName,pgTabix,pgTabixContigs)