def main(args): # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid() + "." + args.outBaseName + ".mergepairs.log" logging.basicConfig(format='%(asctime)s %(message)s',filename=logfile,level=logging.DEBUG) logging.info("\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" % (args.inDir1,args.inDir2,args.outBaseName,args.configFileName)) # create output directory discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) # make sure input sources exist peakparser.checkOutDir(args.inDir1,args.outDirName) peakparser.checkOutDir(args.inDir2,args.outDirName) # make sure config files exist configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName discordant.checkfile(configPath1) discordant.checkfile(configPath2) # read parameters for both inputs configDict1 = peakparser.readConfig(configPath1,args.inDir1,args.outDirName) configDict2 = peakparser.readConfig(configPath2,args.inDir2,args.outDirName) maxDist = int(configDict1['insertSize']) + 2*int(configDict1['readLength']) eltLenDict = discordant.makeEltLenDict(configDict1['eltLenFileName']) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt" readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt" logging.info("merging readfiles (%s, %s)" % (readFileName1,readFileName2)) # print "merging readfiles (%s, %s)" % (readFileName1,readFileName2) mergeChrPosFiles(readFileName1,readFileName2,outReadFileName,maxDist,eltLenDict) # merge bedfiles outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed" bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed" bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed" logging.info("merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2)) # print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2) mergeChrPosFiles(bedFileName1,bedFileName2,outBedFileName,maxDist,eltLenDict) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = configDict1 configDict['bamFileName1'] = configDict1['bamFileName'] configDict['bamFileName2'] = configDict2['bamFileName'] configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): # create output directory pickreads.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) pickreads.checkfile(args.sampleListFile) sampleList = open(args.sampleListFile, 'r') readFileNames = [] bamFileNames = [] insertSizes = [] readLengths = [] lastConfig = None for sampleLine in sampleList: if not re.search("^#", sampleLine): (sampleBam,sampleSubDir,refGenome) = sampleLine.strip().split() peakparser.checkOutDir(sampleSubDir,args.outDirName) configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName pickreads.checkfile(configPath) configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName) lastConfig = configDict insertSizes.append(int(configDict['insertSize'])) readLengths.append(int(configDict['readLength'])) readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt" readFileNames.append(readFileName) bamFileNames.append(configDict['bamFileName']) maxDist = max(insertSizes) + 2*max(readLengths) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" mergeChrPosFiles(readFileNames,outReadFileName,maxDist) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = lastConfig bfnum = 0 for bamFileName in bamFileNames: bfvname = "bamFileName" + str(bfnum) configDict[bfvname] = bamFileName bfnum += 1 configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): peakparser.checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName discordant.checkfile(configPath) configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName) refGenome = configDict['refGenome'] checkAnnotDir(refGenome, args.annotDir) debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt" discordant.checkfile(debugFile) # can filter on retroelement subfamilies discordant.checkfile(args.eltFile) eltDict = {} f = open(args.eltFile,'r') for elt in f: eltDict[elt.strip()]=1 germlineIns = [] cancerIns = [] normalIns = [] otherIns = [] f = open(debugFile, 'r') for line in f: # loop over lines if re.search('valid=1', line) and (re.search('olClN=0', line) or args.allowOverlap): insdata = {} for c in line.rstrip().rsplit(' '): # loop over columns (key,value) = c.rsplit('=') insdata[key] = value if int(insdata['np']) >= int(args.minPeakSize) and eltDict.has_key(insdata['pME']): insSum = insertionSummary() insSum.setPos(insdata['pos']) insSum.setEltPos(insdata['eltExt']) insSum.setBestStrand(insdata['eInv'],insdata['eS']) insSum.setSources(insdata['sources']) insSum.setClassIndex(insdata['index']) insSum.eltFam = insdata['pME'] insSum.numreads = insdata['np'] if len(insSum.sources) > 1: germlineIns.append(insSum) else: if re.search('CANCER', insSum.sources[0]): cancerIns.append(insSum) elif re.search('NORMAL', insSum.sources[0]): normalIns.append(insSum) else: otherIns.append(insSum) f.close() annotatePos(germlineIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.printout) annotatePos(cancerIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.printout) annotatePos(normalIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.printout) annotatePos(otherIns,refGenome,args.annotDir,args.outDirName + "/" + args.outBaseName + "/other.tab.txt",args.printout)
def main(args): peakparser.checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName checkfile(configPath) configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName) refGenome = configDict['refGenome'] checkAnnotDir(refGenome,args.annotDir) debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt" checkfile(debugFile) germlineIns = [] cancerIns = [] normalIns = [] uncatIns = [] f = open(debugFile, 'r') for line in f: # loop over lines if re.search('valid=1', line): insdata = {} for c in line.rstrip().rsplit(' '): # loop over columns (key,value) = c.rsplit('=') insdata[key] = value if int(insdata['np']) >= 8 and int(insdata['nEx']) > 1: insSum = insertionSummary() insSum.setPos(insdata['pos']) insSum.setEltPos(insdata['eltExt']) insSum.setBestStrand(insdata['eInv'],insdata['eS']) insSum.setSources(insdata['sources']) insSum.index = insdata['index'] insSum.priGene = insdata['pGene'] insSum.exonStr = insdata['exT'] insSum.eltFam = insdata['pME'] insSum.numreads = insdata['np'] insSum.pseudo = insdata['pgO'] if len(insSum.sources) > 1: germlineIns.append(insSum) else: if re.search('CANCER', insSum.sources[0]): cancerIns.append(insSum) elif re.search('NORMAL', insSum.sources[0]): normalIns.append(insSum) else: uncatIns.append(insSum) f.close() annotatePos(germlineIns,refGenome,args.outDirName + "/" + args.outBaseName + "/germline.tab.txt",args.annotDir,args.printout) annotatePos(cancerIns,refGenome,args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt",args.annotDir,args.printout) annotatePos(normalIns,refGenome,args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt",args.annotDir,args.printout) annotatePos(uncatIns,refGenome,args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt",args.annotDir,args.printout)
def main(args): # debugging info logfile = args.outDirName + "/" + args.outBaseName + "/logs/%d" % os.getpid( ) + "." + args.outBaseName + ".mergepairs.log" logging.basicConfig(format='%(asctime)s %(message)s', filename=logfile, level=logging.DEBUG) logging.info( "\ninDir1=%s\ninDir2=%s\noutBaseName=%s\nconfigFileName=%s" % (args.inDir1, args.inDir2, args.outBaseName, args.configFileName)) # create output directory pickreads.prepOutDir(args.outBaseName, args.outDirName, args.overwrite) # make sure input sources exist peakparser.checkOutDir(args.inDir1, args.outDirName) peakparser.checkOutDir(args.inDir2, args.outDirName) # make sure config files exist configPath1 = args.outDirName + "/" + args.inDir1 + "/" + args.configFileName configPath2 = args.outDirName + "/" + args.inDir2 + "/" + args.configFileName pickreads.checkfile(configPath1) pickreads.checkfile(configPath2) # read parameters for both inputs configDict1 = peakparser.readConfig(configPath1, args.inDir1, args.outDirName) configDict2 = peakparser.readConfig(configPath2, args.inDir2, args.outDirName) maxDist = int( configDict1['insertSize']) + 2 * int(configDict1['readLength']) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" readFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".readpairs.txt" readFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".readpairs.txt" logging.info("merging readfiles (%s, %s)" % (readFileName1, readFileName2)) # print "merging readfiles (%s, %s)" % (readFileName1,readFileName2) mergeChrPosFiles(readFileName1, readFileName2, outReadFileName, maxDist) # merge bedfiles outBedFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".reads.bed" bedFileName1 = args.outDirName + "/" + args.inDir1 + "/" + args.inDir1 + ".reads.bed" bedFileName2 = args.outDirName + "/" + args.inDir2 + "/" + args.inDir2 + ".reads.bed" logging.info("merging bedfiles (%s,%s)" % (bedFileName1, bedFileName2)) # print "merging bedfiles (%s,%s)" % (bedFileName1,bedFileName2) mergeChrPosFiles(bedFileName1, bedFileName2, outBedFileName, maxDist) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = configDict1 configDict['bamFileName1'] = configDict1['bamFileName'] configDict['bamFileName2'] = configDict2['bamFileName'] configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k, v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()
def main(args): peakparser.checkOutDir(args.outBaseName, args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName checkfile(configPath) configDict = peakparser.readConfig(configPath, args.outBaseName, args.outDirName) # load hash of TranscriptSeq objects (genename --> multiple transcripts) # will need to pass to partialMapTx() later sys.stderr.write("loading " + args.mrnaFastaFile + "...\n") checkfile(args.mrnaFastaFile) txs = fastahash(args.mrnaFastaFile) cancerBamFile = '' normalBamFile = '' refGenomeFile = args.refGenomeFile cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt" normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt" germCallsFile = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt" otherCallsFile = args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt" # fix if unmerged if not configDict.has_key('bamFileName1'): configDict['bamFileName1'] = configDict['bamFileName'] configDict['bamFileName2'] = configDict['bamFileName'] bamType1 = getTypeFromTCGA(configDict['bamFileName1']) bamType2 = getTypeFromTCGA(configDict['bamFileName2']) print("bamfile1=%s bamFile2=%s bamType1=%s bamType2=%s" % (configDict['bamFileName1'], configDict['bamFileName2'], bamType1, bamType2)) if bamType1 != bamType2 and bamType1 != None and bamType2 != None: if bamType1 == 'CANCER': if bamType2 != 'NORMAL': raise NameError('bam1 is cancer but bam2 is not normal') cancerBamFile = configDict['bamFileName1'] normalBamFile = configDict['bamFileName2'] if bamType2 == 'CANCER': if bamType1 != 'NORMAL': raise NameError('bam2 is cancer but bam1 is not normal') cancerBamFile = configDict['bamFileName2'] normalBamFile = configDict['bamFileName1'] else: print 'cannot determine bamfile cancer/normal from filenames in config.txt, defaulting to normal.' normalBamFile = configDict['bamFileName1'] cancerBamFile = configDict['bamFileName2'] checkfile(cancerBamFile) checkfile(normalBamFile) checkfile(normalCallsFile) checkfile(cancerCallsFile) checkfile(germCallsFile) checkfile(otherCallsFile) checkfile(refGenomeFile) cancerBam = pysam.Samfile(cancerBamFile, 'rb') # rb = read, binary normalBam = pysam.Samfile(normalBamFile, 'rb') # rb = read, binary cancerCalls = open(cancerCallsFile, 'r') normalCalls = open(normalCallsFile, 'r') germCalls = open(germCallsFile, 'r') otherCalls = open(otherCallsFile, 'r') refGenome = pysam.Fastafile(refGenomeFile) cancerBreaksOut = open( args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt", 'w') normalBreaksOut = open( args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt", 'w') germBreaksOut = open( args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt", 'w') otherBreaksOut = open( args.outDirName + "/" + args.outBaseName + "/uncategorizedbreaks.tab.txt", 'w') callSetListNames = ('cancer', 'normal', 'germ', 'other') callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls) callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut, otherBreaksOut) for i in range(len(callSetListNames)): for line in callSetListInFiles[i]: col = line.strip().split("\t") chr = col[0] start = int(col[1]) end = int(col[2]) gene = col[7] cancerCluster = fetchRegion(cancerBam, refGenome, int(args.maxReadLen), chr, start, end, gene, args.zeroChar, int(args.minClipQual), args.usechr) cancerCluster.type = 'CANCER' normalCluster = fetchRegion(normalBam, refGenome, int(args.maxReadLen), chr, start, end, gene, args.zeroChar, int(args.minClipQual), args.usechr) normalCluster.type = 'NORMAL' mergeCluster = mergeClusters(cancerCluster, normalCluster, txs) clusterout = mergeCluster.outstring() infodumpout = mergeCluster.infodump() callSetListOutFiles[i].write( line.strip("\n") + "\t" + clusterout + "\n" + infodumpout + "\n") callSetListInFiles[i].close() callSetListOutFiles[i].close()
def main(args): peakparser.checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName checkfile(configPath) configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName) cancerBamFile = '' normalBamFile = '' refGenomeFile = args.refGenomeFile cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt" normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt" germCallsFile = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt" otherCallsFile = args.outDirName + "/" + args.outBaseName + "/other.tab.txt" # fix if unmerged if not configDict.has_key('bamFileName1'): configDict['bamFileName1'] = configDict['bamFileName'] configDict['bamFileName2'] = configDict['bamFileName'] bamType1 = getTypeFromTCGA(configDict['bamFileName1']) bamType2 = getTypeFromTCGA(configDict['bamFileName2']) print ("bamfile1=%s bamFile2=%s bamType1=%s bamType2=%s" % (configDict['bamFileName1'], configDict['bamFileName2'], bamType1, bamType2)) if bamType1 != bamType2 and bamType1 != None and bamType2 != None: if bamType1 == 'CANCER': if bamType2 != 'NORMAL': raise NameError('bam1 is cancer but bam2 is not normal') cancerBamFile = configDict['bamFileName1'] normalBamFile = configDict['bamFileName2'] if bamType2 == 'CANCER': if bamType1 != 'NORMAL': raise NameError('bam2 is cancer but bam1 is not normal') cancerBamFile = configDict['bamFileName2'] normalBamFile = configDict['bamFileName1'] else: print 'cannot determine bamfile cancer/normal from filenames in config.txt, defaulting to normal.' normalBamFile = configDict['bamFileName1'] cancerBamFile = configDict['bamFileName2'] checkfile(cancerBamFile) checkfile(normalBamFile) checkfile(normalCallsFile) checkfile(cancerCallsFile) checkfile(germCallsFile) checkfile(otherCallsFile) checkfile(refGenomeFile) cancerBam = pysam.Samfile(cancerBamFile, 'rb') # rb = read, binary normalBam = pysam.Samfile(normalBamFile, 'rb') # rb = read, binary cancerCalls = open(cancerCallsFile, 'r') normalCalls = open(normalCallsFile, 'r') germCalls = open(germCallsFile, 'r') otherCalls = open(otherCallsFile, 'r') refGenome = pysam.Fastafile(refGenomeFile) cancerBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt", 'w') normalBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt", 'w') germBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt", 'w') otherBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/otherbreaks.tab.txt", 'w') callSetListNames = ('cancer', 'normal', 'germ','other') callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls) callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut, otherBreaksOut) for i in range(len(callSetListNames)): for line in callSetListInFiles[i]: col = line.strip().split("\t") chr = col[0] start = int(col[1]) end = int(col[2]) cancerCluster = fetchRegion(cancerBam,refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr) cancerCluster.type='CANCER' normalCluster = fetchRegion(normalBam,refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr) normalCluster.type='NORMAL' mergeCluster = mergeClusters(cancerCluster,normalCluster,args.refFastaDir) clusterout = mergeCluster.outstring() infodumpout = mergeCluster.infodump() callSetListOutFiles[i].write(line.strip("\n") + "\t" + clusterout + "\n" + infodumpout + "\n") callSetListInFiles[i].close() callSetListOutFiles[i].close()
def main(args): peakparser.checkOutDir(args.outBaseName,args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName checkfile(configPath) configDict = peakparser.readConfig(configPath,args.outBaseName,args.outDirName) cancerBamFile = '' normalBamFile = '' refGenomeFile = args.refGenomeFile cancerCallsFile = args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt" normalCallsFile = args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt" germCallsFile = args.outDirName + "/" + args.outBaseName + "/germline.tab.txt" otherCallsFile = args.outDirName + "/" + args.outBaseName + "/other.tab.txt" bamFiles = [] sampleNames = [] for key in configDict.keys(): if key.startswith('bamFileName'): bamFiles.append(configDict[key]) samplekey = "sampleName" + re.sub('bamFileName','',key) sampleNames.append(configDict[samplekey]) print "finding breakpoints from bam files:" for i in range(len(bamFiles)): print bamFiles[i], sampleNames[i] for bamFile in bamFiles: checkfile(bamFile) checkfile(normalCallsFile) checkfile(cancerCallsFile) checkfile(germCallsFile) checkfile(otherCallsFile) checkfile(refGenomeFile) bamPysams = [] for bamFile in bamFiles: bamPysam = pysam.Samfile(bamFile, 'rb') bamPysams.append(bamPysam) cancerCalls = open(cancerCallsFile, 'r') normalCalls = open(normalCallsFile, 'r') germCalls = open(germCallsFile, 'r') otherCalls = open(otherCallsFile, 'r') refGenome = pysam.Fastafile(refGenomeFile) cancerBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/cancerbreaks.tab.txt", 'w') normalBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/normalbreaks.tab.txt", 'w') germBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/germlinebreaks.tab.txt", 'w') otherBreaksOut = open(args.outDirName + "/" + args.outBaseName + "/otherbreaks.tab.txt", 'w') callSetListNames = ('cancer', 'normal', 'germ','other') callSetListInFiles = (cancerCalls, normalCalls, germCalls, otherCalls) callSetListOutFiles = (cancerBreaksOut, normalBreaksOut, germBreaksOut, otherBreaksOut) for i in range(len(callSetListNames)): for line in callSetListInFiles[i]: col = line.strip().split("\t") chr = col[0] start = int(col[1]) end = int(col[2]) mergedCluster = fetchRegion(bamPysams[0],refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr) mergedCluster.type=sampleNames[0] for p in range(1,len(bamPysams)): nextCluster = fetchRegion(bamPysams[p],refGenome,int(args.maxReadLen),chr,start,end,args.zeroChar,int(args.minClipQual),args.usechr) nextCluster.type=sampleNames[p] mergedCluster = mergeClusters(mergedCluster,nextCluster,args.refFastaDir) clusterout = mergedCluster.outstring() infodumpout = mergedCluster.infodump() callSetListOutFiles[i].write(line.strip("\n") + "\t" + clusterout + "\n" + infodumpout + "\n") callSetListInFiles[i].close() callSetListOutFiles[i].close()
def main(args): peakparser.checkOutDir(args.outBaseName, args.outDirName) configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName checkfile(configPath) configDict = peakparser.readConfig(configPath, args.outBaseName, args.outDirName) refGenome = configDict['refGenome'] checkAnnotDir(refGenome, args.annotDir) debugFile = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".debug.txt" checkfile(debugFile) germlineIns = [] cancerIns = [] normalIns = [] uncatIns = [] f = open(debugFile, 'r') for line in f: # loop over lines if re.search('valid=1', line): insdata = {} for c in line.rstrip().rsplit(' '): # loop over columns (key, value) = c.rsplit('=') insdata[key] = value if int(insdata['np']) >= int(args.minPeakSize) and int( insdata['nEx']) > 1: insSum = insertionSummary() insSum.setPos(insdata['pos']) insSum.setEltPos(insdata['eltExt']) insSum.setBestStrand(insdata['eInv'], insdata['eS']) insSum.setSources(insdata['sources']) insSum.index = insdata['index'] insSum.priGene = insdata['pGene'] insSum.exonStr = insdata['exT'] insSum.eltFam = insdata['pME'] insSum.numreads = insdata['np'] insSum.pseudo = insdata['pgO'] if len(insSum.sources) > 1: germlineIns.append(insSum) else: if re.search('CANCER', insSum.sources[0]): cancerIns.append(insSum) elif re.search('NORMAL', insSum.sources[0]): normalIns.append(insSum) else: uncatIns.append(insSum) f.close() annotatePos(germlineIns, refGenome, args.outDirName + "/" + args.outBaseName + "/germline.tab.txt", args.annotDir, args.printout) annotatePos( cancerIns, refGenome, args.outDirName + "/" + args.outBaseName + "/canceronly.tab.txt", args.annotDir, args.printout) annotatePos( normalIns, refGenome, args.outDirName + "/" + args.outBaseName + "/normalonly.tab.txt", args.annotDir, args.printout) annotatePos( uncatIns, refGenome, args.outDirName + "/" + args.outBaseName + "/uncategorized.tab.txt", args.annotDir, args.printout)
def main(args): # create output directory discordant.prepOutDir(args.outBaseName,args.outDirName,args.overwrite) sampleList = [] try: sampleList = args.sampleList except AttributeError: discordant.checkfile(args.sampleListFile) sampleList = open(args.sampleListFile, 'r') readFileNames = [] bamFileNames = [] sampleNames = [] insertSizes = [] readLengths = [] eltLenDict = None lastConfig = None for sampleLine in sampleList: if not re.search("^#", sampleLine): (sampleBam,sampleSubDir,refGenome,groupName) = sampleLine.strip().split() peakparser.checkOutDir(sampleSubDir,args.outDirName) configPath = args.outDirName + "/" + sampleSubDir + "/" + args.configFileName discordant.checkfile(configPath) configDict = peakparser.readConfig(configPath,sampleSubDir,args.outDirName) eltLenDict = discordant.makeEltLenDict(configDict['eltLenFileName']) lastConfig = configDict insertSizes.append(int(configDict['insertSize'])) readLengths.append(int(configDict['readLength'])) readFileName = args.outDirName + "/" + sampleSubDir + "/" + sampleSubDir + ".readpairs.txt" readFileNames.append(readFileName) bamFileNames.append(configDict['bamFileName']) sampleNames.append(sampleSubDir) maxDist = max(insertSizes) + 2*max(readLengths) # merge readfiles outReadFileName = args.outDirName + "/" + args.outBaseName + "/" + args.outBaseName + ".readpairs.txt" mergeChrPosFiles(readFileNames,outReadFileName,maxDist,eltLenDict) # write new config file configPath = args.outDirName + "/" + args.outBaseName + "/" + args.configFileName configDict = lastConfig assert len(bamFileNames) == len(sampleNames) bfnum = 0 for bamFileName in bamFileNames: bfvname = "bamFileName" + str(bfnum) configDict[bfvname] = bamFileName bfnum += 1 snum = 0 for sampleName in sampleNames: samname = "sampleName" + str(snum) configDict[samname] = sampleName snum += 1 configDict['merged'] = 'True' configDict['outBaseName'] = args.outBaseName configDict['outDirName'] = args.outDirName configDict['readFileName'] = outReadFileName del configDict['bamFileName'] f = open(configPath, 'w') for k,v in configDict.iteritems(): f.write(k + "=" + v + "\n") f.close()