def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder,True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap+=[name,backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name) sys.exit() else: extraMap+=[name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap,1,1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder): print("MAPPING TO THE FOLLOWING BAMS:") for bamFile in bamFileList: print(bamFile) bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, analysisName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (splitGFFPath, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (splitGFFPath, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (splitGFFPath, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (splitGFFPath, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #now we make a signal table #set up the table using the first bam if len(bamFileList) > 1: #set up the first pass at the table signalTable = [['REGION_ID','locusLine'] + [name.split('/')[-1] for name in bamFileList]] bamFileName = bamFileList[0].split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable)): signalTable.append(mappedTable[i]) for bamFile in bamFileList[1:]: bamFileName = bamFile.split('/')[-1] mappedTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') for i in range(1,len(mappedTable[i])): mapSignal = mappedTable[i][2] signalTable[i].append(mapSignal) else: bamFileName = bamFileList[0].split('/')[-1] signalTable = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, analysisName, bamFileName),'\t') return(signalTable)
def run_bash(bash_path, output_path, maxWait=30): ''' runs a bash script and waits up to N minutes ''' if not utils.checkOutput(output_path, 0, 0): print('running bash script %s' % (bash_path)) os.system('bash %s' % (bash_path)) if utils.checkOutput(output_path, 1, 30): print('run completed, output detected for %s at %s' % (bash_path, output_path)) else: print('found prior output for %s at %s' % (bash_path, output_path))
def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def callR_GSEA(class1TablePath, class2TablePath, outputFolder, analysisName, top): ''' function to call the Rscript and to wait until the .cls and .gct files are created returns the paths ''' rBashFilePath = '%s%s_R_gsea.sh' % (outputFolder, analysisName) rBashFile = open(rBashFilePath, 'w') rBashFile.write('#!/usr/bin/bash\n\n') rCmd = 'Rscript %senhancerPromoter_gsea.R %s %s %s %s %s' % ( pipeline_dir, class1TablePath, class2TablePath, outputFolder, analysisName, top) rBashFile.write(rCmd) rBashFile.close() print('writing R plotting command to disk and calling %s' % (rBashFilePath)) os.system('bash %s' % (rBashFilePath)) #now check for the nes output nesPath = '%s%s_top_%s_nes.txt' % (outputFolder, analysisName, top) if utils.checkOutput(nesPath, 0.5, 5): return else: print('ERROR: UNABLE TO SUCCESFULLY DETECT R SCRIPT OUTPUT AT %s' % (nesPath)) sys.exit()
def CarRegTwoOptions(_actor, _failed=False): data = {'key': privKeys[_actor['account']]} writeDataBase(data, carFile) cmd = [CarExe, '--reg'] _ret = runCmd(cmd, _miner=True) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval try: flag = mgmtContract.functions.cars(_actor["account"]).call() except ValueError as error: printError("Cannot call cars()", [error]) return retval if not flag: printError("Got 'False' from contract", _ret[1]) return retval if not _failed: forma = ['Registered successfully'] else: forma = ['Already registered'] return checkOutput(_ret[1], forma)
def run_macs(dataFile): dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] namesList.sort() print(namesList) pipeline_dfci.callMacs(dataFile,macsFolder,namesList,overwrite=False,pvalue='1e-9') os.chdir(projectFolder) # the silly call macs script has to change into the output dir #so this takes us back to the project folder #to check for completeness, we will try to find all of the peak files peak_calling_done = False while not peak_calling_done: dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] for name in namesList: peak_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) print('searching for %s' % (peak_path)) if utils.checkOutput(peak_path,1,180): peak_calling_done =True print('found %s' % (peak_path)) continue else: print('Error: peak calling timed out') sys.exit() #now format the macs output print('formatting macs output') dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] pipeline_dfci.formatMacsOutput(dataFile,macsFolder,macsEnrichedFolder,wiggleFolder,wigLink ='',useBackground=True) print('Finished running Macs 1.4.2')
def ScenterRegTwoOptions(_actor, _failed=False): setAcc(_actor, scenterFile) cmd = [ScenterExe, '--reg'] _ret = runCmd(cmd, _miner=True) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval try: flag = mgmtContract.functions.serviceCenters(_actor["account"]).call() except ValueError as error: printError("Cannot call serviceCenters()", [error]) return retval if not flag: printError("Got 'False' from contract", _ret[1]) return retval if not _failed: forma = ['Registered successfully'] else: forma = ['Already registered'] return checkOutput(_ret[1], forma)
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) try: foo = utils.parseTable(roseOutput,'\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2) mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput,1,30): return roseOutput else: print "ERROR: ROSE CALL ON MERGED REGIONS FAILED" sys.exit()
def callRWaterfall(geneTablePath,outputFolder,analysisName,top): ''' function to call the Rscript and to wait until the .cls and .gct files are created returns the paths ''' rBashFilePath = '%s%s_R_plotting.sh' % (outputFolder,analysisName) rBashFile = open(rBashFilePath,'w') rBashFile.write('#!/usr/bin/bash\n\n') rCmd = 'R --no-save %s %s %s %s < %s/enhancerPromoter_waterfall.R' % (geneTablePath,outputFolder,analysisName,top,whereAmI) rBashFile.write(rCmd) rBashFile.close() print('writing R plotting command to disk and calling %s' %(rBashFilePath)) os.system('bash %s' % (rBashFilePath)) #now check for the .cls output clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top) if utils.checkOutput(clsPath,0.5,5): return else: print('ERROR: UNABLE TO SUCCESFULLY DETECT R SCRIPT OUTPUT AT %s' % (clsPath)) sys.exit()
def VendorRegisterNotUniqueAddressFail(_actor, _name, _value): walletContractAddress = mgmtContract.functions.walletContract().call() prev_balance = w3.eth.getBalance(walletContractAddress) cmd = [VendorExe, '--reg', _name, str(_value)] _ret = runCmdWithActor(cmd, _actor) retval = False if not _ret[0]: print("The command executed too long") return retval cur_balance = w3.eth.getBalance(walletContractAddress) if not (cur_balance - prev_balance == 0): printError("Wallet balance was changed", _ret[1]) return retval try: deposit = mgmtContract.functions.vendorDeposit( _actor['account']).call() except ValueError as error: printError("Cannot call vendorDeposit()", [error]) return retval retval = checkOutput(_ret[1], ['Failed. The vendor address already used.']) return retval
def callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF=''): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (parentFolder,namesList1[0],string.upper(genome),mergeName) if utils.checkOutput(roseOutput,.1,.1): print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput else: print("NO MERGED ROSE OUTPUT FOUND") print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1,superFile2) mergedGFF = mergeCollections(superFile1,superFile2,name1,name2,mergedGFFFile,inputGFF) print('just merged gff') print(mergedGFF) #call rose on the merged regions roseBashFile = callRoseMerged(dataFile,mergedGFF,name1,name2,parentFolder,namesList1,namesList2,useBackground) print('merged rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput,1,10): return roseOutput else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder,namesList1[0]) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() roseOutput= getFile('_ENHANCER_REGION_MAP.txt',roseFileList,roseFolder) return roseOutput
def detectGSEAOutput(analysisName, outputFolder, top): ''' tries to detect the .xls files that show up when GSEA is done running ''' #first figure out the friggin output folder gseaParentFolder = '%sgsea_top_%s_c2/' % (outputFolder, top) for i in range(30): folderList = os.listdir(gseaParentFolder) #print(folderList) candidateFolderList = [ folder for folder in folderList if folder.count('%s_top_%s.Gsea' % (analysisName, top)) == 1 ] if len(candidateFolderList) > 1: print( 'ERROR: MULTIPLE GSEA OUTPUT FOLDERS DETECTED FOR %s WITH TOP %s GENES' % (analysisName, string.upper(str(top)))) sys.exit() elif len(candidateFolderList) == 0: time.sleep(10) elif len(candidateFolderList) == 1: candidateFolder = '%sgsea_top_%s_c2/%s/' % (outputFolder, top, candidateFolderList[0]) print('USING %s AS CANDIDATE GSEA FOLDER' % (candidateFolder)) timeStamp = candidateFolder.split('.')[-1][:-1] print(timeStamp) #now that you have the candidate folder find the friggen xls files #for promoter promoterTablePath = '%sgsea_report_for_PROMOTER_%s.xls' % (candidateFolder, timeStamp) distalTablePath = '%sgsea_report_for_DISTAL_%s.xls' % (candidateFolder, timeStamp) print(promoterTablePath) print(distalTablePath) #now check em if utils.checkOutput(promoterTablePath, 0.5, 30): print('FOUND PROMOTER OUTPUT AT %s' % (promoterTablePath)) if utils.checkOutput(distalTablePath, 0.5, 30): print('FOUND DISTAL OUTPUT AT %s' % (distalTablePath)) return promoterTablePath, distalTablePath else: print('ERROR: UNABLE TO FIND GSEA OUTPUT')
def testVendorGetRegFee(): cmd = [VendorExe, '--regfee'] _ret = runCmd(cmd) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval return checkOutput(_ret[1], [f"Vendor registration fee: {currentRegFee}"])
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() extraMap = [] for name in namesList[1:]: backgroundName = dataDict[name]['background'] extraMap+=[name,backgroundName] #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) if utils.checkOutput(mergedRegionMap,1,1): return mergedRegionMap bashFileName = pipeline_dfci.callRose(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def VendorGetBatteryFee(_actor): cmd = [VendorExe, '--batfee'] _ret = runCmdWithActor(cmd, _actor) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval return checkOutput(_ret[1], [f"Production fee per one battery: {currentBatFee}"])
def VendorDepositWrongAccount(_actor): cmd = [VendorExe, '--deposit'] _ret = runCmdWithActor(cmd, _actor) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval forma = ['Vendor account is not registered.'] return checkOutput(_ret[1], forma)
def define_enhancer_landscape(mouse_dataFile, analysisName, namesList=[]): ''' define enhancers using h3k27ac in the 3 datasets that look good: CG, SCG, THMYCN_139076 using regular ROSE2 ''' #For SCG baseline #no TSS exclusion and no stitching dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) if len(namesList) == 0: namesList = [ name for name in dataDict.keys() if name.upper().count('H3K27AC') == 1 ] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList, ',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList, ',') bedFileList = [ macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList ] bedString = string.join(bedFileList, ',') outputFolder = '%s%s/' % (metaRoseFolder, analysisName) bashFileName = '%s%s_meta_rose.sh' % (metaRoseFolder, analysisName) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g mm9 -i %s -r %s -c %s -o %s -n %s' % ( pipeline_dir, bedString, bamString, controlBamString, outputFolder, analysisName) bashFile.write(metaRoseCmd + '\n') bashFile.close() region_map_path = '%s%s/%s_AllEnhancers.table.txt' % ( metaRoseFolder, analysisName, analysisName) #runs only if no output detected if not utils.checkOutput(region_map_path, 0, 0): print(bashFileName) os.system('bash %s' % (bashFileName)) return bashFileName, region_map_path, namesList
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder,True) queueList =[] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose(dataFile,'',roseOutputFolder,[name],[],enrichedFile,mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_AllEnhancers.table.txt" % (roseOutputFolder,name,name) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile) if utils.checkOutput(enhancerFile,5,60): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: print "UNABLE TO FIND ENHANCER OUTPUT FOR %s. QUITTING NOW" % (name) sys.exit() return nameDict
def testSetupCreateManagementContract(_actor, _fee): global mgmtContract, currentBatFee, currentRegFee currentBatFee = _fee currentRegFee = 1000 * currentBatFee _cmd = [SetupExe, '--setup', str(currentBatFee)] _ret = runCmdWithActor(_cmd, _actor) retval = False if not _ret[0]: print("The command executed too long") return retval db = openDataBase() if not ('mgmtContract' in db): printError('Database was not found or it is in incorrect format', _ret[1]) else: mgmtContractAddress = db['mgmtContract'] mgmtContract = ManagementContract(w3, mgmtContractAddress) try: batContractAddress = mgmtContract.functions.batteryManagement( ).call() except ValueError as error: printError("Cannot call batteryManagement()", [error]) return retval batContract = BatteryContract(w3, batContractAddress) try: erc20ContractAddress = batContract.functions.erc20().call() except ValueError as error: printError("Cannot call erc20()", [error]) return retval try: walletContractAddress = mgmtContract.functions.walletContract( ).call() except ValueError as error: printError("Cannot call erc20()", [error]) return retval forma = 'Management contract: {}\nWallet contract: {}\nCurrency contract: {}' forma = forma.format(mgmtContractAddress, walletContractAddress, erc20ContractAddress).split('\n') retval = checkOutput(_ret[1], forma) return retval
def CarGetAccount(_actor): data = {'key': privKeys[_actor['account']]} writeDataBase(data, carFile) cmd = [CarExe, "--account"] _ret = runCmd(cmd) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval return checkOutput(_ret[1], [_actor['account']])
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder): ''' this is the main run function for the script all of the work should occur here, but no functions should be defined here ''' mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % ( parentFolder, string.upper(genome), mergeName) #check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % ( parentFolder, name1, string.upper(genome), mergeName) try: foo = utils.parseTable(roseOutput, '\t') print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2) mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile) #call rose on the merged shit roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder) print('i can has rose bash file %s' % (roseBashFile)) #run the bash command os.system('bash %s' % (roseBashFile)) #check for and return output if utils.checkOutput(roseOutput, 1, 10): return roseOutput else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder, name1) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerToGeneFile = getFile( '_SuperEnhancers_ENHANCER_TO_GENE.txt', roseFileList, roseFolder)
def CarNewAccount(): cmd = [CarExe, "--new"] _ret = runCmd(cmd) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval data = openDataBase(carFile) if 'key' not in data: printError("Incorrect database", _ret[1]) return retval return checkOutput(_ret[1], [privtopub(data['key'])])
def callMergeSupers(dataFile, superFile1, superFile2, name1, name2, mergeName, genome, parentFolder): """ this is the main run function for the script all of the work should occur here, but no functions should be defined here """ mergedGFFFile = "%s%s_%s_MERGED_REGIONS_-0_+0.gff" % (parentFolder, string.upper(genome), mergeName) # check to make sure this hasn't been done yet roseOutput = "%s%s_ROSE/%s_%s_MERGED_REGIONS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % ( parentFolder, name1, string.upper(genome), mergeName, ) try: foo = utils.parseTable(roseOutput, "\t") print "ROSE OUTPUT ALREADY FOUND HERE %s" % (roseOutput) return roseOutput except IOError: print "MERGING ENHANCER REGIONS FROM %s and %s" % (superFile1, superFile2) mergedGFF = mergeCollections(superFile1, superFile2, name1, name2, mergedGFFFile) # call rose on the merged shit roseBashFile = callRoseMerged(dataFile, mergedGFF, name1, name2, parentFolder) print ("i can has rose bash file %s" % (roseBashFile)) # run the bash command os.system("bash %s" % (roseBashFile)) # check for and return output if utils.checkOutput(roseOutput, 1, 10): return roseOutput else: # try finding it w/ a different name # this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (parentFolder, name1) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."] # no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerToGeneFile = getFile("_SuperEnhancers_ENHANCER_TO_GENE.txt", roseFileList, roseFolder)
def VendorDeposit(_actor): cmd = [VendorExe, '--deposit'] _ret = runCmdWithActor(cmd, _actor) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval try: cur_deposit = mgmtContract.functions.vendorDeposit( _actor['account']).call() except ValueError as error: printError("Cannot call vendorDeposit()", [error]) return retval forma = ['Deposit: {}'.format(w3.fromWei(cur_deposit, 'ether'))] return checkOutput(_ret[1], forma)
def VendorGetBatteryFeeAfterChange(_setfeeactor, _actor, _newfee): _cmd = [SetupExe, '--setfee', str(_newfee)] _ret = runCmdWithActor(_cmd, _setfeeactor) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval _cmd = [VendorExe, '--batfee'] _ret = runCmdWithActor(_cmd, _actor, True) if not _ret[0]: print("The command executed too long or there is nothing on output") return retval return checkOutput(_ret[1], [f"Production fee per one battery: {currentBatFee}"])
def VendorRegisterNewVendorSuccess(_actor, _name, _value): walletContractAddress = mgmtContract.functions.walletContract().call() prev_balance = w3.eth.getBalance(walletContractAddress) cmd = [VendorExe, '--reg', _name, str(_value)] _ret = runCmdWithActor(cmd, _actor) retval = False if not _ret[0]: print("The command executed too long") return retval cur_balance = w3.eth.getBalance(walletContractAddress) wei_value = w3.toWei(_value, 'ether') if not (cur_balance - prev_balance == wei_value): printError("Wallet balance was not changed", _ret[1]) return retval try: deposit = mgmtContract.functions.vendorDeposit( _actor['account']).call() except ValueError as error: printError("Cannot call vendorDeposit()", [error]) return retval if deposit != wei_value: printError("Deposit differs", _ret[1]) return retval try: vendId = mgmtContract.functions.vendorId(_actor['account']).call() except ValueError as error: printError("Cannot call vendorDeposit()", [error]) return retval forma = 'Success.\nVendor ID: {}'.format(str(w3.toHex(vendId))[2:]) forma = forma.split('\n') retval = checkOutput(_ret[1], forma) return retval
def callRScript(genome, outputFolder, analysisName, signalTableFile): ''' calls the R script to do clustering and heatmap ''' clusterTable = "%s%s_%s_clusterTable.txt" % (outputFolder, genome, analysisName) rCmd = 'R --no-save %s %s %s %s < /ark/home/cl512/pipeline/clusterEnhancer.R' % ( genome, outputFolder, analysisName, signalTableFile) print("Calling command %s" % rCmd) os.system(rCmd) print "Checking for cluster table output at %s" % (clusterTable) if utils.checkOutput(clusterTable, 1, 30): return clusterTable else: print "ERROR: CLUSTERING TABLE FAILED TO GENERATE" sys.exit()
def callRScript(genome,outputFolder,analysisName,signalTableFile): ''' calls the R script to do clustering and heatmap ''' clusterTable = "%s%s_%s_clusterTable.txt" % (outputFolder,genome,analysisName) rCmd = 'R --no-save %s %s %s %s < /ark/home/cl512/pipeline/clusterEnhancer.R' % (genome,outputFolder,analysisName,signalTableFile) print("Calling command %s" % rCmd) os.system(rCmd) print "Checking for cluster table output at %s" % (clusterTable) if utils.checkOutput(clusterTable,1,30): return clusterTable else: print "ERROR: CLUSTERING TABLE FAILED TO GENERATE" sys.exit()
def VendorOwnerOneBattery(_actor, _failed=False): global exist_batId, batContract _newOwner = _actor['account'] _batId = exist_batId cmd = [VendorExe, '--owner', delHexPrefix(w3.toHex(_batId)), _newOwner] _ret = runCmdWithActor(cmd, ownerVendor) retval = False if not _ret[0]: print(_ret[1]) print("The command executed too long or there is nothing on output") return retval try: vendAddr = batContract.functions.vendorOf(_batId).call() except ValueError as error: printError("Cannot call vendorOf()", [error]) return retval try: ownerAddr = batContract.functions.ownerOf(_batId).call() except ValueError as error: printError("Cannot call ownerOf()", [error]) return retval forma = [] if not _failed: forma = ['Success'] if (vendAddr == '0x' + '0' * 40) or (_newOwner != ownerAddr): printError("Incorrect info in Battery management contract", _ret[1]) return retval else: forma = ['Failed. Not allowed to change ownership.'] if (vendAddr == '0x' + '0' * 40) or (_newOwner == ownerAddr): printError("Incorrect info in Battery management contract", _ret[1]) return retval return checkOutput(_ret[1], forma)
def testVendorNewAccount(): prev_acc_list = w3.eth.accounts cmd = [VendorExe, "--new", "'New_password'"] ret = runCmd(cmd) retval = False if not ret[0]: print("The command executed too long") return retval new_acc_list = w3.eth.accounts diff = set(prev_acc_list) ^ set(new_acc_list) if len(diff) == 0: printError("New account not found on the node", ret[1]) else: account = list(diff)[0] retval = checkOutput(ret[1], [account]) return retval
def ScenterNewAccount(): prev_acc_list = w3.eth.accounts cmd = [ScenterExe, "--new", "'New_password'"] _ret = runCmd(cmd) retval = False if not _ret[0]: print("The command executed too long or there is nothing on output") return retval new_acc_list = w3.eth.accounts diff = set(prev_acc_list) ^ set(new_acc_list) if len(diff) == 0: printError("New account not found on the node", _ret[1]) else: account = list(diff)[0] retval = checkOutput(_ret[1], [account]) return retval
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path): ''' keyed by probe ID w/ gene as value ''' #see if it already exists pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder) if utils.checkOutput(pickle_path, 0, 0): print('loading previously made probe dict at %s' % (pickle_path)) probe_gene_dict = pickle.load(open(pickle_path, "rb")) return probe_gene_dict #we want to intersect refseq common names w/ the array startDict = utils.makeStartDict(annotFile) ref_name_list = utils.uniquify( [startDict[refID]['name'] for refID in startDict.keys()]) probe_gene_dict = {} array_1 = utils.parseTable(array_1_path, '\t') array_2 = utils.parseTable(array_2_path, '\t') ticker = 0 for line in array_1 + array_2: if len(line) < 5: continue ticker += 1 probe_id = line[4] name = line[-1] # print(probe_id) # print(name) # if ticker== 10: # sys.exit() # print(line) if ref_name_list.count(name) > 0: probe_gene_dict[probe_id] = name pickle.dump(probe_gene_dict, open(pickle_path, 'wb')) return probe_gene_dict
def SetupSetFeeTwoOptions(_actor, _fee, _failed=False): global currentBatFee, currentRegFee _cmd = [SetupExe, '--setfee', str(_fee)] _ret = runCmdWithActor(_cmd, _actor) retval = False if not _ret[0]: print("The command executed too long") return retval if (len(_ret[1]) == 0): printError("No output") return retval else: newBatFee = w3.toWei(_fee, 'ether') try: currentBatFeeWei = mgmtContract.functions.batteryFee().call() except ValueError as error: printError("Cannot call batteryFee()", [error]) return retval currentBatFee = w3.fromWei(currentBatFeeWei, 'ether') currentRegFee = Decimal.normalize(1000 * currentBatFee) if newBatFee != currentBatFeeWei: printError("Fee was not set", _ret[1]) return retval if not _failed: forma = ['Updated successfully'] else: forma = ['No permissions to change the service fee'] return checkOutput(_ret[1], forma)
def VendorRegisterInsufficientFundsFail(_name, _value): walletContractAddress = mgmtContract.functions.walletContract().call() prev_balance = w3.eth.getBalance(walletContractAddress) password = "******" actor = {"account": w3.personal.newAccount(password), "password": password} cmd = [VendorExe, '--reg', _name, str(_value)] _ret = runCmdWithActor(cmd, actor) retval = False if not _ret[0]: print("The command executed too long") return retval cur_balance = w3.eth.getBalance(walletContractAddress) if not (cur_balance - prev_balance == 0): printError("Wallet balance was changed", _ret[1]) return retval try: deposit = mgmtContract.functions.vendorDeposit(actor['account']).call() except ValueError as error: printError("Cannot call vendorDeposit()", [error]) return retval if deposit != 0: print(deposit) printError("Vendor was registered. Deposit is not equal to zero", _ret[1]) return retval retval = checkOutput(_ret[1], ['Failed. No enough funds to deposit.']) return retval
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument("-b", "--bam", dest="bam", nargs='*', help="Enter a comma separated list of .bam files to be processed.", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options parser.add_argument("--stretch-input", dest="stretch_input", default=None, type=int, help="Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)") parser.add_argument("-c", "--color", dest="color", default=None, help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow") parser.add_argument("-s", "--sense", dest="sense", default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_argument("-e", "--extension", dest="extension", default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_argument("-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm) Default is False") parser.add_argument("-y", "--yScale", dest="yScale", default="relative", help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling") parser.add_argument("-n", "--names", dest="names", default=None, help="Enter a comma separated list of names for your bams") parser.add_argument("-p", "--plot", dest="plot", default="MULTIPLE", help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'") parser.add_argument("-t", "--title", dest="title", default='', help="Specify a title for the output plot(s), default will be the coordinate region") # DEBUG OPTION TO SAVE TEMP FILES parser.add_argument("--scale", dest="scale", default='', help="Enter a comma separated list of scaling factors for your bams. Default is none") parser.add_argument("--save-temp", dest="save", action='store_true', default=False, help="If flagged will save temporary files made by bamPlot") parser.add_argument("--bed", dest="bed", help="Add a space-delimited list of bed files to plot") parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, help="If flagged will create a new pdf for each region") args = parser.parse_args() print(args) if args.bam and args.input and args.genome and args.output: # Support a legacy mode where a ',' delimited multiple files bamFileList = args.bam if len(args.bam) == 1: bamFileList = args.bam[0].split(',') # Make sure these are actually files & readable (!) for filename in bamFileList: assert(os.access(filename, os.R_OK)) # bringing in any beds if args.bed: bedFileList = args.bed if type(bedFileList) == str: bedFileList = args.bed.split(',') print(bedFileList) bedCollection = makeBedCollection(bedFileList) else: bedCollection = utils.LocusCollection([], 50) # Load the input for graphing. One of: # - A .gff # - A .bed # - a specific input region (e.g. chr10:.:93150000-93180000) valid_sense_options = {'+', '-', '.'} if os.access(args.input, os.R_OK): if args.input.endswith('.bed'): # Uniquely graph every input of this bed parsed_input_bed = utils.parseTable(args.input, '\t') gffName = os.path.basename(args.input) # Graph title gff = None try: if parsed_input_bed[0][5] in valid_sense_options: # This .bed might have a sense parameter gff = [[e[0], '', args.input, e[1], e[2], '', e[5], '', ''] for e in parsed_input_bed] except IndexError: pass if gff is None: print("Your bed doesn't have a valid sense parameter. Defaulting to both strands, '.'") # We only take chr/start/stop and ignore everything else. gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed] else: # Default to .gff, since that's the original behavior gff = utils.parseTable(args.input, '\t') gffName = args.input.split('/')[-1].split('.')[0] else: # means a coordinate line has been given e.g. chr1:+:1-100 chromLine = args.input.split(':') try: chrom = chromLine[0] sense = chromLine[1] except IndexError: print('Invalid input line or inaccessible file. Try: chr1:.:1-5000') exit() assert(sense in valid_sense_options) [start, end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom, '', args.input, start, end, '', sense, '', ''] gffName = "%s_%s_%s_%s" % (chrom, sense, start, end) gff = [gffLine] # Consider stretching the regions to a fixed minimum size if args.stretch_input: print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input)) minLength = args.stretch_input stretchGff = [] for e in gff: difference = int(e[4]) - int(e[3]) if difference < minLength: pad = int((minLength - difference) / 2) stretchGff.append([e[0], e[1], e[2], int(e[3])-pad, int(e[4])+pad, e[5], e[6], e[7], e[8]]) else: stretchGff.append(e) gff = stretchGff # Sanity test the gff object assert(all([e[6] in valid_sense_options for e in gff])) # All strands are sane #assert(all([int(e[3]) < int(e[4]) for e in gff])) # All start/stops are ordered # bring in the genome genome = args.genome.upper() if ['HG18', 'HG19', 'HG19_RIBO','HG38','MM9', 'MM10', 'RN4','RN6'].count(genome) == 0: print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome)) parser.print_help() exit() # bring in the rest of the options # output rootFolder = args.output if rootFolder[-1] != '/': rootFolder += '/' try: os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder)) exit() # Get analysis title if len(args.title) == 0: title = gffName else: title = args.title # make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder, create=True) # colors if args.color: colorList = args.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED') # recycling the color list colorList += colorList * (len(bamFileList) / len(colorList)) colorList = colorList[0:len(bamFileList)] else: # cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) # sense sense = args.sense extension = int(args.extension) rpm = args.rpm scale = args.scale yScale = args.yScale.upper() # names if args.names: names = args.names.split(',') if len(names) != len(bamFileList): print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND') parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] # plot style plotStyle = args.plot.upper() if ['SINGLE', 'MULTIPLE','MERGE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() # now run! summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection,scale) print ("%s is the summary table" % (summaryTableFileName)) #running the R command to plot multi = args.multi outFile = "%s%s_plots.pdf" % (rootFolder, title) rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle,multi) # open a bash file bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) # delete temp files if not args.save: if utils.checkOutput(outFile, 1, 10): # This is super dangerous (!). Add some sanity checks. assert(" " not in tempFolder) assert(tempFolder is not "/") removeCommand = "rm -rf %s" % (tempFolder) print(removeCommand) os.system(removeCommand) else: print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile)) else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("--group1", dest="group1",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the first group") parser.add_option("--group2", dest="group2",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the second group") parser.add_option("--name1", dest="name1",nargs = 1, default=None, help = "Enter a name for the first group of datasets") parser.add_option("--name2", dest="name2",nargs = 1, default=None, help = "Enter a name for the second group of datasets") #the input options parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of meta rose folders") #optional input to supercede the meta rose (this is kinda sad but will fix later) #should have had this code run clustering from the get go parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "enter a gff, bed or table of regions to perform dyanmic analysis on") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") parser.add_option("--use-background", dest="background",action = 'store_true',default=False, help = "If flagged will use background datasets as in data table") (options,args) = parser.parse_args() print(options) print(args) requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2] try: assert(all(requiredArgs)) except AssertionError: parser.print_help() sys.exit() #now the main run of the function #getting the genoe and data file genome = string.upper(options.genome) dataFile = options.data #getting the rose folders roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) #getting the analysis names name1 = options.name1 name2 = options.name2 mergeName = "%s_%s_merged" % (name1,name2) #getting the datasets names associated with each group namesList1 = options.group1.split(',') namesList2 = options.group2.split(',') #options for background corection useBackground = options.background #option for median scaling medianScale = options.median #option for an overriding set of input regions if options.input != None: #for now only works w/ gffs print('Using %s as a set of predifined input regions' % (options.input)) inputGFF = options.input else: inputGFF= '' plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] regionFile1 = roseDict1['RegionMap'] regionFile2 = roseDict1['RegionMap'] #this is where we can toggle either using meta rose or clustering print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) else: print('doing it right') print(allFile1) print(allFile2) roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) print('this is rose output') print(roseOutput) print('\tMERGING ROSE OUTPUT') mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1) print(rcmd) os.system(rcmd) time.sleep(5) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): print('checking for rank output %s' % (rankOutput)) rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS') if utils.checkOutput(normRoseOutput): print('checking for %s' % (normRoseOutput)) rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2) print(rcmd) os.system(rcmd) else: print('ERROR: REGION PLOT SCRIPT FAILED TO RUN') sys.exit() #NOW MAP GENES print('mapping genes to differential enhancers') statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1) if utils.checkOutput(statOutput): print('checking for gene mapping output %s' % (statOutput)) print('FINISHED WITH GENE MAPPING') else: print('GENE MAPPING FAILED') sys.exit() print('FINISHING OUTPUT') finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder,True) queueList =[] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #define the enhancer type if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile) if utils.checkOutput(enhancerFile,1,10): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerFile = getFile(enhancerString,roseFileList,roseFolder) nameDict[name]['enhancerFile'] = enhancerFile return nameDict
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output and options.names: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) nameString = options.names [name1,name2] =nameString.split(',') mergeName = "%s_%s_merged" % (name1,name2) #option for median scaling medianScale = options.median plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder) else: roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") #additional options parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) if options.names: nameString = options.names [name1,name2] =nameString.split(',') else: name1 = roseFolder1.split('/')[-1] name1 = string.replace(name1,'_ROSE','') name2 = roseFolder2.split('/')[-1] name2 = string.replace(name2,'_ROSE','') mergeName = "%s_%s_merged" % (name1,name2) plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1) superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2) allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1) allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2) print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder) else: mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder) if superOnly: superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) else: superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') if utils.checkOutput(superOutput): #part2 is the R script rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) else: print('ERROR: ROSE CALL FAILED') sys.exit() #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') if superOnly: gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName) else: gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() exit()
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def main(): ''' main run method for enhancer promoter contribution tool ''' parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs='*', help="Enter a space separated list of .bam files for the main factor", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True) parser.add_argument( "-g", "--genome", dest="genome", type=str, help= "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options flags and optional arguments parser.add_argument( "-a", "--activity", dest="activity", type=str, help= "specify a table where first column represents a list of active refseq genes", required=False) parser.add_argument( "-c", "--control", dest="control", nargs='*', help= "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False) parser.add_argument( "-w", "--window", dest="window", type=int, help= "Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000) parser.add_argument( "--other-bams", dest="other", nargs='*', help="enter a space separated list of other bams to map to", required=False) parser.add_argument( "--name", dest="name", type=str, help= "enter a root name for the analysis, otherwise will try to find the name from the input file", required=False) parser.add_argument( "--top", dest="top", type=int, help= "Run the analysis on the top N genes by total signal. Default is 5000", required=False, default=5000) parser.add_argument( "--tads", dest="tads", type=str, help= "Include a .bed of tad regions to restrict enhancer/gene association", required=False, default=None) args = parser.parse_args() print(args) #minimum arguments needed to proceed if args.bam and args.input and args.genome and args.output: #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== print( '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n' ) #top analysis subset top = args.top #input genome genome = args.genome.upper() print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) #set of bams bamFileList = args.bam #bring in the input path inputPath = args.input #try to get the input name or use the name argument if args.name: analysisName = args.name else: analysisName = inputPath.split('/')[-1].split('.')[0] print('USING %s AS ANALYSIS NAME' % (analysisName)) #setting up the output folder parentFolder = utils.formatFolder(args.output, True) outputFolder = utils.formatFolder( '%s%s' % (parentFolder, analysisName), True) print('WRITING OUTPUT TO %s' % (outputFolder)) if inputPath.split('.')[-1] == 'bed': #type is bed print('input in bed format, converting to gff') inputGFF = utils.bedToGFF(inputPath) else: inputGFF = utils.parseTable(inputPath, '\t') #the tss window window = int(args.window) #activity path if args.activity: activityPath = args.activity activityTable = utils.parseTable(activityPath, '\t') #try to find the column for refseq id for i in range(len(activityTable[0])): if str(activityTable[0][i]).count('NM_') > 0 or str( activityTable[0][i]).count('NR_') > 0: ref_col = i geneList = [line[ref_col] for line in activityTable ] # this needs to be REFSEQ NM ID print('IDENTIFIED %s ACTIVE GENES' % (len(geneList))) else: geneList = [] #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) use_tads = True tads_path = args.tads else: use_tads = False tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) #important here to define the window startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile( genome, window, geneList, True) #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.'))) #sys.exit() print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES') print(chrom_list) filtered_gff = [ line for line in inputGFF if chrom_list.count(line[0]) > 0 ] print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' % (len(filtered_gff), len(inputGFF))) #===================================================================================== #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS===================== #===================================================================================== print( '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n' ) #now we need to split the input region print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window)) splitGFF = splitRegions(filtered_gff, tssCollection) print(len(filtered_gff)) print(len(splitGFF)) splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName) utils.unParseTable(splitGFF, splitGFFPath, '\t') print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath)) #now you have to map the bams to the gff print('MAPPING TO THE SPLIT GFF') mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder), True) signalTable = mapBams(bamFileList, splitGFFPath, analysisName, mappedFolder) signalTablePath = '%s%s_signal_table.txt' % (outputFolder, analysisName) utils.unParseTable(signalTable, signalTablePath, '\t') if args.control: controlBamFileList = args.control controlSignalTable = mapBams(controlBamFileList, splitGFFPath, analysisName, mappedFolder) controlSignalTablePath = '%s%s_control_signal_table.txt' % ( outputFolder, analysisName) utils.unParseTable(controlSignalTable, controlSignalTablePath, '\t') #now create the background subtracted summarized average table print('CREATING AN AVERAGE SIGNAL TABLE') averageTable = makeAverageTable(outputFolder, analysisName, useBackground=args.control) averageTablePath = '%s%s_average_table.txt' % (outputFolder, analysisName) utils.unParseTable(averageTable, averageTablePath, '\t') #now load up all of the cpg and other parameters to make the actual peak table #first check if this has already been done peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName) if utils.checkOutput(peakTablePath, 0.1, 0.1): print('PEAK TABLE OUTPUT ALREADY EXISTS') peakTable = utils.parseTable(peakTablePath, '\t') else: peakTable = makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path) utils.unParseTable(peakTable, peakTablePath, '\t') geneTable = makeGeneTable(peakTable, analysisName) geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName) utils.unParseTable(geneTable, geneTablePath, '\t') #if mouse, need to convert genes over if genome.count('MM') == 1: print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA') converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % ( outputFolder, analysisName) converted_geneTable = [geneTable[0]] for line in geneTable[1:]: converted_name = mouse_convert_dict[line[0]] if len(converted_name) > 0: converted_geneTable.append([converted_name] + line[1:]) utils.unParseTable(converted_geneTable, converted_geneTablePath, '\t') geneTablePath = converted_geneTablePath geneTable = converted_geneTable #===================================================================================== #===================================III. PLOTTING ==================================== #===================================================================================== print( '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n' ) #if there are fewer genes in the gene table than the top genes, only run on all if len(geneTable) < int(top): print( 'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s' % (len(geneTable) - 1, top)) top = 0 use_top = False else: use_top = True #now call the R code print('CALLING R PLOTTING SCRIPTS') callRWaterfall(geneTablePath, outputFolder, analysisName, top) #===================================================================================== #==================================IV. RUNNING GSEA=================================== #===================================================================================== print( '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n' ) #now let's call gsea print('RUNNING GSEA ON C2') callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter', use_top) callGSEA(outputFolder, analysisName, top, 'total_contribution', use_top) if use_top: print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) #for top by enhancer v promoter metric top_promoterTablePath, top_distalTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'enhancer_vs_promoter') top_signalTablePath, top_backgroundTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'total_contribution') print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) callR_GSEA(top_promoterTablePath, top_distalTablePath, outputFolder, analysisName + '_enhancer_vs_promoter', top) callR_GSEA(top_signalTablePath, top_backgroundTablePath, outputFolder, analysisName + '_total_contribution', top) print('DETECTING GSEA OUTPUT FOR ALL GENES') #for top all_promoterTablePath, all_distalTablePath = detectGSEAOutput( analysisName, outputFolder, 'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder, analysisName, 'all') #these files can be parsed to make the NES plot #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1] print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = OptionParser(usage=usage) # required flags parser.add_option("-b", "--bam", dest="bam", nargs=1, default=None, help="Enter a comma separated list of .bam files to be processed.") parser.add_option("-i", "--input", dest="input", nargs=1, default=None, help="Enter .gff or genomic region e.g. chr1:+:1-1000.") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="specify a genome, HG18,HG19,MM8,MM9 are currently supported") # output flag parser.add_option("-o", "--output", dest="output", nargs=1, default=None, help="Enter the output folder.") # additional options parser.add_option("-c", "--color", dest="color", nargs=1, default=None, help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow") parser.add_option("-s", "--sense", dest="sense", nargs=1, default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option("-e", "--extension", dest="extension", nargs=1, default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_option("-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm) Default is False") parser.add_option("-y", "--yScale", dest="yScale", nargs=1, default="relative", help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling") parser.add_option("-n", "--names", dest="names", nargs=1, default=None, help="Enter a comma separated list of names for your bams") parser.add_option("-p", "--plot", dest="plot", nargs=1, default="MULTIPLE", help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE'") parser.add_option("-t", "--title", dest="title", nargs=1, default='', help="Specify a title for the output plot(s), default will be the coordinate region") # DEBUG OPTION TO SAVE TEMP FILES parser.add_option("--save-temp", dest="save", action='store_true', default=False, help="If flagged will save temporary files made by bamPlot") parser.add_option("--bed", dest="bed", nargs=1, default=None, help="Add a comma separated list of bam files to plot") (options, args) = parser.parse_args() print(options) print(args) if options.bam and options.input and options.genome and options.output: # bring in the bams bamFileList = options.bam.split(',') # bringing in any beds if options.bed: bedFileList = options.bed.split(',') bedCollection = makeBedCollection(bedFileList) else: bedCollection = utils.LocusCollection([], 50) # bring in the gff try: gff = utils.parseTable(options.input, '\t') gffName = options.input.split('/')[-1].split('.')[0] except IOError: # means a coordinate line has been given e.g. chr1:+:1-100 chromLine = options.input.split(':') chrom = chromLine[0] sense = chromLine[1] [start, end] = chromLine[2].split('-') if chrom[0:3] != 'chr': print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT') exit() gffLine = [chrom, '', options.input, start, end, '', sense, '', ''] gffName = "%s_%s_%s_%s" % (chrom, sense, start, end) gff = [gffLine] # bring in the genome genome = options.genome.upper() if ['HG18', 'HG19', 'MM9', 'RN5'].count(genome) == 0: print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN5, OR MM9' % (genome)) parser.print_help() exit() # bring in the rest of the options # output rootFolder = options.output if rootFolder[-1] != '/': rootFolder += '/' try: os.listdir(rootFolder) except OSError: print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder)) exit() # Get analysis title if len(options.title) == 0: title = gffName else: title = options.title # make a temp folder tempFolder = rootFolder + title + '/' print("CREATING TEMP FOLDER %s" % (tempFolder)) pipeline_dfci.formatFolder(tempFolder, create=True) # colors if options.color: colorList = options.color.split(':') colorList = [x.split(',') for x in colorList] if len(colorList) < len(bamFileList): print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED') # recycling the color list colorList += colorList * (len(bamFileList) / len(colorList)) colorList = colorList[0:len(bamFileList)] else: # cycles through the colors of the rainbow colorList = tasteTheRainbow(len(bamFileList)) # sense sense = options.sense extension = int(options.extension) rpm = options.rpm yScale = options.yScale.upper() # names if options.names: names = options.names.split(',') if len(names) != len(bamFileList): print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND') parser.print_help() exit() else: names = [x.split('/')[-1] for x in bamFileList] # plot style plotStyle = options.plot.upper() if ['SINGLE', 'MULTIPLE'].count(plotStyle) == 0: print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle)) parser.print_help() exit() # now run! summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection) print ("%s is the summary table" % (summaryTableFileName)) outFile = "%s%s_plots.pdf" % (rootFolder, title) rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle) # open a bash file to get shit done bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n') bashFile.write(rCmd) bashFile.close() print("Wrote R command to %s" % (bashFileName)) os.system("bash %s" % (bashFileName)) # delete temp files if not options.save: if utils.checkOutput(outFile, 1, 10): removeCommand = "rm -rf %s" % (tempFolder) print(removeCommand) os.system(removeCommand) else: print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile)) else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -t [TEST_BAM] -c [CONTROL_BAM] -g [GENOME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-t","--test", dest="test",nargs = 1, default=None, help = "Enter the full path of the test bam") parser.add_option("-c","--control", dest="control",nargs = 1, default=None, help = "Enter the full path of the control bam") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the build for the GeCKO library (currently only supports geckov2)") #optional arguments parser.add_option("-n","--name",dest="name",nargs =1, default = 0, help = "Comma separated test,control name") parser.add_option("-s","--scoring",dest="scoring",nargs =1, default = 'WtSum', help = "Scoring method (KSbyScore,WtSum,SecondBestRank) defulat: WtSum") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the full path of the output folder. Default is the current working directory") (options,args) = parser.parse_args() #three required parameters to get started if options.test and options.control and options.genome: #get the names of the datasets if options.name: if len(options.name.split(',')) == 2: [testName,controlName] = options.name.split(',') else: print("ERROR: Must provide a comma separated test,control name if using -n flag") parser.print_help() sys.exit() else: #try to extract names from file #strip extension from filename testName = options.test.split('/')[-1].split('.')[0] controlName = options.control.split('/')[-1].split('.')[0] #names print("using %s as name for test dataset" % (testName)) print("using %s as name for control dataset" % (controlName)) #get the analysis name analysisName = '%s_%s' % (testName,controlName) print("using %s as analysis name" % (analysisName)) #get the scoring method scoringMethod = options.scoring if ['KSbyScore','WtSum','SecondBestRank'].count(scoringMethod)==0: print("ERROR: please specify one of the following scoring methods:('KSbyScore','WtSum','SecondBestRank') or leave blank (default WtSum)") parser.print_help() sys.exit() #set up output folder if options.output: outputFolder = utils.formatFolder(options.output,True) else: outputFolder = utils.formatFolder('./%s/' % (analysisName),True) print("using %s as an output folder" % (outputFolder)) #get the right annotation genomeDict = {'geckov2':'/grail/genomes/gecko/GeCKOv2/Annotation/Human_GeCKOv2_Library.txt', } #load the annotation dictionary annotFile = genomeDict[string.lower(options.genome)] print("using %s as the annotation file" % (annotFile)) #guideDict,geneDict = makeAnnotDict(annotFile) #now set up each bam testBam = utils.Bam(options.test) controlBam = utils.Bam(options.control) #get the MMR for each testMMR = round(float(testBam.getTotalReads())/1000000,4) controlMMR = round(float(controlBam.getTotalReads())/1000000,4) print("Test dataset: %s has an MMR of %s" % (testName,testMMR)) print("Control dataset: %s has an MMR of %s" % (controlName,controlMMR)) #now get the idxstats output testIdxFile = '%s%s_idxstats.txt' % (outputFolder,testName) testIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.test,testIdxFile) print("Test idxstats command:") print(testIdxCmd) os.system(testIdxCmd) controlIdxFile = '%s%s_idxstats.txt' % (outputFolder,controlName) controlIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.control,controlIdxFile) print("Control idxstats command:") print(controlIdxCmd) os.system(controlIdxCmd) print("Checking for output") if not utils.checkOutput(testIdxFile,0.1,5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.test)) print("Found test IdxStats file") if not utils.checkOutput(controlIdxFile,0.1,5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.control)) print("Found control IdxStats file") #now make the fold table foldTableFile =makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1) print('writing output to %s' % (foldTableFile)) print("MAING FRIGER TABLE") rigerTableFile = makeRigerTable(foldTableFile,output='') print('writing FRIGER table to %s' % (rigerTableFile)) rigerBashFileName = callRiger(rigerTableFile,scoring=scoringMethod,output='',callRiger=True) else: parser.print_help() sys.exit()
def main(): ''' using argparse ''' parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES') # required flags parser.add_argument("-d", "--data_table", dest="data_table", type=str, help="input a data table with all datasets to be analyzed", required=True) parser.add_argument("-1", "--group1", dest="group1", type=str, help="input a comma separated list of all datasets in group1", required=True) parser.add_argument("-2", "--group2", dest="group2", type=str, help="input a comma separated list of all datasets in group2", required=True) #optional input override parser.add_argument("-i", "--input", dest="input", type=str, help="input a gff of regions to analyze", required=False) #optional arguments parser.add_argument("-n", "--name", dest="name", type=str, help="specify a name for the analysis. Default is drawn from the data table name", required=False) parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str, help="Enter a name for group1. Default is 'GROUP1'", required=False) parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str, help="Enter a name for group2. Default is 'GROUP2'", required=False) parser.add_argument("-a", "--activity", dest="activity", type=str,default='', help="a table with active gene names in the first column", required=False) parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500, help="Specify a TSS exclusion distance. Default is 2500", required=False) parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None, help="Specify a stitching distance. Default is auto stitching", required=False) parser.add_argument("-o", "--output", dest="output", default='./',type=str, help="Enter the output folder. Default is the current working directory", required=False) parser.add_argument("--log", dest="log", default='',type=str, help="Enter a path to log output", required=False) # # DEBUG OPTION TO SAVE TEMP FILES # parser.add_argument("--scale", dest="scale", default='', # help="Enter a comma separated list of scaling factors for your bams. Default is none") # parser.add_argument("--save-temp", dest="save", action='store_true', default=False, # help="If flagged will save temporary files made by bamPlot") # parser.add_argument("--bed", dest="bed", # help="Add a space-delimited list of bed files to plot") # parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, # help="If flagged will create a new pdf for each region") args = parser.parse_args() #now we can begin to parse the arguments #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== #pulling in the data table data_file = os.path.abspath(args.data_table) dataDict = pipeline_dfci.loadDataTable(data_file) #setting naming conventions if not args.name: analysis_name = data_file.split('/')[-1].split('.')[0] else: analysis_name = args.name #getting the optional input gff if args.input: inputGFF = args.input else: inputGFF = '' #getting group names group1_name = args.group1_name group2_name = args.group2_name #getting group1 group1_string = args.group1 group1_list = [name for name in string.split(group1_string,',') if len(name) > 0] #getting group2 group2_string = args.group2 group2_list = [name for name in string.split(group2_string,',') if len(name) > 0] #checking that all datasets are in the data table for name in group1_list + group2_list: if name not in dataDict: print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file)) sys.exit() #loading in the genome object from the data table genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list]) if len(genome_list) > 1: print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.') sys.exit() #the load genome function has an assertion test to make sure the genome is supported genome = loadGenome(genome_list[0]) parent_folder = utils.formatFolder(args.output,True) output_folder = utils.formatFolder(parent_folder + analysis_name,True) #these are the user defined optional arguments tss = int(args.tss) stitch = args.stitch print('stitch') print(stitch) #list of active genes to constrain analysis if len(args.activity) == 0: #assumes all genes are active unless told otherwise #activity_path,activity_table = getActivity() # fix this function print('using all active genes') else: activity_path = args.activity activity_table = utils.parseTable(activity_path,'\t') print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing datasets described in %s\n' % (data_file)) print('Name for the analysis: %s\n' % (analysis_name)) print('Using genome: %s\n' % (genome.name())) print('%s datasets: %s\n' % (group1_name,group1_string)) print('%s datasets: %s\n' % (group2_name,group2_string)) if len(activity_path) > 0: print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path)) else: print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name)) print('Writing output to: %s\n' % (output_folder)) #===================================================================================== #======================II. DEFINING CIS-REGULATORY ELEMENTS=========================== #===================================================================================== print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n') #crc_wrapper will act at the group level and not consider individual datasets #since a data table is used as the input, the code will rely heavily on pipeline_dfci #embedded tools #1. first we need to run meta rose using default parameters and check the output #exists for each group meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True) group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name) group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name) #print(group1_output) #print(group2_output) #for each output check to see if they exist #if not launch try: foo = open(group1_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name)) launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss) try: foo = open(group2_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name)) launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss) #now check for completion if utils.checkOutput(group1_output,1,10): print('META_ROSE finished for %s' % (group1_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name)) sys.exit() if utils.checkOutput(group2_output,1,10): print('META_ROSE finished for %s' % (group2_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name)) sys.exit() #Meta rose does not give all regions that are SE in at least one sample #and can be blown out by amplicons etc... #sooo we need to run clustering to generate a good input gff #ideally we just rewrite dynamic meta to run off of clustering output #until we do that let's just overwrite w/ an input gff print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name)) dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True) #here we will use the rank table as the primary output dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name) try: foo = open(dynamic_rose_output,'r') except IOError: print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name)) launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF) if utils.checkOutput(dynamic_rose_output,1,10): print('DYNAMIC_ROSE finsihed for %s' % (analysis_name)) else: print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name)) sys.exit() #===================================================================================== #======================III. IDENTIFYING TF NODES IN NETWORK=========================== #===================================================================================== print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n') #now we want to call circuitry on each group... ok to have different subpeaks and motif calls #if as a first approximation we weight by the overall enhancer crc_folder = utils.formatFolder('%scrc/' % (output_folder),True) #for all all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True) launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path) #for group1 group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True) launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path) #for group2 group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True) launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)