def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#==================I. LOADING DATA ANNOTATION TABLES===================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for chip data file pipeline_dfci.summary(chip_data_file) #for chip data file pipeline_dfci.summary(atac_data_file) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING ROSE2===========================' ) print( '#======================================================================' ) print('\n\n') macsEnrichedFolder = '%smacsEnriched/' % ( projectFolder) #folder with macs peak output beds parentFolder = utils.formatFolder( '%srose/' % (projectFolder), True) # create a folder to store ROSE2 output namesList = ['MM1S_H3K27AC', 'MM1S_MED1' ] # calling ROSE2 on H3K27AC and MED1 defined enhancers bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder) mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % ( projectFolder) pipeline_dfci.callRose2(chip_data_file, macsEnrichedFolder, parentFolder, namesList, extraMap=[], inputFile='', tss=2500, stitch=12500, bashFileName=bash_file, mask=mask_file, useBackground=True)
def callRoseMerged(dataFile, mergedGFFFile, name1, name2, parentFolder): ''' makes a rose call for the merged supers ''' dataDict = pipeline_dfci.loadDataTable(dataFile) backgroundName1 = dataDict[name1]['background'] backgroundName2 = dataDict[name2]['background'] if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2): hasBackground = True elif not dataDict.has_key(backgroundName1) and not dataDict.has_key( backgroundName2): hasBackground = False else: print "ERROR: Only 1 dataset has a background file. This is a very very bad idea" sys.exit() if hasBackground: namesList = [name1] extraMap = [name2, dataDict[name2]['background']] else: namesList = [name1] extraMap = [name2] return pipeline_dfci.callRose2(dataFile, '', parentFolder, namesList, extraMap, mergedGFFFile, tss=0, stitch=0)
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder): ''' makes a rose call for the merged supers ''' dataDict = pipeline_dfci.loadDataTable(dataFile) backgroundName1 = dataDict[name1]['background'] backgroundName2 = dataDict[name2]['background'] if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2): hasBackground = True elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(backgroundName2): hasBackground =False else: print "ERROR: Only 1 dataset has a background file. This is a very very bad idea" sys.exit() if hasBackground: namesList = [name1] extraMap = [name2,dataDict[name2]['background']] else: namesList = [name1] extraMap = [name2] return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder,namesList1,namesList2,useBackground=False): ''' makes a rose call for the merged supers ''' #use the first column as a dummy, then load everything up into the extra map # roseBashFile = '%s%s_%s_rose.sh' % (parentFolder,name1,name2) dataDict = pipeline_dfci.loadDataTable(dataFile) #just set the first dataset of namesList1 so the code can run #all of the data will be in the extramap namesList = [namesList1[0]] if useBackground: #first check that all datasets have a background backgroundList = [] for name in namesList1 + namesList2: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): backgroundList.append(backgroundName) else: print "ERROR: No background dataset found for %s incompatible with --use-background flag" % (name) sys.exit() extraMap = namesList1 + namesList2 + backgroundList else: extraMap = namesList1 + namesList2 return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0,bashFileName=roseBashFile,mask='',useBackground=False) #don't want additional background correction from the pipeline wrapper of rose
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder,True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap+=[name,backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name) sys.exit() else: extraMap+=[name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap,1,1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def map_shep_enhancers(shep_on_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder,True) bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder) namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC'] pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile) return bashFileName
def map_nb_enhancers(nb_all_chip_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder, True) bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder) namesList = [ 'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC', 'NGP_H3K27AC' ] pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder, parentFolder, namesList, [], '', 2500, '', bashFileName, maskFile) return bashFileName
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder, maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict = pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder, True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap += [name, backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % ( backgroundName, name) sys.exit() else: extraMap += [name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % ( outputFolder, namesList[0], gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap, 1, 1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile, '', roseParentFolder, [namesList[0]], extraMap, mergedGFFFile, 0, 0, bashFileName, mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap, 5, 60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % ( mergedGFFFile) sys.exit()
def launchEnhancerMapping(dataFile, nameDict, outputFolder, roseFolder, stitch, tssDistance, enhancerType, maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder, True) queueList = [] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose2(dataFile, '', roseOutputFolder, [name], [], enrichedFile, tssDistance, stitch, mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #define the enhancer type if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder, name, name, enhancerString) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name, enhancerFile) if utils.checkOutput(enhancerFile, 1, 10): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (roseOutputFolder, name) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerFile = getFile(enhancerString, roseFileList, roseFolder) nameDict[name]['enhancerFile'] = enhancerFile return nameDict
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder,True) queueList =[] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #define the enhancer type if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile) if utils.checkOutput(enhancerFile,1,10): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerFile = getFile(enhancerString,roseFileList,roseFolder) nameDict[name]['enhancerFile'] = enhancerFile return nameDict
def wrapDRose(dataFile, name1, name2, analysis_name): ''' wraps the delta rose analysis that will be done here using rose w/ 0 tss and 0 stitch ''' #first call rose parentFolder = utils.formatFolder('%stwist1_rose/' % (projectFolder), True) #determine what the eventual output will look like enhancer_path_1 = '%s%s_ROSE/%s_peaks_AllEnhancers.table.txt' % ( parentFolder, name1, name1) enhancer_path_2 = '%s%s_ROSE/%s_peaks_AllEnhancers.table.txt' % ( parentFolder, name2, name2) if utils.checkOutput(enhancer_path_1, 0.1, 0.1) and utils.checkOutput( enhancer_path_2, 0.1, 0.1): print('Found ROSE2 output for %s and %s in %s' % (name1, name2, parentFolder)) else: print('Running ROSE2 on %s and %s with -t 0 and -s 0 parameters') bashFileName = '%s%s_rose.sh' % (parentFolder, analysis_name) pipeline_dfci.callRose2(dataFile, macsEnrichedFolder, parentFolder, [name1, name2], [], '', 0, 0, bashFileName, maskFile, True) #os.system('bash %s' % (bashFileName)) #next run dynamic rose dynamicFolder = utils.formatFolder('%sdynamic_rose/' % (projectFolder), True) rose_folder_1 = '%s%s_ROSE/' % (parentFolder, name1) rose_folder_2 = '%s%s_ROSE/' % (parentFolder, name2) bashFileName = '%s%s_dynamic.sh' % (dynamicFolder, analysis_name) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('#dynamic rose on twist datasets for %s\n\n' % (analysis_name)) dynamic_cmd = 'python %sdynamicEnhancer.py -g %s -d %s -n %s,%s -r %s,%s -o %s%s/ -a' % ( pipeline_dir, genome, dataFile, name1, name2, rose_folder_1, rose_folder_2, dynamicFolder, analysis_name) bashFile.write(dynamic_cmd + '\n\n') bashFile.close() rank_path = '%s%s/output/%s_%s_%s_merged_MERGED_ENHANCERS_RANK_TABLE.txt' % ( dynamicFolder, analysis_name, genome.upper(), name1, name2) print(rank_path) if not utils.checkOutput(rank_path, 0.1, 0.1): #only run if you can't find the terminal output print('Running dynamic rose from %s' % (bashFileName)) os.system('bash %s' % (bashFileName)) if utils.checkOutput(rank_path, 1, 30): print('Found dynamic rose output at %s' % (rank_path)) rank_table = utils.parseTable(rank_path, '\t') rank_gff = [] for line in rank_table[1:]: gff_line = [ line[1], line[0], '', line[2], line[3], '', '.', '', line[0] ] rank_gff.append(gff_line) rank_gff_path = '%s%s_%s_RANK.gff' % (gffFolder, genome.upper(), analysis_name) print('writing rank table as a gff to %s' % (rank_gff_path)) utils.unParseTable(rank_gff, rank_gff_path, '\t') return rank_gff_path else: print( 'Error: operation timed out. Cannot find expected dynamic output at %s' % (rank_path)) sys.exit()