def run_macs(dataFile): dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] namesList.sort() print(namesList) pipeline_dfci.callMacs(dataFile,macsFolder,namesList,overwrite=False,pvalue='1e-9') os.chdir(projectFolder) # the silly call macs script has to change into the output dir #so this takes us back to the project folder #to check for completeness, we will try to find all of the peak files peak_calling_done = False while not peak_calling_done: dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] for name in namesList: peak_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) print('searching for %s' % (peak_path)) if utils.checkOutput(peak_path,1,180): peak_calling_done =True print('found %s' % (peak_path)) continue else: print('Error: peak calling timed out') sys.exit() #now format the macs output print('formatting macs output') dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name for name in dataDict.keys() if name.upper().count('WCE') ==0 and name.upper().count('INPUT') == 0] pipeline_dfci.formatMacsOutput(dataFile,macsFolder,macsEnrichedFolder,wiggleFolder,wigLink ='',useBackground=True) print('Finished running Macs 1.4.2')
def wrapGeneMapper(data_file,names_list=[],launch=True): ''' runs ROSE2 GENE MAPPER on the AllEnhancers table ''' data_dict = pipeline_dfci.loadDataTable(data_file) parent_rose_folder = utils.formatFolder('%srose_final' % (projectFolder),False) if len(names_list) ==0: names_list=[name for name in data_dict.keys() if name.upper().count('H3K27AC') ==1] #find each individual all enhancer table and then call the mapper via an .sh script for name in names_list: print(name) dataset_rose_folder = utils.formatFolder('%s%s_ROSE' %(parent_rose_folder,name),False) all_enhancer_path = '%s%s_peaks_AllEnhancers.table.txt' % (dataset_rose_folder,name) #print(all_enhancer_path) mapper_bash_path = '%s%s_geneMapper.sh' % (dataset_rose_folder,name) mapper_bash_file = open(mapper_bash_path,'w') mapper_bash_file.write('#!/usr/bin/bash\n\n\n\n') mapper_bash_file.write('#Running ROSE2 GENE MAPPER ON %s ALL ENHANCERS OUTPUT\n\n' % (name)) mapper_cmd = 'python %sROSE2_geneMapper.py -g %s -i %s -f -w 100000' % (pipeline_dir,genome,all_enhancer_path) mapper_bash_file.write(mapper_cmd+'\n') mapper_bash_file.close() print('wrote gene mapper command to %s' % (mapper_bash_path)) if launch: os.system('bash %s' % mapper_bash_path)
def makeBoxPlot(dataFile,set_name,gff_name,names_list=[]): ''' wrapping the boxplot script ''' boxplot_script_path = '%sr_scripts/4_chiprx_plots.R' % (projectFolder) scale_table_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder) dataDict= pipeline_dfci.loadDataTable(dataFile) dataFile_name = dataFile.split('/')[-1].split('.')[0] if len(names_list) == 0: names_list = [name for name in dataDict.keys() if name.count(set_name) > 0] names_list.sort() background_list = [ dataDict[name]['background'] for name in names_list] names_string = ','.join(names_list) background_string = ','.join(background_list) signal_table_path = '%sHG19_%s_%s_SIGNAL.txt' % (signalFolder,gff_name,dataFile_name) plot_name = '%s_%s' % (gff_name,set_name) r_cmd = 'Rscript %s %s %s %s %s %s %s' % (boxplot_script_path,signal_table_path,scale_table_path,names_string,background_string,plot_name,projectFolder) print(r_cmd) os.system(r_cmd)
def launchCRC(data_file,genome,dynamic_rose_output,group_name,group_list,crc_folder,activity_path): ''' launches CRC analysis on all bams in a group w/ subpeaks #how do we get subpeaks piped through? ''' dataDict = pipeline_dfci.loadDataTable(data_file) bam_string = ','.join([dataDict[name]['bam'] for name in group_list]) #set up the crc command crc_cmd = 'python CRC3.py -e %s -b %s -g %s -o %s -n %s' % (dynamic_rose_output,bam_string,genome.name(),crc_folder,group_name) if len(activity_path) > 0: crc_cmd += ' --activity %s' % (activity_path) bash_path = '%s%s_crc.sh' % (crc_folder,group_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (whereAmI)) bash_file.write(crc_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote CRC command for %s to %s' % (group_name,bash_path)) print('Launching CRC') os.system('bash %s' % (bash_path))
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder,namesList1,namesList2,useBackground=False): ''' makes a rose call for the merged supers ''' #use the first column as a dummy, then load everything up into the extra map # roseBashFile = '%s%s_%s_rose.sh' % (parentFolder,name1,name2) dataDict = pipeline_dfci.loadDataTable(dataFile) #just set the first dataset of namesList1 so the code can run #all of the data will be in the extramap namesList = [namesList1[0]] if useBackground: #first check that all datasets have a background backgroundList = [] for name in namesList1 + namesList2: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): backgroundList.append(backgroundName) else: print "ERROR: No background dataset found for %s incompatible with --use-background flag" % (name) sys.exit() extraMap = namesList1 + namesList2 + backgroundList else: extraMap = namesList1 + namesList2 return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0,bashFileName=roseBashFile,mask='',useBackground=False) #don't want additional background correction from the pipeline wrapper of rose
def callRoseMerged(dataFile, mergedGFFFile, name1, name2, parentFolder): ''' makes a rose call for the merged supers ''' dataDict = pipeline_dfci.loadDataTable(dataFile) backgroundName1 = dataDict[name1]['background'] backgroundName2 = dataDict[name2]['background'] if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2): hasBackground = True elif not dataDict.has_key(backgroundName1) and not dataDict.has_key( backgroundName2): hasBackground = False else: print "ERROR: Only 1 dataset has a background file. This is a very very bad idea" sys.exit() if hasBackground: namesList = [name1] extraMap = [name2, dataDict[name2]['background']] else: namesList = [name1] extraMap = [name2] return pipeline_dfci.callRose2(dataFile, '', parentFolder, namesList, extraMap, mergedGFFFile, tss=0, stitch=0)
def make_nb_active_gene_lists(nb_all_chip_dataFile): pipeline_dfci.makeGeneGFFs(annotFile,gffFolder,species=genome.upper()) dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) setName = 'NB_TSS_H3K27AC' gffList = ['%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)] cellTypeList = ['BE2C','KELLY','NGP','SHEP21'] namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1] pipeline_dfci.mapEnrichedToGFF(nb_all_chip_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,True,namesList,useBackground=True) #this is for the union mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_NB_TSS_H3K27AC.txt' % (mappedEnrichedFolder) #this setList variable defines overlap logic for promoters. In this case, it's asking for the union of all datasets setList = [['BE2C_H3K27AC'],['KELLY_H3K27AC'],['NGP_H3K27AC'],['SHEP21_0HR_H3K27AC_NOSPIKE']] output = '%sgeneListFolder/HG19_NB_H3K27AC_ACTIVE_UNION.txt' % (projectFolder) pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile) #this is for individual NB datasets namesList =['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] for name in namesList: mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_NB_TSS_H3K27AC.txt' % (mappedEnrichedFolder) setList = [[name]] output = '%sgeneListFolder/HG19_%s_ACTIVE.txt' % (projectFolder,name) pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile)
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder): ''' makes a rose call for the merged supers ''' dataDict = pipeline_dfci.loadDataTable(dataFile) backgroundName1 = dataDict[name1]['background'] backgroundName2 = dataDict[name2]['background'] if dataDict.has_key(backgroundName1) and dataDict.has_key(backgroundName2): hasBackground = True elif not dataDict.has_key(backgroundName1) and not dataDict.has_key(backgroundName2): hasBackground =False else: print "ERROR: Only 1 dataset has a background file. This is a very very bad idea" sys.exit() if hasBackground: namesList = [name1] extraMap = [name2,dataDict[name2]['background']] else: namesList = [name1] extraMap = [name2] return pipeline_dfci.callRose2(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
def map_regions(dataFile,gffList,names_list=[]): ''' making a normalized binding signal table at all regions ''' #since each bam has different read lengths, important to carefully normalize quantification dataDict = pipeline_dfci.loadDataTable(dataFile) dataFile_name = dataFile.split('/')[-1].split('.')[0] if len(names_list) == 0: names_list = dataDict.keys() names_list.sort() for name in names_list: bam = utils.Bam(dataDict[name]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200-read_length print('For dataset %s using an extension of %s' % (name,bam_extension)) pipeline_dfci.mapBamsBatch(dataFile,gffList,mappedFolder,overWrite =False,namesList = [name],extension=bam_extension,rpm=True) #want a signal table of all datasets to each gff print('Writing signal tables for each gff:') for gffFile in gffList: gffName = gffFile.split('/')[-1].split('.')[0] signal_table_path = '%s%s_%s_SIGNAL.txt' % (signalFolder,gffName,dataFile_name) print(signal_table_path) pipeline_dfci.makeSignalTable(dataFile,gffFile,mappedFolder,namesList = names_list,medianNorm=False,output =signal_table_path)
def wrap_enhancer_promoter(dataFile, input_path, activity_path, analysis_name, names_list=[], useBackground=True): ''' runs enhancer promoter on everybody with the conserved regions and union of active genes ''' #hard coded paths tads_path = '%shESC_domains_hg19.bed' % (bedFolder) #setting the output folder ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder), True) dataDict = pipeline_dfci.loadDataTable(dataFile) if len(names_list) == 0: names_list = [name for name in dataDict.keys()] names_list.sort() bams_list = [dataDict[name]['bam'] for name in names_list] bams_string = ' '.join(bams_list) background_names = [ dataDict[name]['background'] for name in names_list ] background_list = [ dataDict[background_name]['bam'] for background_name in background_names ] background_string = ' '.join(background_list) ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name) ep_bash = open(ep_bash_path, 'w') ep_bash.write('#!/usr/bin/bash\n\n\n') ep_bash.write('#enhancer promoter analysis for %s\n\n' % (analysis_name)) if useBackground: python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, background_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) else: python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) ep_bash.close() return (ep_bash_path)
def makeNameDict(dataFile,roseFolder,namesList=[]): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE'] #this filters out control WCE datatsets #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print enrichedFile try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) sys.exit() return nameDict
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder,True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap+=[name,backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name) sys.exit() else: extraMap+=[name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap,1,1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def wrapInvasionBox(data_file, region_prefix, set_name, names_list=[], top=5000, scale_path=''): ''' wrapper for the enhancer invasion boxplots ''' invasion_script = '%sr_scripts/7_enhancer_invasion_plots.R' % ( projectFolder) #set the scale path default if len(scale_path) == 0: scale_path = 'NONE' dataDict = pipeline_dfci.loadDataTable(data_file) if len(names_list) == 0: names_list = [ name for name in dataDict.keys() if name.count('MYC') > 0 ] names_list.sort() print('running enhancer invasion analysis on:') print(names_list) print('anchoring analysis on dataset: %s' % (names_list[0])) #need to get paths of the three peak tables #assumes formatting and naming conventions of the enhancerPromoter folder (see 5_nb_enhancer_promoter.py) peak_0_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % ( projectFolder, region_prefix, names_list[0], region_prefix, names_list[0]) peak_1_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % ( projectFolder, region_prefix, names_list[1], region_prefix, names_list[1]) peak_2_path = '%senhancerPromoter/%s_%s/%s_%s_PEAK_TABLE.txt' % ( projectFolder, region_prefix, names_list[2], region_prefix, names_list[2]) analysis_name = '%s_%s' % (region_prefix, set_name) print(analysis_name) sample_string = ','.join(names_list) print(sample_string) r_cmd = 'Rscript %s %s %s %s %s %s %s %s %s' % ( invasion_script, peak_0_path, peak_1_path, peak_2_path, analysis_name, sample_string, top, projectFolder, scale_path) print(r_cmd) os.system(r_cmd)
def getMedianSignalEnhancer(enhancerFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, '\t') enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def define_enhancer_landscape(mouse_dataFile, analysisName, namesList=[]): ''' define enhancers using h3k27ac in the 3 datasets that look good: CG, SCG, THMYCN_139076 using regular ROSE2 ''' #For SCG baseline #no TSS exclusion and no stitching dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) if len(namesList) == 0: namesList = [ name for name in dataDict.keys() if name.upper().count('H3K27AC') == 1 ] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList, ',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList, ',') bedFileList = [ macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList ] bedString = string.join(bedFileList, ',') outputFolder = '%s%s/' % (metaRoseFolder, analysisName) bashFileName = '%s%s_meta_rose.sh' % (metaRoseFolder, analysisName) bashFile = open(bashFileName, 'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g mm9 -i %s -r %s -c %s -o %s -n %s' % ( pipeline_dir, bedString, bamString, controlBamString, outputFolder, analysisName) bashFile.write(metaRoseCmd + '\n') bashFile.close() region_map_path = '%s%s/%s_AllEnhancers.table.txt' % ( metaRoseFolder, analysisName, analysisName) #runs only if no output detected if not utils.checkOutput(region_map_path, 0, 0): print(bashFileName) os.system('bash %s' % (bashFileName)) return bashFileName, region_map_path, namesList
def callRoseMerged(dataFile,mergedGFFFile,name1,name2,parentFolder): ''' makes a rose call for the merged supers ''' dataDict = pipeline_dfci.loadDataTable(dataFile) namesList = [name1] extraMap = [name2,dataDict[name2]['background']] return pipeline_dfci.callRose(dataFile,'',parentFolder,namesList,extraMap,mergedGFFFile,tss=0,stitch=0)
def getSignalVector(regionFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) regionTable = utils.parseTable(regionFile, '\t') bamPath = dataDict[name]['bam'] bamName = bamPath.split('/')[-1] colID = regionTable[0].index(bamName) signalVector = [float(line[colID]) for line in regionTable[1:]] return signalVector
def getMedianSignalEnhancer(enhancerFile,name,dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile,'\t') enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median= numpy.median(enhancerVector) return median
def getSignalVector(regionFile,name,dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) regionTable = utils.parseTable(regionFile,'\t') bamPath = dataDict[name]['bam'] bamName = bamPath.split('/')[-1] colID = regionTable[0].index(bamName) signalVector = [float(line[colID]) for line in regionTable[1:]] return signalVector
def getMedianSignal(enhancerFile, name, dataFile): """ returns the median enhancer signal of a file """ dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, "\t") backgroundName = dataDict[name]["background"] if dataDict.has_key(backgroundName): enhancerVector = [float(line[6]) - float(line[7]) for line in enhancerTable[6:]] else: enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def summarizeData(dataFile,output ='',namesList= []): dataDict=pipeline_dfci.loadDataTable(dataFile) if len(namesList) == 0: namesList = dataDict.keys() if len(output) == 0: output = string.replace(dataFile,'.txt','_SUMMARY.txt') print('WRITING OUTPUT TO %s' % (output)) readTable = [['NAME','TOTAL_READS','MAPPED_READS','PEAKS']] for name in namesList: print('GETTING DATA SUMMARY FOR %s' % (name)) uniqueID = dataDict[name]['uniqueID'] mappedReads = round(float(pipeline_dfci.getTONYInfo(uniqueID,'67'))/1000000,2) totalRaw = pipeline_dfci.getTONYInfo(uniqueID,'68') totalRaw = int(totalRaw.split('::')[0]) totalReads = round(float(totalRaw)/1000000,2) #mappedReads = 0 #totalReads = 0 #getting the spot score #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID) #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID) #spotTable = utils.parseTable(spotFile,'\t') #spotScore = spotTable[1][0].split(' ')[-1] #get the peak count if name.count('H3K27AC') == 1 or name.count('ATAC') ==1: peakCollection = utils.importBoundRegion('%s%s' % (macsEnrichedFolder,dataDict[name]['enrichedMacs']),name) peakCount = len(peakCollection) else: peakCount = 'NA' newLine = [name,totalReads,mappedReads,peakCount] print(newLine) readTable.append(newLine) utils.unParseTable(readTable,output,'\t')
def define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile): ''' defines the NB enhancer baseline using H3K27ac chips from NGP, KELLY, BE2C, and SHEP21 enhancers defined using auto optimized stitching of nearby regions w/ a 2.5kb tss exclusion uses the meta rose code and writes out a .sh file for reproducibility ''' #For H3K27AC #with TSS exclusion and auto stitching dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) analysisName = 'NB_H3K27AC' namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList,',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList,',') bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList] bedString = string.join(bedFileList,',') roseFolder = '%smeta_rose/' % (projectFolder) roseFolder = utils.formatFolder(roseFolder,True) outputFolder = '%s%s/' % (roseFolder,analysisName) bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName) bashFile = open(bashFileName,'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 2500 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile) bashFile.write(metaRoseCmd + '\n') bashFile.close() #the 4KB parameter is region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (roseFolder,analysisName,analysisName) return bashFileName,region_map_path,namesList
def getMedianSignal(enhancerFile, name, dataFile): ''' returns the median enhancer signal of a file ''' dataDict = pipeline_dfci.loadDataTable(dataFile) enhancerTable = utils.parseTable(enhancerFile, '\t') backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): enhancerVector = [ float(line[6]) - float(line[7]) for line in enhancerTable[6:] ] else: enhancerVector = [float(line[6]) for line in enhancerTable[6:]] median = numpy.median(enhancerVector) return median
def wrapRose2Meta(data_file, input_path, parent_folder, active_gene_path='', rank_list=[], control_list=[], analysis_name=''): ''' quick wrapper for Rose2Meta ''' dataDict = pipeline_dfci.loadDataTable(data_file) rank_string = ','.join([dataDict[name]['bam'] for name in rank_list]) control_string = ','.join( [dataDict[name]['bam'] for name in control_list]) output_folder = utils.formatFolder( '%s%s' % (parent_folder, analysis_name), True) rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % ( py27_path, pipeline_dir, genome, input_path, rank_string, control_string, analysis_name, output_folder, blacklist_path) all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder, analysis_name) if active_gene_path != '': rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path, active_gene_path) else: rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path) rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name) rose_bash = open(rose_bash_path, 'w') rose_bash.write('#!/usr/bin/python\n\n') rose_bash.write('#setting up bamliquidator\n') rose_bash.write('\n\n#ROSE2_CMD\n') rose_bash.write(rose2_meta_cmd + '\n') rose_bash.write(rose2_map_cmd + '\n') rose_bash.close() print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))
def map_shep_enhancers(shep_on_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder,True) bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder) namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC'] pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile) return bashFileName
def map_for_heatmap(mouse_dataFile): ''' to make quantification easier, all bams read lengths extended to 200 ''' dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) #gff files nb_conserved_promoter_gff_5kb_file = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % ( gffFolder) nb_conserved_enhancer_gff_5kb_file = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % ( gffFolder) #setting the list of gff's to map gffList = [ nb_conserved_promoter_gff_5kb_file, nb_conserved_enhancer_gff_5kb_file, ] cellTypeList = ['CG', 'SCG', 'THMYCN1', 'THMYCN2', 'THMYCN'] mapList = [ 'CG_H3K27Ac', 'SCG_H3K27Ac', 'THMYCN1_H3K27Ac', 'THMYCN_139423_H3K27Ac', 'THMYCN_139076_H3K27Ac', 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] #for the non spike in #note, this data is 75bp reads pipeline_dfci.mapBams(mouse_dataFile, cellTypeList, gffList, mappedFolder, nBin=200, overWrite=False, rpm=True, nameList=mapList, extension=125)
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() extraMap = [] for name in namesList[1:]: backgroundName = dataDict[name]['background'] extraMap+=[name,backgroundName] #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) if utils.checkOutput(mergedRegionMap,1,1): return mergedRegionMap bashFileName = pipeline_dfci.callRose(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile): ''' defines the myc baseline in shep on system across the union of all time points uses the meta rose code and writes out a .sh file for reproducibility ''' #For MYC baseline #no TSS exclusion and no stitching dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) analysisName = 'SHEP_ON_MYC' namesList = [name for name in dataDict.keys() if name.count('MYC') == 1] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList,',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList,',') bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList] bedString = string.join(bedFileList,',') roseFolder = '%smeta_rose/' % (projectFolder) roseFolder = utils.formatFolder(roseFolder,True) outputFolder = '%s%s/' % (roseFolder,analysisName) bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName) bashFile = open(bashFileName,'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 0 -s 0 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile) bashFile.write(metaRoseCmd + '\n') bashFile.close() #this is the expeceted region map output region_map_path = '%s%s/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt' % (roseFolder,analysisName,analysisName) return bashFileName,region_map_path,namesList
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mm1s_dataFile, bam_extension)) #first do individuals for plot_group in ['MYC', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_MM1S_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mm1s_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) dataDict = pipeline_dfci.loadDataTable(data_file) genome_build = genome.name() meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True) meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True) meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2]) #setting the output dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True) group1_string = ','.join(group1_list) group2_string = ','.join(group2_list) dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name) if len(inputGFF) > 0: dynamic_cmd += ' --input %s' % (inputGFF) bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(dynamic_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path)) print('Launching DYNAMIC_META_ROSE') os.system('bash %s' % (bash_path))
def make_summary_table(data_file_list, output, bed_path=''): ''' exports a table w/ name, million mapped reads and number of peaks ''' print('WRITING SUMMARY OUTPUT TO %s' % (output)) if bed_path != '': print('COPYING BEDS TO %s' % (bed_path)) summary_table = [['NAME', 'READ_LENGTH', 'MAPPED_READS', 'PEAKS']] for data_file in data_file_list: print('GETTING DATA SUMMARY FOR %s' % (data_file)) dataDict = pipeline_dfci.loadDataTable(data_file) names_list = dataDict.keys() names_list.sort() for name in names_list: print(name) uniqueID = dataDict[name]['uniqueID'] bam = utils.Bam(dataDict[name]['bam']) read_length = bam.getReadLengths()[0] mmr = round(float(bam.getTotalReads()) / 1000000, 2) #get the peak count try: peak_path = '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']) peakCollection = utils.importBoundRegion(peak_path, name) peakCount = len(peakCollection) except IOError: peakCount = 'NA' newLine = [name, read_length, mmr, peakCount] #print(newLine) summary_table.append(newLine) utils.unParseTable(summary_table, output, '\t')
def map_shep_for_heatmap(shep_on_dataFile): ''' map to both chiprx and regular chip bams to make quantification easier, all bams read lengths extended to 200 ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) #gff files shep_mycn_conserved_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep_mycn_conserved_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_conserved_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) #shep21 gff files shep21_mycn_conserved_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep21_mycn_conserved_promoter_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep21_mycn_conserved_enhancer_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) #setting the list of gff's to map gffList = [shep_mycn_conserved_gff_5kb_file, shep_mycn_conserved_promoter_gff_5kb_file, shep_mycn_conserved_enhancer_gff_5kb_file, shep21_mycn_conserved_gff_5kb_file, shep21_mycn_conserved_promoter_gff_5kb_file, shep21_mycn_conserved_enhancer_gff_5kb_file] cellTypeList = ['SHEP'] mapList = [] # map everything #for the non spike in #note, this data is 75bp reads pipeline_dfci.mapBams(shep_on_dataFile,cellTypeList,gffList,mappedFolder,nBin = 200,overWrite =False,rpm=True,nameList = mapList,extension=125)
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place dataDict = pipeline_dfci.loadDataTable(data_file) meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True) genome_build = genome.name() input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list]) bam_string = ','.join([dataDict[name]['bam'] for name in group_list]) meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss) if stitch != None: meta_cmd += ' -s %s' % (stitch) #adding a mask if necessary if genome.hasFeature('mask'): meta_cmd += ' --mask %s' % (genome.returnFeature('mask')) bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(meta_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path)) print('Launching META_ROSE') os.system('bash %s' % (bash_path))
def map_nb_enhancers(nb_all_chip_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder, True) bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder) namesList = [ 'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC', 'NGP_H3K27AC' ] pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder, parentFolder, namesList, [], '', 2500, '', bashFileName, maskFile) return bashFileName
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage=usage) #required flags parser.add_option("-d", "--data", dest="data", nargs=1, default=None, help="Enter a data file for datasets to be processed") parser.add_option("-o", "--output", dest="output", nargs=1, default=None, help="specify an output folder to write results to") #additional options parser.add_option( "-i", "--input", dest="input", nargs=1, default=None, help= "Enter a comma separated list of names to analyze. Default will be all datasets" ) parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Enter a name for the analysis") parser.add_option("-r", "--rose", dest="rose", nargs=1, default=None, help="Enter a folder to detect or write rose output") parser.add_option( "-a", "--all", dest="all", action='store_true', default=False, help="flag to run analysis on ALL enhancers (this is much slower)") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset" ) parser.add_option( "-e", "--enhancer-type", dest="enhancer_type", nargs=1, default='super', help="specify type of enhancer to analyze: super, stretch, superStretch" ) parser.add_option("-t", "--tss", dest="tss", nargs=1, default=2500, help="specify a tss exclusion window. default is 2500bp") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format' ) (options, args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder( options.output, True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #check for a stitching parameter if len(str(options.stitch)) > 0: stitch = str(options.stitch) else: stitch = '' #check for the tss parameter tssDistance = int(options.tss) #check enhancer type enhancerType = string.lower(options.enhancer_type) if ['super', 'superstretch', 'stretch'].count(enhancerType) == 0: print("ERROR: unsupported enhancer type %s" % (enhancerType)) sys.exit() #see if there's a mask if options.mask: maskFile = options.mask else: maskFile = '' #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile, roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile, roseFolder, namesList, enhancerType) print nameDict print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print nameDict.keys() for name in nameDict.keys(): if len(nameDict[name]['enhancerFile']) == 0: print("NO ROSE OUTPUT FOR %s" % (name)) #sys.exit() #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile, nameDict, outputFolder, roseFolder, stitch, tssDistance, enhancerType, maskFile) print nameDict #sys.exit() #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #sys.exit() #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder, genome, analysisName) mergedGFFFile = mergeCollections(nameDict, analysisName, mergedGFFFile, superOnly) #sys.exit() #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder, maskFile) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict, analysisName, genome, outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome, outputFolder, analysisName, signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #sys.exit() #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/pipeline/') cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % ( genome, clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder, maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict = pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder, True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap += [name, backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % ( backgroundName, name) sys.exit() else: extraMap += [name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % ( outputFolder, namesList[0], gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap, 1, 1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile, '', roseParentFolder, [namesList[0]], extraMap, mergedGFFFile, 0, 0, bashFileName, mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap, 5, 60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % ( mergedGFFFile) sys.exit()
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the bamPlot_turbo.py commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return
import pipeline_dfci import os import time import string # ================================================================================ # ============================GLOBAL PARAMETERS=================================== # ================================================================================ # add locations of files and global parameters in this section dataFile = "/home/clin/projects/131106_seComp/SE_TABLE_FORMATTED.txt" genome = "hg18" dataDict = pipeline_dfci.loadDataTable(dataFile) # ================================================================================ # ===================================CLASSES====================================== # ================================================================================ # user defined classes here # ================================================================================ # =================================FUNCTIONS====================================== # ================================================================================ # write your specific functions here def makeSECollection(enhancerFile, name, top=0):
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter a data file for datasets to be processed") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "specify an output folder to write results to") #additional options parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter a comma separated list of names to analyze. Default will be all datasets") parser.add_option("-n","--name", dest="name",nargs=1,default=None, help = "Enter a name for the analysis") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a folder to detect or write rose output") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "flag to run analysis on ALL enhancers (this is much slower)") parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default='', help = "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") parser.add_option("-t","--tss", dest="tss",nargs = 1, default=2500, help = "specify a tss exclusion window. default is 2500bp") parser.add_option("--mask",dest="mask",nargs=1,default=None, help = 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format') (options,args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder(options.output,True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #check for a stitching parameter if len(str(options.stitch)) > 0: stitch = str(options.stitch) else: stitch = '' #check for the tss parameter tssDistance = int(options.tss) #check enhancer type enhancerType = string.lower(options.enhancer_type) if ['super','superstretch','stretch'].count(enhancerType) == 0: print("ERROR: unsupported enhancer type %s" % (enhancerType)) sys.exit() #see if there's a mask if options.mask: maskFile = options.mask else: maskFile = '' #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile,roseFolder,namesList,enhancerType) print nameDict print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print nameDict.keys() for name in nameDict.keys(): if len(nameDict[name]['enhancerFile']) == 0: print("NO ROSE OUTPUT FOR %s" % (name)) #sys.exit() #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile) print nameDict #sys.exit() #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #sys.exit() #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName) mergedGFFFile = mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly) #sys.exit() #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #sys.exit() #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/pipeline/') cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (genome,clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(statOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] headerLength = len(rankEnhancerTable[0]) for line in rankEnhancerTable[1:]: #fix line lengths if len(line) != headerLength: line += ['']*(headerLength-len(line)) #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[-1].split(',') geneList += line[-2].split(',') geneList += line[-3].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff #the cutoff is hard wired, but we can add an option to change the test #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff #print(line) if float(line[-8]) > cutOff and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'UNCHANGED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #formatted diff table #possible that no genes are differential rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t') #make a new formatted table header = rankEnhancerDiffTable[0] formattedRankDiffTable =[header] for line in rankEnhancerDiffTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankDiffTable.append(line) formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bamList1 = [dataDict[name]['bam'] for name in namesList1] bamList2 = [dataDict[name]['bam'] for name in namesList2] bamList = bamList1 + bamList2 bamString = string.join(bamList,',') nameList = [name1]*len(namesList1) + [name2]*len(namesList2) nameString = string.join(nameList,',') print(namesList1[0]) print(namesList2[0]) print(namesList1) print(namesList2) print(dataDict[namesList1[0]]['color']) if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']: colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2) else: colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2) colorString = string.join(colorList,':') #change dir if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): ''' using argparse ''' parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES') # required flags parser.add_argument("-d", "--data_table", dest="data_table", type=str, help="input a data table with all datasets to be analyzed", required=True) parser.add_argument("-1", "--group1", dest="group1", type=str, help="input a comma separated list of all datasets in group1", required=True) parser.add_argument("-2", "--group2", dest="group2", type=str, help="input a comma separated list of all datasets in group2", required=True) #optional input override parser.add_argument("-i", "--input", dest="input", type=str, help="input a gff of regions to analyze", required=False) #optional arguments parser.add_argument("-n", "--name", dest="name", type=str, help="specify a name for the analysis. Default is drawn from the data table name", required=False) parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str, help="Enter a name for group1. Default is 'GROUP1'", required=False) parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str, help="Enter a name for group2. Default is 'GROUP2'", required=False) parser.add_argument("-a", "--activity", dest="activity", type=str,default='', help="a table with active gene names in the first column", required=False) parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500, help="Specify a TSS exclusion distance. Default is 2500", required=False) parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None, help="Specify a stitching distance. Default is auto stitching", required=False) parser.add_argument("-o", "--output", dest="output", default='./',type=str, help="Enter the output folder. Default is the current working directory", required=False) parser.add_argument("--log", dest="log", default='',type=str, help="Enter a path to log output", required=False) # # DEBUG OPTION TO SAVE TEMP FILES # parser.add_argument("--scale", dest="scale", default='', # help="Enter a comma separated list of scaling factors for your bams. Default is none") # parser.add_argument("--save-temp", dest="save", action='store_true', default=False, # help="If flagged will save temporary files made by bamPlot") # parser.add_argument("--bed", dest="bed", # help="Add a space-delimited list of bed files to plot") # parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, # help="If flagged will create a new pdf for each region") args = parser.parse_args() #now we can begin to parse the arguments #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== #pulling in the data table data_file = os.path.abspath(args.data_table) dataDict = pipeline_dfci.loadDataTable(data_file) #setting naming conventions if not args.name: analysis_name = data_file.split('/')[-1].split('.')[0] else: analysis_name = args.name #getting the optional input gff if args.input: inputGFF = args.input else: inputGFF = '' #getting group names group1_name = args.group1_name group2_name = args.group2_name #getting group1 group1_string = args.group1 group1_list = [name for name in string.split(group1_string,',') if len(name) > 0] #getting group2 group2_string = args.group2 group2_list = [name for name in string.split(group2_string,',') if len(name) > 0] #checking that all datasets are in the data table for name in group1_list + group2_list: if name not in dataDict: print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file)) sys.exit() #loading in the genome object from the data table genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list]) if len(genome_list) > 1: print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.') sys.exit() #the load genome function has an assertion test to make sure the genome is supported genome = loadGenome(genome_list[0]) parent_folder = utils.formatFolder(args.output,True) output_folder = utils.formatFolder(parent_folder + analysis_name,True) #these are the user defined optional arguments tss = int(args.tss) stitch = args.stitch print('stitch') print(stitch) #list of active genes to constrain analysis if len(args.activity) == 0: #assumes all genes are active unless told otherwise #activity_path,activity_table = getActivity() # fix this function print('using all active genes') else: activity_path = args.activity activity_table = utils.parseTable(activity_path,'\t') print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing datasets described in %s\n' % (data_file)) print('Name for the analysis: %s\n' % (analysis_name)) print('Using genome: %s\n' % (genome.name())) print('%s datasets: %s\n' % (group1_name,group1_string)) print('%s datasets: %s\n' % (group2_name,group2_string)) if len(activity_path) > 0: print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path)) else: print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name)) print('Writing output to: %s\n' % (output_folder)) #===================================================================================== #======================II. DEFINING CIS-REGULATORY ELEMENTS=========================== #===================================================================================== print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n') #crc_wrapper will act at the group level and not consider individual datasets #since a data table is used as the input, the code will rely heavily on pipeline_dfci #embedded tools #1. first we need to run meta rose using default parameters and check the output #exists for each group meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True) group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name) group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name) #print(group1_output) #print(group2_output) #for each output check to see if they exist #if not launch try: foo = open(group1_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name)) launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss) try: foo = open(group2_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name)) launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss) #now check for completion if utils.checkOutput(group1_output,1,10): print('META_ROSE finished for %s' % (group1_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name)) sys.exit() if utils.checkOutput(group2_output,1,10): print('META_ROSE finished for %s' % (group2_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name)) sys.exit() #Meta rose does not give all regions that are SE in at least one sample #and can be blown out by amplicons etc... #sooo we need to run clustering to generate a good input gff #ideally we just rewrite dynamic meta to run off of clustering output #until we do that let's just overwrite w/ an input gff print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name)) dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True) #here we will use the rank table as the primary output dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name) try: foo = open(dynamic_rose_output,'r') except IOError: print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name)) launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF) if utils.checkOutput(dynamic_rose_output,1,10): print('DYNAMIC_ROSE finsihed for %s' % (analysis_name)) else: print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name)) sys.exit() #===================================================================================== #======================III. IDENTIFYING TF NODES IN NETWORK=========================== #===================================================================================== print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n') #now we want to call circuitry on each group... ok to have different subpeaks and motif calls #if as a first approximation we weight by the overall enhancer crc_folder = utils.formatFolder('%scrc/' % (output_folder),True) #for all all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True) launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path) #for group1 group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True) launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path) #for group2 group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True) launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile']='' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) print nameDict[name] sys.exit() return nameDict
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder, False): roseExists = True roseFolder = utils.formatFolder(roseFolder, False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder, True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [ name for name in dataDict.keys() if string.upper(name).count('WCE') == 0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder, dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile, 'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [ x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % ( roseFolder, name, allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile'] = '' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name][ 'enrichedFile'] == '': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % ( name) print nameDict[name] sys.exit() return nameDict
#pipeline_dfci.makePipelineTable(sampleTableFile,dirPath,bamPath,dataFile) #dataDict = pipeline_dfci.loadDataTable(dataFile) #namesList = dataDict.keys() #print(namesList) #========================================================================== #=======================LOADING DATA ANNOTATION============================ #========================================================================== ##THIS SECTION LOADS A DATA TABLE. MUST BE UNCOMMENTED FOR REST OF CODE TO WORK #LOADING THE DATA TABLE dataDict = pipeline_dfci.loadDataTable(dataFile) print(dataDict.keys()) pipeline_dfci.summary(dataFile) #========================================================================== #==========================CALLING BOWTIE================================== #========================================================================== ##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER #namesList = [] <- fill this in if you want to only map a subset of the data. otherwise leave blank ##SET LAUNCH TO False to debug #pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True)
def mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale): ''' takes the rose output and merges signal ''' print(roseOutput) initialMap = utils.parseTable(roseOutput,'\t') print(len(initialMap)) output_merged = string.replace(roseOutput,'MAP.txt','MAP_MERGED.txt') output_norm = string.replace(roseOutput,'MAP.txt','MAP_NORM.txt') #one column for each signal name1Columns = range(0,len(namesList1),1) name2Columns = range(len(namesList1),len(namesList1+namesList2),1) if useBackground: name1BackgroundColumns = range(len(namesList1 +namesList2),len(namesList1 + namesList2 + namesList1),1) name2BackgroundColumns = range(len(namesList1 +namesList2+namesList1),len(namesList1 + namesList2 + namesList1 + namesList2),1) mergedMap = [initialMap[0][0:6] + ['%s_SIGNAL' % (name1),'%s_SIGNAL' % (name2)]] normMap = [initialMap[0][0:6] + namesList1 + namesList2] for line in initialMap[1:]: signalVector = [float(x) for x in line[7:]] #we ignore the 6th column if useBackground: name1Vector = [signalVector[i] for i in name1Columns] name1BackgroundVector = [signalVector[i] for i in name1BackgroundColumns] name1NormVector = numpy.subtract(name1Vector,name1BackgroundVector).tolist() #now zero out any negatives name1NormVector = [max(0,signal) for signal in name1NormVector] name1Signal = numpy.mean(name1NormVector) name2Vector = [signalVector[i] for i in name2Columns] name2BackgroundVector = [signalVector[i] for i in name2BackgroundColumns] name2NormVector = numpy.subtract(name2Vector,name2BackgroundVector).tolist() #now zero out any negatives name2NormVector = [max(0,signal) for signal in name2NormVector] name2Signal = numpy.mean(name2NormVector) else: name1Vector = [signalVector[i] for i in name1Columns] name1Signal = numpy.mean(name1Vector) name2Vector = [signalVector[i] for i in name2Columns] name2Signal = numpy.mean(name2Vector) mergeLine = line[0:6] + [name1Signal,name2Signal] mergedMap.append(mergeLine) normLine = line[0:6] + name1Vector + name2Vector normMap.append(normLine) if medianScale: #now we basically have to do the same thing to the region map for each one #this must have the correct name/background relationships as the original rose dataDict = pipeline_dfci.loadDataTable(dataFile) medianDict = defaultdict(float) #can do this for each region map regionMap1 = roseDict1['RegionMap'] regionMap2 = roseDict2['RegionMap'] print(regionMap1) print(regionMap2) for name in namesList1: signalVector = getSignalVector(regionMap1,name,dataFile) if useBackground: backgroundName = dataDict[name]['background'] backgroundVector = getSignalVector(regionMap1,backgroundName,dataFile) normVector = numpy.subtract(signalVector,backgroundVector).tolist() medianDict[name] = numpy.median(normVector) else: medianDict[name] = numpy.median(signalVector) #for second namesList must use regionMap2 for name in namesList2: signalVector = getSignalVector(regionMap2,name,dataFile) if useBackground: backgroundName = dataDict[name]['background'] backgroundVector = getSignalVector(regionMap2,backgroundName,dataFile) normVector = numpy.subtract(signalVector,backgroundVector).tolist() medianDict[name] = numpy.median(normVector) else: medianDict[name] = numpy.median(signalVector) #so here we only need to adjust the normMap for name in namesList1 + namesList2: medianSignal = medianDict[name] col = normMap[0].index(name) for row in range(1,len(normMap)): signal = float(normMap[row][col]) normMap[row][col] = float(signal)/float(medianSignal) print(medianDict) utils.unParseTable(mergedMap,output_merged,'\t') utils.unParseTable(normMap,output_norm,'\t') return output_merged,output_norm
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -n [NAMES_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter a data file for datasets to be processed") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "specify an output folder to write results to") #additional options parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter a comma separated list of names to analyze. Default will be all datasets") parser.add_option("-n","--name", dest="name",nargs=1,default=None, help = "Enter a name for the analysis") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a folder to detect or write rose output") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "flag to run analysis on ALL enhancers (this is much slower)") (options,args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder(options.output,True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print namesList #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile,roseFolder,namesList) print nameDict #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder) print nameDict #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName) mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly) #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/rose/') cmd = 'python /ark/home/cl512/rose/ROSE_geneMapper.py -g %s -i %s' % (genome,clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()