def wrap_meme(analysis_name): ''' wrapper to run meme-chip w/ a pwm ''' meme_folder = utils.formatFolder('%smeme/' % (projectFolder), True) output_folder = utils.formatFolder('%s%s' % (meme_folder, analysis_name), True) meme_bash_path = '%s%s_%s_meme.sh' % (meme_folder, analysis_name, top) meme_path = '/storage/cylin/bin/meme/bin/meme-chip' pwm_path = '/storage/cylin/bin/pipeline/crc/annotation/VertebratePWMs.txt' meme_bash = open(meme_bash_path, 'w') meme_bash.write('#!/usr/bin/bash\n') meme_bash.write('#SBATCH -n 32\n') meme_bash.write('#SBATCH -p short\n') meme_cmd = '%s -meme-nmotifs 5 -spamo-skip -oc %s -db %s %s' % ( meme_path, output_folder, pwm_path, fasta_path) meme_bash.write(meme_cmd) meme_bash.close() return meme_bash_path
def makeRoseDict(roseFolder): """ analyzes a rose folder to try to find all of the various necessary files creates a dictionary with their full paths """ if not utils.formatFolder(roseFolder, False): print "Folder %s does not exist" % (roseFolder) sys.exit() roseFolder = utils.formatFolder(roseFolder, False) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."] # no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() # create a dictionary to store stuff roseDict = {} # there are 5 files that we're interested in # REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed # sequentially find each one and add the full path to the roseDict roseDict["AllEnhancer"] = getFile("AllEnhancers.table.txt", roseFileList, roseFolder) roseDict["super"] = getFile("SuperEnhancers.table.txt", roseFileList, roseFolder) roseDict["stretch"] = getFile("_StretchEnhancers.table.txt", roseFileList, roseFolder) roseDict["superstretch"] = getFile("SuperStretchEnhancers.table.txt", roseFileList, roseFolder) roseDict["EnhancerToGene"] = getFile("_SuperEnhancers_ENHANCER_TO_GENE", roseFileList, roseFolder) roseDict["RegionMap"] = getFile("REGION_MAP", roseFileList, roseFolder) roseDict["bed"] = getFile("Enhancers_withSuper.bed", roseFileList, roseFolder) return roseDict
def makeRoseDict(roseFolder): ''' analyzes a rose folder to try to find all of the various necessary files creates a dictionary with their full paths ''' if not utils.formatFolder(roseFolder,False): print "Folder %s does not exist" % (roseFolder) sys.exit() roseFolder = utils.formatFolder(roseFolder,False) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() #create a dictionary to store stuff roseDict = {} #there are 5 files that we're interested in #REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed #sequentially find each one and add the full path to the roseDict roseDict['AllEnhancer'] = getFile('AllEnhancers.table.txt',roseFileList,roseFolder) roseDict['super'] = getFile('SuperEnhancers.table.txt',roseFileList,roseFolder) roseDict['stretch'] = getFile('_StretchEnhancers.table.txt',roseFileList,roseFolder) roseDict['superstretch'] = getFile('SuperStretchEnhancers.table.txt',roseFileList,roseFolder) roseDict['EnhancerToGene'] = getFile('_SuperEnhancers_ENHANCER_TO_GENE',roseFileList,roseFolder) roseDict['RegionMap'] = getFile('REGION_MAP',roseFileList,roseFolder) roseDict['bed'] = getFile('Enhancers_withSuper.bed',roseFileList,roseFolder) return roseDict
def makeRoseDict(roseFolder): ''' analyzes a rose folder to try to find all of the various necessary files creates a dictionary with their full paths ''' if not utils.formatFolder(roseFolder,False): print "Folder %s does not exist" % (roseFolder) sys.exit() roseFolder = utils.formatFolder(roseFolder,False) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() #create a dictionary to store stuff roseDict = {} #there are 5 files that we're interested in #REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed #sequentially find each one and add the full path to the roseDict roseDict['AllEnhancer'] = getFile('AllEnhancers.table.txt',roseFileList,roseFolder) roseDict['SuperEnhancer'] = getFile('SuperEnhancers.table.txt',roseFileList,roseFolder) roseDict['EnhancerToGene'] = getFile('ENHANCER_TO_GENE',roseFileList,roseFolder) roseDict['RegionMap'] = getFile('REGION_MAP',roseFileList,roseFolder) roseDict['bed'] = getFile('Enhancers_withSuper.bed',roseFileList,roseFolder) return roseDict
def makeHeatmap(names_list, gff_list, plot_name, plot_color): ''' wrapper for the heatmap and meta R script ''' meta_heat_script = '%sr_scripts/5_chiprx_heatmaps.R' % (hg19_projectFolder) scale_table_path = '%stables/HG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % ( hg19_projectFolder) figures_path = utils.formatFolder('%sfigures/' % (projectFolder), True) figures_path = utils.formatFolder( '%sfigures/5_chiprx_heatmaps/' % (projectFolder), True) names_string = ','.join(names_list) for gff in gff_list: gffName = gff.split('/')[-1].split('.')[0] mapped_list = [ '%s%s/%s_%s.gff' % (mappedFolder, gffName, gffName, name) for name in names_list ] mapped_string = ','.join(mapped_list) r_cmd = 'Rscript %s %s %s %s %s %s %s %s %s' % ( meta_heat_script, mapped_string, scale_table_path, names_string, plot_color, gffName, plot_name, 'TRUE', projectFolder) print(r_cmd) os.system(r_cmd)
def wrapGeneMapper(data_file,names_list=[],launch=True): ''' runs ROSE2 GENE MAPPER on the AllEnhancers table ''' data_dict = pipeline_dfci.loadDataTable(data_file) parent_rose_folder = utils.formatFolder('%srose_final' % (projectFolder),False) if len(names_list) ==0: names_list=[name for name in data_dict.keys() if name.upper().count('H3K27AC') ==1] #find each individual all enhancer table and then call the mapper via an .sh script for name in names_list: print(name) dataset_rose_folder = utils.formatFolder('%s%s_ROSE' %(parent_rose_folder,name),False) all_enhancer_path = '%s%s_peaks_AllEnhancers.table.txt' % (dataset_rose_folder,name) #print(all_enhancer_path) mapper_bash_path = '%s%s_geneMapper.sh' % (dataset_rose_folder,name) mapper_bash_file = open(mapper_bash_path,'w') mapper_bash_file.write('#!/usr/bin/bash\n\n\n\n') mapper_bash_file.write('#Running ROSE2 GENE MAPPER ON %s ALL ENHANCERS OUTPUT\n\n' % (name)) mapper_cmd = 'python %sROSE2_geneMapper.py -g %s -i %s -f -w 100000' % (pipeline_dir,genome,all_enhancer_path) mapper_bash_file.write(mapper_cmd+'\n') mapper_bash_file.close() print('wrote gene mapper command to %s' % (mapper_bash_path)) if launch: os.system('bash %s' % mapper_bash_path)
def makeNameDict(dataFile,roseFolder,namesList=[]): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE'] #this filters out control WCE datatsets #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print enrichedFile try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) sys.exit() return nameDict
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict= pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder,True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap+=[name,backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name) sys.exit() else: extraMap+=[name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap,1,1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap,5,60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile) sys.exit()
def findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) print 'GENERATING TF NETWORK' # select the TF candidates that have motifs canidateMotifs = [] for gene in canidateGenes: if gene in motifNames: canidateMotifs.append(gene) print 'Number of annotated canidate TFs that have motifs: ' + str( len(canidateMotifs)) canidateMotifs = sorted(canidateMotifs) #canidateMotifs = ['NANOG', 'POU5F1', 'SOX2'] bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme' subprocess.call(bgCmd, shell=True) utils.formatFolder(projectFolder + 'FIMO/', True) fimoCmd = 'fimo' for TF in canidateMotifs: print TF for x in motifDatabaseDict[TF]: fimoCmd += ' --motif ' + "'%s'" % (str(x)) #fimoCmd += ' --thresh 1e-5' fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme' fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa' fimoCmd += ' > ' + projectFolder + 'FIMO/fimo.txt' ## print fimoCmd fimoOutput = subprocess.call( fimoCmd, shell=True) #will wait that fimo is done to go on return fimoCmd
def process_shep_rna_drop_rep(shep21_rna_dataFile, gtfFile): ''' quantifies gene expression to the hg19 ucsc refseq genes_ercc.gtf which has the spike included ''' analysisName = 'SHEP21' cufflinksFolder = utils.formatFolder( '%sshep21_cufflinks_no_rep2/' % (rnaFolder), True) groupList = [ ['SHEP21_0HR_rep1', 'SHEP21_0HR_rep3'], ['SHEP21_2HR_rep1', 'SHEP21_2HR_rep2', 'SHEP21_2HR_rep3'], ['SHEP21_4HR_rep1', 'SHEP21_4HR_rep2', 'SHEP21_4HR_rep3'], ['SHEP21_6HR_rep1', 'SHEP21_6HR_rep2', 'SHEP21_6HR_rep3'], ['SHEP21_8HR_rep1', 'SHEP21_8HR_rep2', 'SHEP21_8HR_rep3'], ['SHEP21_16HR_rep1', 'SHEP21_16HR_rep2', 'SHEP21_16HR_rep3'], ['SHEP21_24HR_rep1', 'SHEP21_24HR_rep2', 'SHEP21_24HR_rep3'], ] bashFileName = '%sshep21_rna_seq_cuff_no_rep2.sh' % (cufflinksFolder) makeCuffTable(shep21_rna_dataFile, analysisName, gtfFile, cufflinksFolder, groupList, bashFileName) return bashFileName
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#==================I. LOADING DATA ANNOTATION TABLES===================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for chip data file pipeline_dfci.summary(chip_data_file) #for chip data file pipeline_dfci.summary(atac_data_file) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING ROSE2===========================' ) print( '#======================================================================' ) print('\n\n') macsEnrichedFolder = '%smacsEnriched/' % ( projectFolder) #folder with macs peak output beds parentFolder = utils.formatFolder( '%srose/' % (projectFolder), True) # create a folder to store ROSE2 output namesList = ['MM1S_H3K27AC', 'MM1S_MED1' ] # calling ROSE2 on H3K27AC and MED1 defined enhancers bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder) mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % ( projectFolder) pipeline_dfci.callRose2(chip_data_file, macsEnrichedFolder, parentFolder, namesList, extraMap=[], inputFile='', tss=2500, stitch=12500, bashFileName=bash_file, mask=mask_file, useBackground=True)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq pipeline_dfci.summary(shep21_rna_dataFile) pipeline_dfci.summary(be2c_rna_drug_dataFile) pipeline_dfci.summary(be2c_rna_twist_dataFile) print('\n\n') print( '#======================================================================' ) print( '#===================II, RUNNING LINE PLOT SCRIPTS======================' ) print( '#======================================================================' ) print('\n\n') #make the folder to store output figures utils.formatFolder('%sfigures/6_rna_line_plots/' % (projectFolder), True) #we have 3 RNA-Seq datasets #first is shep21 at the mycn conserved regions w/ the replicate dropped #and at shep21 defined regions #wrap_shep21() wrap_be2c_jq1()
def wrap_enhancer_promoter(dataFile, input_path, activity_path, analysis_name, names_list=[], useBackground=True): ''' runs enhancer promoter on everybody with the conserved regions and union of active genes ''' #hard coded paths tads_path = '%shESC_domains_hg19.bed' % (bedFolder) #setting the output folder ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder), True) dataDict = pipeline_dfci.loadDataTable(dataFile) if len(names_list) == 0: names_list = [name for name in dataDict.keys()] names_list.sort() bams_list = [dataDict[name]['bam'] for name in names_list] bams_string = ' '.join(bams_list) background_names = [ dataDict[name]['background'] for name in names_list ] background_list = [ dataDict[background_name]['bam'] for background_name in background_names ] background_string = ' '.join(background_list) ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name) ep_bash = open(ep_bash_path, 'w') ep_bash.write('#!/usr/bin/bash\n\n\n') ep_bash.write('#enhancer promoter analysis for %s\n\n' % (analysis_name)) if useBackground: python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, background_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) else: python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) ep_bash.close() return (ep_bash_path)
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes): ''' import the FIMO output once it's finished build the networkX directed graph ''' motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] # The reverse of the other dict, from motif name to gene name for line in motifDatabase: motifDatabaseDict[line[0]] = line[1] fimoFile = projectFolder + 'FIMO/fimo.txt' fimoTable = utils.parseTable(fimoFile, '\t') graph = nx.DiGraph(name=projectName) graph.add_nodes_from(canidateGenes) motifDict = defaultdict(list) for line in fimoTable[1:]: source = motifDatabaseDict[line[0]] #motifId # region = line[1].split('|') region = line[2].split('|') target = refseqToNameDict[ region[0]] #gene name corresponding to the NMid graph.add_edge(source, target) # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]))) motifDict[source].append((region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]))) utils.formatFolder(projectFolder + 'motifBED/', True) for gene in motifDict.keys(): if motifDict[gene]: bed = [] for loc in motifDict[gene]: bed.append([loc[0], loc[1], loc[2]]) filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed' utils.unParseTable(bed, filename, '\t') return graph
def getFile(fileString,fileList,parentFolder): ''' returns full path of file from fileList containing the fileString returns an error if multiple files match ''' if not utils.formatFolder(parentFolder,False): print "ERROR: Folder %s does not exist" % (parentFolder) sys.exit() parentFolder = utils.formatFolder(parentFolder,False) matchFiles = [fileName for fileName in fileList if fileName.count(fileString) == 1] if len(matchFiles) == 0: print "ERROR: No files found in %s with %s in title" % (parentFolder,fileString) sys.exit() if len(matchFiles) > 1: print "ERROR: Multiple files found in %s with %s in title" % (parentFolder,fileString) sys.exit() matchFilePath = "%s%s" % (parentFolder,matchFiles[0]) return matchFilePath
def getFile(fileString,fileList,parentFolder): ''' returns full path of file from fileList containing the fileString returns an error if multiple files match ''' if not utils.formatFolder(parentFolder,False): print "ERROR: Folder %s does not exist" % (parentFolder) sys.exit() parentFolder = utils.formatFolder(parentFolder,False) matchFiles = [fileName for fileName in fileList if fileName.count(fileString) == 1] if len(matchFiles) == 0: print "WARNING: No files found in %s with %s in title" % (parentFolder,fileString) return '' if len(matchFiles) > 1: print "ERROR: Multiple files found in %s with %s in title" % (parentFolder,fileString) sys.exit() matchFilePath = "%s%s" % (parentFolder,matchFiles[0]) return matchFilePath
def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top): ''' runs C2 GSEA ''' #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar' #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set gseaBashFilePath = '%s%s_GSEA_cmd.sh' % (outputFolder,analysisName) gseaBashFile = open(gseaBashFilePath,'w') gseaBashFile.write('#!/usr/bin/bash\n\n') gseaBashFile.write('#COMMAND LINE GSEA CALLS FOR %s\n\n' % (analysisName)) #for all gctPath = '%s%s_top_all.gct' % (outputFolder,analysisName) clsPath = '%s%s_top_all.cls' % (outputFolder,analysisName) gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2' % (outputFolder),True) rptLabel = '%s_top_all' % (analysisName) gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder) gseaBashFile.write(gseaCmd_all) gseaBashFile.write('\n') if top != 'all': #for top N gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top) clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top) gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True) rptLabel = '%s_top_%s' % (analysisName,top) gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel)) gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder) gseaBashFile.write(gseaCmd_top) gseaBashFile.write('\n') gseaBashFile.close() os.system('bash %s' % (gseaBashFilePath))
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) dataDict = pipeline_dfci.loadDataTable(data_file) genome_build = genome.name() meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True) meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True) meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2]) #setting the output dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True) group1_string = ','.join(group1_list) group2_string = ','.join(group2_list) dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name) if len(inputGFF) > 0: dynamic_cmd += ' --input %s' % (inputGFF) bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(dynamic_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path)) print('Launching DYNAMIC_META_ROSE') os.system('bash %s' % (bash_path))
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name) fimo_bash = open(fimo_bash_path, 'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > ' + output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call( fimoCmd, shell=True) #will wait that fimo is done to go on return output
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile): ''' takes the refseq to subpeak seq dict returns the networkx object with all connections ''' fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True) subpeak_name = subpeakFasta.split('/')[-1].split('.')[0] output = '%s%s_fimo.txt' % (fimoFolder,subpeak_name) # Create a dictionary to call motif names keyed on gene names motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs for line in motifDatabase: motifDatabaseDict[line[1]] = [] for line in motifDatabase: motifDatabaseDict[line[1]].append(line[0]) candidate_tf_list.sort() print(candidate_tf_list) #now make a list of all motifs motif_list = [] for tf in candidate_tf_list: motif_list += motifDatabaseDict[tf] motif_list = utils.uniquify(motif_list) fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name) fimo_bash = open(fimo_bash_path,'w') fimo_bash.write('#!/usr/bin/bash\n\n') fimoCmd = 'fimo' for motif in motif_list: fimoCmd += ' --motif ' + "'%s'" % (str(motif)) #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency fimoCmd += ' -verbosity 1' # thanks for that ;)! fimoCmd += ' -text' fimoCmd += ' -oc ' + projectFolder + 'FIMO' fimoCmd += ' --bgfile %s' % (bg_path) fimoCmd += ' ' + motifDatabaseFile + ' ' fimoCmd += subpeakFasta fimoCmd += ' > '+ output print fimoCmd fimo_bash.write(fimoCmd) fimo_bash.close() fimoOutput = subprocess.call(fimoCmd, shell=True) #will wait that fimo is done to go on return output
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder,True) queueList =[] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose(dataFile,'',roseOutputFolder,[name],[],enrichedFile,mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_AllEnhancers.table.txt" % (roseOutputFolder,name,name) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile) if utils.checkOutput(enhancerFile,5,60): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: print "UNABLE TO FIND ENHANCER OUTPUT FOR %s. QUITTING NOW" % (name) sys.exit() return nameDict
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss): ''' launches meta rose ''' project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False) macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place dataDict = pipeline_dfci.loadDataTable(data_file) meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True) genome_build = genome.name() input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list]) bam_string = ','.join([dataDict[name]['bam'] for name in group_list]) meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss) if stitch != None: meta_cmd += ' -s %s' % (stitch) #adding a mask if necessary if genome.hasFeature('mask'): meta_cmd += ' --mask %s' % (genome.returnFeature('mask')) bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name) bash_file = open(bash_path,'w') bash_file.write('#!/usr/bin/bash\n\n') bash_file.write('cd %s\n\n' % (pipeline_dir)) bash_file.write(meta_cmd) bash_file.write('\n\n') bash_file.close() print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path)) print('Launching META_ROSE') os.system('bash %s' % (bash_path))
def define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile): ''' defines the NB enhancer baseline using H3K27ac chips from NGP, KELLY, BE2C, and SHEP21 enhancers defined using auto optimized stitching of nearby regions w/ a 2.5kb tss exclusion uses the meta rose code and writes out a .sh file for reproducibility ''' #For H3K27AC #with TSS exclusion and auto stitching dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) analysisName = 'NB_H3K27AC' namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList,',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList,',') bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList] bedString = string.join(bedFileList,',') roseFolder = '%smeta_rose/' % (projectFolder) roseFolder = utils.formatFolder(roseFolder,True) outputFolder = '%s%s/' % (roseFolder,analysisName) bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName) bashFile = open(bashFileName,'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 2500 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile) bashFile.write(metaRoseCmd + '\n') bashFile.close() #the 4KB parameter is region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (roseFolder,analysisName,analysisName) return bashFileName,region_map_path,namesList
def map_shep_enhancers(shep_on_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder,True) bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder) namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC'] pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile) return bashFileName
def wrapRose2Meta(data_file, input_path, parent_folder, active_gene_path='', rank_list=[], control_list=[], analysis_name=''): ''' quick wrapper for Rose2Meta ''' dataDict = pipeline_dfci.loadDataTable(data_file) rank_string = ','.join([dataDict[name]['bam'] for name in rank_list]) control_string = ','.join( [dataDict[name]['bam'] for name in control_list]) output_folder = utils.formatFolder( '%s%s' % (parent_folder, analysis_name), True) rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % ( py27_path, pipeline_dir, genome, input_path, rank_string, control_string, analysis_name, output_folder, blacklist_path) all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder, analysis_name) if active_gene_path != '': rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path, active_gene_path) else: rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path) rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name) rose_bash = open(rose_bash_path, 'w') rose_bash.write('#!/usr/bin/python\n\n') rose_bash.write('#setting up bamliquidator\n') rose_bash.write('\n\n#ROSE2_CMD\n') rose_bash.write(rose2_meta_cmd + '\n') rose_bash.write(rose2_map_cmd + '\n') rose_bash.close() print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))
def process_be2c_drug_rna(be2c_rna_drug_dataFile, gtfFile): analysisName = 'BE2C_DRUG' cufflinksFolder = utils.formatFolder( '%sbe2c_drug_cufflinks/' % (rnaFolder), True) groupList = [ ['BE2C_DMSO_B1', 'BE2C_DMSO_B2', 'BE2C_DMSO_B3'], ['BE2C_JQ1_4HR_2', 'BE2C_JQ1_4HR_3'], ['BE2C_JQ1_8HR_1', 'BE2C_JQ1_8HR_2', 'BE2C_JQ1_8HR_3'], ['BE2C_JQ1_24HR_1', 'BE2C_JQ1_24HR_2', 'BE2C_JQ1_24HR_3'], ] bashFileName = '%sbe2c_drug_rna_seq_cuff.sh' % (cufflinksFolder) makeCuffTable(be2c_rna_drug_dataFile, analysisName, gtfFile, cufflinksFolder, groupList, bashFileName) return bashFileName
def define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile): ''' defines the myc baseline in shep on system across the union of all time points uses the meta rose code and writes out a .sh file for reproducibility ''' #For MYC baseline #no TSS exclusion and no stitching dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) analysisName = 'SHEP_ON_MYC' namesList = [name for name in dataDict.keys() if name.count('MYC') == 1] bamFileList = [dataDict[name]['bam'] for name in namesList] bamString = string.join(bamFileList,',') controlBams = [dataDict[name]['background'] for name in namesList] controlFileList = [dataDict[name]['bam'] for name in controlBams] controlBamString = string.join(controlFileList,',') bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList] bedString = string.join(bedFileList,',') roseFolder = '%smeta_rose/' % (projectFolder) roseFolder = utils.formatFolder(roseFolder,True) outputFolder = '%s%s/' % (roseFolder,analysisName) bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName) bashFile = open(bashFileName,'w') bashFile.write('#!/usr/bin/bash\n\n') bashFile.write('cd %s\n' % (pipeline_dir)) metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 0 -s 0 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile) bashFile.write(metaRoseCmd + '\n') bashFile.close() #this is the expeceted region map output region_map_path = '%s%s/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt' % (roseFolder,analysisName,analysisName) return bashFileName,region_map_path,namesList
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mm1s_dataFile, bam_extension)) #first do individuals for plot_group in ['MYC', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_MM1S_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mm1s_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def process_be2c_twist_rna(be2c_rna_twist_dataFile, gtfFile): analysisName = 'BE2C_TWIST' cufflinksFolder = utils.formatFolder( '%sbe2c_twist_cufflinks/' % (rnaFolder), True) groupList = [ ['BE2C_shT_nodox_rep1', 'BE2C_shT_nodox_rep2', 'BE2C_shT_nodox_rep3'], ['BE2C_shT_3HR_rep1', 'BE2C_shT_3HR_rep2', 'BE2C_shT_3HR_rep3'], ['BE2C_shT_6HR_rep1', 'BE2C_shT_6HR_rep2', 'BE2C_shT_6HR_rep3'], ['BE2C_shT_12HR_rep1', 'BE2C_shT_12HR_rep2', 'BE2C_shT_12HR_rep3'], ['BE2C_shT_24HR_rep1', 'BE2C_shT_24HR_rep2', 'BE2C_shT_24HR_rep3'], ['BE2C_shT_48HR_rep1', 'BE2C_shT_48HR_rep2', 'BE2C_shT_48HR_rep3'], ] bashFileName = '%sbe2c_twist_rna_seq_cuff.sh' % (cufflinksFolder) makeCuffTable(be2c_rna_twist_dataFile, analysisName, gtfFile, cufflinksFolder, groupList, bashFileName) return bashFileName
def map_nb_enhancers(nb_all_chip_dataFile): ''' for enhancers in individual systems defined by k27ac ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) namesList = dataDict.keys() print(namesList) parentFolder = '%senhancer_rose' % (projectFolder) parentFolder = utils.formatFolder(parentFolder, True) bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder) namesList = [ 'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC', 'NGP_H3K27AC' ] pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder, parentFolder, namesList, [], '', 2500, '', bashFileName, maskFile) return bashFileName
def wrap_dynamic_meta(mouse_dataFile, meta_rose_1, meta_rose_2, output_folder, group1_names, group2_names, name_1, name_2): ''' wraps the dynamic meta enhancer analysis ''' output_folder = utils.formatFolder(output_folder, True) group1_string = ','.join(group1_names) group2_string = ','.join(group2_names) bash_path = '%s%s_%s_dynamic.sh' % (output_folder, name_1, name_2) bash_file = open(bash_path, 'w') bash_file.write('#!/usr/bin/bash\n\n\n') bash_file.write('cd %s\n\n' % (projectFolder)) cmd = 'srun --mem 16000 python %sdynamicEnhancer_meta.py -g MM9 -d %s -r %s,%s -o %s --group1 %s --group2 %s --name1 %s --name2 %s' % ( pipeline_dir, mouse_dataFile, meta_rose_1, meta_rose_2, output_folder, group1_string, group2_string, name_1, name_2) print(cmd) bash_file.write(cmd) bash_file.close() print(bash_path)
def main(): ''' main run call ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a ROSE ranked enhancer or super-enhancer file") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter the bam used to rank enhancers") parser.add_option("-c", "--control", dest="control", nargs=1, default='', help="Enter a background bam for background correction") parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None, help="Enter a gene list to filter through") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder. Default will be same folder as input file") parser.add_option( "-w", "--window", dest="window", nargs=1, default=50000, help="Enter a search distance for genes. Default is 50,000bp") parser.add_option( "-f", "--format", dest="formatTable", action="store_true", default=False, help="If flagged, maintains original formatting of input table") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.genome or not options.rankby: parser.print_help() exit() print(options) # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # GETTING THE INPUT enhancerFile = options.input window = int(options.window) # making the out folder if it doesn't exist if options.out: outFolder = utils.formatFolder(options.out, True) else: outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/' # GETTING BAM INFO rankByBamFile = options.rankby controlBamFile = options.control # CHECK FORMATTING FLAG if options.formatTable: noFormatTable = True else: noFormatTable = False # GETTING THE TRANSCRIBED LIST if options.geneList: transcribedFile = options.geneList else: transcribedFile = '' if options.rankby: enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable = mapEnhancerToGeneTop( rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile, True, window, noFormatTable) # Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: # writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) print("writing output to %s" % (out1)) utils.unParseTable(enhancerToGeneTable, out1, '\t') # writing enhancer top gene table out2 = '%s%s_ENHANCER_TO_TOP_GENE_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) utils.unParseTable(enhancerToTopGeneTable, out2, '\t') # writing the gene table out3 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) utils.unParseTable(geneToEnhancerTable, out3, '\t') else: # writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName) utils.unParseTable(enhancerToGeneTable, out1, '\t') # writing the enhancer table out2 = '%s%s_ENHANCER_TO_TOP_GENE.txt' % (outFolder, enhancerFileName) utils.unParseTable(enhancerToTopGeneTable, out2, '\t') # writing the gene table out3 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName) utils.unParseTable(geneToEnhancerTable, out3, '\t') else: #do traditional mapping enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable) #Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000) utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000) utils.unParseTable(geneToEnhancerTable,out2,'\t') else: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName) utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName) utils.unParseTable(geneToEnhancerTable,out2,'\t')
def collapseFimo(fimo_output,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile): ''' collapses motifs from fimo for each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed ''' #first build up the motif name conversion database motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # a motif can go to multiple genes for line in motifDatabase: motifDatabaseDict[line[0]].append(line[1]) #make the folder to store motif beds utils.formatFolder('%smotif_beds/' % (output_folder),True) edgeDict = {} #first layer are source nodes for tf in candidate_tf_list: edgeDict[tf] = defaultdict(list) #next layer are target nodes which are derived from the fimo output fimoTable = utils.parseTable(fimo_output,'\t') print(fimo_output) #fimo sometimes puts the region in either the first or second column fimo_line = fimoTable[1] if fimo_line[1].count('|') >0: region_index = 1 else: region_index = 2 print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index)) for line in fimoTable[1:]: source_tfs = motifDatabaseDict[line[0]] #motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1],int(region[2]) + int(line[3]), int(region[2]) + int(line[4]),'.') else: target_locus = utils.Locus(region[1],int(region[2]) + int(line[2]), int(region[2]) + int(line[3]),'.') #what's missing here is the enhancer id of the target locus try: edgeDict[source][target].append(target_locus) except KeyError: print('this motif is not in the network') print(line) sys.exit() #now we actually want to collapse this down in a meaningful way #overlapping motifs count as a single binding site. This way a TF with tons of motifs #that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '%s%s_all_motifs.bed' % (output_folder,analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edgeDict[tf].keys() bed_header = ['track name = "%s" description="%s motifs in %s"' % (tf,tf,analysis_name)] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder,tf) for target in target_nodes: edgeCollection = utils.LocusCollection(edgeDict[tf][target],50) edgeCollection = edgeCollection.stitchCollection() edgeLoci = edgeCollection.getLoci() edgeDict[tf][target] = edgeLoci for locus in edgeLoci: bed_line = [locus.chr(),locus.start(),locus.end(),target,'','+'] target_bed.append(bed_line) all_bed.append(bed_line) utils.unParseTable(target_bed,target_bed_path,'\t') #now the loci are all stitched up utils.unParseTable(all_bed,all_bed_path,'\t') return edgeDict
import subprocess import string from collections import defaultdict #================================================================== #=========================GLOBAL=================================== #================================================================== # Get the script's full local path whereAmI = os.path.dirname(os.path.realpath(__file__)) print(whereAmI) # Get the script folder codeFolder = utils.formatFolder(whereAmI,False) print('RUNNING ROSE2_META.py FROM %s' % (whereAmI)) #samtools must be installed samtoolsPath = 'samtools' #bamliquidator must be installed bamliquidator_path = 'bamliquidator_batch' #================================================================== #=====================HELPER FUNCTIONS============================= #================================================================== def getBamChromList(bamFileList):
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser( usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME') # required flags parser.add_argument("-i", "--input", dest="inputSam", type=str, help="Enter a sam file", required=False) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify the main reference genome", required=False) parser.add_argument("-s", "--spike", dest="spike", type=str, help="specify the spike in genome", required=False) parser.add_argument( "-d", "--dest", dest="dest", type=str, help="specify an optional destination for the final bams to move to", required=False) args = parser.parse_args() print(args) if args.inputSam and args.genome and args.spike: print( 'FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam, args.genome, args.spike)) samPath = args.inputSam if string.upper(samPath).count('.SAM') == 0: print('ERROR, file must end in .sam or .SAM') sys.exit() #get the headers genome_string = string.upper('%s_%s' % (args.genome, args.spike)) genomeDict = { 'RN6_DM6': [ '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM', '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM' ], } if genomeDict.has_key(genome_string) == False: print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string)) sys.exit() else: print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' % (genome_string)) header1 = genomeDict[genome_string][0] header2 = genomeDict[genome_string][1] suffix = string.lower('_%s' % (args.spike)) outfile1 = string.replace(samPath, samPath[-4:], '.%s%s' % (args.genome, samPath[-4:])) outfile2 = string.replace(samPath, samPath[-4:], '.%s%s' % (args.spike, samPath[-4:])) split_sam(suffix, samPath, header1, header2, outfile1, outfile2) #move stuff to destination folder if args.dest: bamFolder = utils.formatFolder(args.dest, False) samFolder = utils.getParentFolder(samPath) mv_cmd = 'mv %s*bam* %s' % (samFolder, bamFolder) print('MOVING BAMS FROM %s TO %s' % (samFolder, bamFolder)) os.system(mv_cmd) else: parser.print_help() sys.exit()
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -n [NAMES_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter a data file for datasets to be processed") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "specify an output folder to write results to") #additional options parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter a comma separated list of names to analyze. Default will be all datasets") parser.add_option("-n","--name", dest="name",nargs=1,default=None, help = "Enter a name for the analysis") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a folder to detect or write rose output") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "flag to run analysis on ALL enhancers (this is much slower)") (options,args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder(options.output,True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print namesList #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile,roseFolder,namesList) print nameDict #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder) print nameDict #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName) mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly) #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/rose/') cmd = 'python /ark/home/cl512/rose/ROSE_geneMapper.py -g %s -i %s' % (genome,clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -f [FASTQFILE] -g [GENOME] -u [UNIQUEID] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-f","--fastq", dest="fastq",nargs = 1, default=None, help = "Enter the full path of a fastq file to be mapped") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "specify a genome, options are hg19,hg18, mm9 or geckov2 right now") parser.add_option("-u","--unique",dest="unique",nargs =1, default = None, help = "specify a uniqueID") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Specify an output folder") #optional arguments parser.add_option("--param",dest="paramString",nargs =1, default = '', help = "A string of bowtie parameters") parser.add_option("--link-folder",dest="linkFolder",nargs =1, default = None, help = "Specify a folder to symlink the bam") parser.add_option("-p","--paired",dest="paired",action='store_true',default = False, help = "Flag for paired end data") parser.add_option("-S","--sam",dest="sam",action='store_true',default = False, help = "Flag to save sam") parser.add_option("-q","--qc",dest="qc",action='store_true',default = False, help = "Flag to run fastqc") (options,args) = parser.parse_args() if not options.fastq or not options.genome or not options.unique or not options.output: parser.print_help() exit() #retrive the arguments fastqFile = options.fastq genome = string.lower(options.genome) uniqueID = options.unique outputFolder = options.output #make the output folder outputFolder = utils.formatFolder(outputFolder,True) #retrieve optional arguments paramString = options.paramString if options.linkFolder: linkFolder = options.linkFolder else: linkFolder ='' pairedEnd = options.paired #get the bowtie index bowtieDict = { 'mm9':'/raider/index/mm9/Bowtie2Index/genome', 'hg19':'/raider/index/hg19/Bowtie2Index/genome', 'hg18':'/grail/genomes/Homo_sapiens/human_gp_mar_06_no_random/bowtie/hg18', 'geckov2':'/grail/genomes/gecko/GeCKOv2/Sequence/Bowtie2Index/gecko', 'ribo':'/raider/temp/rDNA/hg19_45S_index/genome', 'hg19_ribo':'/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_ribo/genome', } bowtieIndex = bowtieDict[string.lower(genome)] #get the temp string tempString = '_%s' % str(random.randint(1,10000)) fileNameDict = makeFileNameDict(fastqFile,genome,tempString,tempParentFolder,outputFolder,linkFolder,uniqueID,pairedEnd) #open the bashfile to write to bashFileName = "%s%s_bwt2.sh" % (outputFolder,uniqueID) bashFile = open(bashFileName,'w') #shebang bashFile.write('#!/usr/bin/bash\n') #make temp directory cmd = 'mkdir %s' % (fileNameDict['tempFolder']) bashFile.write(cmd+'\n') #extract fastq cmd = extractFastqCmd(fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #call fastqc if options.qc: cmd =runFastQC(fastqcString,fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #call bowtie cmd = bowtieCmd(bowtieString,paramString,bowtieIndex,fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #remove temp fastq cmd = removeTempFastqCmd(fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #generate a bam cmd = generateTempBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #change into the temp directory cmd = changeTempDir(fileNameDict) bashFile.write(cmd+'\n') #sort the bam cmd = sortBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #index cmd = indexBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #remove sam if not options.sam: cmd = rmSamCmd(fileNameDict) bashFile.write(cmd+'\n') #or move the sam if options.sam: cmd = mvSamCmd(fileNameDict) bashFile.write(cmd+'\n') #mv bams cmd = mvBamCmd(fileNameDict) bashFile.write(cmd+'\n') #link bams if options.linkFolder: cmd = linkBamCmd(fileNameDict) bashFile.write(cmd+'\n') #cleanup cmd = rmTempFiles(fileNameDict) bashFile.write(cmd+'\n') bashFile.close() print "Wrote mapping command to %s" % (bashFileName)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run method for enhancer promoter contribution tool ''' parser = argparse.ArgumentParser(usage='%(prog)s [options]') # required flags parser.add_argument( "-b", "--bam", dest="bam", nargs='*', help="Enter a space separated list of .bam files for the main factor", required=True) parser.add_argument("-i", "--input", dest="input", type=str, help="Enter .gff or .bed file of regions to analyze", required=True) parser.add_argument( "-g", "--genome", dest="genome", type=str, help= "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported", required=True) # output flag parser.add_argument("-o", "--output", dest="output", type=str, help="Enter the output folder.", required=True) # additional options flags and optional arguments parser.add_argument( "-a", "--activity", dest="activity", type=str, help= "specify a table where first column represents a list of active refseq genes", required=False) parser.add_argument( "-c", "--control", dest="control", nargs='*', help= "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False) parser.add_argument( "-w", "--window", dest="window", type=int, help= "Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000) parser.add_argument( "--other-bams", dest="other", nargs='*', help="enter a space separated list of other bams to map to", required=False) parser.add_argument( "--name", dest="name", type=str, help= "enter a root name for the analysis, otherwise will try to find the name from the input file", required=False) parser.add_argument( "--top", dest="top", type=int, help= "Run the analysis on the top N genes by total signal. Default is 5000", required=False, default=5000) parser.add_argument( "--tads", dest="tads", type=str, help= "Include a .bed of tad regions to restrict enhancer/gene association", required=False, default=None) args = parser.parse_args() print(args) #minimum arguments needed to proceed if args.bam and args.input and args.genome and args.output: #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== print( '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n' ) #top analysis subset top = args.top #input genome genome = args.genome.upper() print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome)) #set of bams bamFileList = args.bam #bring in the input path inputPath = args.input #try to get the input name or use the name argument if args.name: analysisName = args.name else: analysisName = inputPath.split('/')[-1].split('.')[0] print('USING %s AS ANALYSIS NAME' % (analysisName)) #setting up the output folder parentFolder = utils.formatFolder(args.output, True) outputFolder = utils.formatFolder( '%s%s' % (parentFolder, analysisName), True) print('WRITING OUTPUT TO %s' % (outputFolder)) if inputPath.split('.')[-1] == 'bed': #type is bed print('input in bed format, converting to gff') inputGFF = utils.bedToGFF(inputPath) else: inputGFF = utils.parseTable(inputPath, '\t') #the tss window window = int(args.window) #activity path if args.activity: activityPath = args.activity activityTable = utils.parseTable(activityPath, '\t') #try to find the column for refseq id for i in range(len(activityTable[0])): if str(activityTable[0][i]).count('NM_') > 0 or str( activityTable[0][i]).count('NR_') > 0: ref_col = i geneList = [line[ref_col] for line in activityTable ] # this needs to be REFSEQ NM ID print('IDENTIFIED %s ACTIVE GENES' % (len(geneList))) else: geneList = [] #check if tads are being invoked if args.tads: print('LOADING TAD LOCATIONS FROM %s' % (args.tads)) use_tads = True tads_path = args.tads else: use_tads = False tads_path = '' print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome)) #important here to define the window startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile( genome, window, geneList, True) #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.'))) #sys.exit() print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES') print(chrom_list) filtered_gff = [ line for line in inputGFF if chrom_list.count(line[0]) > 0 ] print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' % (len(filtered_gff), len(inputGFF))) #===================================================================================== #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS===================== #===================================================================================== print( '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n' ) #now we need to split the input region print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window)) splitGFF = splitRegions(filtered_gff, tssCollection) print(len(filtered_gff)) print(len(splitGFF)) splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName) utils.unParseTable(splitGFF, splitGFFPath, '\t') print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath)) #now you have to map the bams to the gff print('MAPPING TO THE SPLIT GFF') mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder), True) signalTable = mapBams(bamFileList, splitGFFPath, analysisName, mappedFolder) signalTablePath = '%s%s_signal_table.txt' % (outputFolder, analysisName) utils.unParseTable(signalTable, signalTablePath, '\t') if args.control: controlBamFileList = args.control controlSignalTable = mapBams(controlBamFileList, splitGFFPath, analysisName, mappedFolder) controlSignalTablePath = '%s%s_control_signal_table.txt' % ( outputFolder, analysisName) utils.unParseTable(controlSignalTable, controlSignalTablePath, '\t') #now create the background subtracted summarized average table print('CREATING AN AVERAGE SIGNAL TABLE') averageTable = makeAverageTable(outputFolder, analysisName, useBackground=args.control) averageTablePath = '%s%s_average_table.txt' % (outputFolder, analysisName) utils.unParseTable(averageTable, averageTablePath, '\t') #now load up all of the cpg and other parameters to make the actual peak table #first check if this has already been done peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName) if utils.checkOutput(peakTablePath, 0.1, 0.1): print('PEAK TABLE OUTPUT ALREADY EXISTS') peakTable = utils.parseTable(peakTablePath, '\t') else: peakTable = makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path) utils.unParseTable(peakTable, peakTablePath, '\t') geneTable = makeGeneTable(peakTable, analysisName) geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName) utils.unParseTable(geneTable, geneTablePath, '\t') #if mouse, need to convert genes over if genome.count('MM') == 1: print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA') converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % ( outputFolder, analysisName) converted_geneTable = [geneTable[0]] for line in geneTable[1:]: converted_name = mouse_convert_dict[line[0]] if len(converted_name) > 0: converted_geneTable.append([converted_name] + line[1:]) utils.unParseTable(converted_geneTable, converted_geneTablePath, '\t') geneTablePath = converted_geneTablePath geneTable = converted_geneTable #===================================================================================== #===================================III. PLOTTING ==================================== #===================================================================================== print( '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n' ) #if there are fewer genes in the gene table than the top genes, only run on all if len(geneTable) < int(top): print( 'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s' % (len(geneTable) - 1, top)) top = 0 use_top = False else: use_top = True #now call the R code print('CALLING R PLOTTING SCRIPTS') callRWaterfall(geneTablePath, outputFolder, analysisName, top) #===================================================================================== #==================================IV. RUNNING GSEA=================================== #===================================================================================== print( '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n' ) #now let's call gsea print('RUNNING GSEA ON C2') callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter', use_top) callGSEA(outputFolder, analysisName, top, 'total_contribution', use_top) if use_top: print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top)) #for top by enhancer v promoter metric top_promoterTablePath, top_distalTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'enhancer_vs_promoter') top_signalTablePath, top_backgroundTablePath = detectGSEAOutput( analysisName, outputFolder, top, 'total_contribution') print('MAKING NES PLOTS FOR TOP %s GENES' % (top)) callR_GSEA(top_promoterTablePath, top_distalTablePath, outputFolder, analysisName + '_enhancer_vs_promoter', top) callR_GSEA(top_signalTablePath, top_backgroundTablePath, outputFolder, analysisName + '_total_contribution', top) print('DETECTING GSEA OUTPUT FOR ALL GENES') #for top all_promoterTablePath, all_distalTablePath = detectGSEAOutput( analysisName, outputFolder, 'all') print('MAKING NES PLOTS FOR ALL GENES') callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder, analysisName, 'all') #these files can be parsed to make the NES plot #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1] print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder, False): roseExists = True roseFolder = utils.formatFolder(roseFolder, False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder, True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [ name for name in dataDict.keys() if string.upper(name).count('WCE') == 0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder, dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile, 'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [ x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % ( roseFolder, name, allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile'] = '' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name][ 'enrichedFile'] == '': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % ( name) print nameDict[name] sys.exit() return nameDict
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder, maskFile): ''' calls rose on the mergedGFFFile for all datasets ''' dataDict = pipeline_dfci.loadDataTable(dataFile) roseParentFolder = "%srose/" % (outputFolder) utils.formatFolder(roseParentFolder, True) gffName = mergedGFFFile.split('/')[-1].split('.')[0] bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName) #namesList is just the first dataset #extrmap will have to have all other datasets + their backgrounds namesList = nameDict.keys() namesList.sort() extraMap = [] for name in namesList[1:]: if nameDict[name]['background']: backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): extraMap += [name, backgroundName] else: print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % ( backgroundName, name) sys.exit() else: extraMap += [name] print extraMap #first check to see if this has already been done mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % ( outputFolder, namesList[0], gffName) print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap)) if utils.checkOutput(mergedRegionMap, 1, 1): print("FOUND PREVIOUS REGION MAP") return mergedRegionMap bashFileName = pipeline_dfci.callRose2(dataFile, '', roseParentFolder, [namesList[0]], extraMap, mergedGFFFile, 0, 0, bashFileName, mask=maskFile) bashCommand = "bash %s" % (bashFileName) os.system(bashCommand) print "Running enhancer mapping command:\n%s" % (bashCommand) if utils.checkOutput(mergedRegionMap, 5, 60): return mergedRegionMap else: print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % ( mergedGFFFile) sys.exit()
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''): ''' launches enhancer mapping if needed from enriched region files ''' namesList = nameDict.keys() #check to see if everything is good, if so return True and call it a day if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList): print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS" return nameDict #if not, have to call rose roseOutputFolder = utils.formatFolder(roseFolder,True) queueList =[] for name in namesList: #check to see if we need to call rose if nameDict[name]['enhancerFile'] == '': #get the enriched file enrichedFile = nameDict[name]['enrichedFile'] #call rose print "CALLING ROSE FOR %s" % (name) bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile) print bashFileName os.system('bash %s &' % (bashFileName)) #add name to queue list queueList.append(name) #define the enhancer type if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' #now check for completion of datasets for name in queueList: #check for the AllEnhancers table enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString) print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile) if utils.checkOutput(enhancerFile,1,10): print "FOUND ENHANCER OUTPUT FOR %s" % (name) nameDict[name]['enhancerFile'] = enhancerFile else: #try finding it w/ a different name #this will bug out if nothing is there roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name) roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files if len(roseFileList) == 0: print "No files found in %s" % (roseFolder) sys.exit() enhancerFile = getFile(enhancerString,roseFileList,roseFolder) nameDict[name]['enhancerFile'] = enhancerFile return nameDict
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("--group1", dest="group1",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the first group") parser.add_option("--group2", dest="group2",nargs = 1, default=None, help = "Enter a comma separated list of dataset names associated with the second group") parser.add_option("--name1", dest="name1",nargs = 1, default=None, help = "Enter a name for the first group of datasets") parser.add_option("--name2", dest="name2",nargs = 1, default=None, help = "Enter a name for the second group of datasets") #the input options parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of meta rose folders") #optional input to supercede the meta rose (this is kinda sad but will fix later) #should have had this code run clustering from the get go parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "enter a gff, bed or table of regions to perform dyanmic analysis on") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") parser.add_option("--use-background", dest="background",action = 'store_true',default=False, help = "If flagged will use background datasets as in data table") (options,args) = parser.parse_args() print(options) print(args) requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2] try: assert(all(requiredArgs)) except AssertionError: parser.print_help() sys.exit() #now the main run of the function #getting the genoe and data file genome = string.upper(options.genome) dataFile = options.data #getting the rose folders roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) #getting the analysis names name1 = options.name1 name2 = options.name2 mergeName = "%s_%s_merged" % (name1,name2) #getting the datasets names associated with each group namesList1 = options.group1.split(',') namesList2 = options.group2.split(',') #options for background corection useBackground = options.background #option for median scaling medianScale = options.median #option for an overriding set of input regions if options.input != None: #for now only works w/ gffs print('Using %s as a set of predifined input regions' % (options.input)) inputGFF = options.input else: inputGFF= '' plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] regionFile1 = roseDict1['RegionMap'] regionFile2 = roseDict1['RegionMap'] #this is where we can toggle either using meta rose or clustering print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) else: print('doing it right') print(allFile1) print(allFile2) roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF) print('this is rose output') print(roseOutput) print('\tMERGING ROSE OUTPUT') mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1) print(rcmd) os.system(rcmd) time.sleep(5) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): print('checking for rank output %s' % (rankOutput)) rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS') if utils.checkOutput(normRoseOutput): print('checking for %s' % (normRoseOutput)) rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2) print(rcmd) os.system(rcmd) else: print('ERROR: REGION PLOT SCRIPT FAILED TO RUN') sys.exit() #NOW MAP GENES print('mapping genes to differential enhancers') statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1) if utils.checkOutput(statOutput): print('checking for gene mapping output %s' % (statOutput)) print('FINISHED WITH GENE MAPPING') else: print('GENE MAPPING FAILED') sys.exit() print('FINISHING OUTPUT') finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
def main(): ''' using argparse ''' parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES') # required flags parser.add_argument("-d", "--data_table", dest="data_table", type=str, help="input a data table with all datasets to be analyzed", required=True) parser.add_argument("-1", "--group1", dest="group1", type=str, help="input a comma separated list of all datasets in group1", required=True) parser.add_argument("-2", "--group2", dest="group2", type=str, help="input a comma separated list of all datasets in group2", required=True) #optional input override parser.add_argument("-i", "--input", dest="input", type=str, help="input a gff of regions to analyze", required=False) #optional arguments parser.add_argument("-n", "--name", dest="name", type=str, help="specify a name for the analysis. Default is drawn from the data table name", required=False) parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str, help="Enter a name for group1. Default is 'GROUP1'", required=False) parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str, help="Enter a name for group2. Default is 'GROUP2'", required=False) parser.add_argument("-a", "--activity", dest="activity", type=str,default='', help="a table with active gene names in the first column", required=False) parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500, help="Specify a TSS exclusion distance. Default is 2500", required=False) parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None, help="Specify a stitching distance. Default is auto stitching", required=False) parser.add_argument("-o", "--output", dest="output", default='./',type=str, help="Enter the output folder. Default is the current working directory", required=False) parser.add_argument("--log", dest="log", default='',type=str, help="Enter a path to log output", required=False) # # DEBUG OPTION TO SAVE TEMP FILES # parser.add_argument("--scale", dest="scale", default='', # help="Enter a comma separated list of scaling factors for your bams. Default is none") # parser.add_argument("--save-temp", dest="save", action='store_true', default=False, # help="If flagged will save temporary files made by bamPlot") # parser.add_argument("--bed", dest="bed", # help="Add a space-delimited list of bed files to plot") # parser.add_argument("--multi-page", dest="multi", action='store_true', default=False, # help="If flagged will create a new pdf for each region") args = parser.parse_args() #now we can begin to parse the arguments #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== #pulling in the data table data_file = os.path.abspath(args.data_table) dataDict = pipeline_dfci.loadDataTable(data_file) #setting naming conventions if not args.name: analysis_name = data_file.split('/')[-1].split('.')[0] else: analysis_name = args.name #getting the optional input gff if args.input: inputGFF = args.input else: inputGFF = '' #getting group names group1_name = args.group1_name group2_name = args.group2_name #getting group1 group1_string = args.group1 group1_list = [name for name in string.split(group1_string,',') if len(name) > 0] #getting group2 group2_string = args.group2 group2_list = [name for name in string.split(group2_string,',') if len(name) > 0] #checking that all datasets are in the data table for name in group1_list + group2_list: if name not in dataDict: print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file)) sys.exit() #loading in the genome object from the data table genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list]) if len(genome_list) > 1: print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.') sys.exit() #the load genome function has an assertion test to make sure the genome is supported genome = loadGenome(genome_list[0]) parent_folder = utils.formatFolder(args.output,True) output_folder = utils.formatFolder(parent_folder + analysis_name,True) #these are the user defined optional arguments tss = int(args.tss) stitch = args.stitch print('stitch') print(stitch) #list of active genes to constrain analysis if len(args.activity) == 0: #assumes all genes are active unless told otherwise #activity_path,activity_table = getActivity() # fix this function print('using all active genes') else: activity_path = args.activity activity_table = utils.parseTable(activity_path,'\t') print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing datasets described in %s\n' % (data_file)) print('Name for the analysis: %s\n' % (analysis_name)) print('Using genome: %s\n' % (genome.name())) print('%s datasets: %s\n' % (group1_name,group1_string)) print('%s datasets: %s\n' % (group2_name,group2_string)) if len(activity_path) > 0: print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path)) else: print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name)) print('Writing output to: %s\n' % (output_folder)) #===================================================================================== #======================II. DEFINING CIS-REGULATORY ELEMENTS=========================== #===================================================================================== print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n') #crc_wrapper will act at the group level and not consider individual datasets #since a data table is used as the input, the code will rely heavily on pipeline_dfci #embedded tools #1. first we need to run meta rose using default parameters and check the output #exists for each group meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True) group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name) group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name) #print(group1_output) #print(group2_output) #for each output check to see if they exist #if not launch try: foo = open(group1_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name)) launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss) try: foo = open(group2_output,'r') except IOError: print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name)) launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss) #now check for completion if utils.checkOutput(group1_output,1,10): print('META_ROSE finished for %s' % (group1_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name)) sys.exit() if utils.checkOutput(group2_output,1,10): print('META_ROSE finished for %s' % (group2_name)) else: print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name)) sys.exit() #Meta rose does not give all regions that are SE in at least one sample #and can be blown out by amplicons etc... #sooo we need to run clustering to generate a good input gff #ideally we just rewrite dynamic meta to run off of clustering output #until we do that let's just overwrite w/ an input gff print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name)) dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True) #here we will use the rank table as the primary output dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name) try: foo = open(dynamic_rose_output,'r') except IOError: print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name)) launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF) if utils.checkOutput(dynamic_rose_output,1,10): print('DYNAMIC_ROSE finsihed for %s' % (analysis_name)) else: print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name)) sys.exit() #===================================================================================== #======================III. IDENTIFYING TF NODES IN NETWORK=========================== #===================================================================================== print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n') #now we want to call circuitry on each group... ok to have different subpeaks and motif calls #if as a first approximation we weight by the overall enhancer crc_folder = utils.formatFolder('%scrc/' % (output_folder),True) #for all all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True) launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path) #for group1 group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True) launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path) #for group2 group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True) launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
def extractGuideFastq(fastqFile,outputFolder='',gzip=False): ''' takes a fastq and extracts candidate guide RNAs ''' #get the full absolute path for the fastq File fastqFile = os.path.abspath(fastqFile) fastq = utils.open(fastqFile,'r') #get the fastq name and root if len(outputFolder) == 0: outputFolder = utils.getParentFolder(fastqFile) #makes sure the output folder exists utils.formatFolder(outputFolder,True) #grab the name info from the fastq fastqName = fastqFile.split('/')[-1] fastqRoot = string.replace(fastqName,'.fastq','') fastqRoot = string.replace(fastqRoot,'.gz','') #guideFastqFile output guideFastqFile = '%s%s.gecko.fastq' % (outputFolder,fastqRoot) guideFastq = utils.open(guideFastqFile,'w') print('processing %s' % (fastqName)) print('million reads processed:') ticker = 0 found = 0 while True: if ticker%1000000 == 0: print(ticker/1000000) fastqLines = [] #now load the fastq lines try: for i in range(4): fastqLines.append(fastq.next()) except StopIteration: break #see if you can find a cut site seq = fastqLines[1].rstrip() try: cutPosition = seq.index(cutSeq) found+=1 except ValueError: ticker+=1 continue guideStart = cutPosition + cutOffset guideStop = guideStart + guideLength #pulling out the guide seq in the fastqLines fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n' fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n' for line in fastqLines: guideFastq.write(line) ticker+=1 # if ticker == 100000: # print(ticker) # print(found) # print(float(found)/float(ticker)) # break print('SUMMARY STATISTICS') print(ticker) print(found) print(float(found)/float(ticker)) #close the fastq guideFastq.close() #gzip the fastq if gzip: os.system('gzip %s &' % (guideFastqFile)) guideFastqFile += '.gz' return guideFastqFile
def main(): """ main run function """ #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]" parser = argparse.ArgumentParser(usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME') # required flags parser.add_argument("-i", "--input", dest="inputSam", type=str, help="Enter a sam file", required=False) parser.add_argument("-g", "--genome", dest="genome", type=str, help="specify the main reference genome", required=False) parser.add_argument("-s", "--spike", dest="spike", type=str, help="specify the spike in genome", required=False) parser.add_argument("-d", "--dest", dest="dest", type=str, help="specify an optional destination for the final bams to move to", required=False) args = parser.parse_args() print(args) if args.inputSam and args.genome and args.spike: print('FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam,args.genome,args.spike)) samPath = args.inputSam if string.upper(samPath).count('.SAM') == 0: print('ERROR, file must end in .sam or .SAM') sys.exit() #get the headers genome_string = string.upper('%s_%s' % (args.genome,args.spike)) genomeDict = {'RN6_DM6':['/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM','/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'], } if genomeDict.has_key(genome_string) == False: print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string)) sys.exit() else: print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %(genome_string)) header1 = genomeDict[genome_string][0] header2 = genomeDict[genome_string][1] suffix = string.lower('_%s' % (args.spike)) outfile1 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.genome,samPath[-4:])) outfile2 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.spike,samPath[-4:])) split_sam(suffix,samPath,header1,header2,outfile1,outfile2) #move stuff to destination folder if args.dest: bamFolder = utils.formatFolder(args.dest,False) samFolder = utils.getParentFolder(samPath) mv_cmd = 'mv %s*bam* %s' % (samFolder,bamFolder) print('MOVING BAMS FROM %s TO %s' % (samFolder,bamFolder)) os.system(mv_cmd) else: parser.print_help() sys.exit()
import subprocess import string from collections import defaultdict #================================================================== #=========================GLOBAL=================================== #================================================================== # Get the script's full local path whereAmI = os.path.dirname(os.path.realpath(__file__)) print(whereAmI) # Get the script folder pipeline_dir = utils.formatFolder(whereAmI,False) print('RUNNING ROSE2_META.py FROM %s' % (whereAmI)) #samtools must be installed samtoolsPath = 'samtools' #bamliquidator must be installed bamliquidator_path = 'bamliquidator_batch.py' #================================================================== #=====================HELPER FUNCTIONS============================= #================================================================== def getBamChromList(bamFileList):
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter a data file for datasets to be processed") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "specify an output folder to write results to") #additional options parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter a comma separated list of names to analyze. Default will be all datasets") parser.add_option("-n","--name", dest="name",nargs=1,default=None, help = "Enter a name for the analysis") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a folder to detect or write rose output") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "flag to run analysis on ALL enhancers (this is much slower)") parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default='', help = "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") parser.add_option("-t","--tss", dest="tss",nargs = 1, default=2500, help = "specify a tss exclusion window. default is 2500bp") parser.add_option("--mask",dest="mask",nargs=1,default=None, help = 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format') (options,args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder(options.output,True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #check for a stitching parameter if len(str(options.stitch)) > 0: stitch = str(options.stitch) else: stitch = '' #check for the tss parameter tssDistance = int(options.tss) #check enhancer type enhancerType = string.lower(options.enhancer_type) if ['super','superstretch','stretch'].count(enhancerType) == 0: print("ERROR: unsupported enhancer type %s" % (enhancerType)) sys.exit() #see if there's a mask if options.mask: maskFile = options.mask else: maskFile = '' #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile,roseFolder,namesList,enhancerType) print nameDict print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print nameDict.keys() for name in nameDict.keys(): if len(nameDict[name]['enhancerFile']) == 0: print("NO ROSE OUTPUT FOR %s" % (name)) #sys.exit() #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile) print nameDict #sys.exit() #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #sys.exit() #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName) mergedGFFFile = mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly) #sys.exit() #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #sys.exit() #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/pipeline/') cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (genome,clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") #additional options parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") parser.add_option("-m","--median", dest="median",action = 'store_true', default=False, help = "If flagged, will use median enhancer scaling") parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super', help = "specify type of enhancer to analyze: super, stretch, superStretch") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output and options.names: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) nameString = options.names [name1,name2] =nameString.split(',') mergeName = "%s_%s_merged" % (name1,name2) #option for median scaling medianScale = options.median plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) roseDict1 = makeRoseDict(roseFolder1) roseDict2 = makeRoseDict(roseFolder2) #choosing the type of enhancer to analyze enhancerCallType = string.lower(options.enhancer_type) if superOnly: print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType))) superFile1 = roseDict1[enhancerCallType] superFile2 = roseDict2[enhancerCallType] allFile1 = roseDict1['AllEnhancer'] allFile2 = roseDict2['AllEnhancer'] print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: if len(superFile1) ==0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1) sys.exit() if len(superFile2) == 0: print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2) sys.exit() roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder) else: roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') #part2 is the R script mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() sys.exit()
def main(): from optparse import OptionParser usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]" parser = OptionParser(usage=usage) #required flags parser.add_option("-d", "--data", dest="data", nargs=1, default=None, help="Enter a data file for datasets to be processed") parser.add_option("-o", "--output", dest="output", nargs=1, default=None, help="specify an output folder to write results to") #additional options parser.add_option( "-i", "--input", dest="input", nargs=1, default=None, help= "Enter a comma separated list of names to analyze. Default will be all datasets" ) parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Enter a name for the analysis") parser.add_option("-r", "--rose", dest="rose", nargs=1, default=None, help="Enter a folder to detect or write rose output") parser.add_option( "-a", "--all", dest="all", action='store_true', default=False, help="flag to run analysis on ALL enhancers (this is much slower)") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset" ) parser.add_option( "-e", "--enhancer-type", dest="enhancer_type", nargs=1, default='super', help="specify type of enhancer to analyze: super, stretch, superStretch" ) parser.add_option("-t", "--tss", dest="tss", nargs=1, default=2500, help="specify a tss exclusion window. default is 2500bp") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format' ) (options, args) = parser.parse_args() print(options) print(args) if options.data and options.output: #check to see if minimum arguments are met #pull in arguments #pull in the datafile and create a datadict dataFile = options.data #now the output folder outputFolder = utils.formatFolder( options.output, True) #check and create the output folder #now the rose folder if options.rose: roseFolder = options.rose else: roseFolder = "%srose/" % (outputFolder) if options.input: namesList = options.input.split(',') else: namesList = [] #get the genome dataDict = pipeline_dfci.loadDataTable(dataFile) genome = dataDict[dataDict.keys()[0]]['genome'] #check if using only supers if options.all: superOnly = False else: superOnly = True #get the anlysis name if options.name: analysisName = options.name else: analysisName = "enhancers" #check for a stitching parameter if len(str(options.stitch)) > 0: stitch = str(options.stitch) else: stitch = '' #check for the tss parameter tssDistance = int(options.tss) #check enhancer type enhancerType = string.lower(options.enhancer_type) if ['super', 'superstretch', 'stretch'].count(enhancerType) == 0: print("ERROR: unsupported enhancer type %s" % (enhancerType)) sys.exit() #see if there's a mask if options.mask: maskFile = options.mask else: maskFile = '' #===================================================== #=================SUMMARIZE INPUTS==================== #===================================================== print "WORKING IN GENOME %s" % (genome) print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile, roseFolder) print "USING %s AS THE OUTPUT FOLDER" % (outputFolder) #===================================================== #==============ESTABLISH ALL WORKING FILES============ #===================================================== print "\n\n\nESTABLISHING WORKING FILES" nameDict = makeNameDict(dataFile, roseFolder, namesList, enhancerType) print nameDict print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:" print nameDict.keys() for name in nameDict.keys(): if len(nameDict[name]['enhancerFile']) == 0: print("NO ROSE OUTPUT FOR %s" % (name)) #sys.exit() #===================================================== #==============LAUNCH ENHANCER MAPPING================ #===================================================== print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)" nameDict = launchEnhancerMapping(dataFile, nameDict, outputFolder, roseFolder, stitch, tssDistance, enhancerType, maskFile) print nameDict #sys.exit() #===================================================== #====================GET MEDIAN SIGNAL================ #===================================================== print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE" medianDict = makeMedianDict(nameDict) print medianDict #sys.exit() #===================================================== #====================MERGING ENHANCERS================ #===================================================== print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS" mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder, genome, analysisName) mergedGFFFile = mergeCollections(nameDict, analysisName, mergedGFFFile, superOnly) #sys.exit() #===================================================== #===============MAP TO MERGED REGIONS================= #===================================================== print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS" mergedRegionMap = mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder, maskFile) #===================================================== #==============CORRECT FOR MEDIAN SIGNAL============== #===================================================== print "\n\n\nCREATING ENHANCER SIGNAL TABLE" signalTableFile = makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict, analysisName, genome, outputFolder) #===================================================== #===============CALL CLUSTERING R SCRIPT============== #===================================================== print "\n\n\nGENERATING CLUSTERING OUTPUT" clusterTableFile = callRScript(genome, outputFolder, analysisName, signalTableFile) #output should be #png of cluster gram with rows as genes #png of cluster gram of samples w/ tree #ordered table w/ cluster assignment #similarity matrix for samples #sys.exit() #===================================================== #=============GENE MAPPING BY CLUSTER================= #===================================================== os.chdir('/ark/home/cl512/pipeline/') cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % ( genome, clusterTableFile) os.system(cmd) print "FINISHED" else: parser.print_help() sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -f [FASTQFILE] -g [GENOME] -u [UNIQUEID] -o [OUTPUTFOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-f","--fastq", dest="fastq",nargs = 1, default=None, help = "Enter the full path of a fastq file to be mapped") parser.add_option("-g","--genome",dest="genome",nargs =1, default = None, help = "specify a genome, options are hg19,hg18, mm9 or geckov2 right now") parser.add_option("-u","--unique",dest="unique",nargs =1, default = None, help = "specify a uniqueID") parser.add_option("-o","--output",dest="output",nargs =1, default = None, help = "Specify an output folder") #optional arguments parser.add_option("--param",dest="paramString",nargs =1, default = 1, help = "A string of bowtie parameters") parser.add_option("--link-folder",dest="linkFolder",nargs =1, default = None, help = "Specify a folder to symlink the bam") parser.add_option("-p","--paired",dest="paired",action='store_true',default = False, help = "Flag for paired end data") parser.add_option("-S","--sam",dest="sam",action='store_true',default = False, help = "Flag to save sam") parser.add_option("-q","--qc",dest="qc",action='store_true',default = False, help = "Flag to run fastqc") (options,args) = parser.parse_args() if not options.fastq or not options.genome or not options.unique or not options.output: parser.print_help() exit() #retrive the arguments fastqFile = options.fastq genome = string.lower(options.genome) uniqueID = options.unique outputFolder = options.output #make the output folder outputFolder = utils.formatFolder(outputFolder,True) #retrieve optional arguments paramString = options.paramString if options.linkFolder: linkFolder = options.linkFolder else: linkFolder ='' pairedEnd = options.paired #get the bowtie index bowtieDict = { 'hg19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/genome', 'hg38':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome', 'mm10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome', 'mm9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome', 'rn6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/genome', 'mm10_dm6':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index_dm6/genome.mm10.dm6', 'hg19_dm6':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_dm6/genome.hg19.dm6', 'hg19_ercc':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_ercc/human.hg19_ercc' } bowtieIndex = bowtieDict[string.lower(genome)] #get the temp string tempString = '_%s' % str(random.randint(1,10000)) fileNameDict = makeFileNameDict(fastqFile,genome,tempString,tempParentFolder,outputFolder,linkFolder,uniqueID,pairedEnd) #open the bashfile to write to bashFileName = "%s%s_bwt2.sh" % (outputFolder,uniqueID) bashFile = open(bashFileName,'w') #shebang bashFile.write('#!/usr/bin/bash\n') #sbatch funky junk #cmd = '#SBATCH --output=/storage/cylin/grail/slurm_out/serial_test_%j.out # Standard output and error log' #bashFile.write(cmd+'\n') ts = time.time() timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%Hh%Mm%Ss') cmd = '#SBATCH --output=/storage/cylin/grail/slurm_out/bwt2_%s_%s' % (uniqueID,timestamp) + '_%j.out # Standard output and error log' bashFile.write(cmd+'\n') cmd = '#SBATCH -e /storage/cylin/grail/slurm_out/bwt2_%s_%s' % (uniqueID,timestamp) + '_%j.err # Standard output and error log' bashFile.write(cmd+'\n') cmd = 'pwd; hostname; date' bashFile.write(cmd+'\n') bashFile.write('\n\n\n') #make temp directory cmd = 'mkdir %s' % (fileNameDict['tempFolder']) bashFile.write(cmd+'\n') #extract fastq cmd = extractFastqCmd(fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #call fastqc if options.qc: cmd =runFastQC(fastqcString,fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #call bowtie print('THIS IS THE PARAM STRING') print(paramString) cmd = bowtieCmd(bowtieString,paramString,bowtieIndex,fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #remove temp fastq cmd = removeTempFastqCmd(fileNameDict,pairedEnd) bashFile.write(cmd+'\n') #generate a bam cmd = generateTempBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #change into the temp directory cmd = changeTempDir(fileNameDict) bashFile.write(cmd+'\n') #sort the bam cmd = sortBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #index cmd = indexBamCmd(samtoolsString,fileNameDict) bashFile.write(cmd+'\n') #remove sam if not options.sam: cmd = rmSamCmd(fileNameDict) bashFile.write(cmd+'\n') #or move the sam if options.sam: cmd = mvSamCmd(fileNameDict) bashFile.write(cmd+'\n') #mv bams cmd = mvBamCmd(fileNameDict) bashFile.write(cmd+'\n') #link bams if options.linkFolder: cmd = linkBamCmd(fileNameDict) bashFile.write(cmd+'\n') #cleanup #cmd = rmTempFiles(fileNameDict) #bashFile.write(cmd+'\n') bashFile.close() print "Wrote mapping command to %s" % (bashFileName)
def callGSEA(outputFolder, analysisName, top, analysis_type='enhancer_vs_promoter', use_top=True): ''' runs C2 GSEA ''' #figure out the suffix for gct and cls files analysis_dict = { 'enhancer_vs_promoter': ['', '#PROMOTER_versus_DISTAL'], 'total_contribution': ['_total_contrib', '#SIGNAL_versus_BACKGROUND'], } if analysis_dict.has_key(analysis_type) == False: print( 'Error: please use one of the following supported analysis types') print(analysis_dict.keys()) sys.exit() suffix = analysis_dict[analysis_type][0] gseaPath = '/storage/cylin/home/cl6/gsea2-3.0_beta_2.jar' gmxPath = '/storage/cylin/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set gseaBashFilePath = '%s%s_GSEA%s_cmd.sh' % (outputFolder, analysisName, suffix) gseaBashFile = open(gseaBashFilePath, 'w') gseaBashFile.write('#!/usr/bin/bash\n\n') gseaBashFile.write( '#COMMAND LINE GSEA CALLS FOR %s USING %s COMPARISON\n\n' % (analysisName, string.upper(analysis_type))) #for all gctPath = '%s%s_top_all%s.gct' % (outputFolder, analysisName, suffix) clsPath = '%s%s_top_all%s.cls' % (outputFolder, analysisName, suffix) gseaOutputFolder = utils.formatFolder( '%sgsea_top_all_c2%s' % (outputFolder, suffix), True) rptLabel = '%s_top_all%s' % (analysisName, suffix) gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % ( gseaPath, gctPath, clsPath, analysis_dict[analysis_type][1], gmxPath, rptLabel, gseaOutputFolder) gseaBashFile.write(gseaCmd_all) gseaBashFile.write('\n') if use_top: #for top N gctPath = '%s%s_top_%s%s.gct' % (outputFolder, analysisName, top, suffix) clsPath = '%s%s_top_%s%s.cls' % (outputFolder, analysisName, top, suffix) gseaOutputFolder = utils.formatFolder( '%sgsea_top_%s_c2%s' % (outputFolder, top, suffix), True) rptLabel = '%s_top_%s%s' % (analysisName, top, suffix) gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % ( gseaPath, gctPath, clsPath, analysis_dict[analysis_type][1], gmxPath, rptLabel, gseaOutputFolder) gseaBashFile.write(gseaCmd_top) gseaBashFile.write('\n') gseaBashFile.close() os.system('bash %s' % (gseaBashFilePath))
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'): ''' for each name, check for the presence of an enriched file or allEnhancer table these are the files required for enhancer clustering ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #draw the parent folder from the dataFile parentFolder = utils.getParentFolder(dataFile) print "Using %s as the parent folder" % (parentFolder) #check to see if a rose folder exists already if utils.formatFolder(roseFolder,False): roseExists = True roseFolder = utils.formatFolder(roseFolder,False) else: roseExists = False roseFolder = utils.formatFolder(roseFolder,True) #check namesList to see if datasets exist if len(namesList) == 0: namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ] #if no namesList is given, this filters out WCE #now check that all of the datasets at a minimum have a rose output OR enriched region file nameDict = {} for name in namesList: nameDict[name] = {} #check if each dataset has a background backgroundName = dataDict[name]['background'] if dataDict.has_key(backgroundName): nameDict[name]['background'] = True else: nameDict[name]['background'] = False #assumes standard folder structure for enriched file enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs']) print "Looking for macs output at %s" % (enrichedFile) try: foo = open(enrichedFile,'r') foo.close() nameDict[name]['enrichedFile'] = enrichedFile except IOError: nameDict[name]['enrichedFile'] = '' #roseOutput looks for standard format rose output #need an allEnhancers table and a region table to proceed #if the rose folder doesn't exist, don't bother if roseExists: try: roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name)) if enhancerType == 'super': enhancerString = 'AllEnhancers.table.txt' if enhancerType == 'stretch': enhancerString = 'AllEnhancers_Length.table.txt' if enhancerType == 'superstretch': enhancerString = 'AllEnhancers_SuperStretch.table.txt' allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files if len(allEnhancerFileList) > 0: nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0]) else: nameDict[name]['enhancerFile'] = '' except OSError: nameDict[name]['enhancerFile']='' else: nameDict[name]['enhancerFile'] = '' if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='': print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name) print nameDict[name] sys.exit() return nameDict
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]" parser = OptionParser(usage = usage) #required flags parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (HG18,HG19,MM9) for the project") parser.add_option("-d","--data", dest="data",nargs = 1, default=None, help = "Enter the data file for the project") parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None, help = "Enter a comma separated list of rose folder") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output folder for the project") #additional options parser.add_option("-n","--names", dest="names",nargs = 1, default=None, help = "Enter a comma separated list of names to go with the datasets") parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False, help = "If flagged, will plot differential regions") parser.add_option("-a","--all", dest="all",action = 'store_true', default=False, help = "If flagged, will run analysis for all enhancers and not just supers.") (options,args) = parser.parse_args() print(options) print(args) if options.genome and options.data and options.rose and options.output: genome = string.upper(options.genome) dataFile = options.data roseFolderString = options.rose [roseFolder1,roseFolder2] = roseFolderString.split(',') parentFolder = utils.formatFolder(options.output,True) if options.names: nameString = options.names [name1,name2] =nameString.split(',') else: name1 = roseFolder1.split('/')[-1] name1 = string.replace(name1,'_ROSE','') name2 = roseFolder2.split('/')[-1] name2 = string.replace(name2,'_ROSE','') mergeName = "%s_%s_merged" % (name1,name2) plotBam = options.plot if options.all: superOnly = False else: superOnly = True if superOnly and plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder) if superOnly and not plotBam: print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder) if not superOnly and plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder) if not superOnly and not plotBam: print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder) #part 1 print "PART1: analyzing ROSE output from %s and %s" % (name1,name2) #start with the all enhancer tables from the initial rose calls roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False) roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False) superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1) superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2) allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1) allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2) print('\tMERGING ENHANCERS AND CALLING ROSE') if superOnly: mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder) else: mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName) #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder) if superOnly: superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) else: superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName) print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS') if utils.checkOutput(superOutput): #part2 is the R script rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2) print(rcmd) os.system(rcmd) time.sleep(30) callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1) else: print('ERROR: ROSE CALL FAILED') sys.exit() #rank the genes #part 3 #rank the delta print "PART 3: assinging ranks to differential enhancers" print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS') if superOnly: gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName) else: gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName) enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName) if utils.checkOutput(enhancerToGeneFile): rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName) assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput) else: print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN') sys.exit() #make the rank plot print('MAKING RANK PLOTS') if utils.checkOutput(rankOutput): rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2) print(rcmd) os.system(rcmd) else: print('ERROR: RANK PLOT SCRIPT FAILED TO RUN') sys.exit() time.sleep(30) print('FINISHING OUTPUT') finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam) else: parser.print_help() exit()
def main(): import argparse parser = argparse.ArgumentParser(usage="usage: prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" ) #required flags parser.add_argument("-e","--enhancer_file", dest="enhancers", default=None,type=str, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)",required=True) parser.add_argument("-g","--genome",dest="genome", default = None,type=str, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9",required=True) parser.add_argument("-o","--output",dest="output", default = None,type=str, help = "Enter an output folder",required=True) parser.add_argument("-n","--name",dest="name", default = None,type=str, help = "Provide a name for the job",required=True) #you either need bams for valleys or subpeaks parser.add_argument("-b","--bam",dest="bam", default = None,type=str, help = "Enter a comma separated list of bams of valley finding",required=False) parser.add_argument("-s","--subpeaks", dest="subpeaks",default=None,type=str, help = "Enter a BED file of regions to search for motifs",required=False) #additional options parser.add_argument("-a","--activity",dest="activity", default = None,type=str, help = "A table with active gene names in the first column",required=False) parser.add_argument("-l","--extension-length", dest="extension", default=100,type=int, help = "Enter the length to extend subpeak regions for motif finding. default is 100",required=False) parser.add_argument("-B","--background", dest="background", default=None,type=str, help = "Provide a background BAM file",required=False) parser.add_argument("-N", "--number", dest="number", default=1,type=int, help = "Enter the number of non overlapping motifs in a region required to assign a binding event. Default=1",required=False) #I have modified the destination of -N option so that it is different from the destination of -E option parser.add_argument("--motifs", dest="motifs", default=False,type=str, help = "Enter additional PWM file for the analysis",required=False) parser.add_argument("-t","--tfs", dest="tfs",default=None,type=str, help = "Enter additional TFs (comma separated) to be used in the bindinf analysis",required=False) parser.add_argument("--config", dest="config",default='',type=str, help = "Enter genome configuration file to overwrite default paths",required=False) args = parser.parse_args() #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== ### # Define all global file names ### print(args) genome = loadGenome(args.genome,args.config) motifDatabaseFile = genome.returnFeature('motif_database') motifConvertFile = genome.returnFeature('motif_convert') # User input files enhancer_file = args.enhancers if args.bam == None and args.subpeaks == None: print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed') sys.exit() #set the subpeak file if args.subpeaks: subpeakFile = args.subpeaks else: subpeakFile = None #will need to fix bams down the line to take in multiple bams if args.bam: bamFileList = [bam_path for bam_path in args.bam.split(',') if len(bam_path) >0] print(bamFileList) else: bamFileList = [] if args.background: background = args.background else: background = None #output folder and analysis name print(args.output) output_folder = utils.formatFolder(args.output,True) analysis_name = args.name #optional arguments #activity path activity_path = args.activity #motif extension constExtension = args.extension print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing TF connectivity for %s' % (analysis_name)) print('Writing output to %s' % (output_folder)) if subpeakFile: print('Using %s to define subpeaks for motif finding' % (subpeakFile)) else: print('Identifying valleys from .bam files') print('Using %s to define active genes' % (activity_path)) #===================================================================================== #=======================II. IDENTIFYING CANDIDATE TFS AND NODES======================= #===================================================================================== print('\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#======================================\n') geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict= geneToEnhancerDict(genome, enhancer_file, activity_path) #write these guys to disk gene_out = '%s%s_GENE_TABLE.txt' % (output_folder,analysis_name) gene_tf_out = '%s%s_GENE_TF_TABLE.txt' % (output_folder,analysis_name) enhancer_out = '%s%s_ENHANCER_TABLE.txt' % (output_folder,analysis_name) enhancer_tf_out = '%s%s_ENHANCER_TF_TABLE.txt' % (output_folder,analysis_name) summary_out= '%s%s_GENE_SUMMARY.txt' % (output_folder,analysis_name) utils.unParseTable(enhancerTable,enhancer_out,'\t') utils.unParseTable(enhancerTFTable,enhancer_tf_out,'\t') utils.unParseTable(geneTable,gene_out,'\t') utils.unParseTable(geneTFTable,gene_tf_out,'\t') utils.unParseTable(geneSummaryTable,summary_out,'\t') print('Identified %s genes w/ proximal cis-regulatory elements' % (len(gene_to_enhancer_dict))) print('Identified %s candidate TFs' % (len(candidate_tf_list))) print(candidate_tf_list) #===================================================================================== #==========================III. FINDING VALLEYS/SUBPEAKS============================== #===================================================================================== print('\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#======================================\n') #so here we would need to find valleys everywhere if subpeakFile == None: print('finding valleys') #note: the tf_bed_path is for networks, all is for out degree finding all_bed_path = findValleys(gene_to_enhancer_dict, bamFileList, analysis_name, output_folder, cutoff = 0.2) else: print('Using subpeaks from %s' % (subpeakFile)) all_bed_path = filterSubpeaks(subpeakFile,gene_to_enhancer_dict,analysis_name,output_folder) #first make the subpeak bed and subpeak fasta for the tfs all_sub_bed,all_fasta = generateSubpeakFASTA(gene_to_enhancer_dict, all_bed_path, genome, analysis_name,output_folder, constExtension) if subpeakFile == None: #this is the case where we did valleys #only reason you would need to output the sub bed all_sub_out = '%s%s_all_subpeak.bed' % (output_folder,analysis_name) utils.unParseTable(all_sub_bed,all_sub_out,'\t') #writing the all subpeak fasta out to disk all_fasta_out = '%s%s_all_subpeak.fasta' % (output_folder,analysis_name) utils.unParseTable(all_fasta,all_fasta_out,'') #===================================================================================== #=================================IV. FINDING MOTIFS================================== #===================================================================================== print('\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#======================================\n') #first make background bg_path = makeMotifBackground(all_fasta_out,output_folder,analysis_name) #find motifs for all regions fimo_out = findMotifs(all_fasta_out,bg_path,candidate_tf_list, output_folder, analysis_name, motifConvertFile, motifDatabaseFile) edgeDict = collapseFimo(fimo_out,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile) #===================================================================================== #============================V. RUNNING NETWORK ANALYSIS============================== #===================================================================================== print('\n\n#======================================\n#========V. BUILDING NETWORK===========\n#======================================\n') print('building graph and edge table') graph = buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1) formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list) print('FINISHED RUNNING CRC FOR %s' % (analysis_name)) sys.exit()
def main(): ''' main run function ''' from optparse import OptionParser usage = "usage: %prog [options] -t [TEST_BAM] -c [CONTROL_BAM] -g [GENOME]" parser = OptionParser(usage = usage) #required flags parser.add_option("-t","--test", dest="test",nargs = 1, default=None, help = "Enter the full path of the test bam") parser.add_option("-c","--control", dest="control",nargs = 1, default=None, help = "Enter the full path of the control bam") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the build for the GeCKO library (currently only supports geckov2)") #optional arguments parser.add_option("-n","--name",dest="name",nargs =1, default = 0, help = "Comma separated test,control name") parser.add_option("-s","--scoring",dest="scoring",nargs =1, default = 'WtSum', help = "Scoring method (KSbyScore,WtSum,SecondBestRank) defulat: WtSum") parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the full path of the output folder. Default is the current working directory") (options,args) = parser.parse_args() #three required parameters to get started if options.test and options.control and options.genome: #get the names of the datasets if options.name: if len(options.name.split(',')) == 2: [testName,controlName] = options.name.split(',') else: print("ERROR: Must provide a comma separated test,control name if using -n flag") parser.print_help() sys.exit() else: #try to extract names from file #strip extension from filename testName = options.test.split('/')[-1].split('.')[0] controlName = options.control.split('/')[-1].split('.')[0] #names print("using %s as name for test dataset" % (testName)) print("using %s as name for control dataset" % (controlName)) #get the analysis name analysisName = '%s_%s' % (testName,controlName) print("using %s as analysis name" % (analysisName)) #get the scoring method scoringMethod = options.scoring if ['KSbyScore','WtSum','SecondBestRank'].count(scoringMethod)==0: print("ERROR: please specify one of the following scoring methods:('KSbyScore','WtSum','SecondBestRank') or leave blank (default WtSum)") parser.print_help() sys.exit() #set up output folder if options.output: outputFolder = utils.formatFolder(options.output,True) else: outputFolder = utils.formatFolder('./%s/' % (analysisName),True) print("using %s as an output folder" % (outputFolder)) #get the right annotation genomeDict = {'geckov2':'/grail/genomes/gecko/GeCKOv2/Annotation/Human_GeCKOv2_Library.txt', } #load the annotation dictionary annotFile = genomeDict[string.lower(options.genome)] print("using %s as the annotation file" % (annotFile)) #guideDict,geneDict = makeAnnotDict(annotFile) #now set up each bam testBam = utils.Bam(options.test) controlBam = utils.Bam(options.control) #get the MMR for each testMMR = round(float(testBam.getTotalReads())/1000000,4) controlMMR = round(float(controlBam.getTotalReads())/1000000,4) print("Test dataset: %s has an MMR of %s" % (testName,testMMR)) print("Control dataset: %s has an MMR of %s" % (controlName,controlMMR)) #now get the idxstats output testIdxFile = '%s%s_idxstats.txt' % (outputFolder,testName) testIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.test,testIdxFile) print("Test idxstats command:") print(testIdxCmd) os.system(testIdxCmd) controlIdxFile = '%s%s_idxstats.txt' % (outputFolder,controlName) controlIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.control,controlIdxFile) print("Control idxstats command:") print(controlIdxCmd) os.system(controlIdxCmd) print("Checking for output") if not utils.checkOutput(testIdxFile,0.1,5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.test)) print("Found test IdxStats file") if not utils.checkOutput(controlIdxFile,0.1,5): print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.control)) print("Found control IdxStats file") #now make the fold table foldTableFile =makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1) print('writing output to %s' % (foldTableFile)) print("MAING FRIGER TABLE") rigerTableFile = makeRigerTable(foldTableFile,output='') print('writing FRIGER table to %s' % (rigerTableFile)) rigerBashFileName = callRiger(rigerTableFile,scoring=scoringMethod,output='',callRiger=True) else: parser.print_help() sys.exit()
import subprocess #========================================================================== #============================PARAMETERS==================================== #========================================================================== projectName = 'slam_seq' genome ='hg38' annotFile = '%s/annotation/%s_refseq.ucsc' % (pipeline_dir,genome) #project folders projectFolder = '/storage/cylin/grail/projects/%s' % (projectName) #PATH TO YOUR PROJECT FOLDER projectFolder = utils.formatFolder(projectFolder,True) #standard folder names gffFolder ='%sgff/' % (projectFolder) macsFolder = '%smacsFolder/' % (projectFolder) macsEnrichedFolder = '%smacsEnriched/' % (projectFolder) mappedEnrichedFolder = '%smappedEnriched/' % (projectFolder) mappedFolder = '%smappedFolder/' % (projectFolder) wiggleFolder = '%swiggles/' % (projectFolder) metaFolder = '%smeta/' % (projectFolder) metaRoseFolder = '%smeta_rose/' % (projectFolder) roseFolder = '%srose/' % (projectFolder) fastaFolder = '%sfasta/' % (projectFolder) bedFolder = '%sbed/' % (projectFolder) figuresFolder = '%sfigures/' % (projectFolder) geneListFolder = '%sgeneListFolder/' % (projectFolder) bedFolder = '%sbeds/' % (projectFolder)