def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#==================I. LOADING DATA ANNOTATION TABLES===================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for chip data file pipeline_dfci.summary(chip_data_file) #for chip data file pipeline_dfci.summary(atac_data_file) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING ROSE2===========================' ) print( '#======================================================================' ) print('\n\n') macsEnrichedFolder = '%smacsEnriched/' % ( projectFolder) #folder with macs peak output beds parentFolder = utils.formatFolder( '%srose/' % (projectFolder), True) # create a folder to store ROSE2 output namesList = ['MM1S_H3K27AC', 'MM1S_MED1' ] # calling ROSE2 on H3K27AC and MED1 defined enhancers bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder) mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % ( projectFolder) pipeline_dfci.callRose2(chip_data_file, macsEnrichedFolder, parentFolder, namesList, extraMap=[], inputFile='', tss=2500, stitch=12500, bashFileName=bash_file, mask=mask_file, useBackground=True)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#======================I, LOADING DATA ANNOTATION======================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile)
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#======================I. LOADING DATA ANNOTATION======================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(data_file)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq pipeline_dfci.summary(shep21_rna_dataFile) pipeline_dfci.summary(be2c_rna_drug_dataFile) pipeline_dfci.summary(be2c_rna_twist_dataFile) print('\n\n') print( '#======================================================================' ) print( '#===================II, RUNNING LINE PLOT SCRIPTS======================' ) print( '#======================================================================' ) print('\n\n') #make the folder to store output figures utils.formatFolder('%sfigures/6_rna_line_plots/' % (projectFolder), True) #we have 3 RNA-Seq datasets #first is shep21 at the mycn conserved regions w/ the replicate dropped #and at shep21 defined regions #wrap_shep21() wrap_be2c_jq1()
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq pipeline_dfci.summary(shep21_rna_dataFile) pipeline_dfci.summary(be2c_rna_drug_dataFile) pipeline_dfci.summary(be2c_rna_twist_dataFile) print('\n\n') print( '#======================================================================' ) print( '#====================II. PROCESSING RNA_SEQ BAMS=======================' ) print( '#======================================================================' ) print('\n\n') #shep21_bash_path = process_shep_rna(shep21_rna_dataFile,gtfFile) #shep21_drop_rep_bash_path = process_shep_rna_drop_rep(shep21_rna_dataFile,gtfFile) be2c_drug_bash_path = process_be2c_drug_rna(be2c_rna_drug_dataFile, gtfFile)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==========================II. CALLING MACS============================' ) print( '#======================================================================' ) print('\n\n') #running peak finding using macs 1.4.2 on all chip datasets #this usually takes ~2-3 hours on a reasonably fast machine #a 3 hour time out on this entire operation is set #if peak calling takes longer than 3 hours, simply run the script again after completion #run_macs(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=================II. DEFINING ACTIVE GENES IN MOUSE===================' ) print( '#======================================================================' ) print('\n\n') #here we will identify active promoters in various contexts as those with #an H3K27AC peak in the +/- 1kb tss region #UCSC refseq annotations are used for all genes #make_active_gene_lists(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==================III. CALLING ROSE TO MAP ENHANCERS==================' ) print( '#======================================================================' ) print('\n\n') # #for SCG_H3K27AC # analysisName = 'SCG_H3K27AC' # namesList = ['SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for CG_H3K27AC # analysisName = 'CG_H3K27AC' # namesList = ['CG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for GANGLIA_H3K27AC # analysisName = 'GANGLIA_H3K27AC' # namesList = ['CG_H3K27Ac','SCG_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) # #for THMYCN # analysisName = 'THMYCN_H3K27AC' # namesList = ['THMYCN_139076_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN1_H3K27Ac'] # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList) print('\n\n') print( '#======================================================================' ) print( '#=================IV. LIFTING OVER NB CONSERVED REGIONS================' ) print( '#======================================================================' ) print('\n\n') # #liftover a pair of gffs # #first convert to bed # nb_promoter_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (hg19_projectFolder) # nb_enhancer_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (hg19_projectFolder) # nb_promoter_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (hg19_projectFolder) # nb_enhancer_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (hg19_projectFolder) # nb_promoter_gff = utils.parseTable(nb_promoter_gff_path,'\t') # nb_enhancer_gff = utils.parseTable(nb_enhancer_gff_path,'\t') # utils.gffToBed(nb_promoter_gff,nb_promoter_bed_path) # utils.gffToBed(nb_enhancer_gff,nb_enhancer_bed_path) # print('converted NB conserved gffs to beds at %s and %s' % (nb_promoter_bed_path,nb_enhancer_bed_path)) # #note, now you have to liftover manually to create beds # mm9_promoter_bed_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (bedFolder) # mm9_enhancer_bed_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (bedFolder) # mm9_promoter_gff_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder) # mm9_enhancer_gff_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder) # utils.bedToGFF(mm9_promoter_bed_path,mm9_promoter_gff_path) # utils.bedToGFF(mm9_enhancer_bed_path,mm9_enhancer_gff_path) # print('writing mm9 nb mycn sites to %s and %s' % (mm9_promoter_gff_path,mm9_enhancer_gff_path)) print('\n\n') print( '#======================================================================' ) print( '#======================V. MAPPING ENRICHED TO GFFS=====================' ) print( '#======================================================================' ) print('\n\n') # setName = 'THMYCN' # gffList = [mm9_promoter_gff_path,mm9_enhancer_gff_path] # cellTypeList = ['THMYCN1','THMYCN2','THMYCN','CG','SCG'] # mapList = ['CG_H3K27Ac', # 'SCG_H3K27Ac', # 'THMYCN1_H3K27Ac', # 'THMYCN_139423_H3K27Ac', # 'THMYCN_139076_H3K27Ac', # ] # #pipeline_dfci.mapEnrichedToGFF(mouse_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=mapList,useBackground=True) # #summarize info for venn diagrams for each # promoter_mapped_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # promoter_venn_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(promoter_mapped_path,group_list = ['CG','THMYCN'],output=promoter_venn_path) # enhancer_mapped_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder) # enhancer_venn_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_VENN.txt' % (tableFolder) # summarizeVenn(enhancer_mapped_path,group_list = ['CG','THMYCN'],output=enhancer_venn_path) print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAKING MYCN REGIONS GFF======================' ) print( '#======================================================================' ) print('\n\n') dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) names_list = [ 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] mycn_loci = [] for name in names_list: mycn_collection = utils.importBoundRegion( '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']), name) mycn_loci += mycn_collection.getLoci() mycn_collection = utils.LocusCollection(mycn_loci, 50) mycn_collection.stitchCollection() mycn_gff = utils.locusCollectionToGFF(mycn_collection) mycn_gff_path = '%sMM9_THMYCN_MYCN_-0_+0.gff' % (gffFolder) utils.unParseTable(mycn_gff, mycn_gff_path, '\t') #make collections promoter_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder)) enhancer_collection = utils.gffToLocusCollection( '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder)) #make the overlap table overlap_table = [['PROMOTER', 'ENHANCER', 'NONE']] promoter_count = 0 enhancer_count = 0 none_count = 0 for line in mycn_gff: locus = utils.Locus(line[0], int(line[3]) - 10000, int(line[4]) + 10000, '.') if enhancer_collection.getOverlap(locus, 'both'): enhancer_count += 1 continue if promoter_collection.getOverlap(locus, 'both'): promoter_count += 1 else: none_count += 1 overlap_table.append([promoter_count, enhancer_count, none_count]) overlap_table_path = '%sMM9_THMYCN_OVERLAP.txt' % (tableFolder) utils.unParseTable(overlap_table, overlap_table_path, '\t') print('\n\n') print( '#======================================================================' ) print( '#=====================VI. MAPPING GFFS FOR HEATMAP=====================' ) print( '#======================================================================' ) print('\n\n') #map_for_heatmap(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=====================VII. AVERAGING MAPPED SIGNAL=====================' ) print( '#======================================================================' ) print('\n\n') # set_list = ['GANGLIA_H3K27AC','THMYCN_H3K27AC','THMYCN_MYCN'] # set_names = [ # ['CG_H3K27Ac','SCG_H3K27Ac'], # ['THMYCN1_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN_139076_H3K27Ac'], # ['THMYCN2_MYCN','THMYCN_139076_MYCN','THMYCN_139423_MYCN'] # ] # for i in range(len(set_list)): # setName = set_list[i] # names_list =set_names[i] # print(setName) # print(names_list) # #for promoters # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) # #for enhancers # mapped_list = ['%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list] # output_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,setName) # print(output_path) # averagingMappedSignal(mapped_list,output_path,setName) print('\n\n') print( '#======================================================================' ) print( '#=====================VIII. MAKING HEATMAPS/METAS======================' ) print( '#======================================================================' ) print('\n\n')
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(mouse_dataFile) print('\n\n') print( '#======================================================================' ) print( '#============II. MAKING A BED OUT OF HG19 FIGURE REGIONS===============' ) print( '#======================================================================' ) print('\n\n') hg19_gff_path = '%sgff/HG19_NB_FIGURE_GENES.gff' % (hg19_projectFolder) hg19_gff = utils.parseTable(hg19_gff_path, '\t') print(hg19_gff) hg19_bed = utils.gffToBed(hg19_gff) print(hg19_bed) hg19_bed_path = '%sbeds/HG19_NB_FIGURE_GENES.bed' % (hg19_projectFolder) utils.unParseTable(hg19_bed, hg19_bed_path, '\t') #need to manually lift this over to mm9 #https://genome.ucsc.edu/cgi-bin/hgLiftOver mm9_bed_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.bed' % (bedFolder) mm9_gff_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.gff' % (gffFolder) mm9_gff = utils.bedToGFF(mm9_bed_path) #now add some additional manual regions added_gff_regions = [ [ 'chr12', 'TWIST1_ENHANCER', 'TWIST1_ENHANCER', 34639818, 34656263, '', '-', '', 'TWIST1_ENHANCER' ], [ 'chr11', 'NPM1_PROMOTER_2', 'NPM1_PROMOTER_2', 33049820, 33065883, '', '+', '', 'NPM1_PROMOTER_2' ], [ 'chr6', 'GATA2_ENHANCER', 'GATA2_ENHANCER', 88135802, 88159867, '', '+', '', 'GATA2_ENHANCER' ], [ 'chr7', 'PHOX2A', 'PHOX2A', 108964211, 108974610, '', '+', '', 'PHOX2A' ], [ 'chr15', 'LET7B', 'LET7B', 85497440, 85538754, '', '+', '', 'LET7B', ], [ 'chr10', 'LIN28B', 'LIN28B', 45161233, 45217227, '', '-', '', 'LIN28B' ], ] mm9_gff_full = mm9_gff + added_gff_regions utils.unParseTable(mm9_gff_full, mm9_gff_path, '\t') print('\n\n') print( '#======================================================================' ) print( '#=======================III. PLOTTING DATA IN MOUSE====================' ) print( '#======================================================================' ) print('\n\n') #plot mouse regions plot_mouse_genes(mouse_dataFile, mm9_gff_path)
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(chip_data_file) print('\n\n') print( '#======================================================================' ) print( '#==========================II. RUNNING MACS============================' ) print( '#======================================================================' ) print('\n\n') data_dict = pipeline_dfci.loadDataTable(chip_data_file) #chip_list= [name for name in data_dict.keys() if name.upper().count('WCE') == 0] #print(chip_list) k27ac_list = [ name for name in data_dict.keys() if name.count('27ac') == 1 and name.upper().count('WCE') == 0 ] pipeline_dfci.run_macs(chip_data_file, projectFolder, macsFolder, macsEnrichedFolder, wiggleFolder, True, k27ac_list) print('\n\n') print( '#======================================================================' ) print( '#===================III. CALL ROSE INDIVIDUALLY========================' ) print( '#======================================================================' ) print('\n\n') analysis_name = 'HUMAN_LIVER_H3K27AC' parentFolder = utils.formatFolder('%s%s' % (roseFolder, analysis_name), True) data_dict = pipeline_dfci.loadDataTable(chip_data_file) k27ac_list = [ name for name in data_dict.keys() if name.count('27ac') == 1 and name.upper().count('WCE') == 0 ] #pipeline_dfci.callRose2(chip_data_file,macsEnrichedFolder,parentFolder,namesList=k27ac_list,extraMap = [],inputFile='',tss=2500,stitch='',bashFileName ='',mask=maskFile,useBackground=True,py27_path =py27_path) #run rose2 wrapper for both enhancer_bashFileName, enhancer_region_map_path, names_list = define_enhancer_landscape( projectFolder, pipeline_dir, chip_data_file, analysis_name, k27ac_list) print(enhancer_bashFileName, enhancer_region_map_path, names_list) #runs only if no output detected if not utils.checkOutput(enhancer_region_map_path, 0, 0): print(enhancer_bashFileName) os.system('bash %s' % (enhancer_bashFileName))
#namesList = dataDict.keys() #print(namesList) #========================================================================== #=======================LOADING DATA ANNOTATION============================ #========================================================================== ##THIS SECTION LOADS A DATA TABLE. MUST BE UNCOMMENTED FOR REST OF CODE TO WORK #LOADING THE DATA TABLE dataDict = pipeline_dfci.loadDataTable(dataFile) print(dataDict.keys()) pipeline_dfci.summary(dataFile) #========================================================================== #==========================CALLING BOWTIE================================== #========================================================================== ##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER #namesList = [] <- fill this in if you want to only map a subset of the data. otherwise leave blank ##SET LAUNCH TO False to debug #pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True) #========================================================================== #=============================CALL MACS==================================== #==========================================================================
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#==================I. LOADING DATA ANNOTATION TABLES===================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for chip data file pipeline_dfci.summary(chip_data_file) #for chip data file pipeline_dfci.summary(atac_data_file) print('\n\n') print( '#======================================================================' ) print( '#==========================III. CALLING CRC3===========================' ) print( '#======================================================================' ) print('\n\n') #running circuitry on the consensus system #creates a sbatch bash script crc_folder = '%scrc/' % (projectFolder) analysis_name = 'MM1S' enhancer_path = '%srose/MM1S_H3K27AC_ROSE/MM1S_H3K27AC_peaks_SuperEnhancers_ENHANCER_TO_GENE.txt' % ( projectFolder) subpeak_path = '%smacsEnriched/MM1S_ATAC.bt2.srt.rmdup.macs14_peaks.bed' % ( projectFolder) activity_path = '%stables/MM1S_EXPRESSION_ACTIVITY.txt' % (projectFolder) config_path = '%scrc_config.txt' % (whereAmI) #extra args args = '--config %s' % (config_path) print('ESTABLISHING INPUT FILES') for file_path in [enhancer_path, activity_path, subpeak_path, config_path]: if utils.checkOutput(file_path, 0.1, 0.1): print('FOUND %s' % (file_path)) else: print('UNABLE TO FIND %s' % (file_path)) sys.exit() pipeline_dfci.call_crc(analysis_name, enhancer_path, subpeak_path, activity_path, genome, crc_folder, args, py27_path='')
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#==================I. LOADING DATA ANNOTATION TABLES===================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for chip data file pipeline_dfci.summary(chip_data_file) #for chip data file pipeline_dfci.summary(atac_data_file) print('\n\n') print('#======================================================================') print('#=====================II. CONFIGURING GENOME BUILD=====================') print('#======================================================================') print('\n\n') genome_directory = '%sgenomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/' % (projectFolder) mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (projectFolder) config_table = [['BUILD:FIELD:PATH'], ['%s:%s:%s' % (genome,'genome_directory',genome_directory)], ['%s:%s:%s' % (genome,'mask_file',mask_file)], ] config_path = '%scrc_config.txt' %(whereAmI) utils.unParseTable(config_table,config_path,'\t') print('writing genome configuration to %s' % (config_path)) print('\n\n') print('#======================================================================') print('#==================III. DETECTING DEPENDENCIES=========================') print('#======================================================================') print('\n\n') from distutils.spawn import find_executable # Try to find bamliquidator, bamliquidator_batch.py, and fimo bamliquidatorString = find_executable('bamliquidator') if bamliquidatorString is None: raise ValueError('bamliquidator not found in path') else: print('found bamliquidator') bamliquidatorBatchString = find_executable('bamliquidator_batch.py') if bamliquidatorString is None: raise ValueError('bamliquidator_batch.py not found in path') else: print('found bamliquidator_batch.py') bamliquidatorBatchString = find_executable('fimo') if bamliquidatorString is None: raise ValueError('fimo not found in path') else: print('found fimo')
def main(): print('main analysis for NIBR keratinocyte project atac-seq analysis') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#=======================I. CHECKING DATA TABLES========================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible pipeline_dfci.summary(atac_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=========================II. MAPPING FASTQS===========================' ) print( '#======================================================================' ) print('\n\n') # #for atac need no unaligned and no discordant # #atac_params = '--end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time' # #pipeline_dfci.makeBowtieBashJobsSlurm(atac_dataFile,namesList = [],launch=True,overwrite=False,pCount=16,paramString=atac_params) print('\n\n') print( '#======================================================================' ) print( '#======================III. RUNNING RIESLING===========================' ) print( '#======================================================================' ) print('\n\n') # #riesling is an ATAC-seq pipeline jointly developed by our lab and the Gordon lab at WUSTL # #it sanitizes the bams removing duplicate and mitochondrial reads # riesling_dir = utils.formatFolder('%sriesling/' % (projectFolder),True) # input_dir = utils.formatFolder('/storage/cylin/grail/bam/hg19/NIBR_YvsO/ATACseq_YvsO/') # output_dir = utils.formatFolder('/storage/cylin/grail/bam/hg19/NIBR_YvsO/ATACseq_YvsO/riesling/',True) # analysis_name = 'NIBR_ATAC_NEW' # riesling_bash_path = '%s%s_riesling.sh' % (riesling_dir,analysis_name) # riesling_bash = open(riesling_bash_path,'w') # riesling_bash.write('#!/usr/bin/bash\n\n') # #now write the sbatch headers # riesling_bash.write('#SBATCH -n 32\n#SBATCH --mem=512000\n') # riesling_bash.write('#SBATCH -o %s%s_reisling_slurm_%%j.out\n' % (riesling_dir,analysis_name)) # riesling_bash.write('#SBATCH -e %s%s_reisling_slurm_%%j.err\n' % (riesling_dir,analysis_name)) # riesling_bash.write('pwd; hostname; date\n\n') # riesling_bash.write('cd /storage/cylin/bin/riesling-pipeline/\n') # riesling_bash.write('%s 2-sanitize-bam.py -i %s -o %s -g %s -v\n' % (py27_path,input_dir, output_dir,genome)) # riesling_bash.close() # print('writing riesling bam commands to %s' % (riesling_bash_path)) print('\n\n') print( '#======================================================================' ) print( '#=====================IV. FORMATTING RIESLING BAMS=====================' ) print( '#======================================================================' ) print('\n\n') # atac_table = utils.parseTable(atac_dataFile,'\t') # #now fix the path to the right bam # for i in range(1,len(atac_table)): # atac_table[i][0] = atac_table[i][0] + 'riesling/' # atac_dataFile_riesling = atac_dataFile.replace('.txt','_riesling.txt') # new_table = utils.unParseTable(atac_table,atac_dataFile_riesling,'\t') # #now we need to index all of the bams # dataDict = pipeline_dfci.loadDataTable(atac_dataFile_riesling) # names_list = dataDict.keys() # bam_directory = dataDict[names_list[0]]['folder'] # bam_file_list = ['%s%s' % (bam_directory, x) for x in os.listdir(bam_directory) if x.split('.')[-1] == 'bam'] # print(bam_file_list) # for bam_path in bam_file_list: # index_cmd = 'samtools index %s' % (bam_path) # print(index_cmd) # os.system(index_cmd) print('\n\n') print( '#======================================================================' ) print( '#========================V. RUNNING PEAK CALLING=======================' ) print( '#======================================================================' ) print('\n\n') atac_dataFile_riesling = atac_dataFile.replace('.txt', '_riesling.txt') #run_macs(atac_dataFile_riesling,False) print('\n\n') print( '#======================================================================' ) print( '#========================V. RUNNING CLUSTERING ========================' ) print( '#======================================================================' ) print('\n\n') dataDict = pipeline_dfci.loadDataTable(atac_dataFile_riesling) atac_list = dataDict.keys() atac_list.sort() print(atac_list) analysis_name = 'keratinocyte_atac' cluster_folder = utils.formatFolder('%sclustering' % (projectFolder), True) cluster_rose_folder = utils.formatFolder( '%sclustering_rose' % (projectFolder), True) output_folder = utils.formatFolder( '%s%s_clustering' % (cluster_folder, analysis_name), True) names_string = ','.join(atac_list) cluster_bash_path = '%s%s_clustering.sh' % (cluster_folder, analysis_name) cluster_bash = open(cluster_bash_path, 'w') cluster_bash.write('#!/usr/bin/bash\n\n\n') cluster_bash.write('#SBATCH --mem=64000\n\n\n') cluster_cmd = '%s %sclusterEnhancer.py -d %s -i %s -r %s -o %s -e super -t 0 -n %s --mask %s' % ( py27_path, pipeline_dir, atac_dataFile_riesling, names_string, cluster_rose_folder, output_folder, analysis_name, maskFile) cluster_bash.write(cluster_cmd + '\n\n') cluster_bash.close()
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#======================I, LOADING DATA ANNOTATION======================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(shep_on_dataFile) print('\n\n') print('#======================================================================') print('#=========================II. MAP ENHANCERS============================') print('#======================================================================') print('\n\n') # #for enhancers # enhancer_bashFileName,enhancer_region_map_path,namesList = define_enhancer_landscape(projectFolder,pipeline_dir,shep_on_dataFile) # print(enhancer_bashFileName) # #runs only if no output detected # if not utils.checkOutput(enhancer_region_map_path,0,0): # print(enhancer_bashFileName) # os.system('bash %s' % (enhancer_bashFileName)) # #in individual systems # bash_path = map_shep_enhancers(shep_on_dataFile) # #os.system('bash %s' % (bash_path)) print('\n\n') print('#======================================================================') print('#=======================III. MAP MYC LANDSCAPE=========================') print('#======================================================================') print('\n\n') # #for mycn # myc_bashFileName,myc_region_map_path,namesList = define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile) # if not utils.checkOutput(myc_region_map_path,0,0): # print(myc_bashFileName) # os.system('bash %s' % (myc_bashFileName)) print('\n\n') print('#======================================================================') print('#===================IV. MAKING +/- 5KB MYCN GFFs=======================') print('#======================================================================') print('\n\n') #make_shep_on_mycn_landscape(shep_on_dataFile) print('\n\n') print('#======================================================================') print('#================V. MAPPING MYCN GFFs FOR METAS AND HEATMAP============') print('#======================================================================') print('\n\n') #mapping at shep on defined regions and same regions from 3_shep21_chiprx_heatmap #those regions are defined in shep21 data #map_shep_for_heatmap(shep_on_dataFile) print('\n\n') print('#======================================================================') print('#==================VI. MAPPING MYCN GFFs FOR BOX PLOT==================') print('#======================================================================') print('\n\n') #mapping @ a 1 bin scale for the shep21 conserved mycn regions gffList = [ '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-1kb_+1kb.gff' % (gffFolder), '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder), '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder), '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder), '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder), '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder), ] #map_regions(shep_on_dataFile,gffList,names_list=[]) print('\n\n') print('#======================================================================') print('#=====================VII. MAKING BOX PLOTS============================') print('#======================================================================') print('\n\n') set_name = 'SHEP_MYCN' gff_name = 'SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb' names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list) set_name = 'SHEP_H3K27AC' names_list = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC'] makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list) set_name = 'SHEP_MYCN' gff_name = 'SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb' names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list) set_name = 'SHEP_H3K27AC' names_list = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC'] makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list) print('\n\n') print('#======================================================================') print('#===============VII. MAKING HEATMAPS AND METAS ========================') print('#======================================================================') print('\n\n')
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(atac_data_file) #assumes macs has already been run and formatted # run_macs(chip_data_file) # sys.exit() print('\n\n') print( '#======================================================================' ) print( '#======================MAKING GEO TABLES===============================' ) print( '#======================================================================' ) print('\n\n') geoName = 'rasmc_atac' outputFolder = '/storage/cylin/grail/projects/rasmc_all/rasmc_geo/%s_geo/' % ( geoName) namesList = [] # makeGEOTable(atac_data_file,wiggleFolder,macsFolder,namesList,geoName,outputFolder) #========================================================================== #====================MAP BAMS BATCH======================================== #========================================================================== print('Mapping chiprx bams to peaks') dataFile = atac_data_file dataDict = pipeline_dfci.loadDataTable(atac_data_file) names = dataDict.keys() # for name in names: # if len(dataDict[name]['enrichedMacs'])>4: # peak_name=dataDict[name]['enrichedMacs'] # peak_path='%s%s' % (macsEnrichedFolder,peak_name) # gff_path='%s%s.gff' % (gffFolder,peak_name.split('.bed')[0]) # utils.bedToGFF(peak_path,output=gff_path) # gffList=[gff_path] # namesL=[name] # pipeline_dfci.mapBamsBatch(dataFile, gffList,mappedFolder,overWrite=False,namesList=namesL,extension=0,rpm=False) namesL = names tss_gff_path = '%sRN6_TSS_ALL_-300_+300.gff' % (gffFolder) gffList = [tss_gff_path] pipeline_dfci.mapBamsBatch(dataFile, gffList, mappedFolder, overWrite=False, namesList=namesL, extension=0, rpm=True)
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(chip_data_file) #use macs1.4 to make wiggles macs14Folder = utils.formatFolder('%smacs14/' % (projectFolder), True) macs14EnrichedFolder = utils.formatFolder( '%smacs14Enriched/' % (projectFolder), True) pipeline_dfci.run_macs(chip_data_file, projectFolder, macs14Folder, macs14EnrichedFolder, wiggleFolder, useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#=======================II. DEFINING ENHANCERS=========================' ) print( '#======================================================================' ) print('\n\n') #running ROSE2_meta on the chordoma k27ac chip_data_dict = pipeline_dfci.loadDataTable(chip_data_file) # for name in chip_data_dict.keys(): # print(name) # print(chip_data_dict[name]['enrichedMacs']) # #run rose2 wrapper for both # bashFileName,region_map_path,names_list = define_enhancer_landscape(projectFolder,pipeline_dir,chip_data_file,analysis_name = 'CH22_H3K27AC') # print(bashFileName,region_map_path,names_list) # #runs only if no output detected # if not utils.checkOutput(enhancer_region_map_path,0,0): # print(enhancer_bashFileName) # os.system('bash %s' % (enhancer_bashFileName)) # #========= # #========= # #========= # #sanity check debug # #run rose2 meta for one dataset as a test w/ stitch at 500 just for a control # bashFileName,region_map_path,names_list = define_enhancer_landscape(projectFolder,pipeline_dir,chip_data_file,analysis_name = 'CH22_H3K27AC_1_TEST',names_list = ['CH22_H3K27AC_1'],stitch = '500') # print(bashFileName,region_map_path,names_list) # #running regular rose2 # rose2_parent_folder = utils.formatFolder('%srose2' % (projectFolder),True) # rose2_bash = pipeline_dfci.callRose2(chip_data_file,macsEnrichedFolder,rose2_parent_folder,namesList=['CH22_H3K27AC_1'],extraMap = [],inputFile='',tss=2500,stitch='500',bashFileName ='',mask=maskFile,useBackground=True,py27_path =py27_path) # print(rose2_bash) # #ok, ROSE2 META and ROSE2 still produce same result when run on single dataset # #whew # #========= # #========= # #========= print('\n\n') print( '#======================================================================' ) print( '#=======================III. DEFINE T LANDSCAPE========================' ) print( '#======================================================================' ) print('\n\n') # #since this is an HA chip we need to remove HA background # #get the pos regions (T) # data_dict = pipeline_dfci.loadDataTable(chip_data_file) # t_list = ['%s%s' % (macsEnrichedFolder,data_dict[name]['enrichedMacs']) for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0] # print(t_list) # #get the negative_control list from IRF2 project # ha_ctl_list = ['%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder),'%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder)] # t_bed_path_intersect = '%sCH22_T_INTERSECT.bed' % (bedFolder) # #merge_regions(pos_list = t_list,neg_list = ha_ctl_list,analysis_name = 'CH22_T_INTERSECT',output_path=t_bed_path_intersect,merge_type = 'INTERSECT') # t_bed_path_union = '%sCH22_T_UNION.bed' % (bedFolder) # #merge_regions(pos_list = t_list,neg_list = ha_ctl_list,analysis_name = 'CH22_T_UNION',output_path=t_bed_path_union,merge_type = 'UNION') # #for k27ac # h3k27ac_list = ['%s%s' % (macsEnrichedFolder,data_dict[name]['enrichedMacs']) for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0] # h3k27ac_bed_path_union = '%sCH22_H3K27AC_UNION.bed' % (bedFolder) # merge_regions(pos_list = h3k27ac_list,neg_list = [],analysis_name = 'CH22_H3K27AC_UNION',output_path=h3k27ac_bed_path_union,merge_type = 'UNION') print('\n\n') print( '#======================================================================' ) print( '#=========================IV. T MOTIF FINDING==========================' ) print( '#======================================================================' ) print('\n\n') #use the T union and then calculate signal data_dict = pipeline_dfci.loadDataTable(chip_data_file) map_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1] print(map_list) gffList = ['%sCH22_T_UNION.bed' % (bedFolder)] #signal_table_list = pipeline_dfci.map_regions(chip_data_file,gffList,mappedFolder,signalFolder,names_list=map_list,medianNorm=False,output='',extendReadsTo=200) #column order = CH22_dTag_T_MUT_HA CH22_dTag_T_MUT_WCE CH22_dTag_T_WT_HA CH22_dTag_T_WT_WCE signal_table_path = '%sCH22_T_UNION_CH22_CHIP_TABLE_SIGNAL.txt' % ( signalFolder) top = 1000 fasta_path = make_T_top_regions(signal_table_path, top) #now run meme analysis_name = 'HG19_CH22_T_UNION_TOP' meme_bash_path = wrap_meme(analysis_name) print('\n\n') print( '#======================================================================' ) print( '#===================IV. MAKE HEATMAPS OF T LANDSCAPE===================' ) print( '#======================================================================' ) print('\n\n') #use the union for T t_union_path = '%sbeds/CH22_T_UNION.bed' % (projectFolder) print('\n\n') print( '#======================================================================' ) print( '#=======================IV. DEFINE ACTIVE GENES========================' ) print( '#======================================================================' ) print('\n\n') # #make the relevant gffs # pipeline_dfci.makeGeneGFFs(annotFile,gffFolder,species='HG19') # # #Making a list of all active genes # tss_gff = utils.parseTable('%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder),'\t') # start_dict = utils.makeStartDict(annotFile) # all_gene_table = [] # ticker = 1 # for line in tss_gff: # new_line = [ticker,line[1],start_dict[line[1]]['name']] # all_gene_table.append(new_line) # ticker+=1 # utils.unParseTable(all_gene_table,'%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder),'\t') # sys.exit() # setName = 'CH22_H3K27AC' # cellTypeList = ['CH22'] # map_list = ['CH22_H3K27AC_1','CH22_H3K27AC_2'] # gffList = ['%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)] # pipeline_dfci.mapEnrichedToGFF(chip_data_file,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=map_list,useBackground=True) # setList = [['CH22_H3K27AC_1'],['CH22_H3K27AC_2']] #bound by either # output = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder) # mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_CH22_H3K27AC.txt' % (mappedEnrichedFolder) # pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile) print('\n\n') print( '#======================================================================' ) print( '#====================V. RUNNING ENHANCER PROMOTER======================' ) print( '#======================================================================' ) print('\n\n') # data_dict = pipeline_dfci.loadDataTable(chip_data_file) # #need to run enhancer promoter code on both k27ac and T # #for T at active genes # activity_path = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder) # input_path = '%sCH22_T_UNION.bed' % (bedFolder) # analysis_name = 'CH22_T_UNION' # t_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0] # #wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = t_list,useBackground=True) # #for T at all genes # activity_path = '%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder) # input_path = '%sCH22_T_UNION.bed' % (bedFolder) # analysis_name = 'CH22_T_UNION_ALL_GENES' # t_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0] # #wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = t_list,useBackground=True) # #for H3K27AC at active genes # # activity_path = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder) # # input_path = '%sCH22_H3K27AC_UNION.bed' % (bedFolder) # # analysis_name = 'CH22_H3K27C_UNION' # # h3k27ac_list = [name for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0] # # wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = h3k27ac_list,useBackground=True) # #for H3K27AC at all genes # activity_path = '%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder) # input_path = '%sCH22_H3K27AC_UNION.bed' % (bedFolder) # analysis_name = 'CH22_H3K27C_UNION_ALL_GENES' # h3k27ac_list = [name for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0] # wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = h3k27ac_list,useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#=================VI. LINKING CHROMATIN TO EXPRESSION==================' ) print( '#======================================================================' ) print('\n\n') #a gene counts if it is expressed above cut in at least one sample #may need to collapse NM IDs per genes #first check that expression table doesn't have duplicates def merge_rna(exp_path, exp_cutoff=1, output_path=''): ''' just a wrapper for combining expression data w/ gene level h3k27ac and T data ''' exp_table = utils.parseTable(exp_path, '\t') exp_dict = defaultdict(list) for line in exp_table[1:]: #here's where we can filter for an expression cutoff exp_line = [float(x) for x in line[1:]] if max(exp_line) > exp_cutoff: exp_dict[line[0]] = exp_line #now figure out genes w/ T binding t_gene_path = '%senhancerPromoter/CH22_T_UNION_ALL_GENES/CH22_T_UNION_ALL_GENES_GENE_TABLE.txt' % ( projectFolder) t_table = utils.parseTable(t_gene_path, '\t') t_dict = defaultdict(list) for line in t_table[1:]: t_dict[line[0]] = [float(x) for x in line[1:]] #now figure out genes w/ H3K27AC binding h3k27ac_gene_path = '%senhancerPromoter/CH22_H3K27C_UNION_ALL_GENES/CH22_H3K27C_UNION_ALL_GENES_GENE_TABLE.txt' % ( projectFolder) h3k27ac_table = utils.parseTable(h3k27ac_gene_path, '\t') h3k27ac_dict = defaultdict(list) for line in h3k27ac_table[1:]: h3k27ac_dict[line[0]] = [float(x) for x in line[1:]] #now set up the output gene_table = [] gene_table_header = [ 'GENE', 'T_PROMOTER', 'T_DISTAL', 'H3K27AC_PROMOTER', 'H3K27AC_DISTAL' ] + exp_table[0] gene_table.append(gene_table_header) #anchor analysis on genes w/ detectable expr exp_gene_list = exp_dict.keys() exp_gene_list.sort() for gene in exp_gene_list: if gene in t_dict: t_line = t_dict[gene] else: t_line = [0.0, 0.0] if gene in h3k27ac_dict: h3k27ac_line = h3k27ac_dict[gene] else: h3k27ac_line = [0.0, 0.0] new_line = [gene] + t_line + h3k27ac_line + exp_dict[gene] gene_table.append(new_line) utils.unParseTable(gene_table, output_path, '\t') #for norm data rna_project_folder = '/storage/cylin/grail/projects/chordoma_ch22_rna/' #this table is just in alphabetical order exp_path = '%s190612_rna_seq/cuffnorm_output/cuffnorm_all_fpkm_exprs_norm.txt' % ( rna_project_folder) exp_cutoff = 1 output_path = '%stables/HG19_CHORDOMA_CH22_GENE_TABLE_NORM.txt' % ( projectFolder) #merge_rna(exp_path,exp_cutoff,output_path) #for raw data exp_path = '%s190612_rna_seq/cuffnorm_output/cuffnorm_all_fpkm_exprs_raw.txt' % ( rna_project_folder) output_path = '%stables/HG19_CHORDOMA_CH22_GENE_TABLE_RAW.txt' % ( projectFolder)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(nb_all_chip_dataFile) print('\n\n') print( '#======================================================================' ) print( '#===================II. ENHANCER PROMOTER FOR ALL NB===================' ) print( '#======================================================================' ) print('\n\n') # input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) # activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) # analysis_name = 'NB_MYCN_CONSERVED' # nb_enhancer_promoter_bash = wrap_enhancer_promoter(nb_all_chip_dataFile,input_path,activity_path,analysis_name) # os.system('bash %s' % (nb_enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#===============III. ENHANCER PROMOTER IN SHEP21 SYSTEM================' ) print( '#======================================================================' ) print('\n\n') # #for SHEP21 nospike # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for SHEP21 chiprx # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for SHEP21 nospike # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for SHEP21 chiprx # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for nb conserved regions # #for SHEP21 nospike # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # for mycn_name in mycn_list: # input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) # activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) # analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for SHEP21 chiprx # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # for mycn_name in mycn_list: # input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) # activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) # analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#===============IV. ENHANCER PROMOTER IN SHEP ON SYSTEM================' ) print( '#======================================================================' ) print('\n\n') # #for SHEP21 on # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) #for SHEP21 on no background # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] # for mycn_name in mycn_list: # input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False) # os.system('bash %s' % (nb_enhancer_promoter_bash)) # #for SHEP21 on @ NB conserved regions # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] # for mycn_name in mycn_list: # input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) # activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder) # analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#================V. ENHANCER PROMOTER IN INDIVIDUAL NB=================' ) print( '#======================================================================' ) print('\n\n') # #for BE2C, KELLY, NGP # mycn_list = ['BE2C','KELLY','NGP'] # for mycn_name in mycn_list: # input_path = '%s%s_MYCN_peaks.bed' % (macsEnrichedFolder,mycn_name) # activity_path = '%sHG19_%s_H3K27AC_ACTIVE.txt' % (geneListFolder,mycn_name) # analysis_name = '%s_MYCN' % (mycn_name) # nb_enhancer_promoter_bash = wrap_enhancer_promoter(nb_all_chip_dataFile,input_path,activity_path,analysis_name,names_list = ['%s_MYCN' % (mycn_name)]) # os.system('bash %s' % (nb_enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#============VI. ENHANCER PROMOTER ANALYSIS IN OTHER CANCERS===========' ) print( '#======================================================================' ) print('\n\n') # #for p493-6, mm1s, h2171, h128, and u87 # #for p493-6 # myc_list = ['P493-6_T0_MYC','P493-6_T1_MYC','P493-6_T24_MYC'] # for myc_name in myc_list: # input_path = '%sP493-6_T24_MYC_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_P493-6_T24_H3K27AC_ACTIVE.txt' % (geneListFolder) # analysis_name = 'P493-6_T24_MYC_REGIONS_%s' % (myc_name) # enhancer_promoter_bash = wrap_enhancer_promoter(p4936_young_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name]) # os.system('bash %s' % (enhancer_promoter_bash)) # #for sclc # myc_list = ['H128_MYC','H2171_MYC'] # for myc_name in myc_list: # input_path = '%sH2171_MYC_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_H2171_H3K27AC_ACTIVE.txt' % (geneListFolder) # analysis_name = 'H2171_MYC_REGIONS_%s' % (myc_name) # enhancer_promoter_bash = wrap_enhancer_promoter(sclc_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name]) # os.system('bash %s' % (enhancer_promoter_bash)) # #for MM # myc_list = ['MM1S_MYC_DMSO'] # for myc_name in myc_list: # input_path = '%sMM1S_MYC_DMSO_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_MM1S_H3K27AC_ACTIVE.txt' % (geneListFolder) # analysis_name = 'MM1S_MYC_REGIONS_%s' % (myc_name) # enhancer_promoter_bash = wrap_enhancer_promoter(mm1s_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name]) # os.system('bash %s' % (enhancer_promoter_bash)) # #for u87 # myc_list = ['U87_MYC'] # for myc_name in myc_list: # input_path = '%sU87_MYC_peaks.bed' % (macsEnrichedFolder) # activity_path = '%sHG19_U87_H3K27AC_ACTIVE.txt' % (geneListFolder) # analysis_name = 'U87_MYC_REGIONS_%s' % (myc_name) # enhancer_promoter_bash = wrap_enhancer_promoter(u87_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name]) # os.system('bash %s' % (enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#========VII. ENHANCER PROMOTER ANALYSIS FOR OTHER MARKS IN BE2C========' ) print( '#======================================================================' ) print('\n\n') # #names_list = ['BE2C_BRD4','BE2C_H3K27AC','BE2C_TWIST','BE2C_RNA_POL2'] # names_list = ['BE2C_H3K27AC'] # names_list = ['BE2C_BRD4','BE2C_TWIST','BE2C_RNA_POL2','BE2C_H3K27ME3','BE2C_H3K4ME3'] # for name in names_list: # input_path = '%s%s_peaks.bed' % (macsEnrichedFolder,name) # activity_path = '%sHG19_BE2C_H3K27AC_ACTIVE.txt' % (geneListFolder) # analysis_name = '%s_REGIONS' % (name) # enhancer_promoter_bash = wrap_enhancer_promoter(be2c_dataFile,input_path,activity_path,analysis_name,names_list = [name]) # os.system('bash %s' % (enhancer_promoter_bash)) print('\n\n') print( '#======================================================================' ) print( '#====================VIII. MAKING GENE TABLE W/ LENGTH=================' ) print( '#======================================================================' ) print('\n\n') #for nb conserved gene_table_path = '%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_GENE_TABLE.txt' % ( projectFolder) peak_table_path = '%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_PEAK_TABLE.txt' % ( projectFolder) gene_path = addLengths(gene_table_path, peak_table_path) #for shep21 gene_table_path = '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_GENE_TABLE.txt' % ( projectFolder) peak_table_path = '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_PEAK_TABLE.txt' % ( projectFolder) gene_path = addLengths(gene_table_path, peak_table_path)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(shep21_chiprx_dataFile) print('\n\n') print( '#======================================================================' ) print( '#========II. DEFINING ACTIVE GENES AND ENHANCERS IN SHEP21=============' ) print( '#======================================================================' ) print('\n\n') #make_shep21_active() #bash_path = map_nb_enhancers(nb_all_chip_dataFile) #os.system('bash %s' % (bash_path)) print('\n\n') print( '#======================================================================' ) print( '#==================III. MAKING +/- 5KB MYCN GFFs=======================' ) print( '#======================================================================' ) print('\n\n') #make_shep21_mycn_landscape(nb_all_chip_dataFile) print('\n\n') print( '#======================================================================' ) print( '#===============IV. MAPPING MYCN GFFs FOR METAS AND HEATMAP============' ) print( '#======================================================================' ) print('\n\n') #with and without spike in #map_shep21_for_heatmap(shep21_chiprx_dataFile,shep21_dataFile) print('\n\n') print( '#======================================================================' ) print( '#==================V. MAPPING MYCN GFFs FOR BOX PLOT===================' ) print( '#======================================================================' ) print('\n\n') #mapping @ a 1 bin scale for the shep21 conserved mycn regions # gffList = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-0_+0.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-1kb_+1kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder), # ] # gffList = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder), # ] # map_regions(shep21_chiprx_dataFile,gffList,names_list=[]) # map_regions(shep21_dataFile,gffList,names_list=[]) print('\n\n') print( '#======================================================================' ) print( '#===============VI. MAKING HEATMAPS AND METAS =========================' ) print( '#======================================================================' ) print('\n\n') #set the output folder utils.formatFolder('%sfigures/5_chiprx_heatmaps/' % projectFolder, True) # #========================================== # #for shep21 mycn chiprx # plot_name = 'SHEP21_MYCN_RX' # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'red' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #for shep21 mycn regular chip # plot_name = 'SHEP21_MYCN_NOSPIKE' # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'red' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #========================================== # #for shep21 h3k27ac chiprx # plot_name = 'SHEP21_H3K27AC_RX' # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'blue' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #for shep21 mycn regular chip # plot_name = 'SHEP21_H3K27AC_NOSPIKE' # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'blue' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #========================================== # #for shep21 CTCF chiprx # plot_name = 'SHEP21_CTCF_RX' # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'black' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #========================================== # #for shep21 RNA Pol II chiprx # plot_name = 'SHEP21_POL2_RX' # names_list = ['SHEP21_0HR_POL2_RX','SHEP21_2HR_POL2_RX','SHEP21_24HR_POL2_RX'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'black' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #========================================== # #for shep21 RNA Pol II NOSPIKE # plot_name = 'SHEP21_POL2_NOSPIKE' # names_list = ['SHEP21_0HR_POL2_NOSPIKE','SHEP21_2HR_POL2_NOSPIKE','SHEP21_24HR_POL2_NOSPIKE'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'black' # makeHeatmap(names_list,gff_list,plot_name,plot_color) # #========================================== # #for shep21 H3K4ME3 chiprx # plot_name = 'SHEP21_H3K4ME3_RX' # names_list = ['SHEP21_0HR_H3K4ME3_RX','SHEP21_2HR_H3K4ME3_RX','SHEP21_24HR_H3K4ME3_RX'] # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder), # '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder), # ] # plot_color = 'green' # makeHeatmap(names_list,gff_list,plot_name,plot_color) print('\n\n') print( '#======================================================================' ) print( '#=====================VII. MAKING BOXPLOTS ============================' ) print( '#======================================================================' ) print('\n\n') utils.formatFolder('%sfigures/4_chiprx_plots/' % projectFolder, True) # #============================================================================= # #for nb mycn chiprx # set_name = 'MYCN_CHIPRX' # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # #============================================================================= # #for nb mycn chip no spike # set_name = 'MYCN_NOSPIKE' # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # #============================================================================= # #for nb h3k27ac chiprx # set_name = 'H3K27AC_CHIPRX' # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # #============================================================================= # #for nb H3K27ac chip no spike # set_name = 'H3K27AC_NOSPIKE' # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE'] # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list) # #============================================================================= # #for nb ctcf chiprx # set_name = 'CTCF_CHIPRX' # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX'] # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list) #============================================================================= #for nb RNA Pol II chiprx set_name = 'POL2_RX' gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' names_list = [ 'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX' ] makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' names_list = [ 'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX' ] makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' names_list = [ 'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX' ] makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' names_list = [ 'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX' ] makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list) #============================================================================= #for nb RNA Pol II chiprx set_name = 'POL2_NOSPIKE' gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb' names_list = [ 'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE' ] makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0' names_list = [ 'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE' ] makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb' names_list = [ 'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE' ] makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list) gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0' names_list = [ 'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE' ] makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. CHECKING CHIP-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #edit all of the data files to absolute path the for dataFile in chip_data_list: pipeline_dfci.summary(dataFile) print('\n\n') print( '#======================================================================' ) print( '#======================II. CHECKING RNA-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the for dataFile in rna_data_list: pipeline_dfci.summary(dataFile) print('\n\n') print( '#======================================================================' ) print( '#====================III. CHECKING ATAC-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(atac_dataFile) print('\n\n') print( '#======================================================================' ) print( '#======================IV. CHECKING CHIPRX DATA========================' ) print( '#======================================================================' ) print('\n\n') pipeline_dfci.summary(shep21_chiprx_dataFile)
#namesList = dataDict.keys() #print(namesList) #========================================================================== #=======================LOADING DATA ANNOTATION============================ #========================================================================== ##THIS SECTION LOADS A DATA TABLE. MUST BE UNCOMMENTED FOR REST OF CODE TO WORK #LOADING THE DATA TABLE dataDict = pipeline_dfci.loadDataTable(dataFile) print(dataDict.keys()) pipeline_dfci.summary(dataFile) #========================================================================== #==========================CALLING BOWTIE================================== #========================================================================== ##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER #namesList = [] <- fill this in if you want to only map a subset of the data. otherwise leave blank ##SET LAUNCH TO False to debug #pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True) #========================================================================== #=============================CALL MACS====================================
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #these are the datasets we will use pipeline_dfci.summary(shep_on_dataFile) pipeline_dfci.summary(shep21_dataFile) pipeline_dfci.summary(shep21_chiprx_dataFile) print('\n\n') print( '#======================================================================' ) print( '#=========================II. MAKE BOXPLOTS============================' ) print( '#======================================================================' ) print('\n\n') #here we will wrap boxplots for each set of analysis region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE' #this is used to find the peak tables set_name = 'SHEP_MYCN' # this is the defacto title for the datasets scale_table_path = '' wrapInvasionBox(shep_on_dataFile, region_prefix, set_name, names_list=[], top=5000, scale_path=scale_table_path) region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE' #this is used to find the peak tables set_name = 'SHEP_MYCN_NOSPIKE' # this is the defacto title for the datasets scale_table_path = '' myc_list = [ 'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE', 'SHEP21_24HR_MYCN_NOSPIKE' ] wrapInvasionBox(shep21_dataFile, region_prefix, set_name, names_list=myc_list, top=5000, scale_path=scale_table_path) region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE' #this is used to find the peak tables set_name = 'SHEP_MYCN_RX_NO_SCALE' # this is the defacto title for the datasets scale_table_path = '' myc_list = [ 'SHEP21_0HR_MYCN_RX', 'SHEP21_2HR_MYCN_RX', 'SHEP21_24HR_MYCN_RX' ] wrapInvasionBox(shep21_chiprx_dataFile, region_prefix, set_name, names_list=myc_list, top=5000, scale_path=scale_table_path) region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE' #this is used to find the peak tables set_name = 'SHEP_MYCN_RX' # this is the defacto title for the datasets scale_table_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder) myc_list = [ 'SHEP21_0HR_MYCN_RX', 'SHEP21_2HR_MYCN_RX', 'SHEP21_24HR_MYCN_RX' ] wrapInvasionBox(shep21_chiprx_dataFile, region_prefix, set_name, names_list=myc_list, top=5000, scale_path=scale_table_path)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#======================I, LOADING DATA ANNOTATION======================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(nb_all_chip_dataFile) print('\n\n') print('#======================================================================') print('#========================II. MAKING NES TABLE==========================') print('#======================================================================') print('\n\n') # #at a given fdr cutoff, grab the NES pathways nes_folder = utils.formatFolder('%snes_tables/' % (projectFolder),True) # #for top 5k regions # nes_path_list = ['%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/H2171_MYC_REGIONS_H2171_MYC/H2171_MYC_REGIONS_H2171_MYC_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/MM1S_MYC_REGIONS_MM1S_MYC_DMSO/MM1S_MYC_REGIONS_MM1S_MYC_DMSO_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/U87_MYC_REGIONS_U87_MYC/U87_MYC_REGIONS_U87_MYC_top_5000_nes.txt' % (projectFolder), # ] # names_list = ['NB_MYCN_CONSERVED','H2171','MM1S','P493-6_T24','U87'] # output_path = '%sMYC_HIGH_NES.txt' % (nes_folder) # makeNESTable(nes_path_list,names_list,output_path) # #for shep21 nospike shutdown system # nes_path_list = ['%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_2HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_2HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_24HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_24HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder), # ] # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE'] # output_path = '%sSHEP21_MYCN_NOSPIKE_NES.txt' % (nes_folder) # makeNESTable(nes_path_list,names_list,output_path) # #for shep on induction system # nes_path_list = ['%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_0HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_0HR_MYCN_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_2HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_2HR_MYCN_top_5000_nes.txt' % (projectFolder), # '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_6HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_6HR_MYCN_top_5000_nes.txt' % (projectFolder), # ] # names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN'] # output_path = '%sSHEP_ON_NES.txt' % (nes_folder) # makeNESTable(nes_path_list,names_list,output_path) #for BE2C comparisons nes_path_list = [ '%senhancerPromoter/BE2C_MYCN/BE2C_MYCN_top_5000_nes.txt' % (projectFolder), '%senhancerPromoter/BE2C_H3K27AC_REGIONS/BE2C_H3K27AC_REGIONS_top_5000_nes.txt' % (projectFolder), ] names_list = ['BE2C_RNA_POL2','BE2C_MYCN','BE2C_H3K27AC','BE2C_BRD4','BE2C_TWIST'] names_list = ['BE2C_MYCN','BE2C_H3K27AC'] output_path = '%sBE2C_NES.txt' % (nes_folder) makeNESTable(nes_path_list,names_list,output_path) print('\n\n') print('#======================================================================') print('#========================III. CALLING HEATMAP==========================') print('#======================================================================') print('\n\n') # #for high myc # nes_path = '%sMYC_HIGH_NES.txt' % (nes_folder) # wrapHeatmap(nes_path,0.01,2) # #for shep21 nospike # nes_path = '%sSHEP21_MYCN_NOSPIKE_NES.txt' % (nes_folder) # wrapHeatmap(nes_path,0.1,2) # #for shep on # nes_path = '%sSHEP_ON_NES.txt' % (nes_folder) # wrapHeatmap(nes_path,0.1,2) #for be2c nes_path = '%sBE2C_NES.txt' % (nes_folder) wrapHeatmap(nes_path,0.1,1.5) print('\n\n') print('#======================================================================') print('#=====================IV. MAKING TSS DISTAL GFFS=======================') print('#======================================================================') print('\n\n') # #we want the peak list to cover NB_MYCN_CONSERVED, P4936, MM1S, H2171,U87 # #for top 5k regions # peak_path_list = ['%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_PEAK_TABLE.txt' % (projectFolder), # '%senhancerPromoter/H2171_MYC_REGIONS_H2171_MYC/H2171_MYC_REGIONS_H2171_MYC_PEAK_TABLE.txt' % (projectFolder), # '%senhancerPromoter/MM1S_MYC_REGIONS_MM1S_MYC_DMSO/MM1S_MYC_REGIONS_MM1S_MYC_DMSO_PEAK_TABLE.txt' % (projectFolder), # '%senhancerPromoter/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC_PEAK_TABLE.txt' % (projectFolder), # '%senhancerPromoter/U87_MYC_REGIONS_U87_MYC/U87_MYC_REGIONS_U87_MYC_PEAK_TABLE.txt' % (projectFolder), # ] # tss_gff_path,distal_gff_path = makePeakGFFs(peak_path_list) print('\n\n') print('#======================================================================') print('#=====================V. MAPPING MYC TO REGIONS========================') print('#======================================================================') print('\n\n')
def main(): print('rna analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(rna_data_file) print('\n\n') print( '#==========================================================================' ) print( '#=======================II, ALIGNING WITH HISAT2===========================' ) print( '#==========================================================================' ) print('\n\n') #pipeline_dfci.mapHisat(dataFile,namesList=[],useSRA=False,pCount=16,Launch=True) print('\n\n') print( '#==========================================================================' ) print( '#=======================III, RUNNING RNA-SEQ ANALYSIS======================' ) print( '#==========================================================================' ) print('\n\n') #analysisName = 'rasmc_rna' #gtfFile = '/storage/cylin/grail/genomes/ERCC_Technical_Data/rn6_ercc.gtf' #cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder),True) #bashFileName = '%s%s_cufflinks.sh' % (cufflinksFolder,analysisName) #groupList = [['RASMC_RNA_0H_A','RASMC_RNA_0H_B'],['RASMC_RNA_PDGF_2H_B','RASMC_RNA_PDGF_2H_C','RASMC_RNA_PDGF_2H_D'],['RASMC_RNA_PDGF_JQ1_2H_E','RASMC_RNA_PDGF_JQ1_2H_G','RASMC_RNA_PDGF_JQ1_2H_H'],['RASMC_RNA_PDGF_24H_A','RASMC_RNA_PDGF_24H_B','RASMC_RNA_PDGF_24H_D'],['RASMC_RNA_PDGF_JQ1_24H_E','RASMC_RNA_PDGF_JQ1_24H_F','RASMC_RNA_PDGF_JQ1_24H_H']] #print(groupList) #pipeline_dfci.makeCuffTableSlurm(rna_data_file,analysisName,gtfFile,cufflinksFolder,groupList,bashFileName) # #flag useERCC to true print('\n\n') print( '#==========================================================================' ) print( '#=======================IV, MAKE GEO TABLE=================================' ) print( '#==========================================================================' ) print('\n\n') namesList = [] geoName = 'rasmc_rna' outputFolder = '/storage/cylin/grail/projects/rasmc_all/rasmc_geo/%s_geo/' % ( geoName) makeGEORNATable(rna_data_file, namesList, geoName, outputFolder)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq pipeline_dfci.summary(nb_all_chip_dataFile) for dataFile in chip_data_list: pipeline_dfci.summary(dataFile) print('\n\n') print( '#======================================================================' ) print( '#========================II. MAKING FIGURE GFF=========================' ) print( '#======================================================================' ) print('\n\n') nb_figure_gff_path = make_nb_gff() #make the associated beds for plottings nb_mycn_conserved_gff = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) canon_path, non_path = makeEboxBeds(nb_mycn_conserved_gff, name='') bed_string = ','.join([canon_path, non_path]) print(bed_string) print('\n\n') print( '#======================================================================' ) print( '#=====================III. CALLING PLOTTING FUNCTIONS==================' ) print( '#======================================================================' ) print('\n\n') #for the shep21 no spike system plot_shep21_genes(nb_figure_gff_path, bed_string) #for the shep21 chiprx system scale_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder) plot_shep21_chiprx_genes(shep21_chiprx_dataFile, scale_path, nb_figure_gff_path, bed_string) #for the shep on system plot_shep_on_genes(shep_on_dataFile, nb_figure_gff_path, bed_string) #for the pan NB metas plot_nb_all_genes(nb_all_chip_dataFile, nb_figure_gff_path, bed_string) #for be2c only plot_be2c_genes(be2c_dataFile, nb_figure_gff_path, bed_string) #for atac pipeline_dfci.summary(atac_dataFile) plot_nb_atac_genes(atac_dataFile, nb_figure_gff_path, bed_string) #for p493-6 pipeline_dfci.summary(p4936_young_dataFile) plot_p4936_genes(p4936_young_dataFile, nb_figure_gff_path, bed_string) #for mm1s pipeline_dfci.summary(mm1s_dataFile) plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I, LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #these are the datasets we will use pipeline_dfci.summary(shep21_dataFile) print('\n\n') print( '#======================================================================' ) print( '#================II. RUNNING DIFFERENTIAL ROSE ANALYSIS================' ) print( '#======================================================================' ) print('\n\n') #use the dynamic rose tools to first map twist1 binding sites #and then quantify name1 = 'SHEP21_0HR_TWIST' name2 = 'SHEP21_24HR_B_TWIST' analysis_name = 'SHEP21_TWIST1' rank_gff_path = wrapDRose(shep21_dataFile, name1, name2, analysis_name) print('\n\n') print( '#======================================================================' ) print( '#=================III. MAPPING MYCN DATA TO RANK GFF===================' ) print( '#======================================================================' ) print('\n\n') #for shep21 nospike gffList = [rank_gff_path] dataDict = pipeline_dfci.loadDataTable(shep21_dataFile) names_list = [ name for name in dataDict.keys() if name.count('MYCN') == 1 or name.count('INPUT') == 1 or name.count('TWIST') == 1 and name.count('rep2') == 0 ] print(names_list) #map_regions(shep21_dataFile,gffList,names_list) gffList = ['%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder)] #map_regions(shep21_dataFile,gffList,names_list) #make a gff of twist and mycn sites at 0hr twist_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder), 'SHEP21_0HR_TWIST') mycn_collection = utils.importBoundRegion( '%smacsEnriched/SHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (projectFolder), 'SHEP21_0HR_MYCN_NOSPIKE') all_loci = twist_collection.getLoci() + mycn_collection.getLoci() all_collection = utils.LocusCollection(all_loci, 50) stitched_collection = all_collection.stitchCollection() stitched_loci = stitched_collection.getLoci() overlap_loci = [] for locus in stitched_loci: if len(twist_collection.getOverlap(locus, 'both')) > 0 and len( mycn_collection.getOverlap(locus, 'both')) > 0: overlap_loci.append(locus) overlap_collection = utils.LocusCollection(overlap_loci, 50) overlap_gff = utils.locusCollectionToGFF(overlap_collection) overlap_gff_path = '%sHG19_SHEP21_0HR_TWIST_MYCN_INTERSECTION_-0_+0.gff' % ( gffFolder) utils.unParseTable(overlap_gff, overlap_gff_path, '\t') gffList = [overlap_gff_path] map_regions(shep21_dataFile, gffList, names_list)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. CHECKING CHIP-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(chip_dataFile) print('\n\n') print( '#======================================================================' ) print( '#======================II. CHECKING RNA-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(rna_dataFile) #if no processed expression present, runs cuffquant/cuffnorm/RNA-seq pipeline cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder), True) analysis_name = 'NIBR_YvsO' #groupList = [['Y_BC10_Y1','Y_BC11_Y2','Y_BC16_Y3'],['O_BC18_O1','O_BC25_O2','O_BC27_O3']] #bashFileName = '%s%s_rna_cufflinks.sh' % (cufflinksFolder,analysis_name) #pipeline_dfci.makeCuffTable(rna_dataFile,analysis_name,gtfFile,cufflinksFolder,groupList,bashFileName) print('\n\n') print( '#======================================================================' ) print( '#====================III. CHECKING ATAC-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(atac_dataFile) print('\n\n') print( '#======================================================================' ) print( '#====================IV. CHECKING IRF2 CHIPMENTATION===================' ) print( '#======================================================================' ) print('\n\n') pipeline_dfci.summary(irf2_dataFile) print('\n\n') print( '#======================================================================' ) print( '#====================V. SUMMARIZING ALL DATA===========================' ) print( '#======================================================================' ) print('\n\n') output = '%stables/HG19_HPEK_SEQ_TABLE.txt' % (projectFolder) make_summary_table(data_file_list, output)
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print('#======================================================================') print('#======================I, LOADING DATA ANNOTATION======================') print('#======================================================================') print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #edit all of the data files to absolute path the for dataFile in chip_data_list: pipeline_dfci.summary(dataFile) print('\n\n') print('#======================================================================') print('#==========================II. CALLING MACS============================') print('#======================================================================') print('\n\n') #running peak finding using macs 1.4.2 on all chip datasets #this usually takes ~2-3 hours on a reasonably fast machine #a 3 hour time out on this entire operation is set #if peak calling takes longer than 3 hours, simply run the script again after completion # for dataFile in chip_data_list: # run_macs(dataFile) print('\n\n') print('#======================================================================') print('#===================III. DEFINING ACTIVE GENES IN NB===================') print('#======================================================================') print('\n\n') # #here we will identify active promoters in various contexts as those with # #an H3K27AC peak in the +/- 1kb tss region # #UCSC refseq annotations are used for all genes # #make_nb_active_gene_lists(nb_all_chip_dataFile) # make_active_gene_lists(mm1s_dataFile,p4936_young_dataFile,sclc_dataFile,shep_on_dataFile,u87_dataFile) print('\n\n') print('#======================================================================') print('#===============IV. DEFINING NB MYCN AND H3K27AC LANDSCAPE=============') print('#======================================================================') print('\n\n') # #for enhancers # enhancer_bashFileName,enhancer_region_map_path,namesList = define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile) # #runs only if no output detected # if not utils.checkOutput(enhancer_region_map_path,0,0): # print(enhancer_bashFileName) # os.system('bash %s' % (enhancer_bashFileName)) # #for mycn # mycn_bashFileName,mycn_region_map_path,namesList = define_mycn_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile) # if not utils.checkOutput(mycn_region_map_path,0,0): # print(mycn_bashFileName) # os.system('bash %s' % (mycn_bashFileName)) # #now we need to call the R script that creates the rank plots # if utils.checkOutput(mycn_region_map_path,1,30): #set a wait time for 30 minutes # print('Found NB_MYCN meta_rose landscape and running rank plot R code') # conserved_rank_path = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) # if utils.checkOutput(conserved_rank_path,0,0): # print('Identified NB rank conserved regions: %s' % (conserved_rank_path)) # else: # print('Defining NB rank conserved regions') # name_string = ','.join(namesList) #provides the dataset names used # rank_script_path = '%sr_scripts/1_nb_mycn_rank.R' % (projectFolder) # r_cmd = 'Rscript %s %s %s %s' % (rank_script_path,mycn_region_map_path,name_string,projectFolder) # print(r_cmd) # os.system(r_cmd) print('\n\n') print('#======================================================================') print('#==========V. MAPPING MYCN AND H3K27AC TO MYCN REGIONS=================') print('#======================================================================') print('\n\n') # #here we will first make a gff of conserved NB MYCN regions # #and then map MYCN and H3K27ac signal # print('Making a gff and bed of conserved NB MYCN regions:') # mycn_gff_path,mycn_flank_gff_path = make_mycn_regions(conserved_rank_path) # print('Mapping MYCN and H3K27AC signal') # gffList = [mycn_gff_path,mycn_flank_gff_path] #gffList = ['%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder),'%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder)] #pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList,mappedFolder,signalFolder) print('\n\n') print('#======================================================================') print('#==================VI. CREATING NB MYCN STATS TABLE====================') print('#======================================================================') print('\n\n') # mycn_table_path = '%stables/HG19_NB_MYCN_CONSERVED_STATS_TABLE.txt' % (projectFolder) # if utils.checkOutput(mycn_table_path,0,0): # print('Identified MYCN table %s' % (mycn_table_path)) # else: # print('Making MYCN stats table') # mycn_table_path = make_mycn_stats_table(nb_all_chip_dataFile,mycn_table_path) mycn_table_path = '%stables/HG19_NB_MYCN_CONSERVED_STATS_TABLE.txt' % (projectFolder) #mycn_table_path = make_mycn_stats_table(nb_all_chip_dataFile,mycn_table_path) print('\n\n') print('#======================================================================') print('#=================VII. MAKING VECTOR COMPARISON PLOTS==================') print('#======================================================================') print('\n\n') compare_script_path = '%sr_scripts/2_nb_mycn_vector_plots.R' % (projectFolder) r_cmd = 'Rscript %s %s %s' % (compare_script_path,mycn_table_path,projectFolder) print(r_cmd) os.system(r_cmd) print('\n\n') print('#======================================================================') print('#==================VIII. RANKING EBOXES IN MYCN PEAKS==================') print('#======================================================================') print('\n\n') # mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) # ebox_rank_path = rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100) # print(ebox_rank_path) # #now make the heatmap # ebox_heatmap_script_path = '%sr_scripts/3_nb_ebox_heatmap.R' % (projectFolder) # r_cmd = 'Rscript %s %s %s' % (ebox_heatmap_script_path,ebox_rank_path,projectFolder) # print(r_cmd) # os.system(r_cmd) print('\n\n') print('#======================================================================') print('#====================IX. MAPPING BE2C DATASETS TO TSS==================') print('#======================================================================') print('\n\n')
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. CHECKING CHIP-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(chip_dataFile) print('\n\n') print( '#======================================================================' ) print( '#======================II. CHECKING RNA-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(rna_dataFile) #if no processed expression present, runs cuffquant/cuffnorm/RNA-seq pipeline cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder), True) analysis_name = 'NIBR_YvsO' groupList = [['Y_BC10_Y1', 'Y_BC11_Y2', 'Y_BC16_Y3'], ['O_BC18_O1', 'O_BC25_O2', 'O_BC27_O3']] bashFileName = '%s%s_rna_cufflinks.sh' % (cufflinksFolder, analysis_name) pipeline_dfci.makeCuffTable(rna_dataFile, analysis_name, gtfFile, cufflinksFolder, groupList, bashFileName) call_bashFileName = 'bash %s' % bashFileName proc = subprocess.Popen(call_bashFileName, shell=True) # wait for finishing cufflinks proc.wait() # if call_bashFileName returns 1 (fail), then exit with status 1 if proc.returncode: print 'running %s failed' (call_bashFileName) sys.exit(1) print('\n\n') print( '#======================================================================' ) print( '#====================III. CHECKING ATAC-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for RNA-Seq #edit all of the data files to absolute path the pipeline_dfci.summary(atac_dataFile)
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(chip_data_file) print('\n\n') print( '#======================================================================' ) print( '#===========================II. CALLING MACS===========================' ) print( '#======================================================================' ) print('\n\n') #pipeline_dfci.run_macs(chip_data_file,projectFolder,macsFolder,macsEnrichedFolder,wiggleFolder,useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#=======================III. MERGING IRF2 REGIONS======================' ) print( '#======================================================================' ) print('\n\n') #create a set of regions representing the intersect of peaks #filter out anything that overlaps a peak in the HA ctl def merge_regions(): ''' merges ha peaks to identify all overlapping peaks filters out anything overlapping the HA controls ''' hk_dox_ha_1 = utils.importBoundRegion( '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1') hk_dox_ha_2 = utils.importBoundRegion( '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2') hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci() #control datasets hk_ctl_ha_1 = utils.importBoundRegion( '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1') hk_ctl_ha_2 = utils.importBoundRegion( '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2') hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci() hk_ctl_lc = utils.LocusCollection(hk_ctl_loci) print(len(hk_dox_loci)) stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection() print(len(stitched_lc)) filtered_loci = [] for locus in stitched_lc.getLoci(): if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len( hk_dox_ha_2.getOverlap(locus)) > 0: if len(hk_ctl_lc.getOverlap(locus)) == 0: filtered_loci.append(locus) print(len(filtered_loci)) filtered_lc = utils.LocusCollection(filtered_loci) gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % ( gffFolder) filtered_gff = utils.locusCollectionToGFF(filtered_lc) utils.unParseTable(filtered_gff, gff_path, '\t') #merge_regions() print('\n\n') print( '#======================================================================' ) print( '#======================IV. IDENTIFY ATAC OVERLAP REGIONS===============' ) print( '#======================================================================' ) print('\n\n') # atac_bed_path = '%sHG19_combined_atac_-0_+0.bed' % (bedFolder)# all combined atac regions # atac_collection = utils.importBoundRegion(atac_bed_path,'HG19_combined_atac') # print(len(atac_collection)) # #now filter the irf2 gff # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder) # irf2_collection = utils.gffToLocusCollection(irf2_gff_path) # irf2_loci = irf2_collection.getLoci() # irf2_atac_loci = [locus for locus in irf2_loci if atac_collection.getOverlap(locus)] # print(len(irf2_atac_loci)) # irf2_atac_collection=utils.LocusCollection(irf2_atac_loci) # irf2_atac_gff = utils.locusCollectionToGFF(irf2_atac_collection) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # utils.unParseTable(irf2_atac_gff,irf2_atac_gff_path,'\t') # # overlap with TSS # tss_gff_path = '%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder) # tss_gff = utils.parseTable(tss_gff_path,'\t') # tss_collection = utils.gffToLocusCollection(tss_gff) # print('tss overlap w/ IRF2 atac peaks') # print(len([locus for locus in irf2_atac_loci if tss_collection.getOverlap(locus)])) # print(len(irf2_atac_loci)) # #overlap w/ k27ac # k27ac_gff_path = '%sHG19_keratinocyte_combined_all_-0_+0.gff' % (gffFolder) # k27ac_gff = utils.parseTable(k27ac_gff_path,'\t') # k27ac_collection = utils.gffToLocusCollection(k27ac_gff) # print('k27ac overlap w/ IRF2 atac peaks') # print(len([locus for locus in irf2_atac_loci if k27ac_collection.getOverlap(locus)])) # print(len(irf2_atac_loci)) print('\n\n') print( '#======================================================================' ) print( '#========================V. CALLING ROSE2 META=========================' ) print( '#======================================================================' ) print('\n\n') def wrapRose2Meta(data_file, input_path, parent_folder, active_gene_path='', rank_list=[], control_list=[], analysis_name=''): ''' quick wrapper for Rose2Meta ''' dataDict = pipeline_dfci.loadDataTable(data_file) rank_string = ','.join([dataDict[name]['bam'] for name in rank_list]) control_string = ','.join( [dataDict[name]['bam'] for name in control_list]) output_folder = utils.formatFolder( '%s%s' % (parent_folder, analysis_name), True) rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % ( py27_path, pipeline_dir, genome, input_path, rank_string, control_string, analysis_name, output_folder, blacklist_path) all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder, analysis_name) if active_gene_path != '': rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path, active_gene_path) else: rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path) rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name) rose_bash = open(rose_bash_path, 'w') rose_bash.write('#!/usr/bin/python\n\n') rose_bash.write('#setting up bamliquidator\n') rose_bash.write('\n\n#ROSE2_CMD\n') rose_bash.write(rose2_meta_cmd + '\n') rose_bash.write(rose2_map_cmd + '\n') rose_bash.close() print('Wrote ROSE2 META CMD to %s' % (rose_bash_path)) #use ROSE2 w/ -t 0 and -s 0 to quantify background subtracted AUC at all peaks # parent_folder = utils.formatFolder('%smeta_rose/' % (projectFolder),True) # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder) # #creating bam lists # rank_list = ['HK_DOX_HA_1','HK_DOX_HA_2'] # control_list =['HK_DOX_WCE_1','HK_DOX_WCE_2'] # #for all IRF2 HA # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA' # wrapRose2Meta(chip_data_file,irf2_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA_ATAC' # wrapRose2Meta(chip_data_file,irf2_atac_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name) print('\n\n') print( '#======================================================================' ) print( '#================VI. OVERLAPPING IRF2 W/ MOTIF PREDICTIONS=============' ) print( '#======================================================================' ) print('\n\n') # #load up peaks # #irf2_atac_peaks # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # irf2_atac_gff = utils.parseTable(irf2_atac_gff_path,'\t') # irf2_atac_loci = utils.gffToLocusCollection(irf2_atac_gff).getLoci() # irf2_atac_collection = utils.LocusCollection(irf2_atac_loci) # print(len(irf2_atac_loci)) # irf2_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES.txt' % (projectFolder) # irf2_edge_table = utils.parseTable(irf2_edge_path,'\t') # print(len(irf2_edge_table)) # irf2_confirmed_edges = [] # irf2_edge_loci = [] # for line in irf2_edge_table[1:]: # chrom = line[1].split('(')[0] # coords = [int(x) for x in line[1].split(':')[-1].split('-')] # locus = utils.Locus(chrom,coords[0]-00,coords[1]+00,'.',line[0]) # if len(irf2_atac_collection.getOverlap(locus)) > 0: # irf2_confirmed_edges.append(line) # irf2_edge_loci.append(locus) # print(len(irf2_confirmed_edges)) # irf2_confirmed_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES_CONFIRMED.txt' % (projectFolder) # utils.unParseTable(irf2_confirmed_edges,irf2_confirmed_edge_path,'\t') # irf2_edge_collection = utils.LocusCollection(irf2_edge_loci) # print(len(irf2_edge_collection)) # overlap_count = 0 # for locus in irf2_atac_loci: # search_locus = utils.makeSearchLocus(locus,0,0) # if len(irf2_edge_collection.getOverlap(search_locus)) >0: # overlap_count+=1 # print(overlap_count) print('\n\n') print( '#======================================================================' ) print( '#=================VII. RUNNING ENHANCER PROMOTER ON IRF2===============' ) print( '#======================================================================' ) print('\n\n') def wrap_enhancer_promoter(dataFile, input_path, activity_path, analysis_name, names_list=[], useBackground=True): ''' runs enhancer promoter on everybody with the conserved regions and union of active genes ''' #hard coded paths tads_path = '%shESC_domains_hg19.bed' % (bedFolder) #setting the output folder ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder), True) dataDict = pipeline_dfci.loadDataTable(dataFile) if len(names_list) == 0: names_list = [name for name in dataDict.keys()] names_list.sort() bams_list = [dataDict[name]['bam'] for name in names_list] bams_string = ' '.join(bams_list) background_names = [ dataDict[name]['background'] for name in names_list ] background_list = [ dataDict[background_name]['bam'] for background_name in background_names ] background_string = ' '.join(background_list) ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name) ep_bash = open(ep_bash_path, 'w') ep_bash.write('#!/usr/bin/bash\n\n\n') ep_bash.write('#enhancer promoter analysis for %s\n\n' % (analysis_name)) if useBackground: python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, background_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) else: python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) ep_bash.close() return (ep_bash_path) # # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA_ATAC' # bam_list = ['HK_DOX_HA_1','HK_DOX_HA_2'] # wrap_enhancer_promoter(chip_data_file,irf2_atac_gff_path,active_gene_path,analysis_name,bam_list,useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#==============VIII. FORMATTING THE HORRIFYING EXPRESSION TABLE========' ) print( '#======================================================================' ) print('\n\n') # exp_path = '%sirf2_kd_rna_seq/single_counts_filtered_counts.txt' % (projectFolder) # sample_key_path = '%sirf2_kd_rna_seq/sample_key.txt' % (projectFolder) # sample_table = utils.parseTable(sample_key_path,'\t') # sample_list = [line[0] for line in sample_table[1:]] # print(sample_list) # exp_table = utils.parseTable(exp_path,'\t') # #for each gene make a dictionary # exp_dict = {} # #first fill out the dictionary by gene name # for line in exp_table[1:]: # gene_name = line[3].replace('"','') # exp_dict[gene_name] = {} # print(len(exp_dict.keys())) # for line in exp_table[1:]: # gene_name = line[3].replace('"','') # sample_ID = line[4].replace('"','') # counts = line[2] # exp_dict[gene_name][sample_ID] = counts # #make the formatted expression table # header = ['GENE_NAME'] + sample_list # exp_table_formatted = [header] # gene_list = exp_dict.keys() # gene_list.sort() # for gene in gene_list: # exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] # exp_table_formatted.append(exp_line) # exp_table_formatted_path = '%sirf2_kd_rna_seq/irf2_expression_formatted.txt' % (projectFolder) # utils.unParseTable(exp_table_formatted,exp_table_formatted_path,'\t') # #with the exp dict we can make a nicer version of the gene table # gene_table_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE.txt' % (projectFolder) # gene_table_formatted_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE_FORMATTED.txt' % (projectFolder) # gene_table = utils.parseTable(gene_table_path,'\t') # gene_table_formatted = [gene_table[0] + ['IRF2_TOTAL_SIGNAL'] + header+ ['OLD_IRF2_KD_MEAN','OLD_CTL_MEAN','OLD_IRF2_VS_CTL','YOUNG_IRF2_KD_MEAN','YOUNG_CTL_MEAN','YOUNG_IRF2_VS_CTL']] # for line in gene_table[1:]: # if float(line[1]) == 0.0 and float(line[2]) == 0.0: # continue # if exp_dict.has_key(line[0]) == False: # continue # gene = line[0] # #where conditions are met # old_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_OLD_1', 'IRF2_KD_OLD_2', 'IRF2_KD_OLD_3']]) # old_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_OLD_1', 'CT_CRISPR_OLD_2', 'CT_CRISPR_OLD_3']]) # old_fold = numpy.log2(old_kd_mean/old_ctl_mean) # young_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_YOUNG_1', 'IRF2_KD_YOUNG_2', 'IRF2_KD_YOUNG_3']]) # young_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_YOUNG_1', 'CT_CRISPR_YOUNG_2', 'CT_CRISPR_YOUNG_3']]) # young_fold = numpy.log2(young_kd_mean/young_ctl_mean) # exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] + [round(x,4) for x in [old_kd_mean,old_ctl_mean,old_fold,young_kd_mean,young_ctl_mean,young_fold]] # gene_table_formatted.append(line+[sum([float(x) for x in line[1:3]])] + exp_line) # utils.unParseTable(gene_table_formatted,gene_table_formatted_path,'\t') print('\n\n') print( '#======================================================================' ) print( '#=================IX. ANNOTATING IRF2 KD CLUSTERGRAM===================' ) print( '#======================================================================' ) print('\n\n') #this little bit of python code is on the dropbox... need to move over print('\n\n') print( '#======================================================================' ) print( '#=======================X. PLOTTING FIGURE REGIONS ====================' ) print( '#======================================================================' ) print('\n\n') figure_gff_path = '%sHG19_KERATINOCYTE_FIGURE_2_GENES.gff' % (gffFolder) plotName = 'IRF2_FIGURE_2_GENES' outputFolder = utils.formatFolder('%sgene_plot/IRF2/' % (projectFolder), True) pipeline_dfci.callBatchPlot(chip_data_file, figure_gff_path, plotName, outputFolder, namesList=['HK_DOX_HA_1', 'HK_DOX_HA_2'], uniform=True, bed='', plotType='MULTIPLE', extension=200, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='', scaleFactorString='')
def main(): print('main analysis for MYCN project') print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. CHECKING CHIP-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ChIP-Seq #edit all of the data files to absolute path the for dataFile in chip_data_list: pipeline_dfci.summary(dataFile) print('\n\n') print( '#======================================================================' ) print( '#===================II. MAKING POL2 SIGNAL TABLES======================' ) print( '#======================================================================' ) print('\n\n') gffList = [ '%sHG19_TSS_ALL_-300_+300.gff' % (gffFolder), '%sHG19_BODY_ALL_+300_+3000.gff' % (gffFolder), ] names_list = [ 'SHEP21_0HR_POL2_NOSPIKE_R2', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE', 'SHEP21_0HR_INPUT_NOSPIKE', 'SHEP21_2HR_INPUT_NOSPIKE_rep2', 'SHEP21_24HR_INPUT_NOSPIKE_rep2', ] #shep21_nospike_pol2_signal_path = pipeline_dfci.map_regions(shep21_dataFile,gffList,mappedFolder,signalFolder,names_list,medianNorm=False,output='') #now for shep21 chiprx names_list = [ 'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX', 'SHEP21_0HR_INPUT_RX_1', 'SHEP21_2HR_INPUT_RX_1', 'SHEP21_24HR_INPUT_RX_1', ] shep21_nospike_pol2_signal_path = pipeline_dfci.map_regions( shep21_chiprx_dataFile, gffList, mappedFolder, signalFolder, names_list, medianNorm=False, output='') print('\n\n') print( '#======================================================================' ) print( '#====================III. CHECKING ATAC-SEQ DATA=======================' ) print( '#======================================================================' ) print('\n\n') print('\n\n') print( '#======================================================================' ) print( '#======================IV. CHECKING CHIPRX DATA========================' ) print( '#======================================================================' ) print('\n\n')
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#=======================I. FIXING LINKS FOR BAMS=======================' ) print( '#======================================================================' ) print('\n\n') # bam_folder = '/storage/cylin/grail/projects/chordoma_rna/190612_rna_seq/bams/' # def symlink_bai(bam_folder): # ''' # resolves the symlinks of the bams to also symlink the bais # ''' # bam_file_list = ['%s%s' % (bam_folder,fh) for fh in os.listdir(bam_folder) if fh.count('bam') > 0] # print(bam_file_list) # for bam_path in bam_file_list: # #print(bam_path) # #print(os.path.realpath(bam_path)) # bam_origin = os.path.realpath(bam_path) # sym_origin = bam_origin.replace('.bam','.bam.bai') # sym_dest = bam_path.replace('.bam','.bam.bai') # #print(sym_origin) # #print(sym_dest) # sym_cmd ='ln -s %s %s' % (sym_origin,sym_dest) # os.system(sym_cmd) # symlink_bai(bam_folder) print('\n\n') print( '#======================================================================' ) print( '#=====================II. LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for ch22 data file pipeline_dfci.summary(ch22_rna_data_file) #for umchor1 pipeline_dfci.summary(umchor1_rna_data_file) print('\n\n') print( '#======================================================================' ) print( '#======================III. RUNNING CUFFNORM===========================' ) print( '#======================================================================' ) print('\n\n')