def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mm1s_dataFile, bam_extension)) #first do individuals for plot_group in ['MYC', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_MM1S_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mm1s_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def plot_nb_atac_genes(atac_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sNB_ATAC/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(atac_dataFile) names_list = dataDict.keys() print(names_list) #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,atac_dataFile,bam_extension)) print(dataDict[names_list[1]]['bam']) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (atac_dataFile, bam_extension)) #first do individuals for plot_group in ['ATAC']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 and name.count('MM1S') == 0 ] plotName = '%s_NB_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas plotList = [ 'BE2C_ATAC_rep1', 'KELLY_ATAC', 'NGP_ATAC', 'SHEP21_ATAC', ] groupString = 'ATAC,ATAC,ATAC,ATAC' plotName = '%s_NB_ATAC_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_NB_ATAC_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(atac_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def plot_be2c_genes(be2c_dataFile, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep on data ''' #first establish the plot folder plotFolder = utils.formatFolder('%sBE2C/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(be2c_dataFile) names_list = dataDict.keys() print(names_list) # #initial check for consistency of read lengths # for name in names_list: # print(name) # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,be2c_dataFile,bam_extension)) print(dataDict[names_list[0]]['bam']) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (be2c_dataFile, bam_extension)) #first do individuals except for twist using relative scaling plotList = [ name for name in dataDict.keys() if name.count('TWIST') == 0 and name.count('INPUT') == 0 ] plotName = '%s_BE2C_RELATIVE' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') plotName = '%s_BE2C_UNIFORM' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now for twist plotList = ['BE2C_TWIST'] twist_extension = 125 plotName = '%s_BE2C_TWIST' % (plot_prefix) print(plotName) pipeline_dfci.callBatchPlot(be2c_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MULTIPLE', extension=twist_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='')
def plot_shep21_chiprx_genes(shep21_chiprx_dataFile, scale_path, nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep21 chiprx data with both spikey normey and without spikey normey ''' #we want a multiplicative scale factor for the data and to not have rpm on scale_table = utils.parseTable(scale_path, '\t') scale_dict = {} for line in scale_table[1:]: scale_dict[line[0]] = line[2] #first establish the plot folder plotFolder_scaled = utils.formatFolder( '%sSHEP21_CHIPRX_SCALED/' % (genePlotFolder), True) plotFolder_rpm = utils.formatFolder( '%sSHEP21_CHIPRX_RPM_NOSCALE/' % (genePlotFolder), True) plotFolder_raw = utils.formatFolder( '%sSHEP21_CHIPRX_RAW_NOSCALE/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file #for shep21_dataFile dataDict = pipeline_dfci.loadDataTable(shep21_chiprx_dataFile) names_list = dataDict.keys() #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,shep21_chiprx_dataFile,bam_extension)) bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (shep21_chiprx_dataFile, bam_extension)) #for shep21 we want meta of k27ac, pol2, mycn, and twist #individual of k27ac, pol2, mycn, and twist #first do individuals rpm scaled for plot_group in ['MYCN', 'H3K4ME3', 'H3K27AC', 'POL2', 'CTCF']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] scaleList = [ round(1 / float(scale_dict[name]), 4) for name in plotList ] scaleList = [str(x) for x in scaleList] plot_scale_string = ','.join(scaleList) #first raw no scaling plotName = '%s_SHEP21_%s_RX_RAW_NOSCALE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_raw, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=False, rxGenome='') #first rpm no scaling plotName = '%s_SHEP21_%s_RX_RPM_NOSCALE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #next w/ scaling plotName = '%s_SHEP21_%s_RX_SCALED' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile, nb_figure_gff_path, plotName, plotFolder_scaled, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=False, rxGenome='', scaleFactorString=plot_scale_string) #now as metas plotList = [ 'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE', 'SHEP21_24HR_MYCN_NOSPIKE', 'SHEP21_0HR_H3K27AC_NOSPIKE', 'SHEP21_2HR_H3K27AC_NOSPIKE', 'SHEP21_24HR_H3K27AC_NOSPIKE', 'SHEP21_0HR_TWIST', 'SHEP21_2HR_TWIST', 'SHEP21_24HR_B_TWIST', 'SHEP21_0HR_POL2_NOSPIKE_R2', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE', ] groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2' plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder_rpm, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def plot_shep21_genes(nb_figure_gff_path, bed_string): ''' plots all varieties and iterations of tracks for shep21 data ''' #we will have a variety of different plot types #all nb_meta baseline #chiprx_scaled #chiprx w/o scaling #just shep21 nospike #shep on #first establish the plot folder plotFolder = utils.formatFolder('%sSHEP21_NOSPIKE/' % (genePlotFolder), True) plot_prefix = 'HG19_NB_FIGURE_GENES' #we also have to set the extension properly between datasets #go by data file #for shep21_dataFile dataDict = pipeline_dfci.loadDataTable(shep21_dataFile) names_list = dataDict.keys() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (shep21_dataFile, bam_extension)) #for shep21 we want meta of k27ac, pol2, mycn, and twist #individual of k27ac, pol2, mycn, and twist #first do individuals for plot_group in ['MYCN', 'TWIST', 'H3K27AC', 'POL2']: plotList = [ name for name in dataDict.keys() if name.count(plot_group) > 0 ] plotName = '%s_SHEP21_%s_NOSPIKE' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas plotList = [ 'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE', 'SHEP21_24HR_MYCN_NOSPIKE', 'SHEP21_0HR_H3K27AC_NOSPIKE', 'SHEP21_2HR_H3K27AC_NOSPIKE', 'SHEP21_24HR_H3K27AC_NOSPIKE', 'SHEP21_0HR_TWIST', 'SHEP21_2HR_TWIST', 'SHEP21_24HR_B_TWIST', 'SHEP21_0HR_POL2_NOSPIKE_R2', 'SHEP21_2HR_POL2_NOSPIKE', 'SHEP21_24HR_POL2_NOSPIKE', ] groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2' plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(shep21_dataFile, nb_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed=bed_string, plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')
def main(): print('main analysis for project %s' % (projectName)) print('changing directory to project folder') os.chdir(projectFolder) print('\n\n') print( '#======================================================================' ) print( '#======================I. LOADING DATA ANNOTATION======================' ) print( '#======================================================================' ) print('\n\n') #This section sanity checks each data table and makes sure both bam and .bai files are accessible #for data file pipeline_dfci.summary(chip_data_file) print('\n\n') print( '#======================================================================' ) print( '#===========================II. CALLING MACS===========================' ) print( '#======================================================================' ) print('\n\n') #pipeline_dfci.run_macs(chip_data_file,projectFolder,macsFolder,macsEnrichedFolder,wiggleFolder,useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#=======================III. MERGING IRF2 REGIONS======================' ) print( '#======================================================================' ) print('\n\n') #create a set of regions representing the intersect of peaks #filter out anything that overlaps a peak in the HA ctl def merge_regions(): ''' merges ha peaks to identify all overlapping peaks filters out anything overlapping the HA controls ''' hk_dox_ha_1 = utils.importBoundRegion( '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1') hk_dox_ha_2 = utils.importBoundRegion( '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2') hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci() #control datasets hk_ctl_ha_1 = utils.importBoundRegion( '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1') hk_ctl_ha_2 = utils.importBoundRegion( '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2') hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci() hk_ctl_lc = utils.LocusCollection(hk_ctl_loci) print(len(hk_dox_loci)) stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection() print(len(stitched_lc)) filtered_loci = [] for locus in stitched_lc.getLoci(): if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len( hk_dox_ha_2.getOverlap(locus)) > 0: if len(hk_ctl_lc.getOverlap(locus)) == 0: filtered_loci.append(locus) print(len(filtered_loci)) filtered_lc = utils.LocusCollection(filtered_loci) gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % ( gffFolder) filtered_gff = utils.locusCollectionToGFF(filtered_lc) utils.unParseTable(filtered_gff, gff_path, '\t') #merge_regions() print('\n\n') print( '#======================================================================' ) print( '#======================IV. IDENTIFY ATAC OVERLAP REGIONS===============' ) print( '#======================================================================' ) print('\n\n') # atac_bed_path = '%sHG19_combined_atac_-0_+0.bed' % (bedFolder)# all combined atac regions # atac_collection = utils.importBoundRegion(atac_bed_path,'HG19_combined_atac') # print(len(atac_collection)) # #now filter the irf2 gff # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder) # irf2_collection = utils.gffToLocusCollection(irf2_gff_path) # irf2_loci = irf2_collection.getLoci() # irf2_atac_loci = [locus for locus in irf2_loci if atac_collection.getOverlap(locus)] # print(len(irf2_atac_loci)) # irf2_atac_collection=utils.LocusCollection(irf2_atac_loci) # irf2_atac_gff = utils.locusCollectionToGFF(irf2_atac_collection) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # utils.unParseTable(irf2_atac_gff,irf2_atac_gff_path,'\t') # # overlap with TSS # tss_gff_path = '%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder) # tss_gff = utils.parseTable(tss_gff_path,'\t') # tss_collection = utils.gffToLocusCollection(tss_gff) # print('tss overlap w/ IRF2 atac peaks') # print(len([locus for locus in irf2_atac_loci if tss_collection.getOverlap(locus)])) # print(len(irf2_atac_loci)) # #overlap w/ k27ac # k27ac_gff_path = '%sHG19_keratinocyte_combined_all_-0_+0.gff' % (gffFolder) # k27ac_gff = utils.parseTable(k27ac_gff_path,'\t') # k27ac_collection = utils.gffToLocusCollection(k27ac_gff) # print('k27ac overlap w/ IRF2 atac peaks') # print(len([locus for locus in irf2_atac_loci if k27ac_collection.getOverlap(locus)])) # print(len(irf2_atac_loci)) print('\n\n') print( '#======================================================================' ) print( '#========================V. CALLING ROSE2 META=========================' ) print( '#======================================================================' ) print('\n\n') def wrapRose2Meta(data_file, input_path, parent_folder, active_gene_path='', rank_list=[], control_list=[], analysis_name=''): ''' quick wrapper for Rose2Meta ''' dataDict = pipeline_dfci.loadDataTable(data_file) rank_string = ','.join([dataDict[name]['bam'] for name in rank_list]) control_string = ','.join( [dataDict[name]['bam'] for name in control_list]) output_folder = utils.formatFolder( '%s%s' % (parent_folder, analysis_name), True) rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % ( py27_path, pipeline_dir, genome, input_path, rank_string, control_string, analysis_name, output_folder, blacklist_path) all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder, analysis_name) if active_gene_path != '': rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path, active_gene_path) else: rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % ( py27_path, pipeline_dir, genome, all_enhancer_path) rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name) rose_bash = open(rose_bash_path, 'w') rose_bash.write('#!/usr/bin/python\n\n') rose_bash.write('#setting up bamliquidator\n') rose_bash.write('\n\n#ROSE2_CMD\n') rose_bash.write(rose2_meta_cmd + '\n') rose_bash.write(rose2_map_cmd + '\n') rose_bash.close() print('Wrote ROSE2 META CMD to %s' % (rose_bash_path)) #use ROSE2 w/ -t 0 and -s 0 to quantify background subtracted AUC at all peaks # parent_folder = utils.formatFolder('%smeta_rose/' % (projectFolder),True) # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder) # #creating bam lists # rank_list = ['HK_DOX_HA_1','HK_DOX_HA_2'] # control_list =['HK_DOX_WCE_1','HK_DOX_WCE_2'] # #for all IRF2 HA # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA' # wrapRose2Meta(chip_data_file,irf2_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA_ATAC' # wrapRose2Meta(chip_data_file,irf2_atac_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name) print('\n\n') print( '#======================================================================' ) print( '#================VI. OVERLAPPING IRF2 W/ MOTIF PREDICTIONS=============' ) print( '#======================================================================' ) print('\n\n') # #load up peaks # #irf2_atac_peaks # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # irf2_atac_gff = utils.parseTable(irf2_atac_gff_path,'\t') # irf2_atac_loci = utils.gffToLocusCollection(irf2_atac_gff).getLoci() # irf2_atac_collection = utils.LocusCollection(irf2_atac_loci) # print(len(irf2_atac_loci)) # irf2_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES.txt' % (projectFolder) # irf2_edge_table = utils.parseTable(irf2_edge_path,'\t') # print(len(irf2_edge_table)) # irf2_confirmed_edges = [] # irf2_edge_loci = [] # for line in irf2_edge_table[1:]: # chrom = line[1].split('(')[0] # coords = [int(x) for x in line[1].split(':')[-1].split('-')] # locus = utils.Locus(chrom,coords[0]-00,coords[1]+00,'.',line[0]) # if len(irf2_atac_collection.getOverlap(locus)) > 0: # irf2_confirmed_edges.append(line) # irf2_edge_loci.append(locus) # print(len(irf2_confirmed_edges)) # irf2_confirmed_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES_CONFIRMED.txt' % (projectFolder) # utils.unParseTable(irf2_confirmed_edges,irf2_confirmed_edge_path,'\t') # irf2_edge_collection = utils.LocusCollection(irf2_edge_loci) # print(len(irf2_edge_collection)) # overlap_count = 0 # for locus in irf2_atac_loci: # search_locus = utils.makeSearchLocus(locus,0,0) # if len(irf2_edge_collection.getOverlap(search_locus)) >0: # overlap_count+=1 # print(overlap_count) print('\n\n') print( '#======================================================================' ) print( '#=================VII. RUNNING ENHANCER PROMOTER ON IRF2===============' ) print( '#======================================================================' ) print('\n\n') def wrap_enhancer_promoter(dataFile, input_path, activity_path, analysis_name, names_list=[], useBackground=True): ''' runs enhancer promoter on everybody with the conserved regions and union of active genes ''' #hard coded paths tads_path = '%shESC_domains_hg19.bed' % (bedFolder) #setting the output folder ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder), True) dataDict = pipeline_dfci.loadDataTable(dataFile) if len(names_list) == 0: names_list = [name for name in dataDict.keys()] names_list.sort() bams_list = [dataDict[name]['bam'] for name in names_list] bams_string = ' '.join(bams_list) background_names = [ dataDict[name]['background'] for name in names_list ] background_list = [ dataDict[background_name]['bam'] for background_name in background_names ] background_string = ' '.join(background_list) ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name) ep_bash = open(ep_bash_path, 'w') ep_bash.write('#!/usr/bin/bash\n\n\n') ep_bash.write('#enhancer promoter analysis for %s\n\n' % (analysis_name)) if useBackground: python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, background_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) else: python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % ( pipeline_dir, bams_string, genome.upper(), input_path, ep_folder, activity_path, analysis_name, tads_path) ep_bash.write(python_cmd) ep_bash.close() return (ep_bash_path) # # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder) # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder) # analysis_name = 'IRF2_HA_ATAC' # bam_list = ['HK_DOX_HA_1','HK_DOX_HA_2'] # wrap_enhancer_promoter(chip_data_file,irf2_atac_gff_path,active_gene_path,analysis_name,bam_list,useBackground=True) print('\n\n') print( '#======================================================================' ) print( '#==============VIII. FORMATTING THE HORRIFYING EXPRESSION TABLE========' ) print( '#======================================================================' ) print('\n\n') # exp_path = '%sirf2_kd_rna_seq/single_counts_filtered_counts.txt' % (projectFolder) # sample_key_path = '%sirf2_kd_rna_seq/sample_key.txt' % (projectFolder) # sample_table = utils.parseTable(sample_key_path,'\t') # sample_list = [line[0] for line in sample_table[1:]] # print(sample_list) # exp_table = utils.parseTable(exp_path,'\t') # #for each gene make a dictionary # exp_dict = {} # #first fill out the dictionary by gene name # for line in exp_table[1:]: # gene_name = line[3].replace('"','') # exp_dict[gene_name] = {} # print(len(exp_dict.keys())) # for line in exp_table[1:]: # gene_name = line[3].replace('"','') # sample_ID = line[4].replace('"','') # counts = line[2] # exp_dict[gene_name][sample_ID] = counts # #make the formatted expression table # header = ['GENE_NAME'] + sample_list # exp_table_formatted = [header] # gene_list = exp_dict.keys() # gene_list.sort() # for gene in gene_list: # exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] # exp_table_formatted.append(exp_line) # exp_table_formatted_path = '%sirf2_kd_rna_seq/irf2_expression_formatted.txt' % (projectFolder) # utils.unParseTable(exp_table_formatted,exp_table_formatted_path,'\t') # #with the exp dict we can make a nicer version of the gene table # gene_table_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE.txt' % (projectFolder) # gene_table_formatted_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE_FORMATTED.txt' % (projectFolder) # gene_table = utils.parseTable(gene_table_path,'\t') # gene_table_formatted = [gene_table[0] + ['IRF2_TOTAL_SIGNAL'] + header+ ['OLD_IRF2_KD_MEAN','OLD_CTL_MEAN','OLD_IRF2_VS_CTL','YOUNG_IRF2_KD_MEAN','YOUNG_CTL_MEAN','YOUNG_IRF2_VS_CTL']] # for line in gene_table[1:]: # if float(line[1]) == 0.0 and float(line[2]) == 0.0: # continue # if exp_dict.has_key(line[0]) == False: # continue # gene = line[0] # #where conditions are met # old_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_OLD_1', 'IRF2_KD_OLD_2', 'IRF2_KD_OLD_3']]) # old_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_OLD_1', 'CT_CRISPR_OLD_2', 'CT_CRISPR_OLD_3']]) # old_fold = numpy.log2(old_kd_mean/old_ctl_mean) # young_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_YOUNG_1', 'IRF2_KD_YOUNG_2', 'IRF2_KD_YOUNG_3']]) # young_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_YOUNG_1', 'CT_CRISPR_YOUNG_2', 'CT_CRISPR_YOUNG_3']]) # young_fold = numpy.log2(young_kd_mean/young_ctl_mean) # exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] + [round(x,4) for x in [old_kd_mean,old_ctl_mean,old_fold,young_kd_mean,young_ctl_mean,young_fold]] # gene_table_formatted.append(line+[sum([float(x) for x in line[1:3]])] + exp_line) # utils.unParseTable(gene_table_formatted,gene_table_formatted_path,'\t') print('\n\n') print( '#======================================================================' ) print( '#=================IX. ANNOTATING IRF2 KD CLUSTERGRAM===================' ) print( '#======================================================================' ) print('\n\n') #this little bit of python code is on the dropbox... need to move over print('\n\n') print( '#======================================================================' ) print( '#=======================X. PLOTTING FIGURE REGIONS ====================' ) print( '#======================================================================' ) print('\n\n') figure_gff_path = '%sHG19_KERATINOCYTE_FIGURE_2_GENES.gff' % (gffFolder) plotName = 'IRF2_FIGURE_2_GENES' outputFolder = utils.formatFolder('%sgene_plot/IRF2/' % (projectFolder), True) pipeline_dfci.callBatchPlot(chip_data_file, figure_gff_path, plotName, outputFolder, namesList=['HK_DOX_HA_1', 'HK_DOX_HA_2'], uniform=True, bed='', plotType='MULTIPLE', extension=200, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='', scaleFactorString='')
def plot_mouse_genes(mouse_dataFile, mouse_figure_gff_path): ''' plots all varieties and iterations of tracks @ lifted over mouse regions ''' #first establish the plot folder plotFolder = utils.formatFolder('%sTHMYCN/' % (genePlotFolder), True) plot_prefix = 'MM9_NB_FIGURE_GENES_LIFTOVER' #we also have to set the extension properly between datasets #go by data file dataDict = pipeline_dfci.loadDataTable(mouse_dataFile) names_list = dataDict.keys() #initial check for consistency of read lengths # for name in names_list: # bam = utils.Bam(dataDict[name]['bam']) # read_length = bam.getReadLengths()[0] # bam_extension = 200-read_length # print('For dataset %s in %s using an extension of %s' % (name,mouse_dataFile,bam_extension)) # sys.exit() bam = utils.Bam(dataDict[names_list[0]]['bam']) read_length = bam.getReadLengths()[0] bam_extension = 200 - read_length print('For datasets in %s using an extension of %s' % (mouse_dataFile, bam_extension)) #first do individuals for plot_group in ['_MYCN', 'H3K27AC']: plotList = [ name for name in dataDict.keys() if name.upper().count(plot_group) > 0 ] print(plotList) if plot_group == '_MYCN': plotName = '%s_THMYCN%s' % (plot_prefix, plot_group) else: plotName = '%s_THMYCN_%s' % (plot_prefix, plot_group) print(plotName) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed='', plotType='MULTIPLE', extension=bam_extension, multiPage=False, debug=False, nameString='', rpm=True, rxGenome='') #now as metas #we only have 3 good k27ac and 3 good mycn datasets plotList = [ 'CG_H3K27Ac', 'SCG_H3K27Ac', 'THMYCN1_H3K27Ac', 'THMYCN_139423_H3K27Ac', 'THMYCN_139076_H3K27Ac', 'THMYCN2_MYCN', 'THMYCN_139076_MYCN', 'THMYCN_139423_MYCN', ] groupString = 'CG_,SCG,H3K27AC,H3K27AC,H3K27AC,MYCN,MYCN,MYCN' plotName = '%s_THMYCN_META_RELATIVE' % (plot_prefix) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=False, bed='', plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='') plotName = '%s_THMYCN_META_UNIFORM' % (plot_prefix) pipeline_dfci.callBatchPlot(mouse_dataFile, mouse_figure_gff_path, plotName, plotFolder, plotList, uniform=True, bed='', plotType='MERGE', extension=bam_extension, multiPage=False, debug=False, nameString=groupString, rpm=True, rxGenome='')