def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background=False): ''' calculates the level of acetylation at each TF promoter ''' print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) if background: background = utils.Bam(background) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500)) tssCollection = utils.LocusCollection(tssLoci, 50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') mappingCmd = 'bamliquidator_batch' mappingCmd += ' -r ' + outputname mappingCmd += ' -o ' + projectFolder + 'bamliquidator' mappingCmd += ' -m -e 200 ' mappingCmd += bamFile subprocess.call(mappingCmd, shell=True) print mappingCmd
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict): ''' calculates the level of H3K27ac at each promoter from a H3K27ac bam file ''' print 'IDENTIFY EXPRESSED GENES' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') # run bamToGFF.py to quantify signal at each TSS +/- 1kb mappingCmd = 'python ./bamToGFF.py' mappingCmd += ' -r ' mappingCmd += ' -d ' mappingCmd += ' -o ' + projectFolder + 'matrix.gff' mappingCmd += ' -m 1 -f 0 -e 200 ' mappingCmd += ' -i ' + projectFolder + projectName + '_TSS.gff' mappingCmd += ' -b ' + bamFile call(mappingCmd, shell=True) print mappingCmd
def make_probe_to_gene_dict(annotFile, array_1_path, array_2_path): ''' keyed by probe ID w/ gene as value ''' #see if it already exists pickle_path = '%soberthuer_outcome/probe_dict.pkl' % (projectFolder) if utils.checkOutput(pickle_path, 0, 0): print('loading previously made probe dict at %s' % (pickle_path)) probe_gene_dict = pickle.load(open(pickle_path, "rb")) return probe_gene_dict #we want to intersect refseq common names w/ the array startDict = utils.makeStartDict(annotFile) ref_name_list = utils.uniquify( [startDict[refID]['name'] for refID in startDict.keys()]) probe_gene_dict = {} array_1 = utils.parseTable(array_1_path, '\t') array_2 = utils.parseTable(array_2_path, '\t') ticker = 0 for line in array_1 + array_2: if len(line) < 5: continue ticker += 1 probe_id = line[4] name = line[-1] # print(probe_id) # print(name) # if ticker== 10: # sys.exit() # print(line) if ref_name_list.count(name) > 0: probe_gene_dict[probe_id] = name pickle.dump(probe_gene_dict, open(pickle_path, 'wb')) return probe_gene_dict
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print 'FINDING CANIDATE TFs' enhancerAssignment = [] TFtoEnhancerDict = defaultdict(list) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in expressedNM: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) # Loop through enhancers for enhancer in enhancerLoci: # If the enhancer overlaps a TSS, save it overlappingLoci = tssCollection.getOverlap(enhancer, 'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # Find all gene TSS within 100 kb proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) # If no genes are within 100 kb, find the closest active gene closestGene = '' if len(overlappingGenes) == 0 and len(proximalGenes) == 0: distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distalGenes =[] for distalLocus in distalLoci: distalGenes.append(distalLocus.ID()) enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distalGenes] if distList: closestGene = distalGenes[distList.index(min(distList))] overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def make_shep_on_mycn_landscape(shep_on_dataFile): ''' finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) print('LOADING SHEP ON MYCN SITES') #load all of the shep_on sites # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder) # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t') shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder) shep_on_bed = utils.parseTable(shep_on_bed_path,'\t') shep_on_gff = utils.bedToGFF(shep_on_bed) #now get the conserved NB MYCN regions nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file) print('LOADING SHEP ACTIVE ENHANCERS') #make a collection of enhancers shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder) shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC') #now get the active promoters print('LOADING SHEP ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder) shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t') transcribedList = [line[1] for line in shep_transcribed_table] tssLoci = [] for refID in transcribedList: tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) shep_tss_collection = utils.LocusCollection(tssLoci,50) #now initialize the 6 gffs we will need shep_mycn_gff = [] shep_mycn_gff_5kb = [] shep_mycn_gff_1kb = [] shep_mycn_promoter_gff = [] shep_mycn_promoter_gff_1kb = [] shep_mycn_promoter_gff_5kb = [] shep_mycn_enhancer_gff = [] shep_mycn_enhancer_gff_1kb = [] shep_mycn_enhancer_gff_5kb = [] #and their respective file names shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder) shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder) shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder) shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder) shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder) print('ITERATING THROUGH SHEP MYCN PEAKS') ticker = 0 enhancer = 0 promoter = 0 other = 0 for line in shep_on_gff: if ticker % 1000 == 0: print ticker ticker+=1 peakID = '%s_%s' % ('SHEP_MYCN',str(ticker)) lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID) if nb_conserved_mycn_collection.getOverlap(lineLocus): gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID] peakCenter = (int(line[3]) + int(line[4]))/2 gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID] #the 1kb is not a center +/- but a flank gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID] shep_mycn_gff.append(gffLine) shep_mycn_gff_5kb.append(gffLine_5kb) shep_mycn_gff_1kb.append(gffLine_1kb) #tss overlap should take precedence over enhancer overlap if shep_tss_collection.getOverlap(lineLocus,'both'): shep_mycn_promoter_gff.append(gffLine) shep_mycn_promoter_gff_5kb.append(gffLine_5kb) shep_mycn_promoter_gff_1kb.append(gffLine_1kb) promoter+=1 #now check for enhancer overlap elif shep_enhancer_collection.getOverlap(lineLocus,'both'): shep_mycn_enhancer_gff.append(gffLine) shep_mycn_enhancer_gff_5kb.append(gffLine_5kb) shep_mycn_enhancer_gff_1kb.append(gffLine_1kb) enhancer+=1 else: other+=1 print('Of %s shep on mycn peaks' % (len(shep_on_gff))) print('%s are promoter' % (promoter)) print('%s are enhancer' % (enhancer)) print('%s are other' % (other)) #now write out the gffs utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t') utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help= "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option( "-c", "--control", dest="control", nargs=1, default=None, help= "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam" ) parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [ inputFile for inputFile in options.input.split(',') if len(inputFile) > 1 ] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][ 0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [ bam for bam in options.control.split(',') if len(bam) > 0 ] rankbyBamList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList * len(rankbyBamList) else: print( 'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE' ) sys.exit() else: bamFileList = [ bam for bam in options.rankby.split(',') if len(bam) > 0 ] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0], '\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile, '\t') gffCollection = utils.gffToLocusCollection(gff, 50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci, 50) inputCollection = inputCollection.stitchCollection( ) # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i, line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]), int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName, str(i + 1)) #1 indexing newLine = [ chrom, lineID, lineID, min(coords), max(coords), '', sense, '', lineID ] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome), inputName) utils.unParseTable(formattedGFF, masterGFFFile, '\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1, inputName + '_MERGED_SIGNAL', controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % ( pipeline_dir, outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % ( pipeline_dir, genome, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option( "-s", "--stitch", dest="stitch", nargs=1, default='', help= "Enter a max linking distance for stitching. Default will determine optimal stitching parameter" ) parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option( "--mask", dest="mask", nargs=1, default=None, help= "Mask a set of regions from analysis. Provide a .bed or .gff of masking regions" ) # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd), 'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(inputGFFFile, bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [ locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0 ] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching( referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % ( inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch.py' bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % ( mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % ( stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File, 0.2, 5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % ( genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % ( genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerTable = utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' #loading in the enhancer gff regions enhancer_collection = utils.gffToLocusCollection(enhancer_gff) enhancer_loci = enhancer_collection.getLoci() #loading in the genome and TF info annot_file = genome.returnFeature('annot_file') startDict = utils.makeStartDict(annot_file) tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t') refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs #make a collection of all TF TSSs tssLoci = [] for refID in refID_list: tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus tssCollection = utils.LocusCollection(tssLoci,50) enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] gene_to_enhancer_dict = defaultdict(list) # Loop through enhancers #all gene nnames stored by refID for enhancer in enhancer_loci: # If the enhancer overlaps a TSS, save it overlapping_loci = tssCollection.getOverlap(enhancer, 'both') overlapping_refIDs =[locus.ID() for locus in overlapping_loci] # Find all gene TSS within 100 kb proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximal_refIDs =[locus.ID() for locus in proximal_loci] # If no genes are within 100 kb, find the closest active gene within 1 million bp closest_refID = [] if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0: distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distal_refIDs =[locus.ID() for locus in distal_loci] enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distal_refIDs] if len(distance_list) > 0: closest_refID = [distalGenes[distance_list.index(min(distance_list))]] #now we have all potential gene cases all_refIDs = overlappingGenes + proximalGenes + closest_refID #now we get all names and refIDs all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ]) all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs]) #first do enhancer level assignment names_string = ','.join(all_names) enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string]) #now do gene level assignment for refID in all_refIDs: gene_to_enhancer_dict[refID].append(enhancer.ID()) #an enhancer can be assigned to multiple genes #a promoter can only be assigned to 1 gene #promoters don't have enhancerIDs so don't add them yet #this should just be an enhancer level table #followed by a gene level table overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in tf_list: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def binPeakTable(peak_table_path,activity_path,binSize = 1000000,output = ''): ''' calculates the promoter/enahncer AUC signal across bins sets the output to the same path unless otherwise specified ''' if len(output) == 0: output = string.replace(peak_table,'.txt','bin_table.txt') binSize = int(binSize) stepSize = binSize/2 activityTable = utils.parseTable(activity_path,'\t') startDict = utils.makeStartDict(annotFile) tssLoci = [] print('making tss collection for active genes') for line in activityTable: tssLoci.append(utils.makeTSSLocus(line[1],startDict,0,0)) tssCollection = utils.LocusCollection(tssLoci,50) promoterDict = {} enhancerDict = {} tssDict = {} #hard wired for hg19 chrom_path = '/ark/home/cl512/pipeline/annotation/hg19.chrom.sizes' chrom_table = utils.parseTable(chrom_path,'\t') chromDict = {} for line in chrom_table: chromDict[line[0]] = int(line[1]) chromList = ['chr'+str(i) for i in range(1,23)] + ['chrX','chrY'] #set the hg19 chroms #need to seed the dict for chrom in chromList: promoterDict[chrom] = defaultdict(float) enhancerDict[chrom] = defaultdict(float) tssDict[chrom] =defaultdict(int) # dict to count active promoters #now as we iterate through the peak table peak_table = utils.parseTable(peak_table_path,'\t') print('filling in enhancer dict') for line in peak_table[1:]: chrom = line[1] signal = float(line[9])*int(line[4]) #for approximation use the center coordinate to assign bin #every region should be in 2 bins center = (int(line[2]) + int(line[3]))/2 first_bin = center/stepSize if center % stepSize < stepSize: second_bin = first_bin - 1 else: second_bin = first_bin + 1 if int(line[5]) == 1: promoterDict[chrom][first_bin] +=signal promoterDict[chrom][second_bin] +=signal else: enhancerDict[chrom][first_bin] +=signal enhancerDict[chrom][second_bin] +=signal #now load up the new peak table outTable = [['BIN','CHROM','START','STOP','TSS_COUNT','PROMOTER','ENHANCER']] print('making out table') for chrom in chromList: print(chrom) chromLength = chromDict[chrom] for i in range(chromLength/stepSize): bin_start = i*stepSize + 1 bin_stop = i*stepSize + binSize bin_locus = utils.Locus(chrom,bin_start,bin_stop,'.') overlapTSSCount = len(tssCollection.getOverlap(bin_locus,'both')) bin_id = '%s_%s' % (chrom,str(i+1)) promoterSignal = promoterDict[chrom][i] enhancerSignal = enhancerDict[chrom][i] newLine = [bin_id,chrom,bin_start,bin_stop,overlapTSSCount,promoterSignal,enhancerSignal] outTable.append(newLine) utils.unParseTable(outTable,output,'\t') return outTable
# ##### import sys sys.path.append('/ark/home/af661/src/utils/') import utils from collections import defaultdict from string import upper import numpy as np from math import log # Annotation file for hg19 annotationFile = '/ark/home/cl512/pipeline/annotation/hg19_refseq.ucsc' startDict = utils.makeStartDict(annotationFile) print 'making TSS loci' tssLoci = [] counter = 0 for gene in startDict: counter += 1 if counter % 1000 == 0: print counter tssLoci.append(utils.makeTSSLocus(gene, startDict, 100000, 100000)) # proximal = within 100kb tssCollection = utils.LocusCollection(tssLoci, 200) print 'converting gene names' refseqToNameDict = {} annotTable = utils.parseTable(annotationFile, '\t')
def make_shep21_mycn_landscape(nb_all_chip_dataFile): ''' finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer ''' #first get the shep21 regions print('LOADING SHEP21 MYCN SITES') dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) shep21_0hr_mycn_enriched_file = '%s%s' % ( macsEnrichedFolder, dataDict['SHEP21_0HR_MYCN_NOSPIKE']['enrichedMacs']) shep21_0hr_mycn_bed = utils.parseTable(shep21_0hr_mycn_enriched_file, '\t') #now get the conserved NB MYCN regions nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % ( gffFolder) nb_conserved_mycn_collection = utils.gffToLocusCollection( nb_conserved_mycn_gff_file) print('LOADING SHEP21 ACTIVE ENHANCERS') #make a collection of enhancers shep21_enhancer_file = '%senhancer_rose/SHEP21_0HR_H3K27AC_NOSPIKE_ROSE/SHEP21_0HR_H3K27AC_NOSPIKE_peaks_AllEnhancers.table.txt' % ( projectFolder) shep21_enhancer_collection = utils.makeSECollection( shep21_enhancer_file, 'SHEP21_0HR_H3K27AC_NOSPIKE') #now get the active promoters print('LOADING SHEP21 ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) shep21_transcribed_file = '%sHG19_SHEP21_H3K27AC_TRANSCRIBED.txt' % ( geneListFolder) shep21_transcribed_table = utils.parseTable(shep21_transcribed_file, '\t') transcribedList = [line[1] for line in shep21_transcribed_table] tssLoci = [] for refID in transcribedList: tssLoci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000)) shep21_tss_collection = utils.LocusCollection(tssLoci, 50) #now initialize the 6 gffs we will need shep21_mycn_conserved_gff = [] shep21_mycn_conserved_gff_5kb = [] shep21_mycn_conserved_gff_1kb = [] shep21_mycn_conserved_promoter_gff = [] shep21_mycn_conserved_promoter_gff_1kb = [] shep21_mycn_conserved_promoter_gff_5kb = [] shep21_mycn_conserved_enhancer_gff = [] shep21_mycn_conserved_enhancer_gff_1kb = [] shep21_mycn_conserved_enhancer_gff_5kb = [] #and their respective file names shep21_mycn_conserved_gff_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-0_+0.gff' % ( gffFolder) shep21_mycn_conserved_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-5kb_+5kb.gff' % ( gffFolder) shep21_mycn_conserved_gff_1kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-1kb_+1kb.gff' % ( gffFolder) shep21_mycn_conserved_promoter_gff_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0.gff' % ( gffFolder) shep21_mycn_conserved_promoter_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % ( gffFolder) shep21_mycn_conserved_promoter_gff_1kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb.gff' % ( gffFolder) shep21_mycn_conserved_enhancer_gff_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0.gff' % ( gffFolder) shep21_mycn_conserved_enhancer_gff_5kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % ( gffFolder) shep21_mycn_conserved_enhancer_gff_1kb_file = '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb.gff' % ( gffFolder) print('ITERATING THROUGH SHEP21 MYCN PEAKS') ticker = 0 for line in shep21_0hr_mycn_bed: if ticker % 1000 == 0: print ticker ticker += 1 peakID = '%s_%s' % ('SHEP21_0HR_MYCN_NOSPIKE', str(ticker)) lineLocus = utils.Locus(line[0], line[1], line[2], '.', peakID) if nb_conserved_mycn_collection.getOverlap(lineLocus): gffLine = [ line[0], peakID, peakID, line[1], line[2], '', '.', '', peakID ] peakCenter = (int(line[1]) + int(line[2])) / 2 gffLine_5kb = [ line[0], peakID, peakID, peakCenter - 5000, peakCenter + 5000, '', '.', '', peakID ] #the 1kb is not a center +/- but a flank gffLine_1kb = [ line[0], peakID, peakID, int(line[1]) - 1000, int(line[2]) + 1000, '', '.', '', peakID ] shep21_mycn_conserved_gff.append(gffLine) shep21_mycn_conserved_gff_5kb.append(gffLine_5kb) shep21_mycn_conserved_gff_1kb.append(gffLine_1kb) #tss overlap should take precedence over enhancer overlap if shep21_tss_collection.getOverlap(lineLocus, 'both'): shep21_mycn_conserved_promoter_gff.append(gffLine) shep21_mycn_conserved_promoter_gff_5kb.append(gffLine_5kb) shep21_mycn_conserved_promoter_gff_1kb.append(gffLine_1kb) #now check for enhancer overlap elif shep21_enhancer_collection.getOverlap(lineLocus, 'both'): shep21_mycn_conserved_enhancer_gff.append(gffLine) shep21_mycn_conserved_enhancer_gff_5kb.append(gffLine_5kb) shep21_mycn_conserved_enhancer_gff_1kb.append(gffLine_1kb) #now write out the gffs utils.unParseTable(shep21_mycn_conserved_gff, shep21_mycn_conserved_gff_file, '\t') utils.unParseTable(shep21_mycn_conserved_gff_5kb, shep21_mycn_conserved_gff_5kb_file, '\t') utils.unParseTable(shep21_mycn_conserved_gff_1kb, shep21_mycn_conserved_gff_1kb_file, '\t') utils.unParseTable(shep21_mycn_conserved_promoter_gff, shep21_mycn_conserved_promoter_gff_file, '\t') utils.unParseTable(shep21_mycn_conserved_promoter_gff_5kb, shep21_mycn_conserved_promoter_gff_5kb_file, '\t') utils.unParseTable(shep21_mycn_conserved_promoter_gff_1kb, shep21_mycn_conserved_promoter_gff_1kb_file, '\t') utils.unParseTable(shep21_mycn_conserved_enhancer_gff, shep21_mycn_conserved_enhancer_gff_file, '\t') utils.unParseTable(shep21_mycn_conserved_enhancer_gff_5kb, shep21_mycn_conserved_enhancer_gff_5kb_file, '\t') utils.unParseTable(shep21_mycn_conserved_enhancer_gff_1kb, shep21_mycn_conserved_enhancer_gff_1kb_file, '\t')
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName): ''' find all TFs within 1Mb of the super-enhancer center that are considered expressed return a dictionary keyed by TF that points to a list of super-enhancer loci ''' print 'FINDING CANIDATE TFs' startDict = utils.makeStartDict(annotationFile) # Find the location of the TSS of all transcripts (NMid) considered expressed tssLoci = [] for geneID in expressedNM: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) tssCollection = utils.LocusCollection(tssLoci,50) # Assign all transcripts (NMid) that are TFs to a super-enhancer if it is the closest gene seAssignment = [] seAssignmentGene = [] TFandSuperDict = {} for superEnh in superLoci: seCenter = (superEnh.start() + superEnh.end()) / 2 # Find all transcripts whose TSS occur within 1Mb of the SE center searchLocus = utils.Locus(superEnh.chr(), superEnh.start()-1000000, superEnh.end()+1000000, '.') allEnhancerLoci = tssCollection.getOverlap(searchLocus) allEnhancerGenes = [locus.ID() for locus in allEnhancerLoci] # Find the transcript that is closest to the center if allEnhancerGenes: distList = [abs(seCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] closestGene = allEnhancerGenes[distList.index(min(distList))] else: closestGene = '' seAssignment.append([superEnh.chr(), superEnh.start(), superEnh.end(), closestGene]) # Select the transcript if it is a TF, and allow for a TF to have multiple SEs if closestGene in TFlist and closestGene not in TFandSuperDict.keys(): TFandSuperDict[closestGene] = [superEnh] elif closestGene in TFlist and closestGene in TFandSuperDict.keys(): TFandSuperDict[closestGene].append(superEnh) # Convert the selected TF NMids to gene names if closestGene != '': geneName = refseqToNameDict[closestGene] seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName]) # Output the list of SE-assigned transcripts (NMids) seAssignmentFile = projectFolder + projectName + '_SE_ASSIGNMENT_TRANSCRIPT.txt' utils.unParseTable(seAssignment, seAssignmentFile, '\t') # Output the list of SE-assigned genes seAssignmentGeneFile = projectFolder + projectName + '_SE_ASSIGNMENT_GENE.txt' utils.unParseTable(seAssignmentGene, seAssignmentGeneFile, '\t') print 'Number of canidate TFs:', len(TFandSuperDict) return TFandSuperDict
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): print('PERFORMING REGION STITCHING') # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name #referenceCollection debugOutput = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) # first make a locus collection of TSS startDict = utils.makeStartDict(annotFile) # now makeTSS loci for active genes removeTicker = 0 # this loop makes a locus centered around +/- tssWindow of transcribed genes # then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) # gives all the loci in referenceCollection boundLoci = referenceCollection.getLoci() # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus, 'both')) > 0: # if true, the bound locus overlaps an active gene referenceCollection.remove(locus) debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) removeTicker += 1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) # referenceCollection is now all enriched region loci that don't overlap an active TSS if stitchWindow == '': print('DETERMINING OPTIMUM STITCHING PARAMTER') optCollection = copy.deepcopy(referenceCollection) stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) print('USING A STITCHING PARAMETER OF %s' % stitchWindow) stitchedCollection = referenceCollection.stitchCollection(stitchWindow, 'both') if removeTSS: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus, 'both') tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci] tssNames = utils.uniquify(tssNames) if len(tssNames) > 2: # stitchedCollection.remove(stitchedLocus) originalLoci = referenceCollection.getOverlap(stitchedLocus, 'both') originalTicker += len(originalLoci) fixedLoci += originalLoci debugOutput.append([stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS']) removeTicker += 1 else: fixedLoci.append(stitchedLocus) print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = utils.LocusCollection(fixedLoci, 50) return fixedCollection, debugOutput, stitchWindow else: return stitchedCollection, debugOutput, stitchWindow
def make_mycn_stats_table(nb_all_chip_dataFile,outFile): ''' making a table of conserved mycn peaks w/ some additional stats mycn and h3k27ac signal is avg. background normalized across 4 samples active tss defined as the union of all H3K27ac occupied promoters in NB active enhancers defined as the union of all H3K27ac sites outside of promoters ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) print('SETTING UP OUTPUT TABLE') outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']] dinuc = nmers(2,['A','T','G','C']) #input files mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder) h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder) mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) #note, this is the ucsc hg19 cpg islands extended file #to download and format run ./beds/download_cpg.sh cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder) enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder) print('LOADING MYCN BINDING DATA') mycnSignalTable = utils.parseTable(mycnSignalFile,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} #this only works if the first column are unique identifiers if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile)) sys.exit() for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) mycn_sig_dict[line[0]] = numpy.mean(line_sig) print('LOADING MYCN RANK DATA') mycnRankTable = utils.parseTable(mycnRankFile,'\t') print('LOADING H3K27AC BINDING DATA') h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t') #making a signal dictionary for background subtracted H3K27ac binding names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = h3k27acSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] h3k27ac_sig_dict = {} #this only works if the first column are unique identifiers if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile)) sys.exit() for line in h3k27acSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig) #making the cpg collection print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(cpgFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) cpgCollection = utils.LocusCollection(cpgLoci,50) #next make the tss collection of active promoters print('LOADING ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) activeTable = utils.parseTable(activeGeneFile,'\t') tss_1kb_loci = [] for line in activeTable: tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000)) tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) #enhancer file print("LOADING ACTIVE ENHANCERS") enhancerTable = utils.parseTable(enhancerFile,'\t') print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB') print(len(enhancerTable) - 6) enhancerLoci = [] for line in enhancerTable: if line[0][0] != '#' and line[0][0] != 'R': try: lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0]) enhancerLoci.append(lineLocus) except IndexError: print(line) sys.exit() enhancerCollection = utils.LocusCollection(enhancerLoci,50) print('CLASSIFYING MYCN PEAKS') ticker = 0 for i in range(1,len(mycnSignalTable)): if ticker%100 == 0: print(ticker) ticker +=1 line = mycnSignalTable[i] mycn_signal = round(mycn_sig_dict[line[0]],4) h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4) peakID = line[0] locusString = line[1] chrom = locusString.split('(')[0] [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')] lineLocus = utils.Locus(chrom,start,stop,'.',peakID) tssOverlap = 0 if tss_1kb_collection.getOverlap(lineLocus,'both'): tssOverlap = 1 enhancerOverlap = 0 if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0: enhancerOverlap = 1 cpgIslandOverlap = 0 if cpgCollection.getOverlap(lineLocus,'both'): cpgIslandOverlap = 1 #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(),lineLocus.start()) cpgEnd = min(locus.end(),lineLocus.end()) overlappingBases += (cpgEnd-cpgStart) overlapFraction = round(float(overlappingBases)/lineLocus.len(),2) #now get the seq lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True)) gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2) dinuc_dict = {} for nmer in dinuc: dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq) mycnRankLine = mycnRankTable[i] mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]]) canonMatchList = re.findall('CACGTG',lineSeq) canon_count = len(canonMatchList) eboxMatchList = re.findall('CA..TG',lineSeq) ebox_count = len(eboxMatchList) non_canon_count = ebox_count-canon_count #get the expected values canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5) canon_exp = round(canon_exp,2) notCG = 1- dinuc_dict['CG'] non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5) non_exp = round(non_exp,2) #for gata and GABPA gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq) gabpa_count = len(gabpaMatchList) gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5) gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5) gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2) gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq) gata_count = len(gataMatchList) an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC'] cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC'] gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5) gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5) gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2) newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp] outTable.append(newLine) utils.unParseTable(outTable,outFile,'\t') return outFile
def loadAnnotFile(genome,window,geneList=[],skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome 20170213, add by Quanhu Sheng return validGenes """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', 'MM9': 'annotation/mm9_refseq.ucsc', 'MM10': 'annotation/mm10_refseq.ucsc', 'HG19': 'annotation/hg19_refseq.ucsc', 'HG19_RIBO': 'annotation/hg19_refseq.ucsc', 'RN4': 'annotation/rn4_refseq.ucsc', 'RN6': 'annotation/rn6_refseq.ucsc', } annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32(annotFile) & 0xFFFFFFFF # hash the entire location of this script annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash, annotFileHash) cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name) if os.path.isfile(cache_file_path): # Cache exists! Load it! try: print('\tLoading genome data from cache.') with open(cache_file_path, 'rb') as cache_fh: cached_data = cPickle.load(cache_fh) print('\tCache loaded.') return cached_data except (IOError, cPickle.UnpicklingError): # Pickle corrupt? Let's get rid of it. print('\tWARNING: Cache corrupt or unreadable. Ignoring.') else: print('\tNo cache exists: Loading annotation (slow).') # We're still here, so either caching was disabled, or the cache doesn't exist startDict = utils.makeStartDict(annotFile, geneList) tssLoci =[] validGenes = [] for gene in geneList: if gene in startDict: tssLoci.append(utils.makeTSSLocus(gene,startDict,window,window)) validGenes.append(gene) else: print('\tWARNING: gene %s not in annotation database. Ignoring.' % gene) tssCollection = utils.LocusCollection(tssLoci,50) if not skip_cache: print('Writing cache for the first time.') with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) return startDict, tssCollection, validGenes
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)
def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): print('PERFORMING REGION STITCHING') # first have to turn bound region file into a locus collection # need to make sure this names correctly... each region should have a unique name #referenceCollection debugOutput = [] # filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) # first make a locus collection of TSS startDict = utils.makeStartDict(annotFile) # now makeTSS loci for active genes removeTicker = 0 # this loop makes a locus centered around +/- tssWindow of transcribed genes # then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append( utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) # gives all the loci in referenceCollection boundLoci = referenceCollection.getLoci() # this loop will check if each bound region is contained by the TSS exclusion zone # this will drop out a lot of the promoter only regions that are tiny # typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus, 'both')) > 0: # if true, the bound locus overlaps an active gene referenceCollection.remove(locus) debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) removeTicker += 1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) # referenceCollection is now all enriched region loci that don't overlap an active TSS if stitchWindow == '': print('DETERMINING OPTIMUM STITCHING PARAMTER') optCollection = copy.deepcopy(referenceCollection) stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) print('USING A STITCHING PARAMETER OF %s' % stitchWindow) stitchedCollection = referenceCollection.stitchCollection( stitchWindow, 'both') if removeTSS: # now replace any stitched region that overlap 2 distinct genes # with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci, 50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap( stitchedLocus, 'both') tssNames = [ startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci ] tssNames = utils.uniquify(tssNames) if len(tssNames) > 2: # stitchedCollection.remove(stitchedLocus) originalLoci = referenceCollection.getOverlap( stitchedLocus, 'both') originalTicker += len(originalLoci) fixedLoci += originalLoci debugOutput.append([ stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS' ]) removeTicker += 1 else: fixedLoci.append(stitchedLocus) print( 'REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = utils.LocusCollection(fixedLoci, 50) return fixedCollection, debugOutput, stitchWindow else: return stitchedCollection, debugOutput, stitchWindow
#================================================================================ #===================================CLASSES====================================== #================================================================================ #user defined classes here #================================================================================ #=================================FUNCTIONS====================================== #================================================================================ #write your specific functions here annotFile = '/storage/goodell/home/jmreyes/pipeline/annotation/%s_refseq.ucsc' % ( genome) startDict = utils.makeStartDict(annotFile) startLoci = [] #for TR, -30, +300 and genebody +0 for gene in startDict.keys(): geneChrom = startDict[gene]['chr'] geneStart = startDict[gene]['start'] geneEnd = startDict[gene]['end'] geneSense = startDict[gene]['sense'] # newLocus = [geneChrom, gene, '', geneStart] newLocus = utils.makeTSSLocus(gene, startDict, 0, 0) startLoci.append([ newLocus.chr(),
def loadAnnotFile(genome, window, geneList=[], skip_cache=False): """ load in the annotation and create a startDict and tss collection for a set of refseq IDs a given genome """ genomeDict = { 'HG18': 'annotation/hg18_refseq.ucsc', 'MM9': 'annotation/mm9_refseq.ucsc', 'MM10': 'annotation/mm10_refseq.ucsc', 'HG19': 'annotation/hg19_refseq.ucsc', 'HG19_RIBO': 'annotation/hg19_refseq.ucsc', 'RN4': 'annotation/rn4_refseq.ucsc', 'RN6': 'annotation/rn6_refseq.ucsc', 'HG38': 'annotation/hg38_refseq.ucsc', } genomeDirectoryDict = { 'HG19': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/', 'RN6': '/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Chromosomes/', 'MM9': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Chromosomes/', 'MM10': '/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Chromosomes/', 'HG38': '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Chromosomes/', } mouse_convert_file = '%s/annotation/HMD_HumanPhenotype.rpt' % (whereAmI) #making a dictionary for mouse to human conversion mouse_convert_dict = defaultdict(str) mouse_convert_table = utils.parseTable(mouse_convert_file, '\t') for line in mouse_convert_table: mouse_convert_dict[line[4]] = line[0] genomeDirectory = genomeDirectoryDict[string.upper(genome)] #making a chrom_dict that is a list of all chroms with sequence chrom_list = utils.uniquify([ name.split('.')[0] for name in os.listdir(genomeDirectory) if len(name) > 0 ]) annotFile = whereAmI + '/' + genomeDict[string.upper(genome)] if not skip_cache: # Try loading from a cache, if the crc32 matches annotPathHash = zlib.crc32( annotFile) & 0xFFFFFFFF # hash the entire location of this script annotFileHash = zlib.crc32(open(annotFile, "rb").read()) & 0xFFFFFFFF cache_file_name = "%s.%s.%s.cache" % (genome, annotPathHash, annotFileHash) cache_file_path = '%s/%s' % (tempfile.gettempdir(), cache_file_name) if os.path.isfile(cache_file_path): # Cache exists! Load it! try: print('\tLoading genome data from cache.') with open(cache_file_path, 'rb') as cache_fh: cached_data = cPickle.load(cache_fh) print('\tCache loaded.') return cached_data except (IOError, cPickle.UnpicklingError): # Pickle corrupt? Let's get rid of it. print('\tWARNING: Cache corrupt or unreadable. Ignoring.') else: print('\tNo cache exists: Loading annotation (slow).') # We're still here, so either caching was disabled, or the cache doesn't exist startDict = utils.makeStartDict(annotFile, geneList) tssLoci = [] if geneList == []: geneList = startDict.keys() for gene in geneList: tssLoci.append(utils.makeTSSLocus(gene, startDict, window, window)) tssCollection = utils.LocusCollection(tssLoci, 50) if not skip_cache: print('Writing cache for the first time.') with open(cache_file_path, 'wb') as cache_fh: cPickle.dump((startDict, tssCollection), cache_fh, cPickle.HIGHEST_PROTOCOL) return startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict
def main(): #get WGBS files projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/' wgbsList = [ 'dc5_mutant_BS.txt', 'dc3_mutant_BS.txt', 'dc15_WT_BS.txt', 'dc16_WT_BS.txt' ] #import out genes of interest sigGenesFile = utils.parseTable( projectFolder + 'tables/Amish_significant.txt', '\r') sigTable = [x.split('\t') for x in sigGenesFile[0]] sigGenes = [x[0] for x in sigTable] #make start dict containing all TSS start sites startDict = utils.makeStartDict(annotFile) #converter form refseq to gene name revDict = {} for name in startDict.keys(): revDict[startDict[name]['name']] = name #get out subset of genes sigLoci = [] window = 500 for gene in sigGenes: if gene in revDict.keys(): refSeq = revDict[gene] geneChr = startDict[refSeq]['chr'] geneStart = startDict[refSeq]['start'] geneEnd = startDict[refSeq]['end'] geneSense = startDict[refSeq]['sense'] newLocus = [ geneChr, geneStart[0] - window, geneStart[0] + window, geneSense, gene + ':' + refSeq ] sigLoci.append(newLocus) else: refSeq = 'NA' # print len(sigLoci) # print sigLoci[1:5] # utils.unParseTable(sigLoci, projectFolder+'bed/Amish_sigTSS_-500_+500.bed', '\t') sortedBed = projectFolder + 'bed/Amish_sigTSS_-500_+500.sorted.bed' binNumber = 200 ts = time.time() timestamp = datetime.datetime.fromtimestamp(ts).strftime( '%Y%m%d_%Hh%Mm%Ss') dateFolder = projectFolder + 'scripts/' + datetime.datetime.fromtimestamp( ts).strftime('%Y%m%d') + '/' utils.formatFolder(dateFolder, True) bedList = [sortedBed] for wgbsCalls in wgbsList: methylPlotBash = [['#!/usr/bin/bash']] catBash = [] wgbsName = wgbsCalls.split('.')[0] outDir = projectFolder + 'temp/' + wgbsName + '/' outBed = projectFolder + 'temp/' + wgbsName + '/bed/' utils.formatFolder(outDir, True) utils.formatFolder(outBed, True) ticker = 0 for bed in bedList: bedName = bed.split('.bed')[0].split('/')[-1] splitCmd = 'split -l 1000 %s %s' % (bed, outBed + bedName) os.system(splitCmd) bedSplitList = [x for x in os.listdir(outBed) if bedName in x] catBedList = [] for bed in bedSplitList: ticker += 1 outName = wgbsName + '_' + bed if ticker % 10 == 0: sepMark = '&' else: sepMark = '&' methylCall = 'python /storage/goodell/home/jmreyes/xwing/methylPlot.py -i %s -b %s -o %s -n %s %s' % ( projectFolder + 'wgbs/' + wgbsCalls, outBed + bed, outDir + outName, binNumber, sepMark) methylPlotBash.append([methylCall]) catBedList.append(outDir + outName) catBedListSort = sorted(catBedList) catOut = projectFolder + 'mapped/' + wgbsName + '_' + bedName + '_' + timestamp + '_avgMethyl.txt' catCmd = '#cat %s > %s' % (' '.join(catBedListSort), catOut) catBash.append([catCmd]) rmCmd = ['#rm -rf %s' % (outDir)] outputBash = methylPlotBash + catBash utils.unParseTable( outputBash, dateFolder + wgbsName + '_TSS_mapping_' + timestamp + '.sh', '\t')