def checkRefCollection(referenceCollection): ''' makes sure the names of all loci in the reference collection are unique ''' namesList = [locus.ID() for locus in referenceCollection.getLoci()] if len(namesList) != len(ROSE_utils.uniquify(namesList)): print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") print("THE SECOND COLUMN OF THE INPUT .GFF OR THE FOURTH COLUMN OF THE INPUT .BED MUST HAVE A UNIQUE IDENTIFIER FOR EACH REGION") sys.exit() else: print("REFERENCE COLLECTION PASSES QC") return
def checkRefCollection(referenceCollection): ''' makes sure the names of all loci in the reference collection are unique ''' namesList = [locus.ID() for locus in referenceCollection.getLoci()] if len(namesList) != len(ROSE_utils.uniquify(namesList)): print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") print( "THE SECOND COLUMN OF THE INPUT .GFF OR THE FOURTH COLUMN OF THE INPUT .BED MUST HAVE A UNIQUE IDENTIFIER FOR EACH REGION" ) sys.exit() else: print("REFERENCE COLLECTION PASSES QC") return
def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False,matrix = None): #def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False): '''maps reads from a bam to a gff''' floor = int(floor) #USING BAM CLASS bam = ROSE_utils.Bam(bamFile) #new GFF to write to newGFF = [] #millionMappedReads if rpm: MMR= round(float(bam.getTotalReads('mapped'))/1000000,4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) senseTrans = str.maketrans('-+.','+-+') if ROSE_utils.checkChrStatus(bamFile) == 1: print("has chr") hasChrFlag = 1 #sys.exit(); else: print("does not have chr") hasChrFlag = 0 #sys.exit() if type(gff) == str: gff = ROSE_utils.parseTable(gff,'\t') #setting up a maxtrix table newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)]) #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker%100 == 0: print(ticker) ticker+=1 if not hasChrFlag: line[0] = re.sub(r"chr",r"",line[0]) gffLocus = ROSE_utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1]) #print(line[0]) #sys.exit() searchLocus = ROSE_utils.makeSearchLocus(gffLocus,int(extension),int(extension)) reads = bam.getReadsLocus(searchLocus,'both',False,'none') #now extend the reads and make a list of extended reads extendedReads = [] for locus in reads: if locus.sense() == '+' or locus.sense() == '.': locus = ROSE_utils.Locus(locus.chr(),locus.start(),locus.end()+extension,locus.sense(), locus.ID()) if locus.sense() == '-': locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID()) extendedReads.append(locus) if gffLocus.sense() == '+' or gffLocus.sense == '.': senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '-',extendedReads) else: senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '+',extendedReads) senseHash = defaultdict(int) antiHash = defaultdict(int) #filling in the readHashes if sense == '+' or sense == 'both' or sense =='.': for read in senseReads: for x in range(read.start(),read.end()+1,1): senseHash[x]+=1 if sense == '-' or sense == 'both' or sense == '.': #print('foo') for read in antiReads: for x in range(read.start(),read.end()+1,1): antiHash[x]+=1 #now apply flooring and filtering for coordinates keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys()) if floor > 0: keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys) #coordinate filtering keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys) #setting up the output table clusterLine = [gffLocus.ID(),gffLocus.__str__()] #getting the binsize binSize = (gffLocus.len()-1)/int(matrix) nBins = int(matrix) if binSize == 0: clusterLine+=['NA']*int(matrix) newGFF.append(clusterLine) continue n=0 if gffLocus.sense() == '+' or gffLocus.sense() =='.' or gffLocus.sense() == 'both': i = gffLocus.start() while n <nBins: n+=1 binKeys = filter(lambda x: i < x < i+binSize,keys) binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize clusterLine+=[round(binDen/MMR,4)] i = i+binSize else: i = gffLocus.end() while n < nBins: n+=1 binKeys = filter(lambda x: i-binSize < x < i,keys) binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize clusterLine+=[round(binDen/MMR,4)] i = i-binSize newGFF.append(clusterLine) return newGFF #===================================================================== #============================MAIN METHOD============================== #===================================================================== def main(): from optparse import OptionParser usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]" parser = OptionParser(usage = usage) #required flags parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None, help = "Enter .bam file to be processed.") parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter .gff or ENRICHED REGION file to be processed.") #output flag parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output filename.") #additional options parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both', help = "Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option("-f","--floor", dest="floor",nargs =1, default=0, help = "Sets a read floor threshold necessary to count towards density") parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200, help = "Extends reads by n bp. Default value is 200bp") parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False, help = "Normalizes density to reads per million (rpm)") parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None, help = "Outputs a variable bin sized matrix. User must specify number of bins.") (options,args) = parser.parse_args() print(options) print(args) if options.bam: bamFile = options.bam fullPath = os.path.abspath(bamFile) bamName = fullPath.split('/')[-1].split('.')[0] pathFolder = '/'.join(fullPath.split('/')[0:-1]) fileList = os.listdir(pathFolder) hasBai = False for fileName in fileList: if fileName.count(bamName) == 1 and fileName.count('.bai') == 1: hasBai = True if not hasBai: print('ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file') parser.print_help() exit() if options.sense: if ['+','-','.','both'].count(options.sense) == 0: print('ERROR: sense flag must be followed by +,-,.,both') parser.print_help() exit() if options.matrix: try: int(options.matrix) except: print('ERROR: User must specify an integer bin number for matrix (try 50)') parser.print_help() exit() if options.input and options.bam: inputFile = options.input gffFile = inputFile bamFile = options.bam if options.output == None: output = os.getcwd() + inputFile.split('/')[-1]+'.mapped' else: output = options.output if options.matrix: print('mapping to GFF and making a matrix with fixed bin number') newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.floor,options.rpm,options.matrix) ROSE_utils.unParseTable(newGFF,output,'\t') else: parser.print_help() if __name__ == "__main__": main()
def mapEnhancerToGene(annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, byRefseq=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' print("Herp") startDict = ROSE_utils.makeStartDict(annotFile) print("Derp") enhancerTable = ROSE_utils.parseTable(enhancerFile, '\t') if len(transcribedFile) > 0: transcribedTable = ROSE_utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = list(startDict.keys()) print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = ROSE_utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(ROSE_utils.makeTSSLocus(geneID, startDict, 0, 0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci, 50) geneDict = { 'overlapping': defaultdict(list), 'proximal': defaultdict(list) } #list of all genes that appear in this analysis overallGeneList = [] #set up the output tables #first by enhancer enhancerToGeneTable = [ enhancerTable[5][0:6] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + enhancerTable[5][-2:] ] #next by gene geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] for line in enhancerTable[6:]: enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = ROSE_utils.Locus(line[1], line[2], line[3], '.', line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( ROSE_utils.makeSearchLocus(enhancerLocus, 50000, 50000), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( ROSE_utils.makeSearchLocus(enhancerLocus, 50000000, 50000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = ROSE_utils.uniquify(overlappingGenes) proximalGenes = ROSE_utils.uniquify(proximalGenes) distalGenes = ROSE_utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 #get absolute distance to enhancer center distList = [ abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes ] #get the ID and convert to name #print enhancerCenter - startDict[geneID]['start'][0] #print distList.index(min(distList)) #print min(distList) #print len(distList) #print len(allEnhancerGenes[distList.index(min(distList))]) #print line #print len(startDict[allEnhancerGenes[distList.index(min(distList))]]) closestGene = startDict[allEnhancerGenes[distList.index( min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE newEnhancerLine = line[0:6] if byRefseq: newEnhancerLine.append(','.join( ROSE_utils.uniquify([x for x in overlappingGenes]))) newEnhancerLine.append(','.join( ROSE_utils.uniquify([x for x in proximalGenes]))) #print newEnhancerLine #print len(allEnhancerGenes) #print distList closestGene = allEnhancerGenes[distList.index(min(distList))] newEnhancerLine.append(closestGene) else: newEnhancerLine.append(','.join( ROSE_utils.uniquify( [startDict[x]['name'] for x in overlappingGenes]))) newEnhancerLine.append(','.join( ROSE_utils.uniquify( [startDict[x]['name'] for x in proximalGenes]))) closestGene = startDict[allEnhancerGenes[distList.index( min(distList))]]['name'] newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) #End loop through #Make table by gene overallGeneList = ROSE_utils.uniquify(overallGeneList) nameOrder = ROSE_utils.order( [startDict[x]['name'] for x in overallGeneList]) usedNames = [] for i in nameOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['proximal'][refID] + geneDict['overlapping'][ refID] newLine = [geneName, refID, ','.join(proxEnhancers)] geneToEnhancerTable.append(newLine) #re-sort enhancerToGeneTable enhancerOrder = ROSE_utils.order( [int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) return sortedTable, geneToEnhancerTable
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" parser = OptionParser(usage=usage) #required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a ROSE ranked enhancer or super-enhancer file") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19,HG38)") #optional flags parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None, help="Enter a gene list to filter through") parser.add_option( "-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder. Default will be same folder as input file" ) parser.add_option( "-r", "--refseq", dest="refseq", action='store_true', default=False, help="If flagged will write output by refseq ID and not common name") #RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.genome: parser.print_help() exit() #GETTING THE INPUT enhancerFile = options.input #making the out folder if it doesn't exist if options.out: outFolder = ROSE_utils.formatFolder(options.out, True) else: outFolder = '/'.join(enhancerFile.split('/')[0:-1]) + '/' #GETTING THE GENOME genome = options.genome print(('USING %s AS THE GENOME' % genome)) #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'HG38': '%s/annotation/hg38_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] #GETTING THE TRANSCRIBED LIST if options.geneList: transcribedFile = options.geneList else: transcribedFile = '' enhancerToGeneTable, geneToEnhancerTable = mapEnhancerToGene( annotFile, enhancerFile, uniqueGenes=True, byRefseq=options.refseq, transcribedFile=transcribedFile) #Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName) ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName) ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t')
def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True): print('PERFORMING REGION STITCHING') #first have to turn bound region file into a locus collection #need to make sure this names correctly... each region should have a unique name boundCollection = ROSE_utils.gffToLocusCollection(inputGFF) debugOutput = [] #filter out all bound regions that overlap the TSS of an ACTIVE GENE if removeTSS: #first make a locus collection of TSS startDict = ROSE_utils.makeStartDict(annotFile) #now makeTSS loci for active genes removeTicker=0 #this loop makes a locus centered around +/- tssWindow of transcribed genes #then adds it to the list tssLoci tssLoci = [] for geneID in startDict.keys(): tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,tssWindow,tssWindow)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci,50) #gives all the loci in boundCollection boundLoci = boundCollection.getLoci() #this loop will check if each bound region is contained by the TSS exclusion zone #this will drop out a lot of the promoter only regions that are tiny #typical exclusion window is around 2kb for locus in boundLoci: if len(tssCollection.getContainers(locus,'both'))>0: #if true, the bound locus overlaps an active gene boundCollection.remove(locus) debugOutput.append([locus.__str__(),locus.ID(),'CONTAINED']) removeTicker+=1 print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) #boundCollection is now all enriched region loci that don't overlap an active TSS stitchedCollection = boundCollection.stitchCollection(stitchWindow,'both') if removeTSS: #now replace any stitched region that overlap 2 distinct genes #with the original loci that were there fixedLoci = [] tssLoci = [] for geneID in startDict.keys(): tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,50,50)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci,50) removeTicker = 0 originalTicker = 0 for stitchedLocus in stitchedCollection.getLoci(): overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus,'both') tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci] tssNames = ROSE_utils.uniquify(tssNames) if len(tssNames) > 2: #stitchedCollection.remove(stitchedLocus) originalLoci = boundCollection.getOverlap(stitchedLocus,'both') originalTicker+=len(originalLoci) fixedLoci+=originalLoci debugOutput.append([stitchedLocus.__str__(),stitchedLocus.ID(),'MULTIPLE_TSS']) removeTicker+=1 else: fixedLoci.append(stitchedLocus) print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) fixedCollection = ROSE_utils.LocusCollection(fixedLoci,50) return fixedCollection,debugOutput else: return stitchedCollection,debugOutput
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" parser = OptionParser(usage = usage) #required flags parser.add_option("-i","--i", dest="input",nargs = 1, default=None, help = "Enter a ROSE ranked enhancer or super-enhancer file") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (MM9,MM8,HG18,HG19)") #optional flags parser.add_option("-l","--list", dest="geneList",nargs = 1, default=None, help = "Enter a gene list to filter through") parser.add_option("-o","--out", dest="out",nargs = 1, default=None, help = "Enter an output folder. Default will be same folder as input file") parser.add_option("-w","--window", dest="window",nargs = 1, default=50000, help = "Enter a search distance for genes. Default is 50,000bp") parser.add_option("-f","--format", dest="formatTable",action= "store_true", default=False, help = "If flagged, maintains original formatting of input table") #RETRIEVING FLAGS (options,args) = parser.parse_args() if not options.input or not options.genome: parser.print_help() exit() #GETTING THE INPUT enhancerFile = options.input window = int(options.window) #making the out folder if it doesn't exist if options.out: outFolder = ROSE_utils.formatFolder(options.out,True) else: outFolder = join(enhancerFile.split('/')[0:-1],'/') + '/' #GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) #CHECK FORMATTING FLAG if options.formatTable: noFormatTable =True else: noFormatTable = False #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18':'%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19':'%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[upper(genome)] #GETTING THE TRANSCRIBED LIST if options.geneList: transcribedFile = options.geneList else: transcribedFile = '' enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable) #Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000) ROSE_utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000) ROSE_utils.unParseTable(geneToEnhancerTable,out2,'\t') else: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName) ROSE_utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName) ROSE_utils.unParseTable(geneToEnhancerTable,out2,'\t')
def main(): from optparse import OptionParser usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]" parser = OptionParser(usage=usage) #required flags parser.add_option("-b", "--bam", dest="bam", nargs=1, default=None, help="Enter .bam file to be processed.") parser.add_option( "-i", "--input", dest="input", nargs=1, default=None, help="Enter .gff or ENRICHED REGION file to be processed.") #output flag parser.add_option("-o", "--output", dest="output", nargs=1, default=None, help="Enter the output filename.") #additional options parser.add_option( "-s", "--sense", dest="sense", nargs=1, default='both', help="Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option( "-f", "--floor", dest="floor", nargs=1, default=0, help="Sets a read floor threshold necessary to count towards density") parser.add_option("-e", "--extension", dest="extension", nargs=1, default=200, help="Extends reads by n bp. Default value is 200bp") parser.add_option("-r", "--rpm", dest="rpm", action='store_true', default=False, help="Normalizes density to reads per million (rpm)") parser.add_option( "-m", "--matrix", dest="matrix", nargs=1, default=None, help= "Outputs a variable bin sized matrix. User must specify number of bins." ) (options, args) = parser.parse_args() print(options) print(args) if options.bam: bamFile = options.bam fullPath = os.path.abspath(bamFile) bamName = fullPath.split('/')[-1].split('.')[0] pathFolder = '/'.join(fullPath.split('/')[0:-1]) fileList = os.listdir(pathFolder) hasBai = False for fileName in fileList: if fileName.count(bamName) == 1 and fileName.count('.bai') == 1: hasBai = True if not hasBai: print( 'ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file' ) parser.print_help() exit() if options.sense: if ['+', '-', '.', 'both'].count(options.sense) == 0: print('ERROR: sense flag must be followed by +,-,.,both') parser.print_help() exit() if options.matrix: try: int(options.matrix) except: print( 'ERROR: User must specify an integer bin number for matrix (try 50)' ) parser.print_help() exit() if options.input and options.bam: inputFile = options.input gffFile = inputFile bamFile = options.bam if options.output == None: output = os.getcwd() + inputFile.split('/')[-1] + '.mapped' else: output = options.output if options.matrix: print('mapping to GFF and making a matrix with fixed bin number') newGFF = mapBamToGFF(bamFile, gffFile, options.sense, int(options.extension), options.floor, options.rpm, options.matrix) ROSE_utils.unParseTable(newGFF, output, '\t') else: parser.print_help()
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" parser = OptionParser(usage=usage) #required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a ROSE ranked enhancer or super-enhancer file") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") #optional flags parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None, help="Enter a gene list to filter through") parser.add_option( "-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder. Default will be same folder as input file" ) parser.add_option( "-w", "--window", dest="window", nargs=1, default=50000, help="Enter a search distance for genes. Default is 50,000bp") parser.add_option( "-f", "--format", dest="formatTable", action="store_true", default=False, help="If flagged, maintains original formatting of input table") #RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.genome: parser.print_help() exit() #GETTING THE INPUT enhancerFile = options.input window = int(options.window) #making the out folder if it doesn't exist if options.out: outFolder = ROSE_utils.formatFolder(options.out, True) else: outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/' #GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) #CHECK FORMATTING FLAG if options.formatTable: noFormatTable = True else: noFormatTable = False #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[upper(genome)] #GETTING THE TRANSCRIBED LIST if options.geneList: transcribedFile = options.geneList else: transcribedFile = '' enhancerToGeneTable, geneToEnhancerTable = mapEnhancerToGene( annotFile, enhancerFile, transcribedFile, True, window, noFormatTable) #Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder, enhancerFileName, window / 1000) ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder, enhancerFileName, window / 1000) ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t') else: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName) ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName) ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t')
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) #required flags parser.add_option( "-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers" ) parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option( "-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19,MM10,HG38)") #optional flags parser.add_option( "-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default=12500, help="Enter a max linking distance for stitching") parser.add_option( "-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") #RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() #making the out folder if it doesn't exist outFolder = ROSE_utils.formatFolder(options.out, True) #figuring out folder schema gffFolder = ROSE_utils.formatFolder(outFolder + 'gff/', True) mappedFolder = ROSE_utils.formatFolder(outFolder + 'mappedGFF/', True) #GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': #CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) ROSE_utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print( 'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT' ) #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) #GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileLIst = ROSE_utils.uniquify(bamFileList) #optional args #Stitch parameter stitchWindow = int(options.stitch) #tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] #GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), 'HG38': '%s/annotation/hg38_refseq.ucsc' % (cwd), } annotFile = genomeDict[upper(genome)] #MAKING THE START DICT print('MAKING START DICT') startDict = ROSE_utils.makeStartDict(annotFile) #LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = ROSE_utils.gffToLocusCollection(inputGFFFile) #NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput = regionStitching(inputGFFFile, stitchWindow, tssWindow, annotFile, removeTSS) #NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = ROSE_utils.locusCollectionToGFF(stitchedCollection) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, stitchWindow / 1000) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, stitchWindow / 1000) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, stitchWindow / 1000) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % ( gffFolder, inputName, stitchWindow / 1000) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, stitchWindow / 1000) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % ( gffFolder, inputName, stitchWindow / 1000) #WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) ROSE_utils.unParseTable(debugOutput, debugOutFile, '\t') #WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) ROSE_utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') #SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) #MAPPING TO THE NON STITCHED (ORIGINAL GFF) #MAPPING TO THE STITCHED GFF # bin for bam mapping nBin = 1 #IMPORTANT #CHANGE cmd1 and cmd2 TO PARALLELIZE OUTPUT FOR BATCH SUBMISSION #e.g. if using LSF cmd1 = "bsub python bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s" % (nBin,bamFile,stitchedGFFFile,mappedOut1) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] #MAPPING TO THE STITCHED GFF mappedOut1 = '%s%s_%s_MAPPED.gff' % (mappedFolder, stitchedGFFName, bamFileName) #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED cmd1 = "python ROSE_bamToGFF_turbo.py -e 200 -r -m %s -b %s -i %s -o %s &" % ( nBin, bamFile, stitchedGFFFile, mappedOut1) print(cmd1) os.system(cmd1) #MAPPING TO THE ORIGINAL GFF mappedOut2 = '%s%s_%s_MAPPED.gff' % (mappedFolder, inputName, bamFileName) #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED cmd2 = "python ROSE_bamToGFF_turbo.py 1 -e 200 -r -m %s -b %s -i %s -o %s &" % ( nBin, bamFile, inputGFFFile, mappedOut2) print(cmd2) os.system(cmd2) print('PAUSING TO MAP') time.sleep(10) #CHECK FOR MAPPING OUTPUT outputDone = False ticker = 0 print('WAITING FOR MAPPING TO COMPLETE. ELAPSED TIME (MIN):') while not outputDone: ''' check every 1 minutes for completed output ''' outputDone = True if ticker % 6 == 0: print(ticker * 5) ticker += 1 #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP if ticker == 120: print( 'ERROR: OPERATION TIME OUT. MAPPING OUTPUT NOT DETECTED AFTER 2 HOURS' ) exit() break for bamFile in bamFileList: #GET THE MAPPED OUTPUT NAMES HERE FROM MAPPING OF EACH BAMFILE bamFileName = bamFile.split('/')[-1] mappedOut1 = '%s%s_%s_MAPPED.gff' % (mappedFolder, stitchedGFFName, bamFileName) try: mapFile = open(mappedOut1, 'r') mapFile.close() except IOError: outputDone = False mappedOut2 = '%s%s_%s_MAPPED.gff' % (mappedFolder, inputName, bamFileName) try: mapFile = open(mappedOut2, 'r') mapFile.close() except IOError: outputDone = False if outputDone == True: break time.sleep(60) print('MAPPING TOOK %s MINUTES' % (ticker)) print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #CALCULATE DENSITY BY REGION mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) time.sleep(10) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % ( outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) #calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) cmd = "python ROSE_geneMapper.py -g %s -i %s%s" % (genome, outFolder, superTableFile) os.system(cmd)
def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False,matrix = None): #def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False): '''maps reads from a bam to a gff''' floor = int(floor) #USING BAM CLASS bam = ROSE_utils.Bam(bamFile) #new GFF to write to newGFF = [] #millionMappedReads if rpm: MMR= round(float(bam.getTotalReads('mapped'))/1000000,4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) senseTrans = maketrans('-+.','+-+') if ROSE_utils.checkChrStatus(bamFile) == 1: print "has chr" hasChrFlag = 1 #sys.exit(); else: print "does not have chr" hasChrFlag = 0 #sys.exit() if type(gff) == str: gff = ROSE_utils.parseTable(gff,'\t') #setting up a maxtrix table newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)]) #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker%100 == 0: print ticker ticker+=1 if not hasChrFlag: line[0] = re.sub(r"chr",r"",line[0]) gffLocus = ROSE_utils.Locus(line[0],float(line[3]),float(line[4]),line[6],line[1]) #print line[0] #sys.exit() searchLocus = ROSE_utils.makeSearchLocus(gffLocus,int(extension),int(extension)) reads = bam.getReadsLocus(searchLocus,'both',False,'none') #now extend the reads and make a list of extended reads extendedReads = [] for locus in reads: if locus.sense() == '+' or locus.sense() == '.': locus = ROSE_utils.Locus(locus.chr(),locus.start(),locus.end()+extension,locus.sense(), locus.ID()) if locus.sense() == '-': locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID()) extendedReads.append(locus) if gffLocus.sense() == '+' or gffLocus.sense == '.': senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '-',extendedReads) else: senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads) antiReads = filter(lambda x:x.sense() == '+',extendedReads) senseHash = defaultdict(int) antiHash = defaultdict(int) #filling in the readHashes if sense == '+' or sense == 'both' or sense =='.': for read in senseReads: for x in range(read.start(),read.end()+1,1): senseHash[x]+=1 if sense == '-' or sense == 'both' or sense == '.': #print('foo') for read in antiReads: for x in range(read.start(),read.end()+1,1): antiHash[x]+=1 #now apply flooring and filtering for coordinates keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys()) if floor > 0: keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys) #coordinate filtering keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys) #setting up the output table clusterLine = [gffLocus.ID(),gffLocus.__str__()] #getting the binsize binSize = (gffLocus.len()-1)/int(matrix) nBins = int(matrix) if binSize == 0: clusterLine+=['NA']*int(matrix) newGFF.append(clusterLine) continue n=0 if gffLocus.sense() == '+' or gffLocus.sense() =='.' or gffLocus.sense() == 'both': i = gffLocus.start() while n <nBins: n+=1 binKeys = filter(lambda x: i < x < i+binSize,keys) binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize clusterLine+=[round(binDen/MMR,4)] i = i+binSize else: i = gffLocus.end() while n < nBins: n+=1 binKeys = filter(lambda x: i-binSize < x < i,keys) binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize clusterLine+=[round(binDen/MMR,4)] i = i-binSize newGFF.append(clusterLine) return newGFF
def main(): from optparse import OptionParser usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]" parser = OptionParser(usage = usage) #required flags parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None, help = "Enter .bam file to be processed.") parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter .gff or ENRICHED REGION file to be processed.") #output flag parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output filename.") #additional options parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both', help = "Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option("-f","--floor", dest="floor",nargs =1, default=0, help = "Sets a read floor threshold necessary to count towards density") parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200, help = "Extends reads by n bp. Default value is 200bp") parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False, help = "Normalizes density to reads per million (rpm)") parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None, help = "Outputs a variable bin sized matrix. User must specify number of bins.") (options,args) = parser.parse_args() print(options) print(args) if options.bam: bamFile = options.bam fullPath = os.path.abspath(bamFile) bamName = fullPath.split('/')[-1].split('.')[0] pathFolder = join(fullPath.split('/')[0:-1],'/') fileList = os.listdir(pathFolder) hasBai = False for fileName in fileList: if fileName.count(bamName) == 1 and fileName.count('.bai') == 1: hasBai = True if not hasBai: print('ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file') parser.print_help() exit() if options.sense: if ['+','-','.','both'].count(options.sense) == 0: print('ERROR: sense flag must be followed by +,-,.,both') parser.print_help() exit() if options.matrix: try: int(options.matrix) except: print('ERROR: User must specify an integer bin number for matrix (try 50)') parser.print_help() exit() if options.input and options.bam: inputFile = options.input gffFile = inputFile bamFile = options.bam if options.output == None: output = os.getcwd() + inputFile.split('/')[-1]+'.mapped' else: output = options.output if options.matrix: print('mapping to GFF and making a matrix with fixed bin number') newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.floor,options.rpm,options.matrix) ROSE_utils.unParseTable(newGFF,output,'\t') else: parser.print_help()
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = ROSE_utils.makeStartDict(annotFile) enhancerTable = ROSE_utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = ROSE_utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = ROSE_utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = ROSE_utils.uniquify(overlappingGenes) proximalGenes = ROSE_utils.uniquify(proximalGenes) distalGenes = ROSE_utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = ROSE_utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = ROSE_utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = ROSE_utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,output,refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE']] lociLenList = [] #strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: #numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) #numOrder = order(numLociList,decreasing=True) lenOrder = ROSE_utils.order(lociLenList,decreasing=True) ticker = 0 for i in lenOrder: ticker+=1 if ticker%1000 ==0: print(ticker) locus = loci[i] #First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus,'both') for refLocus in refOverlappingLoci: refEnrichSize+=refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 locusTable.append([locus.ID(),locus.chr(),locus.start(),locus.end(),stitchCount,refEnrichSize]) print('GETTING MAPPED DATA') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) #assumes standard convention for naming enriched region gffs #opening up the mapped GFF print('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName)) mappedGFF =ROSE_utils.parseTable('%s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName),'\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(ROSE_utils.Locus(chrom,start,end,'.',line[0])) try: signalDict[line[0]] = float(line[2])*(abs(end-start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = ROSE_utils.LocusCollection(mappedLoci,500) locusTable[0].append(bamFileName) for i in range(1,len(locusTable)): signal=0.0 line = locusTable[i] lineLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.') overlappingRegions = mappedCollection.getOverlap(lineLocus,sense='both') for region in overlappingRegions: signal+= signalDict[region.ID()] locusTable[i].append(signal) ROSE_utils.unParseTable(locusTable,output,'\t')
print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) #GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby,options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileLIst = ROSE_utils.uniquify(bamFileList) #optional args #Stitch parameter stitchWindow = int(options.stitch) #tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0]
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [INPUT_GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage = usage) #required flags parser.add_option("-i","--i", dest="input",nargs = 1, default=None, help = "Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r","--rankby", dest="rankby",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-o","--out", dest="out",nargs = 1, default=None, help = "Enter an output folder") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Reference genome file: example- hg18_refseq.ucsc") #optional flags parser.add_option("-b","--bams", dest="bams",nargs = 1, default=None, help = "Enter a comma separated list of additional bam files to map to") parser.add_option("-c","--control", dest="control",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default=12500, help = "Enter a max linking distance for stitching") parser.add_option("-t","--tss", dest="tss",nargs = 1, default=0, help = "Enter a distance from TSS to exclude. 0 = no TSS exclusion") #RETRIEVING FLAGS (options,args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() #making the out folder if it doesn't exist outFolder = ROSE_utils.formatFolder(options.out,True) #figuring out folder schema gffFolder = ROSE_utils.formatFolder(outFolder+'gff/',True) mappedFolder = ROSE_utils.formatFolder(outFolder+ 'mappedGFF/',True) #GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': #CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder,inputGFFName) ROSE_utils.bedToGFF(options.input,inputGFFFile) elif options.input.split('.')[-1] =='gff': #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) #GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby,options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileLIst = ROSE_utils.uniquify(bamFileList) #optional args #Stitch parameter stitchWindow = int(options.stitch) #tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] #GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) #GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() ## genomeDict = { ## 'HG18':'%s/annotation/hg18_refseq.ucsc' % (cwd), ## 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), ## 'HG19':'%s/annotation/hg19_refseq.ucsc' % (cwd), ## 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), ## 'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd), ## } annotFile = genome ## annotFile = genomeDict[upper(genome)] #MAKING THE START DICT print('MAKING START DICT') startDict = ROSE_utils.makeStartDict(annotFile) #LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = ROSE_utils.gffToLocusCollection(inputGFFFile) #CHECKING INPUT REGIONS FOR FORMATTING print('CHECKING INPUT TO MAKE SURE EACH REGION HAS A UNIQUE IDENTIFIER') checkRefCollection(referenceCollection) #makes sure that all input regions have a unique ID #NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection,debugOutput = regionStitching(inputGFFFile,stitchWindow,tssWindow,annotFile,removeTSS) #NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF=ROSE_utils.locusCollectionToGFF(stitchedCollection) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder,inputName,stitchWindow/1000) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,stitchWindow/1000) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder,inputName,stitchWindow/1000) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder,inputName,stitchWindow/1000) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName,stitchWindow/1000) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder,inputName,stitchWindow/1000) #WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) ROSE_utils.unParseTable(debugOutput,debugOutFile,'\t') #WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) ROSE_utils.unParseTable(stitchedGFF,stitchedGFFFile,'\t') #SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) #MAPPING TO THE NON STITCHED (ORIGINAL GFF) #MAPPING TO THE STITCHED GFF # bin for bam mapping nBin =1 #IMPORTANT #CHANGE cmd1 and cmd2 TO PARALLELIZE OUTPUT FOR BATCH SUBMISSION #e.g. if using LSF cmd1 = "bsub python bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s" % (nBin,bamFile,stitchedGFFFile,mappedOut1) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] #MAPPING TO THE STITCHED GFF mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName) #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED cmd1 = "python /usr/local/'bin'/ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,stitchedGFFFile,mappedOut1) print(cmd1) os.system(cmd1) #MAPPING TO THE ORIGINAL GFF mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName) #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED cmd2 = "python /usr/local/'bin'/ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,inputGFFFile,mappedOut2) print(cmd2) os.system(cmd2) print('PAUSING TO MAP') time.sleep(10) #CHECK FOR MAPPING OUTPUT outputDone = False ticker = 0 print('WAITING FOR MAPPING TO COMPLETE. ELAPSED TIME (MIN):') while not outputDone: ''' check every 5 minutes for completed output ''' outputDone = True if ticker%6 == 0: print(ticker*5) ticker +=1 #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP if ticker == 144: print('ERROR: OPERATION TIME OUT. MAPPING OUTPUT NOT DETECTED') exit() break for bamFile in bamFileList: #GET THE MAPPED OUTPUT NAMES HERE FROM MAPPING OF EACH BAMFILE bamFileName = bamFile.split('/')[-1] mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName) try: mapFile = open(mappedOut1,'r') mapFile.close() except IOError: outputDone = False mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName) try: mapFile = open(mappedOut2,'r') mapFile.close() except IOError: outputDone = False if outputDone == True: break time.sleep(300) print('MAPPING TOOK %s MINUTES' % (ticker*5)) print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') #CALCULATE DENSITY BY REGION mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,outputFile1,refName = stitchedGFFName) time.sleep(10) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < /usr/local/bin/ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < /usr/local/bin/ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName) print(cmd) os.system(cmd)
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage = usage) #required flags parser.add_option("-i","--i", dest="input",nargs = 1, default=None, help = "Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r","--rankby", dest="rankby",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-o","--out", dest="out",nargs = 1, default=None, help = "Enter an output folder") parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None, help = "Enter the genome build (MM9,MM8,HG18,HG19)") #optional flags parser.add_option("-b","--bams", dest="bams",nargs = 1, default=None, help = "Enter a comma separated list of additional bam files to map to") parser.add_option("-c","--control", dest="control",nargs = 1, default=None, help = "bamfile to rank enhancer by") parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default=12500, help = "Enter a max linking distance for stitching") parser.add_option("-t","--tss", dest="tss",nargs = 1, default=0, help = "Enter a distance from TSS to exclude. 0 = no TSS exclusion") #RETRIEVING FLAGS (options,args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() #making the out folder if it doesn't exist outFolder = ROSE_utils.formatFolder(options.out,True) #figuring out folder schema gffFolder = ROSE_utils.formatFolder(outFolder+'gff/',True) mappedFolder = ROSE_utils.formatFolder(outFolder+ 'mappedGFF/',True) #GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': #CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder,inputGFFName) ROSE_utils.bedToGFF(options.input,inputGFFFile) elif options.input.split('.')[-1] =='gff': #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') #COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile,gffFolder))
def mapBamToGFF(bamFile, gff, sense='both', extension=200, floor=0, rpm=False, matrix=None): #def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False): '''maps reads from a bam to a gff''' floor = int(floor) #USING BAM CLASS bam = ROSE_utils.Bam(bamFile) #new GFF to write to newGFF = [] #millionMappedReads if rpm: MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4) else: MMR = 1 print(('using a MMR value of %s' % (MMR))) #senseTrans = maketrans('-+.','+-+') #deprecated if ROSE_utils.checkChrStatus(bamFile) == 1: print("has chr") hasChrFlag = 1 #sys.exit(); else: print("does not have chr") hasChrFlag = 0 #sys.exit() if type(gff) == str: gff = ROSE_utils.parseTable(gff, '\t') #setting up a maxtrix table newGFF.append(['GENE_ID', 'locusLine'] + [ 'bin_' + str(n) + '_' + bamFile.split('/')[-1] for n in range(1, int(matrix) + 1, 1) ]) #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker % 100 == 0: print(ticker) ticker += 1 if not hasChrFlag: line[0] = re.sub(r"chr", r"", line[0]) gffLocus = ROSE_utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) #print line[0] #sys.exit() searchLocus = ROSE_utils.makeSearchLocus(gffLocus, int(extension), int(extension)) reads = bam.getReadsLocus(searchLocus, 'both', False, 'none') #now extend the reads and make a list of extended reads extendedReads = [] for locus in reads: if locus.sense() == '+' or locus.sense() == '.': locus = ROSE_utils.Locus(locus.chr(), locus.start(), locus.end() + extension, locus.sense(), locus.ID()) if locus.sense() == '-': locus = ROSE_utils.Locus(locus.chr(), locus.start() - extension, locus.end(), locus.sense(), locus.ID()) extendedReads.append(locus) if gffLocus.sense() == '+' or gffLocus.sense == '.': senseReads = [ x for x in extendedReads if x.sense() == '+' or x.sense() == '.' ] antiReads = [x for x in extendedReads if x.sense() == '-'] else: senseReads = [ x for x in extendedReads if x.sense() == '-' or x.sense() == '.' ] antiReads = [x for x in extendedReads if x.sense() == '+'] senseHash = defaultdict(int) antiHash = defaultdict(int) #filling in the readHashes if sense == '+' or sense == 'both' or sense == '.': for read in senseReads: for x in range(read.start(), read.end() + 1, 1): senseHash[x] += 1 if sense == '-' or sense == 'both' or sense == '.': #print('foo') for read in antiReads: for x in range(read.start(), read.end() + 1, 1): antiHash[x] += 1 #now apply flooring and filtering for coordinates keys = ROSE_utils.uniquify( list(senseHash.keys()) + list(antiHash.keys())) if floor > 0: keys = [x for x in keys if (senseHash[x] + antiHash[x]) > floor] #coordinate filtering keys = [x for x in keys if gffLocus.start() < x < gffLocus.end()] #setting up the output table clusterLine = [gffLocus.ID(), gffLocus.__str__()] #getting the binsize binSize = (gffLocus.len() - 1) / int(matrix) nBins = int(matrix) if binSize == 0: clusterLine += ['NA'] * int(matrix) newGFF.append(clusterLine) continue n = 0 if gffLocus.sense() == '+' or gffLocus.sense( ) == '.' or gffLocus.sense() == 'both': i = gffLocus.start() while n < nBins: n += 1 binKeys = [x for x in keys if i < x < i + binSize] binDen = float( sum([senseHash[x] + antiHash[x] for x in binKeys])) / binSize clusterLine += [round(binDen / MMR, 4)] i = i + binSize else: i = gffLocus.end() while n < nBins: n += 1 binKeys = [x for x in keys if i - binSize < x < i] binDen = float( sum([senseHash[x] + antiHash[x] for x in binKeys])) / binSize clusterLine += [round(binDen / MMR, 4)] i = i - binSize newGFF.append(clusterLine) return newGFF
def mapEnhancerToGene(annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = ROSE_utils.makeStartDict(annotFile) enhancerTable = ROSE_utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = ROSE_utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = ROSE_utils.makeTranscriptCollection(annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(ROSE_utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = ROSE_utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) # superDict= defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [['REGION_ID', 'OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [['CHROM','START','END','REGION_ID','OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE','enhancerRank']] # next by gene geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[0], line[1], line[2]) enhancerLocus = ROSE_utils.Locus(line[0], line[1], line[2], '.', line[3]) # overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow),'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = ROSE_utils.uniquify(overlappingGenes) proximalGenes = ROSE_utils.uniquify(proximalGenes) distalGenes = ROSE_utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[1]) + int(line[2])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:4] newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[4:5] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) if ',' in line[4]: rankDict[refID].append(int(line[4].split(',')[0])) else: rankDict[refID].append(int(line[4])) # superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) if ',' in line[4]: rankDict[refID].append(int(line[4].split(',')[0])) else: rankDict[refID].append(int(line[4])) # superDict[refID].append(int(line[-1])) # End loop through # Make table by gene overallGeneList = ROSE_utils.uniquify(overallGeneList) # use enhancer rank to order rankOrder = ROSE_utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID] + geneDict['proximal'][refID] # superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') newLine = [geneName, refID, join(proxEnhancers, ','), enhancerRanks] geneToEnhancerTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, geneToEnhancerTable else: tmp = [] for line in enhancerToGeneTable[1:-1]: if ',' in line[-1]: tmp.append(int(line[-1].split(',')[0])) else: tmp.append(int(line[-1])) enhancerOrder = ROSE_utils.order(tmp) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) return sortedTable, geneToEnhancerTable