def splitRegions(inputGFF,tssCollection): #if even a single coordinate is shared with the +/-1kb splitGFF = [] for line in inputGFF: chrom = line[0] regionID = line[1] lineLocus = utils.Locus(line[0],line[3],line[4],'.') overlappingLoci = tssCollection.getOverlap(lineLocus) if len(overlappingLoci) > 0: #case where a tss Overlap #identify the parts of the line locus that are contained localTSSCollection = utils.LocusCollection(overlappingLoci,50) overlappingCoords = lineLocus.coords() for tssLocus in overlappingLoci: overlappingCoords += tssLocus.coords() overlappingCoords = utils.uniquify(overlappingCoords) overlappingCoords.sort() #you need to hack and slash add 1 to the last coordinate of the overlappingCoords overlappingCoords[-1] +=1 i = 0 regionTicker = 1 while i < (len(overlappingCoords)-1): start = int(overlappingCoords[i]) stop = int(overlappingCoords[(i+1)])-1 if (stop - start) < 50: #this eliminates really tiny regions i+=1 continue splitLocus = utils.Locus(chrom,start+1,stop,'.') if lineLocus.overlaps(splitLocus): #has to be a mycn site newID = '%s_%s' % (regionID,regionTicker) tssStatus = 0 if localTSSCollection.getOverlap(splitLocus): tssStatus = 1 splitGFFLine = [chrom,newID,newID,start,stop,'','.',tssStatus,newID] splitGFF.append(splitGFFLine) regionTicker+=1 i+=1 else: line[7] = 0 splitGFF.append(line) return splitGFF
def makeBedCollection(bedFileList): ''' takes in a list of bedFiles and makes a single huge collection each locus has as its ID the name of the bed file ''' bedLoci = [] print("MAKING BED COLLECTION FOR:") for bedFile in bedFileList: bedName = bedFile.split('/')[-1].split('.')[0] print(bedName) bed = utils.parseTable(bedFile, '\t') for line in bed: if len(line) >= 3: #check that line[0] if line[0][0:3] == 'chr': try: coords = [int(line[1]), int(line[2])] bedLocus = utils.Locus(line[0], min(coords), max(coords), '.', bedName) bedLoci.append(bedLocus) except ValueError: pass print("IDENTIFIED %s BED REGIONS" % (len(bedLoci))) return utils.LocusCollection(bedLoci, 50)
def findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated ''' print 'IDENTIFYING VALLEYS IN SUPER ENHANCERS' valleyBED = [] valleyDict = {} for gene in TFtoEnhancerDict.keys(): valleyDict[gene] = [] print gene for region in TFtoEnhancerDict[gene]: scoreArray = scoreValley(region, bamFile, projectName, projectFolder) for index,score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index*10, region.start() + (index+1)*10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: valleyBED.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys bedfilename = projectFolder + projectName + '_valleys.bed' utils.unParseTable(valleyBED, bedfilename, '\t') print bedfilename return bedfilename
def main(): projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/' bedFolder = projectFolder+'bed/' wtCanyonBed = bedFolder+'canyon_WT_sizeSelected.bed' mutCanyonBed = bedFolder+'canyon_Mut_sizeSelected.bed' wtCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'wt_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(wtCanyonBed, '\t')]) mutCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'mut_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(mutCanyonBed, '\t')]) overlappingCanyons = [] wtExpansion = [] mutExpansion = [] wtUnique = [] mutUnique = [] overlapCounter = 0 mutOverlap = 0 for locus in wtCanyonLocusCollection.getLoci(): wtMutOverlap = mutCanyonLocusCollection.getOverlap(locus, 'both') if len(wtMutOverlap) > 0: overlapCounter += 1 for overlap in wtMutOverlap: newLine = [locus.chr(), locus.start(), locus.end(), locus.end()-locus.start(), overlap.chr(), overlap.start(), overlap.end(), overlap.end()-overlap.start()] wtLength = locus.end()-locus.start() mutLength = overlap.end()-overlap.start() if mutLength > wtLength: mutExpansion.append(newLine) elif wtLength > mutLength: wtExpansion.append(newLine) else: wtUnique.append(locus) for locus in mutCanyonLocusCollection.getLoci(): mutWTOverlap = wtCanyonLocusCollection.getOverlap(locus, 'both') if len(mutWTOverlap) > 0: mutOverlap += 1 else: mutUnique.append(locus) print len(mutExpansion) print len(wtExpansion) utils.unParseTable(mutExpansion, projectFolder+'tables/MUT_canyons_expanded.txt', '\t') utils.unParseTable(wtExpansion, projectFolder+'tables/WT_canyons_expanded.txt', '\t')
def assignEnhancerRank(enhancerToGeneFile, enhancerFile1, enhancerFile2, name1, name2, rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t') enhancerCollection1 = makeSECollection(enhancerFile1, name1, False) enhancerCollection2 = makeSECollection(enhancerFile2, name2, False) enhancerDict1 = makeSEDict(enhancerFile1, name1, False) enhancerDict2 = makeSEDict(enhancerFile2, name2, False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2] for i in range(1, len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [ enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap ] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [ enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap ] enhancer2Rank = min(rankList2) enhancerToGene[i] += [enhancer1Rank, enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene, rankOutput, '\t')
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [ utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable ] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFtoEnhancerDict.keys(): subpeakDict[gene] = [] for region in TFtoEnhancerDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [ utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps ] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append( [overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?) overlapLoci = bedCollection.getOverlap(gffLocus, sense='both') print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci), gffLine)) # since beds come from multiple sources, we want to figure out how to offset them offsetDict = {} # this will store each ID name bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci]) bedNamesList.sort() for i in range(len(bedNamesList)): offsetDict[bedNamesList[ i]] = 2 * i # offsets different categories of bed regions if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) # fill out the name table for name in bedNamesList: offset = offsetDict[name] nameTable.append([name, 0, 0.0 - offset]) for bedLocus in overlapLoci: offset = offsetDict[bedLocus.ID()] [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff=0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated returns 2 kinds of bed files... 1 = all ''' #first make the bamDict all_valley_bed = [] valleyDict = {} #start w/ a bamFileList and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bamFileList] max_read_length = max([bam.getReadLengths()[0] for bam in bam_list]) gene_list = gene_to_enhancer_dict.keys() gene_list.sort() ticker = 0 print('number of regions processed:') for gene in gene_list: valleyDict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker % 100 == 0: print(ticker) ticker += 1 scoreArray = scoreValley(region, bam_list, max_read_length, projectName, projectFolder) for index, score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index * 10, region.start() + (index + 1) * 10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: all_valley_bed.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys all_bed_path = projectFolder + projectName + '_all_valleys.bed' utils.unParseTable(all_valley_bed, all_bed_path, '\t') return all_bed_path
def createSuperLoci(superTable, Enumber='super'): ''' takes as input a ROSE SuperEnhancer table output a table of loci for SuperEnhancers ''' print 'CREATING SUPER-ENHANCER LOCUS COLLECTION' output = [] if Enumber == 'super': for line in superTable[6:]: if line[-1] == '1': locus = utils.Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7]))) output.append(locus) else: end = 6+int(Enumber) for line in superTable[6:end]: locus = utils.Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7]))) output.append(locus) return output
def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def make_mycn_regions(conserved_rank_path): ''' takes conserved NB MYCN regions then creates a bed and gff of regions ''' conserved_rank_table = utils.parseTable(conserved_rank_path,'\t') mycn_gff = [] mycn_flank_gff = [] mycn_bed = [] mycn_flank_bed = [] for line in conserved_rank_table[1:]: locus_line = utils.Locus(line[1],line[2],line[3],'.') if int(line[3]) < int(line[2]): print('uh oh') print(line) gff_line = [line[1],line[0],'',line[2],line[3],'','.','',line[0]] bed_line = [line[1],line[2],line[3],line[0]] mycn_gff.append(gff_line) mycn_bed.append(bed_line) gff_flank_line = [line[1],line[0],'',int(line[2])-500,int(line[3])+500,'','.','',line[0]] bed_flank_line = [line[1],int(line[2])-500,int(line[3])+500,line[0]] mycn_flank_gff.append(gff_flank_line) mycn_flank_bed.append(bed_flank_line) mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) mycn_flank_gff_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder) mycn_bed_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.bed' % (bedFolder) mycn_flank_bed_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.bed' % (bedFolder) #writing to disk utils.unParseTable(mycn_gff,mycn_gff_path,'\t') utils.unParseTable(mycn_flank_gff,mycn_flank_gff_path,'\t') utils.unParseTable(mycn_bed,mycn_bed_path,'\t') utils.unParseTable(mycn_flank_bed,mycn_flank_bed_path,'\t') print(mycn_gff_path) print(mycn_flank_gff_path) print(mycn_bed_path) print(mycn_flank_bed_path) return mycn_gff_path,mycn_flank_gff_path
def mapBamToGFFLine(bamFile, MMR, name, gffLine, color, nBins, sense='both', extension=200): '''maps reads from a bam to a gff''' print('using a MMR/scaling denominator value of %s' % (MMR)) line = gffLine[0:9] gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) # setting up the output clusterline colorLine = color bamName = bamFile.split('/')[-1] clusterLine = [bamName, gffLocus.ID(), name, gffLocus.__str__()] + colorLine binSize = gffLocus.len() / nBins # some regions will be too short to get info on # we just kick these back and abandon them if binSize == 0: clusterLine += ['NA'] * int(nBins) return clusterLine # flippy flip if sense is negative senseTrans = string.maketrans('-+.', '+-+') if sense == '-': bamSense = string.translate(gffLocus.sense(), senseTrans) elif sense == '+': bamSense = gffLocus.sense() else: bamSense = '.' # using the bamLiquidator to get the readstring # print('using nBin of %s' % nBin) bamCommand = "%s %s %s %s %s %s %s %s" % (bamliquidatorString, bamFile, gffLocus.chr(), gffLocus.start(), gffLocus.end(), bamSense, nBins, extension) # print(bamCommand) getReads = subprocess.Popen(bamCommand, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) readString = getReads.communicate() denList = readString[0].split('\n')[:-1] # flip the denList if the actual gff region is - if gffLocus.sense() == '-': denList = denList[::-1] # converting from units of total bp of read sequence per bin to rpm/bp denList = [round(float(x) / binSize / MMR, 4) for x in denList] clusterLine += denList return clusterLine
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension): ''' takes as input a BED file of constituents outputs a FASTA file of merged extended super-enhancer consituents and associated formated name ''' print 'MAKE FASTA' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFandSuperDict.keys(): subpeakDict[gene] = [] for region in TFandSuperDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) # Output the fasta file of extended SE constituents outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def makeSECollection(enhancerFile, name, superOnly=True): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile, '\t') enhancerLoci = [] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: if superOnly and int(line[-1]) == 0: break enhancerLoci.append( utils.Locus(line[1], line[2], line[3], '.', name + '_' + line[0])) return utils.LocusCollection(enhancerLoci, 50)
def makeSECollection(enhancerFile,name,top=0): ''' returns a locus collection from a super table top gives the number of rows ''' enhancerTable = utils.parseTable(enhancerFile,'\t') superLoci = [] ticker = 0 for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue else: ticker+=1 superLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0])) if ticker == top: break return utils.LocusCollection(superLoci,50)
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ]] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([ locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize ]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable( '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerTable = utils.parseTable(enhancerFile,'\t') #internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile,'\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) #this turns the tssLoci list into a LocusCollection #50 is the internal parameter for LocusCollection and doesn't really matter tssCollection = utils.LocusCollection(tssLoci,50) geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} #dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict= defaultdict(list) #list of all genes that appear in this analysis overallGeneList = [] if noFormatTable: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] else: #set up the output tables #first by enhancer enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] #next by gene geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] #next make the gene to enhancer table geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] for line in enhancerTable: if line[0][0] =='#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') distalGenes =[] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes #these checks make sure each gene list is unique. #technically it is possible for a gene to be overlapping, but not proximal since the #gene could be longer than the 50kb window, but we'll let that slide here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) #Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: #get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3]))/2 #get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] #get the ID and convert to name closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] #NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) #Now grab all overlapping and proximal genes for the gene ordered table overallGeneList +=overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList+=proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) #End loop through #Make table by gene overallGeneList = utils.uniquify(overallGeneList) #use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]],',') newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] geneToEnhancerTable.append(newLine) #resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable,geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i+1)]) return sortedTable,geneToEnhancerTable
def mapBamToGFF(bamFile, gff, sense='.', extension=200, rpm=False, clusterGram=None, matrix=None): '''maps reads from a bam to a gff''' #creating a new gff to output newGFF = [] #reading in the bam bam = utils.Bam(bamFile) #getting RPM normalization if rpm: MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4) else: MMR = 1 print('using a MMR value of %s' % (MMR)) #creating a sense trans senseTrans = string.maketrans('-+.', '+-+') #reading in the gff if type(gff) == str: gff = utils.parseTable(gff, '\t') #setting up a clustergram table if clusterGram: binSize = int(clusterGram) binSizeList = [] #now go through each line of the gff and make sure they're all the same length for i in range(0, len(gff), 1): line = gff[i] gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) binSizeList.append(gffLocus.len() / binSize) binSizeList = utils.uniquify(binSizeList) if len(binSizeList) > 1: print( 'WARNING: lines in gff are of different length. Output clustergram will have variable row length' ) newGFF.append(['GENE_ID', 'locusLine'] + [ str(x * binSize) + '_' + bamFile.split('/')[-1] for x in range(1, max(binSizeList) + 1, 1) ]) #setting up a maxtrix table if matrix: newGFF.append(['GENE_ID', 'locusLine'] + [ 'bin_' + str(n) + '_' + bamFile.split('/')[-1] for n in range(1, int(matrix) + 1, 1) ]) nBin = int(matrix) # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidatorString = '/usr/bin/bamliquidator' if not os.path.isfile(bamliquidatorString): bamliquidatorString = './bamliquidator' if not os.path.isfile(bamliquidatorString): raise ValueError('bamliquidator not found in path') #getting and processing reads for gff lines ticker = 0 print('Number lines processed') for line in gff: line = line[0:9] if ticker % 100 == 0: print(ticker) ticker += 1 gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1]) #get the nBin and binSize if clusterGram: nBin = gffLocus.len() / int(clusterGram) binSize = int(clusterGram) if matrix: nBin = int(matrix) binSize = gffLocus.len() / nBin #some regions will be too short to get info on if binSize == 0: clusterLine = [gffLocus.ID(), gffLocus.__str__()] + ['NA'] * nBin newGFF.append(clusterLine) continue #flippy flip if sense is negative if sense == '-': bamSense = string.translate(gffLocus.sense(), senseTrans) elif sense == '+': bamSense = gffLocus.sense() else: bamSense = '.' #using the bamLiquidator to get the readstring #print('using nBin of %s' % nBin) bamCommand = "%s %s %s %s %s %s %s %s" % ( bamliquidatorString, bamFile, line[0], gffLocus.start(), gffLocus.end(), bamSense, nBin, extension) #print(bamCommand) getReads = subprocess.Popen(bamCommand, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) readString, stderr = getReads.communicate() if stderr: print("STDERR out: %s" % (stderr)) denList = readString.split('\n')[:-1] #print("denlist is: %s" % denList) #flip the denList if the actual gff region is - if gffLocus.sense() == '-': denList = denList[::-1] #converting from units of total bp of read sequence per bin to rpm/bp denList = [round(float(x) / binSize / MMR, 4) for x in denList] #if the gff region is - strand, flip the clusterLine = [gffLocus.ID(), gffLocus.__str__()] + denList newGFF.append(clusterLine) return newGFF
def findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName): ''' find all TFs within 1Mb of the super-enhancer center that are considered expressed return a dictionary keyed by TF that points to a list of super-enhancer loci ''' print 'FINDING CANIDATE TFs' startDict = utils.makeStartDict(annotationFile) # Find the location of the TSS of all transcripts (NMid) considered expressed tssLoci = [] for geneID in expressedNM: tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) tssCollection = utils.LocusCollection(tssLoci,50) # Assign all transcripts (NMid) that are TFs to a super-enhancer if it is the closest gene seAssignment = [] seAssignmentGene = [] TFandSuperDict = {} for superEnh in superLoci: seCenter = (superEnh.start() + superEnh.end()) / 2 # Find all transcripts whose TSS occur within 1Mb of the SE center searchLocus = utils.Locus(superEnh.chr(), superEnh.start()-1000000, superEnh.end()+1000000, '.') allEnhancerLoci = tssCollection.getOverlap(searchLocus) allEnhancerGenes = [locus.ID() for locus in allEnhancerLoci] # Find the transcript that is closest to the center if allEnhancerGenes: distList = [abs(seCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] closestGene = allEnhancerGenes[distList.index(min(distList))] else: closestGene = '' seAssignment.append([superEnh.chr(), superEnh.start(), superEnh.end(), closestGene]) # Select the transcript if it is a TF, and allow for a TF to have multiple SEs if closestGene in TFlist and closestGene not in TFandSuperDict.keys(): TFandSuperDict[closestGene] = [superEnh] elif closestGene in TFlist and closestGene in TFandSuperDict.keys(): TFandSuperDict[closestGene].append(superEnh) # Convert the selected TF NMids to gene names if closestGene != '': geneName = refseqToNameDict[closestGene] seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName]) # Output the list of SE-assigned transcripts (NMids) seAssignmentFile = projectFolder + projectName + '_SE_ASSIGNMENT_TRANSCRIPT.txt' utils.unParseTable(seAssignment, seAssignmentFile, '\t') # Output the list of SE-assigned genes seAssignmentGeneFile = projectFolder + projectName + '_SE_ASSIGNMENT_GENE.txt' utils.unParseTable(seAssignmentGene, seAssignmentGeneFile, '\t') print 'Number of canidate TFs:', len(TFandSuperDict) return TFandSuperDict
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100): ''' uses the conserved MYCN sites and ranks eboxes within them by average background subtracted signal searches 100bp (window variable) from mycn summits ''' window = int(window) #bring in the conserved mycn region print('making gff of nb mycn summits') nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t') nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50) dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile) names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1] names_list.sort() summit_loci = [] #first makes a gff of all summits +/- 100bp for all nb mycn datasets for name in names_list: summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name) summit_bed = utils.parseTable(summit_bed_path,'\t') for line in summit_bed: summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3]) if len(nb_mycn_collection.getOverlap(summit_locus)) > 0: summit_loci.append(summit_locus) summit_collection =utils.LocusCollection(summit_loci,50) summit_merged_collection = summit_collection.stitchCollection() summit_gff = utils.locusCollectionToGFF(summit_merged_collection) summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window) utils.unParseTable(summit_gff,summit_gff_path,'\t') #this is borrowed from above and maps chip-seq signal to the gff print('mapping to nb mycn summits and making signal dict') gffList = [summit_gff_path] summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList) mycnSignalTable = utils.parseTable(summit_signal_path,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) region_id = line[1] coords = [int(x) for x in line[1].split(':')[-1].split('-')] line_length = coords[1]-coords[0] mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length #now for each region find the eboxes and then add up the signal print('making ebox ranking') ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG'] eboxDict = {} for ebox in ebox_list: eboxDict[ebox] = [] ticker = 0 for line in summit_gff: if ticker % 1000 == 0: print(ticker) ticker+=1 chrom = line[0] sense = '.' start = int(line[3]) end = int(line[4]) region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4]) signal = mycn_sig_dict[region_id] sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True) motifVector = [] matches = re.finditer('CA..TG',str.upper(sequenceLine)) if matches: for match in matches: motifVector.append(match.group()) #count only 1 of each motif type per line #motifVector = utils.uniquify(motifVector) for motif in motifVector: if ebox_list.count(motif) > 0: eboxDict[motif].append(signal) else: eboxDict[utils.revComp(motif)].append(signal) eboxTable =[] eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']] for ebox in eboxDict.keys(): newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])] eboxTable.append(newLine) occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True) for x in occurenceOrder: eboxTableOrdered.append(eboxTable[x]) print(eboxTableOrdered) ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window) utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t') return ebox_outfile
def collapseFimo(fimo_output, gene_to_enhancer_dict, candidate_tf_list, output_folder, analysis_name, motifConvertFile): ''' collapses motifs from fimo for each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed ''' #first build up the motif name conversion database motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # a motif can go to multiple genes for line in motifDatabase: motifDatabaseDict[line[0]].append(line[1]) #make the folder to store motif beds utils.formatFolder('%smotif_beds/' % (output_folder), True) edgeDict = {} #first layer are source nodes for tf in candidate_tf_list: edgeDict[tf] = defaultdict( list ) #next layer are target nodes which are derived from the fimo output fimoTable = utils.parseTable(fimo_output, '\t') print(fimo_output) #fimo sometimes puts the region in either the first or second column fimo_line = fimoTable[1] if fimo_line[1].count('|') > 0: region_index = 1 else: region_index = 2 print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index)) for line in fimoTable[1:]: source_tfs = motifDatabaseDict[line[0]] #motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]), '.') else: target_locus = utils.Locus(region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]), '.') #what's missing here is the enhancer id of the target locus try: edgeDict[source][target].append(target_locus) except KeyError: print('this motif is not in the network') print(line) sys.exit() #now we actually want to collapse this down in a meaningful way #overlapping motifs count as a single binding site. This way a TF with tons of motifs #that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '%s%s_all_motifs.bed' % (output_folder, analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edgeDict[tf].keys() bed_header = [ 'track name = "%s" description="%s motifs in %s"' % (tf, tf, analysis_name) ] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder, tf) for target in target_nodes: edgeCollection = utils.LocusCollection(edgeDict[tf][target], 50) edgeCollection = edgeCollection.stitchCollection() edgeLoci = edgeCollection.getLoci() edgeDict[tf][target] = edgeLoci for locus in edgeLoci: bed_line = [ locus.chr(), locus.start(), locus.end(), target, '', '+' ] target_bed.append(bed_line) all_bed.append(bed_line) utils.unParseTable(target_bed, target_bed_path, '\t') #now the loci are all stitched up utils.unParseTable(all_bed, all_bed_path, '\t') return edgeDict
def make_shep_on_mycn_landscape(shep_on_dataFile): ''' finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer ''' dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile) print('LOADING SHEP ON MYCN SITES') #load all of the shep_on sites # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder) # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t') shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder) shep_on_bed = utils.parseTable(shep_on_bed_path,'\t') shep_on_gff = utils.bedToGFF(shep_on_bed) #now get the conserved NB MYCN regions nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file) print('LOADING SHEP ACTIVE ENHANCERS') #make a collection of enhancers shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder) shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC') #now get the active promoters print('LOADING SHEP ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder) shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t') transcribedList = [line[1] for line in shep_transcribed_table] tssLoci = [] for refID in transcribedList: tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000)) shep_tss_collection = utils.LocusCollection(tssLoci,50) #now initialize the 6 gffs we will need shep_mycn_gff = [] shep_mycn_gff_5kb = [] shep_mycn_gff_1kb = [] shep_mycn_promoter_gff = [] shep_mycn_promoter_gff_1kb = [] shep_mycn_promoter_gff_5kb = [] shep_mycn_enhancer_gff = [] shep_mycn_enhancer_gff_1kb = [] shep_mycn_enhancer_gff_5kb = [] #and their respective file names shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder) shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder) shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder) shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder) shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder) shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder) shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder) shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder) print('ITERATING THROUGH SHEP MYCN PEAKS') ticker = 0 enhancer = 0 promoter = 0 other = 0 for line in shep_on_gff: if ticker % 1000 == 0: print ticker ticker+=1 peakID = '%s_%s' % ('SHEP_MYCN',str(ticker)) lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID) if nb_conserved_mycn_collection.getOverlap(lineLocus): gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID] peakCenter = (int(line[3]) + int(line[4]))/2 gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID] #the 1kb is not a center +/- but a flank gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID] shep_mycn_gff.append(gffLine) shep_mycn_gff_5kb.append(gffLine_5kb) shep_mycn_gff_1kb.append(gffLine_1kb) #tss overlap should take precedence over enhancer overlap if shep_tss_collection.getOverlap(lineLocus,'both'): shep_mycn_promoter_gff.append(gffLine) shep_mycn_promoter_gff_5kb.append(gffLine_5kb) shep_mycn_promoter_gff_1kb.append(gffLine_1kb) promoter+=1 #now check for enhancer overlap elif shep_enhancer_collection.getOverlap(lineLocus,'both'): shep_mycn_enhancer_gff.append(gffLine) shep_mycn_enhancer_gff_5kb.append(gffLine_5kb) shep_mycn_enhancer_gff_1kb.append(gffLine_1kb) enhancer+=1 else: other+=1 print('Of %s shep on mycn peaks' % (len(shep_on_gff))) print('%s are promoter' % (promoter)) print('%s are enhancer' % (enhancer)) print('%s are other' % (other)) #now write out the gffs utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t') utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t') utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
# First, load in the node TFs, ATAC peaks and super enhancer regions we'll consider for this analysis # From networks already constructed from CRC2.py node_file = '/crusader/projects/cll/final/network/lines/zinba/' + projectName + '/' + projectName + '_NODELIST.txt' node_table = utils.parseTable(node_file, '\t') nodelist = [x[0] for x in node_table] print nodelist super_enhancer_file = '/crusader/projects/cll/final/rose/' + projectName + '_H3K27ac/' + projectName + '_H3K27ac_peaks_SuperEnhancers.table.txt' se_table = utils.parseTable(super_enhancer_file, '\t') subpeak_file = '/crusader/projects/cll/final/zinba/lines/MEC1_ATAC/MEC1_ATAC.peaks.bed' subpeak_table = utils.parseTable(subpeak_file, '\t') subpeak_loci = [] for line in subpeak_table: subpeak_loci.append(utils.Locus(line[0], line[1], line[2], '.')) subpeak_collection = utils.LocusCollection(subpeak_loci, 100) subpeak_dict = {} # key is enhancer ID, points to a list of loci # assign subpeak Loci to each super enhancer fasta = [] se_namelist = [] for line in se_table[6:]: se_id = line[0] se_namelist.append(se_id) subpeak_dict[se_id] = [] se_locus = utils.Locus(line[1], line[2], line[3], '.') overlaps = subpeak_collection.getOverlap(se_locus)
def main(): projectFolder = '/storage/goodell/home/jmreyes/projects/amish_ayala/' #gather up DMR tables #ayala MUT vs WT mutWT_hypo = utils.parseTable(projectFolder + 'bed/hypoDMRsWT.vs.Mut.bed', '\t') mutWT_hyper = utils.parseTable( projectFolder + 'bed/hyperDMRsWT.vs.Mut.bed', '\t') mutWT_control = utils.parseTable( projectFolder + 'bed/Control_nonDMRsWT.vs.Mut.bed', '\t') #ley all tbrs_all = utils.parseTable(projectFolder + 'bed/TBRS_DMRs.bed', '\t') aml_all = utils.parseTable(projectFolder + 'bed/AML_DMRs.bed', '\t') tbrs_hypo = [] tbrs_hyper = [] aml_hypo = [] aml_hyper = [] tbrs_all_loci = [] aml_all_loci = [] for line in tbrs_all: chrom = 'chr' + line[0] start = line[1] end = line[2] if 'hypo' in line: tbrs_all_loci.append( utils.Locus( chrom, start, end, '.', 'tbrs_all_hypo_' + str(chrom) + ':' + str(start) + '-' + str(end))) elif 'hyper' in line: tbrs_all_loci.append( utils.Locus( chrom, start, end, '.', 'tbrs_all_hyper_' + str(chrom) + ':' + str(start) + '-' + str(end))) for line in aml_all: chrom = 'chr' + line[0] start = line[1] end = line[2] if 'hypo' in line: aml_all_loci.append( utils.Locus( chrom, start, end, '.', 'aml_all_hypo_' + str(chrom) + ':' + str(start) + '-' + str(end))) elif 'hyper' in line: aml_all_loci.append( utils.Locus( chrom, start, end, '.', 'aml_all_hyper_' + str(chrom) + ':' + str(start) + '-' + str(end))) mutWT_hypo_loci = [] for line in mutWT_hypo: chrom = line[0] start = line[1] end = line[2] sense = '.' locusID = 'hypo_' + str(chrom) + ':' + str(start) + '-' + str(end) new_line = utils.Locus(chrom, start, end, '.', locusID) mutWT_hypo_loci.append(new_line) mutWT_hyper_loci = [] for line in mutWT_hyper: chrom = line[0] start = line[1] end = line[2] sense = '.' locusID = 'hyper_' + str(chrom) + ':' + str(start) + '-' + str(end) new_line = utils.Locus(chrom, start, end, '.', locusID) mutWT_hyper_loci.append(new_line) print len(mutWT_hyper_loci) print len(mutWT_hypo_loci) mutWT_all_loci = mutWT_hyper_loci + mutWT_hypo_loci mutWT_hypo_LC = utils.LocusCollection(mutWT_hypo_loci) tbrs_all_LC = utils.LocusCollection(tbrs_all_loci) aml_all_LC = utils.LocusCollection(aml_all_loci) tbrs_all_overlap = [] aml_all_overlap = [] for locus in mutWT_hypo_LC.getLoci(): tbrs_overlap = tbrs_all_LC.getOverlap(locus, 'both') if len(tbrs_overlap) > 0: for overlapLocus in tbrs_overlap: overlapChrom = overlapLocus.chr() overlapStart = overlapLocus.start() overlapEnd = overlapLocus.end() tbrs_all_overlap.append([ locus.ID(), overlapChrom, overlapStart, overlapEnd, overlapLocus.ID() ]) aml_overlap = aml_all_LC.getOverlap(locus, 'both') if len(aml_overlap) > 0: for overlapLocus in aml_overlap: overlapChrom = overlapLocus.chr() overlapStart = overlapLocus.start() overlapEnd = overlapLocus.end() aml_all_overlap.append([ locus.ID(), overlapChrom, overlapStart, overlapEnd, overlapLocus.ID() ]) utils.unParseTable(tbrs_all_overlap, projectFolder + 'tables/DMRsvsTBRS_all_overlaps.txt', '\t') utils.unParseTable(aml_all_overlap, projectFolder + 'tables/DMRsvsAML_all_overlaps.txt', '\t')
def mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams plotBuffer = int(gffLocus.len() / float(nBins) * 20) overlapLoci = txCollection.getOverlap(gffLocus, sense='both') geneList = [locus.ID() for locus in overlapLoci] if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) offsetCollection = utils.LocusCollection([], 500) for geneID in geneList: gene = geneDict[geneID] print(gene.commonName()) if len(gene.commonName()) > 1: name = gene.commonName() else: name = geneID offset = 4 * len(offsetCollection.getOverlap(gene.txLocus())) offsetCollection.append( utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer)) # write the name of the gene down if gene.sense() == '+': geneStart = gene.txLocus().start() else: geneStart = gene.txLocus().end() geneStart = abs(geneStart - refPoint) * scaleFactor nameTable.append([name, geneStart, -2 - offset]) # draw a line across the entire txLocus [start, stop] = [ abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords() ] diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset]) # now draw thin boxes for all txExons if len(gene.txExons()) > 0: for txExon in gene.txExons(): [start, stop] = [ abs(x - refPoint) * scaleFactor for x in txExon.coords() ] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) # now draw fatty boxes for the coding exons if any if len(gene.cdExons()) > 0: for cdExon in gene.cdExons(): [start, stop] = [ abs(x - refPoint) * scaleFactor for x in cdExon.coords() ] diagramTable.append([start, -1 - offset, stop, 1 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_diagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt', '\t')
def geneToEnhancerDict(genome, enhancer_file, activity_path): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print('Identifying enhancers and target genes from %s' % (enhancer_file)) #should this do gene assignment???? #for now assume gene assignment has been done #can later toggle to do gene assignment #first load the TF lists tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t') motif_table = utils.parseTable(genome.returnFeature('motif_convert'), '\t') #this gives all tfs that have a motif motif_tfs = utils.uniquify([line[1] for line in motif_table]) #intersect w/ the activity table if len(activity_path) > 0: activity_table = utils.parseTable(activity_path, '\t') #figure out the right column for actual gene names (basically not NM or NR and not a numeral) for i in range(len(activity_table[0])): # try: # foo = int(activity_table[0][i]) # except ValueError: # case where it is not an integer if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][ 0:2] != 'NR': #assumes refseq gene_col = i break print('using column %s of %s gene activity table for common names' % (gene_col + 1, activity_path)) active_gene_list = [ string.upper(line[gene_col]) for line in activity_table ] tf_list_refseq = [ line[0] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0 ] tf_list_name = utils.uniquify([ line[1] for line in tf_table if active_gene_list.count(line[1]) > 0 and motif_tfs.count(line[1]) > 0 ]) else: tf_list_refseq = [ line[0] for line in tf_table if motif_tfs.count(line[1]) > 0 ] tf_list_name = [ line[1] for line in tf_table if motif_tfs.count(line[1]) > 0 ] print('Identified %s TFs from %s that have motifs' % (len(tf_list_name), genome.returnFeature('tf_file'))) #keyed by gene with loci objects in the list gene_to_enhancer_dict = defaultdict(list) enhancer_to_gene_dict = defaultdict(list) #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output enhancer_table = utils.parseTable(enhancer_file, '\t') print('Analyzing %s cis-regulatory regions' % (len(enhancer_table))) #now let's make the enhancer table by region and then by gene enhancerTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']] enhancerTFTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']] geneTable = [['GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']] geneTFTable = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']] geneSummaryTable = [['GENE', 'TF', 'ENHANCER_LIST']] #will need to track which ones are TFs candidate_tf_list = [] #find the columns for gene assignment header = enhancer_table[0] header_length = len(enhancer_table[0]) closest_index = header.index('CLOSEST_GENE') proximal_index = header.index('PROXIMAL_GENES') overlap_index = header.index('OVERLAP_GENES') for line in enhancer_table[1:]: if len( line ) != header_length: #don't bother trying to figure out lines w/o target genes continue enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) closest_gene_list = line[closest_index].split(',') proximal_gene_list = line[proximal_index].split(',') overlap_gene_list = line[overlap_index].split(',') all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list all_gene_list = [string.upper(gene) for gene in all_gene_list] #print(all_gene_list) #print(activity_path) #print(active_gene_list) #gets a unique list of all tfs if len(activity_path) > 0: all_gene_list = utils.uniquify([ gene for gene in all_gene_list if active_gene_list.count(gene) > 0 ]) else: all_gene_list = utils.uniquify(all_gene_list) candidate_gene_list = utils.uniquify( [gene for gene in all_gene_list if tf_list_name.count(gene) > 0]) if len(all_gene_list) > 0: for gene in all_gene_list: gene_to_enhancer_dict[gene].append(enhancer_locus) enhancer_to_gene_dict[enhancer_locus].append(gene) newLine = line[0:4] + [','.join(all_gene_list)] else: newLine = line[0:4] + [''] enhancerTable.append(newLine) if len(candidate_gene_list) > 0: tfLine = line[0:4] + [','.join(candidate_gene_list)] enhancerTFTable.append(tfLine) #now iterate through each gene and list the enhancers gene_list = gene_to_enhancer_dict.keys() print(gene_list) gene_list.sort() for gene in gene_list: if tf_list_name.count(gene) > 0: tf_status = 1 candidate_tf_list.append(gene) else: tf_status = 0 enhancer_loci = gene_to_enhancer_dict[gene] enhancerString = ','.join( [enhancer.ID() for enhancer in enhancer_loci]) geneSummaryTable.append([gene, tf_status, enhancerString]) for enhancer in enhancer_loci: newLine = [ gene, tf_status, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID() ] geneTable.append(newLine) if tf_status == 1: newLine = [ gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID() ] geneTFTable.append(newLine) return geneTable, geneTFTable, enhancerTable, enhancerTFTable, geneSummaryTable, candidate_tf_list, gene_to_enhancer_dict
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) #get the chromLists from the various bams here cmd = 'samtools idxstats %s' % (rankByBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] if len(controlBamFile) > 0: cmd = 'samtools idxstats %s' % (controlBamFile) idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) idxStats= idxStats.communicate() bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] #now make sure no genes have a bad chrom overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] #now make an enhancer collection of all transcripts enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = 'bamliquidator_batch' print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedRankByFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) os.system(cmd) #check for completion if utils.checkOutput(mappedControlFile,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def make_mycn_stats_table(nb_all_chip_dataFile,outFile): ''' making a table of conserved mycn peaks w/ some additional stats mycn and h3k27ac signal is avg. background normalized across 4 samples active tss defined as the union of all H3K27ac occupied promoters in NB active enhancers defined as the union of all H3K27ac sites outside of promoters ''' dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile) print('SETTING UP OUTPUT TABLE') outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']] dinuc = nmers(2,['A','T','G','C']) #input files mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder) h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder) mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder) activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder) #note, this is the ucsc hg19 cpg islands extended file #to download and format run ./beds/download_cpg.sh cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder) enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder) print('LOADING MYCN BINDING DATA') mycnSignalTable = utils.parseTable(mycnSignalFile,'\t') #making a signal dictionary for MYCN binding names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = mycnSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] mycn_sig_dict = {} #this only works if the first column are unique identifiers if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile)) sys.exit() for line in mycnSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) mycn_sig_dict[line[0]] = numpy.mean(line_sig) print('LOADING MYCN RANK DATA') mycnRankTable = utils.parseTable(mycnRankFile,'\t') print('LOADING H3K27AC BINDING DATA') h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t') #making a signal dictionary for background subtracted H3K27ac binding names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE'] background_list = [dataDict[name]['background'] for name in names_list] header = h3k27acSignalTable[0] chip_columns = [header.index(name) for name in names_list] background_columns = [header.index(background_name) for background_name in background_list] h3k27ac_sig_dict = {} #this only works if the first column are unique identifiers if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])): print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile)) sys.exit() for line in h3k27acSignalTable[1:]: line_sig = [] for i in range(len(names_list)): line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]])) h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig) #making the cpg collection print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(cpgFile,'\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1])) cpgCollection = utils.LocusCollection(cpgLoci,50) #next make the tss collection of active promoters print('LOADING ACTIVE PROMOTERS') startDict = utils.makeStartDict(annotFile) activeTable = utils.parseTable(activeGeneFile,'\t') tss_1kb_loci = [] for line in activeTable: tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000)) tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50) #enhancer file print("LOADING ACTIVE ENHANCERS") enhancerTable = utils.parseTable(enhancerFile,'\t') print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB') print(len(enhancerTable) - 6) enhancerLoci = [] for line in enhancerTable: if line[0][0] != '#' and line[0][0] != 'R': try: lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0]) enhancerLoci.append(lineLocus) except IndexError: print(line) sys.exit() enhancerCollection = utils.LocusCollection(enhancerLoci,50) print('CLASSIFYING MYCN PEAKS') ticker = 0 for i in range(1,len(mycnSignalTable)): if ticker%100 == 0: print(ticker) ticker +=1 line = mycnSignalTable[i] mycn_signal = round(mycn_sig_dict[line[0]],4) h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4) peakID = line[0] locusString = line[1] chrom = locusString.split('(')[0] [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')] lineLocus = utils.Locus(chrom,start,stop,'.',peakID) tssOverlap = 0 if tss_1kb_collection.getOverlap(lineLocus,'both'): tssOverlap = 1 enhancerOverlap = 0 if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0: enhancerOverlap = 1 cpgIslandOverlap = 0 if cpgCollection.getOverlap(lineLocus,'both'): cpgIslandOverlap = 1 #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(),lineLocus.start()) cpgEnd = min(locus.end(),lineLocus.end()) overlappingBases += (cpgEnd-cpgStart) overlapFraction = round(float(overlappingBases)/lineLocus.len(),2) #now get the seq lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True)) gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2) dinuc_dict = {} for nmer in dinuc: dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq) mycnRankLine = mycnRankTable[i] mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]]) canonMatchList = re.findall('CACGTG',lineSeq) canon_count = len(canonMatchList) eboxMatchList = re.findall('CA..TG',lineSeq) ebox_count = len(eboxMatchList) non_canon_count = ebox_count-canon_count #get the expected values canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5) canon_exp = round(canon_exp,2) notCG = 1- dinuc_dict['CG'] non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5) non_exp = round(non_exp,2) #for gata and GABPA gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq) gabpa_count = len(gabpaMatchList) gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5) gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5) gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2) gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq) gata_count = len(gataMatchList) an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC'] cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC'] gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5) gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5) gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2) newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp] outTable.append(newLine) utils.unParseTable(outTable,outFile,'\t') return outFile
def makePeakTable(paramDict, splitGFFPath, averageTablePath, startDict, geneList, genomeDirectory, tads_path=''): ''' makes the final peak table with ebox info ''' peakTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG', 'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT', 'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES', 'PROXIMAL_GENES' ]] print('LOADING PEAK REGIONS') peakGFF = utils.parseTable(splitGFFPath, '\t') print('LOADING BINDING DATA') signalTable = utils.parseTable(averageTablePath, '\t') print('LOADING CPGS ISLANDS') cpgBed = utils.parseTable(paramDict['cpgPath'], '\t') cpgLoci = [] for line in cpgBed: cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1])) cpgCollection = utils.LocusCollection(cpgLoci, 50) print("MAKING TSS COLLECTIONS") if len(geneList) == 0: geneList = startDict.keys() tss_1kb_loci = [] tss_50kb_loci = [] for refID in geneList: tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000)) tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000, 50000)) #make a 1kb flanking and 50kb flanking collection tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50) tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50) if len(tads_path) > 0: print('LOADING TADS FROM %s' % (tads_path)) tad_collection = utils.importBoundRegion(tads_path, 'tad') use_tads = True #building a tad dict keyed by tad ID w/ genes in that tad provided tad_dict = defaultdict(list) for tss_locus in tss_1kb_loci: overlapping_tads = tad_collection.getOverlap(tss_locus, 'both') for tad_locus in overlapping_tads: tad_dict[tad_locus.ID()].append(tss_locus.ID()) else: use_tads = False print('CLASSIFYING PEAKS') ticker = 0 no_tad_count = 0 for i in range(len(peakGFF)): if ticker % 1000 == 0: print(ticker) ticker += 1 #getting the particulars of the region gffLine = peakGFF[i] peakID = gffLine[1] chrom = gffLine[0] start = int(gffLine[3]) stop = int(gffLine[4]) lineLocus = utils.Locus(chrom, start, stop, '.', peakID) #getting the mapped signal signalLine = signalTable[(i + 1)] signalVector = [float(x) for x in signalLine[2:]] #setting up the new line newLine = [peakID, chrom, start, stop, lineLocus.len()] #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier newLine.append(gffLine[7]) #check cpg status if cpgCollection.getOverlap(lineLocus, 'both'): newLine.append(1) else: newLine.append(0) #now do fractional cpgOverlap overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both') overlappingBases = 0 for locus in overlappingCpGLoci: cpgStart = max(locus.start(), lineLocus.start()) cpgEnd = min(locus.end(), lineLocus.end()) overlappingBases += (cpgEnd - cpgStart) overlapFraction = float(overlappingBases) / lineLocus.len() newLine.append(round(overlapFraction, 2)) #now get the seq lineSeq = string.upper( utils.fetchSeq(genomeDirectory, chrom, start, stop, True)) if len(lineSeq) == 0: print('UH OH') print(lineSeq) print(gffLine) print(i) print(chrom) print(start) print(stop) sys.exit() gcFreq = float(lineSeq.count('GC') + lineSeq.count('CG')) / len(lineSeq) newLine.append(gcFreq) #this is where we add the ChIP-Seq signal newLine += signalVector eboxMatchList = re.findall('CA..TG', lineSeq) if len(eboxMatchList) == 0: newLine += [0] * 3 else: totalCount = len(eboxMatchList) canonCount = eboxMatchList.count('CACGTG') otherCount = totalCount - canonCount newLine += [canonCount, otherCount, totalCount] #now find the overlapping and proximal genes #here each overlapping gene the tss 1kb locus overlaps the peak if use_tads: tad_loci = tad_collection.getOverlap(lineLocus, 'both') tad_id_list = [tad_locus.ID() for tad_locus in tad_loci] tad_genes = [] for tad_id in tad_id_list: tad_genes += tad_dict[tad_id] if len(tad_genes) == 0: #print('no tad for this region') #print(gffLine) no_tad_count += 1 else: tad_genes = [] if len(tad_genes) > 0: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') if tad_genes.count(locus.ID()) > 0 ] # print('linked peak to tad genes') # print([startDict[x]['name'] for x in tad_genes]) # print(tad_id_list) # print(gffLine) # print(overlappingGenes) # print(proximalGenes) else: overlappingGenes = [ startDict[locus.ID()]['name'] for locus in tss_1kb_collection.getOverlap(lineLocus, 'both') ] proximalGenes = [ startDict[locus.ID()]['name'] for locus in tss_50kb_collection.getOverlap(lineLocus, 'both') ] overlappingGenes = utils.uniquify(overlappingGenes) #here the tss 50kb locus overlaps the peak #overlap takes priority over proximal proximalGenes = [ gene for gene in proximalGenes if overlappingGenes.count(gene) == 0 ] proximalGenes = utils.uniquify(proximalGenes) overlappingString = string.join(overlappingGenes, ',') proximalString = string.join(proximalGenes, ',') newLine += [overlappingString, proximalString] peakTable.append(newLine) print('Out of %s regions, %s were assigned to at least 1 tad' % (len(peakTable), no_tad_count)) return peakTable
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' print 'FINDING CANIDATE TFs' enhancerAssignment = [] TFtoEnhancerDict = defaultdict(list) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in expressedNM: tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000)) tssCollection = utils.LocusCollection(tssLoci,50) # Loop through enhancers for enhancer in enhancerLoci: # If the enhancer overlaps a TSS, save it overlappingLoci = tssCollection.getOverlap(enhancer, 'both') overlappingGenes =[] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # Find all gene TSS within 100 kb proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximalGenes =[] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) # If no genes are within 100 kb, find the closest active gene closestGene = '' if len(overlappingGenes) == 0 and len(proximalGenes) == 0: distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distalGenes =[] for distalLocus in distalLoci: distalGenes.append(distalLocus.ID()) enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distalGenes] if distList: closestGene = distalGenes[distList.index(min(distList))] overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict