def main(): genome = 'hg19' projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/' bedFolder = projectFolder + 'bed/' canyonBed = bedFolder + 'canyon_Mut_sizeSelected.bed' scripts = projectFolder + 'scripts/' sampleName = 'Mut' outputUp = 'HG19_' + sampleName + '_1000extend_upstreamFlanking.bed' outputDown = 'HG19_' + sampleName + '_' + '1000extend_downstreamFlanking.bed' geneGTF = '/storage/goodell/home/jmreyes/grail/genomes/Homo_sapiens/UCSC/%s/Annotation/Genes/genes.gtf' % ( genome) bedList = [outputUp, outputDown] cmdBash = [['#!/usr/bin/bash']] cmdOut = projectFolder + 'scripts/' + sampleName + '_geneIntersect.sh' for bed in bedList: bedIn = bed bedName = bedIn.split('/')[-1].split('.bed')[0] sortedOut = bedFolder + bedName + '.sorted.bed' intersectOut = bedFolder + bedName + '_geneIntersect.bed' sortBedCmd = 'sort -k1,1 -k2,2n %s > %s' % (bedFolder + bedIn, sortedOut) cmdBash.append([sortBedCmd]) intersectCmd = 'bedtools closest -d -a %s -b %s > %s' % ( sortedOut, geneGTF, intersectOut) cmdBash.append([intersectCmd]) utils.unParseTable(cmdBash, cmdOut, '\t')
def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1): ''' makes the fold table and writes to disk fold table is ranked by fold change first column is guideID, second column is gene name, third is fold change ''' guideDict,geneDict = makeAnnotDict(annotFile) testIdx = utils.parseTable(testIdxFile,'\t') controlIdx = utils.parseTable(controlIdxFile,'\t') #for each guide, divide the count by the MMR then add 1 then take the log2 ratio outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]] for i in range(len(testIdx)): guideID = testIdx[i][0] gene = guideDict[guideID] testCount = float(testIdx[i][2])/testMMR + epsilon controlCount = float(controlIdx[i][2])/controlMMR + epsilon log2Ratio = numpy.log2(testCount/controlCount) newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)] outTable.append(newLine) outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName) utils.unParseTable(outTable,outputFile,'\t') return outputFile
def buildGraph(edgeDict, gene_to_enhancer_dict, output_folder, analysis_name, cutoff=1): ''' from the collapsed edge dictionary, build a target graph require at least n motifs to constitute an edge where n is set by cutoff. default is 1 ''' node_list = edgeDict.keys() node_list.sort() #this is only edges between TFs graph = nx.DiGraph(name=analysis_name) graph.add_nodes_from(node_list) #this stores ALL edges identified by motifs edge_table = [[ 'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID', 'TF_INTERACTION' ]] edge_output = '%s%s_EDGE_TABLE.txt' % (output_folder, analysis_name) for source in node_list: print(source) target_list = edgeDict[source].keys() target_list.sort() for target in target_list: #now we need to see which target regions this guy overlaps target_regions = gene_to_enhancer_dict[target] target_collection = utils.LocusCollection(target_regions, 50) #get the edges hitting that target edgeLoci = edgeDict[source][target] if node_list.count(target) > 0: tf_interaction = 1 else: tf_interaction = 0 #only add to the graph if this is a TF/TF interaction if len(edgeLoci) >= cutoff and node_list.count(target) > 0: graph.add_edge(source, target) #now for each edge, add to the table for edgeLocus in edgeLoci: regionString = ','.join([ locus.ID() for locus in target_collection.getOverlap(edgeLocus) ]) edgeLine = [ source, target, edgeLocus.chr(), edgeLocus.start(), edgeLocus.end(), regionString, tf_interaction ] edge_table.append(edgeLine) utils.unParseTable(edge_table, edge_output, '\t') return graph
def findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated ''' print 'IDENTIFYING VALLEYS IN SUPER ENHANCERS' valleyBED = [] valleyDict = {} for gene in TFtoEnhancerDict.keys(): valleyDict[gene] = [] print gene for region in TFtoEnhancerDict[gene]: scoreArray = scoreValley(region, bamFile, projectName, projectFolder) for index,score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index*10, region.start() + (index+1)*10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: valleyBED.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys bedfilename = projectFolder + projectName + '_valleys.bed' utils.unParseTable(valleyBED, bedfilename, '\t') print bedfilename return bedfilename
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap,'\t') namesList = medianDict.keys() signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList] for line in regionMap[1:]: newLine = line[0:6] for i in range(len(namesList)): enhancerIndex = (i*2) + 6 controlIndex = (i*2) + 7 enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal/medianDict[namesList[i]] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable,outputFile,'\t') return outputFile
def formatOutput(TFtoEnhancerDict, refseqToNameDict, projectName, projectFolder): ''' takes in the dict mapping TFs to all proximal supers returns a file that lists each canidate TFs and gives the coordinates of the super enhancers around them ''' output = [['TF_refseq', 'TF_name', 'chr', 'start', 'stop', 'SuperID', 'Super_Load' ]] used = [] for gene in TFtoEnhancerDict.keys(): for superEnh in TFtoEnhancerDict[gene]: check = (refseqToNameDict[gene], superEnh.chr(), superEnh.start(), superEnh.end()) if check not in used: newline = [gene, refseqToNameDict[gene]] newline.append(superEnh.chr()) newline.append(superEnh.start()) newline.append(superEnh.end()) newline.append(superEnh.ID()) newline.append(superEnh.score()) output.append(newline) used.append(check) outputname = projectFolder + projectName + '_CANIDATE_TF_AND_SUPER_TABLE.txt' utils.unParseTable(output, outputname, '\t') return 1
def cut_1000(order_table,regions_table,geneDict, outpath_top='',outpath_bottom=''): top_table = [] top_cut_regions = [] top_table.append(regions_table[0]) for line in order_table[1:1001]: top_cut_regions.append(line[4].split(',')) print(top_cut_regions[1:10]) for line in regions_table: region = line[0] #print(region) if any(region in s for s in top_cut_regions): top_table.append(line) utils.unParseTable(top_table,outpath_top,'\t') bottom_table=[] bottom_cut_regions = [] bottom_table.append(regions_table[0]) for line in order_table[len(order_table)-1000:len(order_table)]: bottom_cut_regions.append(line[4].split(',')) print(bottom_cut_regions[1:10]) for line in regions_table: region = line[0] if any(region in s for s in bottom_cut_regions): bottom_table.append(line) utils.unParseTable(bottom_table,outpath_bottom,'\t')
def collapseRegionMap(regionMapFile, name='', controlBams=False): ''' takes a regionMap file and collapses signal into a single column also fixes any stupid start/stop sorting issues needs to take into account whether or not controls were used ''' regionMap = utils.parseTable(regionMapFile, '\t') for n, line in enumerate(regionMap): if n == 0: #new header if len(name) == 0: name = 'MERGED_SIGNAL' regionMap[n] = line[0:6] + [name] else: newLine = list(line[0:6]) if controlBams: signalLine = [float(x) for x in line[6:]] rankbyIndexes = range(0, len(signalLine) / 2, 1) controlIndexes = range(len(signalLine) / 2, len(signalLine), 1) metaVector = [] for i, j in zip(rankbyIndexes, controlIndexes): #min signal is 0 metaVector.append(max(0, signalLine[i] - signalLine[j])) metaSignal = numpy.mean(metaVector) else: metaSignal = numpy.mean([float(x) for x in line[6:]]) regionMap[n] = newLine + [metaSignal] outputFile = string.replace(regionMapFile, 'REGION', 'META') utils.unParseTable(regionMap, outputFile, '\t') return (outputFile)
def filterSubpeaks(subpeakFile,gene_to_enhancer_dict, analysis_name,output_folder): ''' takes the initial subpeaks in, stitches them, ''' # stitch the subpeaks print(subpeakFile) subpeakCollection = utils.importBoundRegion(subpeakFile,'%s_subpeak' % (analysis_name)) subpeakCollection = subpeakCollection.stitchCollection() subpeakLoci = subpeakCollection.getLoci() all_sub_bed = [] for locus in subpeakLoci: bed_line = [locus.chr(),locus.start(),locus.end(),'.',locus.ID()] all_sub_bed.append(bed_line) all_bed_path = output_folder + analysis_name + '_all_subpeak.bed' utils.unParseTable(all_sub_bed, all_bed_path, '\t') return all_bed_path
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''): ''' input: an activity table with refseq in first column and expression or promoter acetylation in second column output: a dictionary keyed by refseq that points to activity ''' print 'CREATING EXPRESSION DICTIONARY' if not expressionFile: expressionFilename = projectFolder + 'bamliquidator/matrix.txt' else: expressionFilename = expressionFile expressionTable = utils.parseTable(expressionFilename, '\t') expressionDictNM = {} expressionDictGene = {} for line in expressionTable[1:]: trid = line[0] geneName = refseqToNameDict[trid] try: exp = float(line[2]) except IndexError: exp = float(line[1]) # Save the expression value of each NMid in a dict, keep higher value if multiple if trid in expressionDictNM and exp > expressionDictNM[trid]: expressionDictNM[trid] = exp elif trid not in expressionDictNM: expressionDictNM[trid] = exp # Save the value of the expression if it's the highest for that gene if geneName in expressionDictGene and exp > expressionDictGene[geneName]: expressionDictGene[geneName] = exp elif geneName not in expressionDictGene: expressionDictGene[geneName] = exp cutoff = numpy.percentile(expressionDictGene.values(), expCutoff) print 'Expression cutoff: ' + str(cutoff) expressedGenes = [] expressedNM = [] for nmid in expressionDictNM: if float(expressionDictNM[nmid]) > cutoff: expressedGenes.append(refseqToNameDict[nmid]) expressedNM.append(nmid) expressedGenes = utils.uniquify(expressedGenes) Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt' utils.unParseTable(expressedGenes, Genefilename, '') expressedNM = utils.uniquify(expressedNM) NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt' utils.unParseTable(expressedNM, NMfilename, '') return expressedNM, expressionDictNM
def makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap,'\t') namesList = nameDict.keys() namesList.sort() signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList] print("len of %s for namesList" % (len(namesList))) print(namesList) for line in regionMap[1:]: newLine = line[0:6] #a little tricky here to add datasets sequentially i = 6 #start w/ the first column w/ data for name in namesList: if nameDict[name]['background'] == True: enhancerIndex = int(i) i +=1 controlIndex = int(i) i +=1 try: enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex]) except IndexError: print line print len(line) print enhancerIndex print controlIndex sys.exit() else: enhancerIndex = int(i) i+=1 enhancerSignal = float(line[enhancerIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal/medianDict[name] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable,outputFile,'\t') return outputFile
def makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict, analysisName, genome, outputFolder): ''' makes a table where each row is an enhancer and each column is the log2 background corrected signal vs. median ''' #load in the region map regionMap = utils.parseTable(mergedRegionMap, '\t') namesList = nameDict.keys() namesList.sort() signalTable = [[ 'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE' ] + namesList] print("len of %s for namesList" % (len(namesList))) print(namesList) for line in regionMap[1:]: newLine = line[0:6] #a little tricky here to add datasets sequentially i = 6 #start w/ the first column w/ data for name in namesList: if nameDict[name]['background'] == True: enhancerIndex = int(i) i += 1 controlIndex = int(i) i += 1 try: enhancerSignal = float(line[enhancerIndex]) - float( line[controlIndex]) except IndexError: print line print len(line) print enhancerIndex print controlIndex sys.exit() else: enhancerIndex = int(i) i += 1 enhancerSignal = float(line[enhancerIndex]) if enhancerSignal < 0: enhancerSignal = 0 enhancerSignal = enhancerSignal / medianDict[name] newLine.append(enhancerSignal) signalTable.append(newLine) outputFile = "%s%s_%s_signalTable.txt" % (outputFolder, genome, analysisName) print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile) utils.unParseTable(signalTable, outputFile, '\t') return outputFile
def summarizeVenn(mapped_path, group_list=['CG', 'THMYCN'], output=''): ''' summarizes binary occupancy across group to make a venn diagram ''' group_table = [['GFF_LINE', 'ID'] + group_list] mapped_table = utils.parseTable(mapped_path, '\t') group_cols = [] for group in group_list: group_names = [ name for name in mapped_table[0] if name.count(group) > 0 ] group_cols.append( [mapped_table[0].index(name) for name in group_names]) print(group_cols) for line in mapped_table[1:]: binary_vector = [] #a 1/0 vector to hold mapping by group for i in range(len(group_list)): cols = group_cols[i] signal = max([int(line[x]) for x in cols]) binary_vector.append(signal) new_line = line[0:2] + binary_vector group_table.append(new_line) print(group_table[0:5]) #now add up the stats #this part assumes only 2 groups for now otherwise gets combinatorially challenging #permute all possible binary combinations given the vector length binary_combinations = [[0], [1]] for i in range(len(group_list) - 1): new_combinations = [] for x in binary_combinations: print(x) x1 = list(x) + [1] x0 = list(x) + [0] new_combinations.append(x1) new_combinations.append(x0) binary_combinations = list(new_combinations) print(binary_combinations) count_table = [group_list + ['count']] for combo in binary_combinations: count = len([line for line in group_table[1:] if line[2:] == combo]) count_table.append(combo + [count]) print(count_table) if len(output) > 0: utils.unParseTable(count_table, output, '\t') else: return count_table
def mergeCollections(nameDict, analysisName, output='', superOnly=True): ''' merges them collections ''' allLoci = [] namesList = nameDict.keys() for name in namesList: seCollection = makeSECollection(nameDict[name]['enhancerFile'], name, superOnly) if superOnly: print "DATASET: %s HAS %s SUPERENHANCERS" % (name, len(seCollection)) else: print "DATASET: %s HAS %s ENHANCERS" % (name, len(seCollection)) allLoci += seCollection.getLoci() print len(allLoci) mergedCollection = utils.LocusCollection(allLoci, 50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci)) #sort by size and provide a unique ID sizeList = [locus.len() for locus in stitchedLoci] sizeOrder = utils.order(sizeList, decreasing=True) orderedLoci = [stitchedLoci[i] for i in sizeOrder] for i in range(len(orderedLoci)): orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName, str(i + 1)) mergedGFF = [] for locus in orderedLoci: newLine = [ locus.chr(), locus.ID(), '', locus.start(), locus.end(), '', locus.sense(), '', locus.ID() ] mergedGFF.append(newLine) if len(output) == 0: return mergedGFF else: print "writing merged gff to %s" % (output) utils.unParseTable(mergedGFF, output, '\t') return output
def filterPeaks(tabixFolder,mycTablePath,outputPath,repeatList = []): ''' auto filters the 3 repeat classes LINE, LTR, Simple_repeat outputs a bed in the format of [PEAK_ID,CHROM, START,STOP,LENGTH, LINE, LTR, Simple_repeat] ''' if len(repeatList) == 0: repeatList = ['LINE','LTR','Simple_repeat'] repeatTable = [['PEAK_ID','CHROM','START','STOP','LENGTH'] + repeatList] mycTable = utils.parseTable(mycTablePath,'\t') ticker =0 for line in mycTable[1:]: if line[0][0] =='P': continue if ticker % 100 == 0: print ticker ticker +=1 peak_ID = line[0] chrom = line[1] start = int(line[2]) stop = int(line[3]) length = line[4] locusString = '%s:%s-%s' % (chrom,start,stop) repeatFractions = [] for repeatClass in repeatList: tabixGFF = '%shg19_%s_category_sorted.gff.gz' % (tabixFolder,repeatClass) tabixCmd = 'tabix %s %s' % (tabixGFF,locusString) tabix = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) tabixLines = tabix.stdout.readlines() tabixLines = [x.rstrip().split('\t') for x in tabixLines] #i think you get back essentially gff lines overlapFraction = 0.0 for line in tabixLines: lineStart = int(line[3]) lineStop = int(line[4]) lineStart = max(start,lineStart) lineStop = min(stop,lineStop) overlapLength = lineStop - lineStart overlapFraction += float(overlapLength)/float(length) repeatFractions.append(round(overlapFraction,4)) newLine = [peak_ID,chrom,start,stop,length] + repeatFractions repeatTable.append(newLine) utils.unParseTable(repeatTable,outputPath,'\t')
def assignEnhancerRank(enhancerToGeneFile, enhancerFile1, enhancerFile2, name1, name2, rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t') enhancerCollection1 = makeSECollection(enhancerFile1, name1, False) enhancerCollection2 = makeSECollection(enhancerFile2, name2, False) enhancerDict1 = makeSEDict(enhancerFile1, name1, False) enhancerDict2 = makeSEDict(enhancerFile2, name2, False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2] for i in range(1, len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [ enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap ] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [ enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap ] enhancer2Rank = min(rankList2) enhancerToGene[i] += [enhancer1Rank, enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene, rankOutput, '\t')
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?) overlapLoci = bedCollection.getOverlap(gffLocus, sense='both') print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci), gffLine)) # since beds come from multiple sources, we want to figure out how to offset them offsetDict = {} # this will store each ID name bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci]) bedNamesList.sort() for i in range(len(bedNamesList)): offsetDict[bedNamesList[ i]] = 2 * i # offsets different categories of bed regions if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) # fill out the name table for name in bedNamesList: offset = offsetDict[name] nameTable.append([name, 0, 0.0 - offset]) for bedLocus in overlapLoci: offset = offsetDict[bedLocus.ID()] [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory, projectName, projectFolder, constExtension): ''' from a BED file of constituents generate a FASTA for the consituients contained within the canidate supers ''' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [ utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable ] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFtoEnhancerDict.keys(): subpeakDict[gene] = [] for region in TFtoEnhancerDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [ utils.makeSearchLocus(x, constExtension, constExtension) for x in overlaps ] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append( [overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str( subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start() + 1), int(subpeak.end() + 1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff=0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated returns 2 kinds of bed files... 1 = all ''' #first make the bamDict all_valley_bed = [] valleyDict = {} #start w/ a bamFileList and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bamFileList] max_read_length = max([bam.getReadLengths()[0] for bam in bam_list]) gene_list = gene_to_enhancer_dict.keys() gene_list.sort() ticker = 0 print('number of regions processed:') for gene in gene_list: valleyDict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker % 100 == 0: print(ticker) ticker += 1 scoreArray = scoreValley(region, bam_list, max_read_length, projectName, projectFolder) for index, score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index * 10, region.start() + (index + 1) * 10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: all_valley_bed.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys all_bed_path = projectFolder + projectName + '_all_valleys.bed' utils.unParseTable(all_valley_bed, all_bed_path, '\t') return all_bed_path
def getTonyInfo(uniqueIDList, colList): ''' pass this a uniqueID List and a list of columns ''' uniqueIDString = string.join(uniqueIDList, ',') columnString = string.join([str(x) for x in colList], ',') cmd = "perl /ark/tony/admin/getDB_Data.pl -i %s -c %s -o TAB" % ( uniqueIDString, columnString) sqlOut = subprocess.Popen(cmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) sqlText = sqlOut.communicate() sqlText = sqlText[0] sqlTable = sqlText.split('\n') sqlTable = [x for x in sqlTable if len(x) > 0] sqlTable = [x.split('\t') for x in sqlTable] header = [x.split(':')[-1] for x in sqlTable[0][1:]] header = [str.upper(x) for x in header] header = ['GENOME', 'SOURCE', 'CELL_TYPE', 'NAME', 'BAMFILE'] tonyDict = {} for line in sqlTable[1:]: uniqueID = line[0] tonyDict[uniqueID] = {} for i in range(len(header)): tonyDict[uniqueID][header[i]] = line[(i + 1)] newTable = [] newTable.append(header) for key in tonyDict.keys(): newLine = [] newLine.append(str.upper(tonyDict[key]['GENOME'])) newLine.append(tonyDict[key]['SOURCE']) newLine.append(tonyDict[key]['CELL_TYPE']) newLine.append(tonyDict[key]['NAME']) newLine.append(tonyDict[key]['BAMFILE']) newTable.append(newLine) #print newTable utils.unParseTable(newTable, '/grail/projects/masterBamTable.txt', '\t')
def averagingMappedSignal(mapped_list, output_path, setName): ''' averages signal across a set of mapped gffs and writes the new output ''' #create a list containing all of the tables table_list = [ utils.parseTable(mapped_list[i], '\t') for i in range(len(mapped_list)) ] #first set up the output header output_header = ['GENE_ID', 'locusLine'] nCols = len(table_list[0][0]) - 2 for n in range(nCols): output_header.append('bin_%s_%s' % (n + 1, setName)) output_table = [output_header] #now iterate through each row to set up the gene ID and locus line for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) > 2: output_table.append(line[0:2]) #now run through the whole matrix in i,j notation and put average signal into the final matrix #iterate through rows row_ticker = 1 for i in range(1, len(table_list[0])): line = table_list[0][i] if len(line) == 2: continue signal_vector = [] #iterate through columns for j in range(2, len(table_list[0][0])): try: signal_vector = [float(table[i][j]) for table in table_list] except IndexError: print(i, j) print(table_list[0][i]) print(table_list[1][i]) signal = max(round(numpy.average(signal_vector), 4), 0) output_table[row_ticker].append(signal) row_ticker += 1 print(len(table_list[0])) print(len(output_table)) utils.unParseTable(output_table, output_path, '\t') return output_path
def fix_table_s1(ob_s1_path): ''' fixes formatting of table s1 ''' s1 = open(ob_s1_path, 'r') lines = s1.readlines() if len(lines) == 1: lines = lines[0].split('\r') s1_table = [line.rstrip().split('\t') for line in lines] utils.unParseTable(s1_table, ob_s1_path, '\t') return ob_s1_path
def assignEnhancerRank(enhancerToGeneFile,enhancerFile1,enhancerFile2,name1,name2,rankOutput=''): ''' for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables ''' enhancerToGene = utils.parseTable(enhancerToGeneFile,'\t') enhancerCollection1 = makeSECollection(enhancerFile1,name1,False) enhancerCollection2 = makeSECollection(enhancerFile2,name2,False) enhancerDict1 = makeSEDict(enhancerFile1,name1,False) enhancerDict2 = makeSEDict(enhancerFile2,name2,False) #we're going to update the enhancerToGeneTable enhancerToGene[0] += ['%s_rank' % name1,'%s_rank' % name2] for i in range(1,len(enhancerToGene)): line = enhancerToGene[i] locusLine = utils.Locus(line[1],line[2],line[3],'.',line[0]) #if the enhancer doesn't exist, its ranking is dead last on the enhancer list enhancer1Overlap = enhancerCollection1.getOverlap(locusLine,'both') if len(enhancer1Overlap) == 0: enhancer1Rank = len(enhancerCollection1) else: rankList1 = [enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap] enhancer1Rank = min(rankList1) enhancer2Overlap = enhancerCollection2.getOverlap(locusLine,'both') if len(enhancer2Overlap) == 0: enhancer2Rank = len(enhancerCollection2) else: rankList2 = [enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap] enhancer2Rank = min(rankList2) enhancerToGene[i]+=[enhancer1Rank,enhancer2Rank] if len(rankOutput) == 0: return enhancerToGene else: utils.unParseTable(enhancerToGene,rankOutput,'\t')
def mergeCollections(nameDict,analysisName,output='',superOnly=True): ''' merges them collections ''' allLoci = [] namesList = nameDict.keys() for name in namesList: seCollection =makeSECollection(nameDict[name]['enhancerFile'],name,superOnly) if superOnly: print "DATASET: %s HAS %s SUPERENHANCERS" % (name,len(seCollection)) else: print "DATASET: %s HAS %s ENHANCERS" % (name,len(seCollection)) allLoci += seCollection.getLoci() print len(allLoci) mergedCollection = utils.LocusCollection(allLoci,50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci)) #sort by size and provide a unique ID sizeList = [locus.len() for locus in stitchedLoci] sizeOrder = utils.order(sizeList,decreasing=True) orderedLoci = [stitchedLoci[i] for i in sizeOrder] for i in range(len(orderedLoci)): orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName,str(i+1)) mergedGFF = [] for locus in orderedLoci: newLine = [locus.chr(),locus.ID(),'',locus.start(),locus.end(),'',locus.sense(),'',locus.ID()] mergedGFF.append(newLine) if len(output) == 0: return mergedGFF else: print "writing merged gff to %s" % (output) utils.unParseTable(mergedGFF,output,'\t') return output
def makePeakGFFs(peak_path_list): ''' makes a stitched gff for all MYC bound TSS and Distal regions across all datasets ''' #setting the output tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder) distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder) #check to see if already done if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1): print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path)) return tss_gff_path,distal_gff_path #emtpy loci lists to hold everything tss_loci = [] distal_loci = [] for peak_path in peak_path_list: print('processing %s' % (peak_path)) peak_table= utils.parseTable(peak_path,'\t') for line in peak_table[1:]: peak_locus = utils.Locus(line[1],line[2],line[3],'.') if int(line[5]) == 0: distal_loci.append(peak_locus) else: tss_loci.append(peak_locus) #now combind the loci print('stitching loci') distal_collection = utils.LocusCollection(distal_loci,50) tss_collection = utils.LocusCollection(tss_loci,50) stitched_distal_collection = distal_collection.stitchCollection() stitched_tss_collection = tss_collection.stitchCollection() #now make the gffs distal_gff= utils.locusCollectionToGFF(distal_collection) tss_gff= utils.locusCollectionToGFF(tss_collection) #now write to disk utils.unParseTable(distal_gff,distal_gff_path,'\t') utils.unParseTable(tss_gff,tss_gff_path,'\t') return tss_gff_path,distal_gff_path
def addLengths(gene_table_path, peak_table_path): ''' add tss and distal lengths to a gene table using the peak table ''' output_path = string.replace(gene_table_path, 'GENE_TABLE', 'GENE_TABLE_LENGTH') print(output_path) tss_dict = defaultdict(int) distal_dict = defaultdict(int) peak_table = utils.parseTable(peak_table_path, '\t') for line in peak_table[1:]: #get the genes gene_list = [] if len(line) == 15: gene_list += line[-1].split(',') gene_list += line[-2].split(',') elif len(line) == 14: gene_list += line[-1].split(',') else: continue gene_list = utils.uniquify( [gene for gene in gene_list if len(gene) > 0]) for gene in gene_list: if int(line[5]) == 1: tss_dict[gene] += int(line[4]) else: distal_dict[gene] += int(line[4]) #now fill out the gene table gene_table = utils.parseTable(gene_table_path, '\t') output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']] for line in gene_table[1:]: gene = line[0] new_line = line + [tss_dict[gene], distal_dict[gene]] output_table.append(new_line) utils.unParseTable(output_table, output_path, '\t') return output_path
def getTonyInfo(uniqueIDList,colList): ''' pass this a uniqueID List and a list of columns ''' uniqueIDString = string.join(uniqueIDList,',') columnString = string.join([str(x) for x in colList],',') cmd = "perl /ark/tony/admin/getDB_Data.pl -i %s -c %s -o TAB" % (uniqueIDString,columnString) sqlOut = subprocess.Popen(cmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) sqlText = sqlOut.communicate() sqlText = sqlText[0] sqlTable = sqlText.split('\n') sqlTable = [x for x in sqlTable if len(x) > 0] sqlTable = [x.split('\t') for x in sqlTable] header = [x.split(':')[-1] for x in sqlTable[0][1:]] header= [str.upper(x) for x in header] header = ['GENOME', 'SOURCE', 'CELL_TYPE', 'NAME', 'BAMFILE'] tonyDict = {} for line in sqlTable[1:]: uniqueID = line[0] tonyDict[uniqueID] = {} for i in range(len(header)): tonyDict[uniqueID][header[i]] = line[(i+1)] newTable = [] newTable.append(header) for key in tonyDict.keys(): newLine = [] newLine.append(str.upper(tonyDict[key]['GENOME'])) newLine.append(tonyDict[key]['SOURCE']) newLine.append(tonyDict[key]['CELL_TYPE']) newLine.append(tonyDict[key]['NAME']) newLine.append(tonyDict[key]['BAMFILE']) newTable.append(newLine) #print newTable utils.unParseTable(newTable, '/grail/projects/masterBamTable.txt', '\t')
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff = 0.2): ''' takes in the super dict returns a dictionary of refseqs with all valley loci that are associated returns 2 kinds of bed files... 1 = all ''' #first make the bamDict all_valley_bed = [] valleyDict = {} #start w/ a bamFileList and make a list of bam type objects bam_list = [utils.Bam(bam_path) for bam_path in bamFileList] max_read_length = max([bam.getReadLengths()[0] for bam in bam_list]) gene_list = gene_to_enhancer_dict.keys() gene_list.sort() ticker = 0 print('number of regions processed:') for gene in gene_list: valleyDict[gene] = [] for region in gene_to_enhancer_dict[gene]: if ticker %100 == 0: print(ticker) ticker+=1 scoreArray = scoreValley(region, bam_list,max_read_length,projectName, projectFolder) for index,score in enumerate(scoreArray): if score > cutoff: valley = utils.Locus(region.chr(), region.start() + index*10, region.start() + (index+1)*10, '.') valleyDict[gene].append(valley) stitchedValleys = stitchValleys(valleyDict[gene]) for valley in stitchedValleys: all_valley_bed.append([valley.chr(), valley.start(), valley.end()]) valleyDict[gene] = stitchedValleys all_bed_path = projectFolder + projectName + '_all_valleys.bed' utils.unParseTable(all_valley_bed, all_bed_path, '\t') return all_bed_path
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?) overlapLoci = bedCollection.getOverlap(gffLocus, sense='both') print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci),gffLine)) # since beds come from multiple sources, we want to figure out how to offset them offsetDict = {} # this will store each ID name bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci]) bedNamesList.sort() for i in range(len(bedNamesList)): offsetDict[bedNamesList[i]] = 2 * i # offsets different categories of bed regions if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) # fill out the name table for name in bedNamesList: offset = offsetDict[name] nameTable.append([name, 0, 0.0 - offset]) for bedLocus in overlapLoci: offset = offsetDict[bedLocus.ID()] [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
def summarizeData(dataFile,output ='',namesList= []): dataDict=pipeline_dfci.loadDataTable(dataFile) if len(namesList) == 0: namesList = dataDict.keys() if len(output) == 0: output = string.replace(dataFile,'.txt','_SUMMARY.txt') print('WRITING OUTPUT TO %s' % (output)) readTable = [['NAME','TOTAL_READS','MAPPED_READS','PEAKS']] for name in namesList: print('GETTING DATA SUMMARY FOR %s' % (name)) uniqueID = dataDict[name]['uniqueID'] mappedReads = round(float(pipeline_dfci.getTONYInfo(uniqueID,'67'))/1000000,2) totalRaw = pipeline_dfci.getTONYInfo(uniqueID,'68') totalRaw = int(totalRaw.split('::')[0]) totalReads = round(float(totalRaw)/1000000,2) #mappedReads = 0 #totalReads = 0 #getting the spot score #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID) #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID) #spotTable = utils.parseTable(spotFile,'\t') #spotScore = spotTable[1][0].split(' ')[-1] #get the peak count if name.count('H3K27AC') == 1 or name.count('ATAC') ==1: peakCollection = utils.importBoundRegion('%s%s' % (macsEnrichedFolder,dataDict[name]['enrichedMacs']),name) peakCount = len(peakCollection) else: peakCount = 'NA' newLine = [name,totalReads,mappedReads,peakCount] print(newLine) readTable.append(newLine) utils.unParseTable(readTable,output,'\t')
def buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1): ''' from the collapsed edge dictionary, build a target graph require at least n motifs to constitute an edge where n is set by cutoff. default is 1 ''' node_list = edgeDict.keys() node_list.sort() #this is only edges between TFs graph = nx.DiGraph(name=analysis_name) graph.add_nodes_from(node_list) #this stores ALL edges identified by motifs edge_table = [['SOURCE','TARGET','CHROM','START','STOP','REGION_ID','TF_INTERACTION']] edge_output = '%s%s_EDGE_TABLE.txt' % (output_folder,analysis_name) for source in node_list: print(source) target_list = edgeDict[source].keys() target_list.sort() for target in target_list: #now we need to see which target regions this guy overlaps target_regions = gene_to_enhancer_dict[target] target_collection = utils.LocusCollection(target_regions,50) #get the edges hitting that target edgeLoci = edgeDict[source][target] if node_list.count(target) > 0: tf_interaction = 1 else: tf_interaction = 0 #only add to the graph if this is a TF/TF interaction if len(edgeLoci) >= cutoff and node_list.count(target) > 0: graph.add_edge(source,target) #now for each edge, add to the table for edgeLocus in edgeLoci: regionString = ','.join([locus.ID() for locus in target_collection.getOverlap(edgeLocus)]) edgeLine = [source,target,edgeLocus.chr(),edgeLocus.start(),edgeLocus.end(),regionString,tf_interaction] edge_table.append(edgeLine) utils.unParseTable(edge_table,edge_output,'\t') return graph
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict, canidateGenes): ''' import the FIMO output once it's finished build the networkX directed graph ''' motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = {} motifNames = [line[1] for line in motifDatabase] # The reverse of the other dict, from motif name to gene name for line in motifDatabase: motifDatabaseDict[line[0]] = line[1] fimoFile = projectFolder + 'FIMO/fimo.txt' fimoTable = utils.parseTable(fimoFile, '\t') graph = nx.DiGraph(name=projectName) graph.add_nodes_from(canidateGenes) motifDict = defaultdict(list) for line in fimoTable[1:]: source = motifDatabaseDict[line[0]] #motifId # region = line[1].split('|') region = line[2].split('|') target = refseqToNameDict[ region[0]] #gene name corresponding to the NMid graph.add_edge(source, target) # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3]))) motifDict[source].append((region[1], int(region[2]) + int(line[3]), int(region[2]) + int(line[4]))) utils.formatFolder(projectFolder + 'motifBED/', True) for gene in motifDict.keys(): if motifDict[gene]: bed = [] for loc in motifDict[gene]: bed.append([loc[0], loc[1], loc[2]]) filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed' utils.unParseTable(bed, filename, '\t') return graph
def mergeCollections(superFile1,superFile2,name1,name2,output=''): ''' merges them collections ''' conSuperCollection = makeSECollection(superFile1,name1) tnfSuperCollection = makeSECollection(superFile2,name2) #now merge them mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci() mergedCollection = utils.LocusCollection(mergedLoci,50) #stitch the collection together stitchedCollection = mergedCollection.stitchCollection() stitchedLoci = stitchedCollection.getLoci() #loci that are in both get renamed with a new unique identifier renamedLoci =[] ticker = 1 for locus in stitchedLoci: if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)): newID = 'CONSERVED_%s' % (str(ticker)) ticker +=1 locus._ID = newID else: locus._ID = locus.ID()[2:] renamedLoci.append(locus) #now we turn this into a gff and write it out gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50)) if len(output) == 0: return gff else: print "writing merged gff to %s" % (output) utils.unParseTable(gff,output,'\t') return output
def makeFoldTable(annotFile, analysisName, testName, controlName, testMMR, controlMMR, testIdxFile, controlIdxFile, outputFolder, epsilon=1): ''' makes the fold table and writes to disk fold table is ranked by fold change first column is guideID, second column is gene name, third is fold change ''' guideDict, geneDict = makeAnnotDict(annotFile) testIdx = utils.parseTable(testIdxFile, '\t') controlIdx = utils.parseTable(controlIdxFile, '\t') #for each guide, divide the count by the MMR then add 1 then take the log2 ratio outTable = [['GUIDE_ID', 'GENE', 'LOG2_RATIO', testName, controlName]] for i in range(len(testIdx)): guideID = testIdx[i][0] gene = guideDict[guideID] testCount = float(testIdx[i][2]) / testMMR + epsilon controlCount = float(controlIdx[i][2]) / controlMMR + epsilon log2Ratio = numpy.log2(testCount / controlCount) newLine = [ guideID, gene, log2Ratio, round(testCount, 4), round(controlCount, 4) ] outTable.append(newLine) outputFile = '%s%s_log2Ratio.txt' % (outputFolder, analysisName) utils.unParseTable(outTable, outputFile, '\t') return outputFile
def main(): projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/' bedFolder = projectFolder+'bed/' wtCanyonBed = bedFolder+'canyon_WT_sizeSelected.bed' mutCanyonBed = bedFolder+'canyon_Mut_sizeSelected.bed' wtCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'wt_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(wtCanyonBed, '\t')]) mutCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'mut_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(mutCanyonBed, '\t')]) overlappingCanyons = [] wtExpansion = [] mutExpansion = [] wtUnique = [] mutUnique = [] overlapCounter = 0 mutOverlap = 0 for locus in wtCanyonLocusCollection.getLoci(): wtMutOverlap = mutCanyonLocusCollection.getOverlap(locus, 'both') if len(wtMutOverlap) > 0: overlapCounter += 1 for overlap in wtMutOverlap: newLine = [locus.chr(), locus.start(), locus.end(), locus.end()-locus.start(), overlap.chr(), overlap.start(), overlap.end(), overlap.end()-overlap.start()] wtLength = locus.end()-locus.start() mutLength = overlap.end()-overlap.start() if mutLength > wtLength: mutExpansion.append(newLine) elif wtLength > mutLength: wtExpansion.append(newLine) else: wtUnique.append(locus) for locus in mutCanyonLocusCollection.getLoci(): mutWTOverlap = wtCanyonLocusCollection.getOverlap(locus, 'both') if len(mutWTOverlap) > 0: mutOverlap += 1 else: mutUnique.append(locus) print len(mutExpansion) print len(wtExpansion) utils.unParseTable(mutExpansion, projectFolder+'tables/MUT_canyons_expanded.txt', '\t') utils.unParseTable(wtExpansion, projectFolder+'tables/WT_canyons_expanded.txt', '\t')
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension): ''' takes as input a BED file of constituents outputs a FASTA file of merged extended super-enhancer consituents and associated formated name ''' print 'MAKE FASTA' subpeakDict = {} subpeakBED = [['track name=' + projectName + ' color=204,0,204']] subpeakTable = utils.parseTable(subpeaks, '\t') subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable] subpeakCollection = utils.LocusCollection(subpeakLoci, 50) for gene in TFandSuperDict.keys(): subpeakDict[gene] = [] for region in TFandSuperDict[gene]: overlaps = subpeakCollection.getOverlap(region) extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps] overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50) overlapCollection = overlapCollectionTemp.stitchCollection() for overlap in overlapCollection.getLoci(): subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()]) subpeakDict[gene].append(overlap) bedfilename = projectFolder + projectName + '_subpeaks.bed' utils.unParseTable(subpeakBED, bedfilename, '\t') fasta = [] for gene in subpeakDict: for subpeak in subpeakDict[gene]: fastaTitle = gene + '|' + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end()) fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1)) fasta.append('>' + fastaTitle) fasta.append(upper(fastaLine)) # Output the fasta file of extended SE constituents outname = projectFolder + projectName + '_SUBPEAKS.fa' utils.unParseTable(fasta, outname, '')
def getExpanded(locusTable, expansion, status, output): loci = utils.parseTable(locusTable, '\t') expandedList = [] for line in loci: wtLocus = line[0:4] mutLocus = line[4:8] if status == 'WT': newLine = expansionStat(wtLocus, mutLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) elif status == 'MUT': newLine = expansionStat(mutLocus, wtLocus, expansion=0.1) if len(newLine) > 0: expandedList.append(newLine) print len(expandedList), ' expanded loci in ', status utils.unParseTable(expandedList, output, '\t')
def makeRigerTable(foldTableFile, output=''): ''' blah ''' #need a table of this format rigerTable = [[ 'Construct', 'GeneSymbol', 'NormalizedScore', 'Construct Rank', 'HairpinWeight' ]] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile, '\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]], decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict = defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] += 1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i + 1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print( "Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank, 1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile, '_log2Ratio.txt', '_friger.txt') utils.unParseTable(rigerTable, output, '\t') return output
def main(): projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/' bedFolder = projectFolder + 'bed/' canyonBed = bedFolder + 'canyon_WT_sizeSelected.bed' extension = 1000 sampleName = 'WT' upstreamEdgeBed = [] downstreamEdgeBed = [] outputUp = 'HG19_' + sampleName + '_' + str( extension) + 'extend_upstreamFlanking.bed' outputDown = 'HG19_' + sampleName + '_' + str( extension) + 'extend_downstreamFlanking.bed' for line in utils.parseTable(canyonBed, '\t'): print line chrom = line[0] start = int(line[1]) end = int(line[2]) canyon_name = sampleName + '_' + str(chrom) + '(.):' + str( start) + '-' + str(end) if start > extension: startUpstream = start - extension startDownstream = start + extension endUpstream = end - extension endDownstream = end + extension upstreamLine = [ chrom, startUpstream, startDownstream, canyon_name + '_5Flank' ] downstreamLine = [ chrom, endUpstream, endDownstream, canyon_name + '_3Flank' ] upstreamEdgeBed.append(upstreamLine) downstreamEdgeBed.append(downstreamLine) else: pass utils.unParseTable(upstreamEdgeBed, bedFolder + outputUp, '\t') utils.unParseTable(downstreamEdgeBed, bedFolder + outputDown, '\t')
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict, background=False): ''' calculates the level of acetylation at each TF promoter ''' print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA' annotTable = utils.parseTable(annotationFile, '\t') output = [] counter = 0 bam = utils.Bam(bamFile) if background: background = utils.Bam(background) startDict = utils.makeStartDict(annotationFile) tssLoci = [] for gene in startDict: tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500)) tssCollection = utils.LocusCollection(tssLoci, 50) gff = utils.locusCollectionToGFF(tssCollection) outputname = projectFolder + projectName + '_TSS.gff' utils.unParseTable(gff, outputname, '\t') mappingCmd = 'bamliquidator_batch' mappingCmd += ' -r ' + outputname mappingCmd += ' -o ' + projectFolder + 'bamliquidator' mappingCmd += ' -m -e 200 ' mappingCmd += bamFile subprocess.call(mappingCmd, shell=True) print mappingCmd
def makeRigerTable(foldTableFile,output=''): ''' blah ''' #need a table of this format rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']] #set weight to 1 for now foldTable = utils.parseTable(foldTableFile,'\t') constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True) #make geneCountDict print("making gene count dictionary") geneCountDict= defaultdict(int) for line in foldTable[1:]: geneCountDict[line[1]] +=1 print("iterating through constructs") constructRank = 1 for i in constructOrder: rowIndex = i+1 # accounts for the header geneName = foldTable[rowIndex][1] if geneCountDict[geneName] == 1: print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName)) continue newLine = foldTable[rowIndex][0:3] + [constructRank,1] rigerTable.append(newLine) constructRank += 1 if len(output) == 0: output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt') utils.unParseTable(rigerTable,output,'\t') return output
def collapseRegionMap(regionMapFile,name='',controlBams=False): ''' takes a regionMap file and collapses signal into a single column also fixes any stupid start/stop sorting issues needs to take into account whether or not controls were used ''' regionMap = utils.parseTable(regionMapFile,'\t') for n,line in enumerate(regionMap): if n ==0: #new header if len(name) == 0: name = 'MERGED_SIGNAL' regionMap[n] = line[0:6] +[name] else: newLine = list(line[0:6]) if controlBams: signalLine = [float(x) for x in line[6:]] rankbyIndexes = range(0,len(signalLine)/2,1) controlIndexes = range(len(signalLine)/2,len(signalLine),1) metaVector = [] for i,j in zip(rankbyIndexes,controlIndexes): #min signal is 0 metaVector.append(max(0,signalLine[i] - signalLine[j])) metaSignal = numpy.mean(metaVector) else: metaSignal = numpy.mean([float(x) for x in line[6:]]) regionMap[n] = newLine + [metaSignal] outputFile = string.replace(regionMapFile,'REGION','META') utils.unParseTable(regionMap,outputFile,'\t') return(outputFile)
def main(): from optparse import OptionParser usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]" parser = OptionParser(usage = usage) #required flags parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None, help = "Enter .bam file to be processed.") parser.add_option("-i","--input", dest="input",nargs = 1, default=None, help = "Enter .gff or ENRICHED REGION file to be processed.") #output flag parser.add_option("-o","--output", dest="output",nargs = 1, default=None, help = "Enter the output filename.") #additional options parser.add_option("-s","--sense", dest="sense",nargs = 1, default='.', help = "Map to '+','-' or 'both' strands. Default maps to both.") parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200, help = "Extends reads by n bp. Default value is 200bp") parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False, help = "Normalizes density to reads per million (rpm)") parser.add_option("-c","--cluster", dest="cluster",nargs = 1, default=None, help = "Outputs a fixed bin size clustergram. user must specify bin size.") parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None, help = "Outputs a variable bin sized matrix. User must specify number of bins.") (options,args) = parser.parse_args() print(options) print(args) if options.sense: if ['+','-','.','both'].count(options.sense) == 0: print('ERROR: sense flag must be followed by +,-,.,both') parser.print_help() exit() if options.cluster and options.matrix: print('ERROR: Cannot specify both matrix and clustergram flags.') parser.print_help() exit() if options.matrix: try: int(options.matrix) except: print('ERROR: User must specify an integer bin number for matrix (try 50)') parser.print_help() exit() if options.cluster: try: int(options.cluster) except: print('ERROR: User must specify an integer bin size for clustergram (try 25)') parser.print_help() exit() if options.input and options.bam: inputFile = options.input if inputFile.split('.')[-1] != 'gff': print('converting file to a .gff') gffFile = convertEnrichedRegionsToGFF(inputFile) else: gffFile = inputFile bamFile = options.bam if options.output == None: output = os.getcwd() + inputFile.split('/')[-1]+'.mapped' else: output = options.output if options.cluster: print('mapping to GFF and making clustergram with fixed bin width') newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,int(options.cluster),None) elif options.matrix: print('mapping to GFF and making a matrix with fixed bin number') newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,None,int(options.matrix)) print('bamToGFF_turbo writing output to: %s' % (output)) # Hackjob to make subdirectories for ROSE integration try: os.mkdir(os.path.dirname(output)) except OSError: pass utils.unParseTable(newGFF,output,'\t') else: parser.print_help()
def makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, outFolder, names, title, bedCollection): ''' makes a plot table for each line of the gff mapped against all the bams in the bamList ''' # load in the gff if type(gff) == str: gff = utils.parseTable(gff, '\t') # load in the annotation print('loading in annotation for %s' % (genome)) geneDict, txCollection = loadAnnotFile(genome) # make an MMR dict so MMRs are only computed once print('Getting information about read depth in bams') mmrDict = {} for bamFile in bamFileList: # millionMappedReads idxCmd = 'samtools idxstats %s' % (bamFile) idxPipe = subprocess.Popen(idxCmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) idxStats = idxPipe.communicate() idxStats = idxStats[0].split('\n') idxStats = [line.split('\t') for line in idxStats] rawCount = sum([int(line[2]) for line in idxStats[:-1]]) if rpm: MMR = round(float(rawCount) / 1000000, 4) else: MMR = 1 mmrDict[bamFile] = MMR # bam = Bam(bamFile) # if rpm: # MMR= round(float(bam.getTotalReads('mapped'))/1000000,4) # else: # MMR = 1 # mmrDict[bamFile] = MMR # mmrDict[bamFile] = 21.5377 ticker = 1 # go line by line in the gff summaryTable = [['DIAGRAM_TABLE', 'NAME_TABLE', 'BED_DIAGRAM_TABLE', 'BED_NAME_TABLE', 'PLOT_TABLE', 'CHROM', 'ID', 'SENSE', 'START', 'END']] for gffLine in gff: gffString = 'line_%s_%s_%s_%s_%s_%s' % (ticker, gffLine[0], gffLine[1], gffLine[6], gffLine[3], gffLine[4]) ticker += 1 print('writing the gene diagram table for region %s' % (gffLine[1])) mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=gffString) mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=gffString) outTable = [] outTable.append(['BAM', 'GENE_ID', 'NAME', 'LOCUSLINE', 'COLOR1', 'COLOR2', 'COLOR3'] + ['bin_' + str(n) for n in range(1, int(nBins) + 1, 1)]) for i in range(0, len(bamFileList), 1): bamFile = bamFileList[i] name = names[i] color = colorList[i] print('getting data for location %s in dataset %s' % (gffLine[1], bamFile)) mmr = mmrDict[bamFile] newLine = mapBamToGFFLine(bamFile, mmr, name, gffLine, color, nBins, sense, extension) outTable.append(newLine) # get the gene name if geneDict.has_key(gffLine[1]): geneName = geneDict[gffLine[1]].commonName() else: geneName = gffLine[1] utils.unParseTable(outTable, outFolder + gffString + '_plotTemp.txt', '\t') diagramTable = outFolder + gffString + '_diagramTemp.txt' plotTable = outFolder + gffString + '_plotTemp.txt' nameTable = outFolder + gffString + '_nameTemp.txt' bedNameTable = outFolder + gffString + '_bedNameTemp.txt' bedDiagramTable = outFolder + gffString + '_bedDiagramTemp.txt' summaryTable.append([diagramTable, nameTable, bedDiagramTable, bedNameTable, plotTable, gffLine[0], geneName, gffLine[6], gffLine[3], gffLine[4]]) summaryTableFileName = "%s%s_summary.txt" % (outFolder, title) utils.unParseTable(summaryTable, summaryTableFileName, '\t') return summaryTableFileName
def mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=''): ''' for every line produces a file with all of the rectangles to draw ''' if len(header) == 0: gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4]) else: gffString = header diagramTable = [[0, 0, 0, 0]] nameTable = [['', 0, 0]] gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1]) scaleFactor = float(nBins) / gffLocus.len() # plotting buffer for diagrams plotBuffer = int(gffLocus.len() / float(nBins) * 20) overlapLoci = txCollection.getOverlap(gffLocus, sense='both') geneList = [locus.ID() for locus in overlapLoci] if gffLine[6] == '-': refPoint = int(gffLine[4]) else: refPoint = int(gffLine[3]) offsetCollection = utils.LocusCollection([], 500) for geneID in geneList: gene = geneDict[geneID] print(gene.commonName()) if len(gene.commonName()) > 1: name = gene.commonName() else: name = geneID offset = 4 * len(offsetCollection.getOverlap(gene.txLocus())) offsetCollection.append(utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer)) # write the name of the gene down if gene.sense() == '+': geneStart = gene.txLocus().start() else: geneStart = gene.txLocus().end() geneStart = abs(geneStart - refPoint) * scaleFactor nameTable.append([name, geneStart, -2 - offset]) # draw a line across the entire txLocus [start, stop] = [abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords()] diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset]) # now draw thin boxes for all txExons if len(gene.txExons()) > 0: for txExon in gene.txExons(): [start, stop] = [abs(x - refPoint) * scaleFactor for x in txExon.coords()] diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset]) # now draw fatty boxes for the coding exons if any if len(gene.cdExons()) > 0: for cdExon in gene.cdExons(): [start, stop] = [abs(x - refPoint) * scaleFactor for x in cdExon.coords()] diagramTable.append([start, -1 - offset, stop, 1 - offset]) utils.unParseTable(diagramTable, outFolder + gffString + '_diagramTemp.txt', '\t') utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt', '\t')
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM, bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter): ''' Assign each Super-Enhancer to the closest active TSS to its center Return a dictionary keyed by TF that points to a list of loci ''' #loading in the enhancer gff regions enhancer_collection = utils.gffToLocusCollection(enhancer_gff) enhancer_loci = enhancer_collection.getLoci() #loading in the genome and TF info annot_file = genome.returnFeature('annot_file') startDict = utils.makeStartDict(annot_file) tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t') refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs #make a collection of all TF TSSs tssLoci = [] for refID in refID_list: tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus tssCollection = utils.LocusCollection(tssLoci,50) enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']] gene_to_enhancer_dict = defaultdict(list) # Loop through enhancers #all gene nnames stored by refID for enhancer in enhancer_loci: # If the enhancer overlaps a TSS, save it overlapping_loci = tssCollection.getOverlap(enhancer, 'both') overlapping_refIDs =[locus.ID() for locus in overlapping_loci] # Find all gene TSS within 100 kb proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both') proximal_refIDs =[locus.ID() for locus in proximal_loci] # If no genes are within 100 kb, find the closest active gene within 1 million bp closest_refID = [] if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0: distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both') distal_refIDs =[locus.ID() for locus in distal_loci] enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2 distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in distal_refIDs] if len(distance_list) > 0: closest_refID = [distalGenes[distance_list.index(min(distance_list))]] #now we have all potential gene cases all_refIDs = overlappingGenes + proximalGenes + closest_refID #now we get all names and refIDs all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ]) all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs]) #first do enhancer level assignment names_string = ','.join(all_names) enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string]) #now do gene level assignment for refID in all_refIDs: gene_to_enhancer_dict[refID].append(enhancer.ID()) #an enhancer can be assigned to multiple genes #a promoter can only be assigned to 1 gene #promoters don't have enhancerIDs so don't add them yet #this should just be an enhancer level table #followed by a gene level table overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) # If a TSS overlaps an enhancer, assign them together if overlappingGenes: for gene in overlappingGenes: if gene in tf_list: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Otherwise, assign the enhancer to the most active gene in 100 kb elif not overlappingGenes and proximalGenes: highestGene = '' highestActivity = 0 for gene in proximalGenes: if expressionDictNM[gene] > highestActivity: highestActivity = expressionDictNM[gene] highestGene = gene if highestGene in TFlist: TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) elif not overlappingGenes and not proximalGenes and closestGene: if closestGene in TFlist: gene = closestGene TFtoEnhancerDict[gene].append(enhancer) enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()]) # Add promoter is it's not contained in the super if promoter: for gene in TFtoEnhancerDict.keys(): promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense']) overlapBool = False for enhancer in TFtoEnhancerDict[gene]: if promoter.overlaps(enhancer): overlapBool = True if not overlapBool: TFtoEnhancerDict[gene].append(promoter) seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt' utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t') return TFtoEnhancerDict
def collapseFimo(fimo_output,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile): ''' collapses motifs from fimo for each source node (TF) and each target node (gene enhancer regions), collapse motif instances then spit out a ginormous set of beds and a single crazy collapsed bed ''' #first build up the motif name conversion database motifDatabase = utils.parseTable(motifConvertFile, '\t') motifDatabaseDict = defaultdict(list) # The reverse of the other dict, from motif name to gene name # a motif can go to multiple genes for line in motifDatabase: motifDatabaseDict[line[0]].append(line[1]) #make the folder to store motif beds utils.formatFolder('%smotif_beds/' % (output_folder),True) edgeDict = {} #first layer are source nodes for tf in candidate_tf_list: edgeDict[tf] = defaultdict(list) #next layer are target nodes which are derived from the fimo output fimoTable = utils.parseTable(fimo_output,'\t') print(fimo_output) #fimo sometimes puts the region in either the first or second column fimo_line = fimoTable[1] if fimo_line[1].count('|') >0: region_index = 1 else: region_index = 2 print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index)) for line in fimoTable[1:]: source_tfs = motifDatabaseDict[line[0]] #motifId for source in source_tfs: if candidate_tf_list.count(source) == 0: continue region = line[region_index].split('|') target = region[0] if region_index == 2: target_locus = utils.Locus(region[1],int(region[2]) + int(line[3]), int(region[2]) + int(line[4]),'.') else: target_locus = utils.Locus(region[1],int(region[2]) + int(line[2]), int(region[2]) + int(line[3]),'.') #what's missing here is the enhancer id of the target locus try: edgeDict[source][target].append(target_locus) except KeyError: print('this motif is not in the network') print(line) sys.exit() #now we actually want to collapse this down in a meaningful way #overlapping motifs count as a single binding site. This way a TF with tons of motifs #that finds the same site over and over again doesn't get over counted all_bed = [] all_bed_path = '%s%s_all_motifs.bed' % (output_folder,analysis_name) for tf in candidate_tf_list: print(tf) target_nodes = edgeDict[tf].keys() bed_header = ['track name = "%s" description="%s motifs in %s"' % (tf,tf,analysis_name)] all_bed.append(bed_header) target_bed = [bed_header] target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder,tf) for target in target_nodes: edgeCollection = utils.LocusCollection(edgeDict[tf][target],50) edgeCollection = edgeCollection.stitchCollection() edgeLoci = edgeCollection.getLoci() edgeDict[tf][target] = edgeLoci for locus in edgeLoci: bed_line = [locus.chr(),locus.start(),locus.end(),target,'','+'] target_bed.append(bed_line) all_bed.append(bed_line) utils.unParseTable(target_bed,target_bed_path,'\t') #now the loci are all stitched up utils.unParseTable(all_bed,all_bed_path,'\t') return edgeDict
def main(): ''' main run call ''' from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a ROSE ranked enhancer or super-enhancer file") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter the bam used to rank enhancers") parser.add_option("-c", "--control", dest="control", nargs=1, default='', help="Enter a background bam for background correction") parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None, help="Enter a gene list to filter through") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder. Default will be same folder as input file") parser.add_option( "-w", "--window", dest="window", nargs=1, default=50000, help="Enter a search distance for genes. Default is 50,000bp") parser.add_option( "-f", "--format", dest="formatTable", action="store_true", default=False, help="If flagged, maintains original formatting of input table") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.genome or not options.rankby: parser.print_help() exit() print(options) # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # GETTING THE INPUT enhancerFile = options.input window = int(options.window) # making the out folder if it doesn't exist if options.out: outFolder = utils.formatFolder(options.out, True) else: outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/' # GETTING BAM INFO rankByBamFile = options.rankby controlBamFile = options.control # CHECK FORMATTING FLAG if options.formatTable: noFormatTable = True else: noFormatTable = False # GETTING THE TRANSCRIBED LIST if options.geneList: transcribedFile = options.geneList else: transcribedFile = '' if options.rankby: enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable = mapEnhancerToGeneTop( rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile, True, window, noFormatTable) # Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: # writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) print("writing output to %s" % (out1)) utils.unParseTable(enhancerToGeneTable, out1, '\t') # writing enhancer top gene table out2 = '%s%s_ENHANCER_TO_TOP_GENE_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) utils.unParseTable(enhancerToTopGeneTable, out2, '\t') # writing the gene table out3 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % ( outFolder, enhancerFileName, window / 1000) utils.unParseTable(geneToEnhancerTable, out3, '\t') else: # writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName) utils.unParseTable(enhancerToGeneTable, out1, '\t') # writing the enhancer table out2 = '%s%s_ENHANCER_TO_TOP_GENE.txt' % (outFolder, enhancerFileName) utils.unParseTable(enhancerToTopGeneTable, out2, '\t') # writing the gene table out3 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName) utils.unParseTable(geneToEnhancerTable, out3, '\t') else: #do traditional mapping enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable) #Writing enhancer output enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] if window != 50000: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000) utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000) utils.unParseTable(geneToEnhancerTable,out2,'\t') else: #writing the enhancer table out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName) utils.unParseTable(enhancerToGeneTable,out1,'\t') #writing the gene table out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName) utils.unParseTable(geneToEnhancerTable,out2,'\t')
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): ''' makes a table of factor density in a stitched locus and ranks table by number of loci stitched together ''' print('FORMATTING TABLE') loci = stitchedCollection.getLoci() locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] lociLenList = [] # strip out any that are in chrY for locus in list(loci): if locus.chr() == 'chrY': loci.remove(locus) for locus in loci: # numLociList.append(int(stitchLocus.ID().split('_')[1])) lociLenList.append(locus.len()) # numOrder = order(numLociList,decreasing=True) lenOrder = utils.order(lociLenList, decreasing=True) ticker = 0 for i in lenOrder: ticker += 1 if ticker % 1000 == 0: print(ticker) locus = loci[i] # First get the size of the enriched regions within the stitched locus refEnrichSize = 0 refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') for refLocus in refOverlappingLoci: refEnrichSize += refLocus.len() try: stitchCount = int(locus.ID().split('_')[0]) except ValueError: stitchCount = 1 coords = [int(x) for x in locus.coords()] locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize]) print('GETTING MAPPED DATA') print("USING A BAMFILE LIST:") print(bamFileList) for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] print('GETTING MAPPING DATA FOR %s' % bamFile) # assumes standard convention for naming enriched region gffs # opening up the mapped GFF print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') signalDict = defaultdict(float) print('MAKING SIGNAL DICT FOR %s' % (bamFile)) mappedLoci = [] for line in mappedGFF[1:]: chrom = line[1].split('(')[0] start = int(line[1].split(':')[-1].split('-')[0]) end = int(line[1].split(':')[-1].split('-')[1]) mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) try: signalDict[line[0]] = float(line[2]) * (abs(end - start)) except ValueError: print('WARNING NO SIGNAL FOR LINE:') print(line) continue mappedCollection = utils.LocusCollection(mappedLoci, 500) locusTable[0].append(bamFileName) for i in range(1, len(locusTable)): signal = 0.0 line = locusTable[i] lineLocus = utils.Locus(line[1], line[2], line[3], '.') overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') for region in overlappingRegions: signal += signalDict[region.ID()] locusTable[i].append(signal) utils.unParseTable(locusTable, output, '\t')
def optimizeStitching(locusCollection, name, outFolder, stepSize=500): ''' takes a locus collection and starts writing out stitching stats at step sized intervals ''' maxStitch = 15000 # set a hard wired match stitching parameter stitchTable = [['STEP', 'NUM_REGIONS', 'TOTAL_CONSTIT', 'TOTAL_REGION', 'MEAN_CONSTIT', 'MEDIAN_CONSTIT', 'MEAN_REGION', 'MEDIAN_REGION', 'MEAN_STITCH_FRACTION', 'MEDIAN_STITCH_FRACTION']] # first consolidate the collection locusCollection = locusCollection.stitchCollection(stitchWindow=0) total_constit = sum([locus.len() for locus in locusCollection.getLoci()]) step = 0 while step <= maxStitch: print("Getting stitch stats for %s (bp)" % (step)) stitchCollection = locusCollection.stitchCollection(stitchWindow=step) num_regions = len(stitchCollection) stitchLoci = stitchCollection.getLoci() regionLengths = [locus.len() for locus in stitchLoci] total_region = sum(regionLengths) constitLengths = [] for locus in stitchLoci: constitLoci = locusCollection.getOverlap(locus) constitLengths.append(sum([locus.len() for locus in constitLoci])) meanConstit = round(numpy.mean(constitLengths), 2) medianConstit = round(numpy.median(constitLengths), 2) meanRegion = round(numpy.mean(regionLengths), 2) medianRegion = round(numpy.median(regionLengths), 2) stitchFractions = [float(constitLengths[i]) / float(regionLengths[i]) for i in range(len(regionLengths))] meanStitchFraction = round(numpy.mean(stitchFractions), 2) medianStitchFraction = round(numpy.median(stitchFractions), 2) newLine = [step, num_regions, total_constit, total_region, meanConstit, medianConstit, meanRegion, medianRegion, meanStitchFraction, medianStitchFraction] stitchTable.append(newLine) step += stepSize # write the stitch table to disk stitchParamFile = '%s%s_stitch_params.tmp' % (outFolder, name) utils.unParseTable(stitchTable, stitchParamFile, '\t') # call the rscript rCmd = 'Rscript ./ROSE2_stitchOpt.R %s %s %s' % (stitchParamFile, outFolder, name) print(rCmd) # get back the stitch parameter rOutput = subprocess.Popen(rCmd, stdout=subprocess.PIPE, shell=True) rOutputTest = rOutput.communicate() print(rOutputTest) stitchParam = rOutputTest[0].split('\n')[2] try: stitchParam = int(stitchParam) except ValueError: print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED") sys.exit() # delete? the table # os.system('rm -f %s' % (stitchParamFile)) return stitchParam
def formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list): ''' takes the networkx graph returns all figures, tables, etc ''' # output the network as a .ntx dictionary of lists networkFilename = output_folder + analysis_name + '.ntx' networkFile = open(networkFilename, 'w') networkDictOfLists = nx.to_dict_of_lists(graph) pickle.dump(networkDictOfLists, networkFile) # output the adjacency list and nodelist nodeFile = output_folder + analysis_name + '_NODELIST.txt' if nx.__version__[0] == '1': nodeList = [ [n] for n in graph.nodes_iter()] elif nx.__version__[0] == '2': nodeList = [[n] for n in graph.nodes()] else: print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE') sys.exit() utils.unParseTable(nodeList, nodeFile, '\t') adjFile = output_folder + analysis_name + '_ADJ_LIST.txt' if nx.__version__[0] == '1': adjList = graph.adjacency_list() elif nx.__version__[0] == '2': adjList = [n[1].keys() for n in graph.adjacency()] else: print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE') sys.exit() utils.unParseTable(adjList, adjFile, '\t') edgesTable = [['From', 'To']] targetList = [] for i,gene in enumerate(nodeList): for j in adjList[i]: newline = [gene[0],j] edgesTable.append(newline) TFname = gene[0] edgeFile = output_folder + analysis_name + '_EDGE_LIST.txt' utils.unParseTable(edgesTable, edgeFile, '\t') # Make the degree table degTable = [['Tf', 'In_Degree', 'Out_Degree', 'Total_Connections' ]] degFile = output_folder + analysis_name + '_DEGREE_TABLE.txt' for node in graph.nodes(): #shouldn't we output the table for the TFs that have motifs only ? for canidateMotifs in graph.nodes().... newline = [node, graph.in_degree()[node], graph.out_degree()[node], graph.degree()[node]] degTable.append(newline) utils.unParseTable(degTable, degFile, '\t') print 'DEFINING THE CORE REGULATORY CIRCUIT' autoreg = graph.selfloop_edges() selfLoops = [x for x,y in autoreg] selfLoopFile = output_folder + analysis_name + '_SELF_LOOPS.txt' utils.unParseTable(selfLoops, selfLoopFile, '') #recover bidirectional edges pairs = [] for n in selfLoops: for m in selfLoops: if n != m: if graph.has_edge(n,m) and graph.has_edge(m,n): pairs.append([n,m]) unDirGraph = nx.from_edgelist(pairs) cliqueGen = find_cliques_recursive(unDirGraph) cliqueList = list(cliqueGen) utils.unParseTable(cliqueList, output_folder + analysis_name + '_CLIQUES_ALL.txt', '\t') cliqueRanking = [] outDegreeDict = graph.out_degree() for c in cliqueList: score = 0 for gene in c: score += outDegreeDict[gene] score = score/len(c) if score > 0 and len(c) > 2: cliqueRanking.append((c, score)) sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1]) cliqueFile = output_folder + analysis_name + '_CLIQUE_SCORES_DEGREE.txt' utils.unParseTable(sortCliqueRanking, cliqueFile, '\t') factorEnrichmentDict = {} for factor in selfLoops: factorEnrichmentDict[factor] = 0 for pair in cliqueRanking: c = pair[0] for factor in c: factorEnrichmentDict[factor] += 1 factorRankingTable = [] for factor in selfLoops: newline = [factor, factorEnrichmentDict[factor]/float(len(cliqueRanking))] factorRankingTable.append(newline) factorRankingFile = output_folder + analysis_name + '_ENRICHED_CLIQUE_FACTORS.txt' utils.unParseTable(factorRankingTable, factorRankingFile, '\t') # Begin VSA scoring # Initiate the graph G=nx.Graph() #recover bidirectional edges bidirectionalEdges = pairs #fill up the graph G.add_nodes_from(selfLoops) G.add_edges_from(bidirectionalEdges) #find all the cliques cliques = find_cliques_recursive(G) cliqueList = list(cliques) print 'Number of cliques:' print len(cliqueList) #count the occurences of the TFs accross the loops dicoTFinloopsCounts={} for clique in cliqueList: for TF in clique: if dicoTFinloopsCounts.has_key(TF): dicoTFinloopsCounts[TF]+=1 else: dicoTFinloopsCounts[TF]=1 #calculate a score by loop cliqueRanking = [] cliqueNub = 0 for clique in cliqueList: cliqueScore=0 for TF in clique: cliqueScore = (float(cliqueScore) + (float(dicoTFinloopsCounts[TF]))) cliqueRanking.append((clique, cliqueScore/len(clique), len(clique))) #print(cliqueRanking) sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1]) #print(sortCliqueRanking) cliqueFile = output_folder + analysis_name + '_CLIQUE_SCORES_VSA.txt' utils.unParseTable(sortCliqueRanking, cliqueFile, '\t') print 'Top CRC:' print sortCliqueRanking[0]
def main(): import argparse parser = argparse.ArgumentParser(usage="usage: prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" ) #required flags parser.add_argument("-e","--enhancer_file", dest="enhancers", default=None,type=str, help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)",required=True) parser.add_argument("-g","--genome",dest="genome", default = None,type=str, help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9",required=True) parser.add_argument("-o","--output",dest="output", default = None,type=str, help = "Enter an output folder",required=True) parser.add_argument("-n","--name",dest="name", default = None,type=str, help = "Provide a name for the job",required=True) #you either need bams for valleys or subpeaks parser.add_argument("-b","--bam",dest="bam", default = None,type=str, help = "Enter a comma separated list of bams of valley finding",required=False) parser.add_argument("-s","--subpeaks", dest="subpeaks",default=None,type=str, help = "Enter a BED file of regions to search for motifs",required=False) #additional options parser.add_argument("-a","--activity",dest="activity", default = None,type=str, help = "A table with active gene names in the first column",required=False) parser.add_argument("-l","--extension-length", dest="extension", default=100,type=int, help = "Enter the length to extend subpeak regions for motif finding. default is 100",required=False) parser.add_argument("-B","--background", dest="background", default=None,type=str, help = "Provide a background BAM file",required=False) parser.add_argument("-N", "--number", dest="number", default=1,type=int, help = "Enter the number of non overlapping motifs in a region required to assign a binding event. Default=1",required=False) #I have modified the destination of -N option so that it is different from the destination of -E option parser.add_argument("--motifs", dest="motifs", default=False,type=str, help = "Enter additional PWM file for the analysis",required=False) parser.add_argument("-t","--tfs", dest="tfs",default=None,type=str, help = "Enter additional TFs (comma separated) to be used in the bindinf analysis",required=False) parser.add_argument("--config", dest="config",default='',type=str, help = "Enter genome configuration file to overwrite default paths",required=False) args = parser.parse_args() #===================================================================================== #===============================I. PARSING ARGUMENTS================================== #===================================================================================== ### # Define all global file names ### print(args) genome = loadGenome(args.genome,args.config) motifDatabaseFile = genome.returnFeature('motif_database') motifConvertFile = genome.returnFeature('motif_convert') # User input files enhancer_file = args.enhancers if args.bam == None and args.subpeaks == None: print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed') sys.exit() #set the subpeak file if args.subpeaks: subpeakFile = args.subpeaks else: subpeakFile = None #will need to fix bams down the line to take in multiple bams if args.bam: bamFileList = [bam_path for bam_path in args.bam.split(',') if len(bam_path) >0] print(bamFileList) else: bamFileList = [] if args.background: background = args.background else: background = None #output folder and analysis name print(args.output) output_folder = utils.formatFolder(args.output,True) analysis_name = args.name #optional arguments #activity path activity_path = args.activity #motif extension constExtension = args.extension print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n') print('Analyzing TF connectivity for %s' % (analysis_name)) print('Writing output to %s' % (output_folder)) if subpeakFile: print('Using %s to define subpeaks for motif finding' % (subpeakFile)) else: print('Identifying valleys from .bam files') print('Using %s to define active genes' % (activity_path)) #===================================================================================== #=======================II. IDENTIFYING CANDIDATE TFS AND NODES======================= #===================================================================================== print('\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#======================================\n') geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict= geneToEnhancerDict(genome, enhancer_file, activity_path) #write these guys to disk gene_out = '%s%s_GENE_TABLE.txt' % (output_folder,analysis_name) gene_tf_out = '%s%s_GENE_TF_TABLE.txt' % (output_folder,analysis_name) enhancer_out = '%s%s_ENHANCER_TABLE.txt' % (output_folder,analysis_name) enhancer_tf_out = '%s%s_ENHANCER_TF_TABLE.txt' % (output_folder,analysis_name) summary_out= '%s%s_GENE_SUMMARY.txt' % (output_folder,analysis_name) utils.unParseTable(enhancerTable,enhancer_out,'\t') utils.unParseTable(enhancerTFTable,enhancer_tf_out,'\t') utils.unParseTable(geneTable,gene_out,'\t') utils.unParseTable(geneTFTable,gene_tf_out,'\t') utils.unParseTable(geneSummaryTable,summary_out,'\t') print('Identified %s genes w/ proximal cis-regulatory elements' % (len(gene_to_enhancer_dict))) print('Identified %s candidate TFs' % (len(candidate_tf_list))) print(candidate_tf_list) #===================================================================================== #==========================III. FINDING VALLEYS/SUBPEAKS============================== #===================================================================================== print('\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#======================================\n') #so here we would need to find valleys everywhere if subpeakFile == None: print('finding valleys') #note: the tf_bed_path is for networks, all is for out degree finding all_bed_path = findValleys(gene_to_enhancer_dict, bamFileList, analysis_name, output_folder, cutoff = 0.2) else: print('Using subpeaks from %s' % (subpeakFile)) all_bed_path = filterSubpeaks(subpeakFile,gene_to_enhancer_dict,analysis_name,output_folder) #first make the subpeak bed and subpeak fasta for the tfs all_sub_bed,all_fasta = generateSubpeakFASTA(gene_to_enhancer_dict, all_bed_path, genome, analysis_name,output_folder, constExtension) if subpeakFile == None: #this is the case where we did valleys #only reason you would need to output the sub bed all_sub_out = '%s%s_all_subpeak.bed' % (output_folder,analysis_name) utils.unParseTable(all_sub_bed,all_sub_out,'\t') #writing the all subpeak fasta out to disk all_fasta_out = '%s%s_all_subpeak.fasta' % (output_folder,analysis_name) utils.unParseTable(all_fasta,all_fasta_out,'') #===================================================================================== #=================================IV. FINDING MOTIFS================================== #===================================================================================== print('\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#======================================\n') #first make background bg_path = makeMotifBackground(all_fasta_out,output_folder,analysis_name) #find motifs for all regions fimo_out = findMotifs(all_fasta_out,bg_path,candidate_tf_list, output_folder, analysis_name, motifConvertFile, motifDatabaseFile) edgeDict = collapseFimo(fimo_out,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile) #===================================================================================== #============================V. RUNNING NETWORK ANALYSIS============================== #===================================================================================== print('\n\n#======================================\n#========V. BUILDING NETWORK===========\n#======================================\n') print('building graph and edge table') graph = buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1) formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list) print('FINISHED RUNNING CRC FOR %s' % (analysis_name)) sys.exit()
import utils from sys import argv filename = argv[1] outname = filename[:-3] + 'sorted.bed' bedfile = utils.parseTable(filename, '\t') out = [] for line in bedfile: coords = [int(line[1]), int(line[2])] start = min(coords) end = max(coords) newline = [line[0], start, end] + line[3:] out.append(newline) utils.unParseTable(out, outname, '\t')
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000): """ cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss """ dataDict = pipeline_dfci.loadDataTable(dataFile) # making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) # make the output folder outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True) # bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput, "\t") # make a new formatted table header = rankEnhancerTable[0] header[-4] = "DELTA RANK" header[-3] = "IS_SUPER" formattedRankTable = [header] # the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] # the beds gainedTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome, name2, genome, name2, name1) ) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = ( 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome, name1, name2, genome, name1, name2) ) conservedBed = [[conservedTrackHeader]] lostTrackHeader = ( 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome, name1, genome, name1, name2) ) lostBed = [[lostTrackHeader]] # the genes geneTable = [ [ "GENE", "ENHANCER_ID", "ENHANCER_CHROM", "ENHANCER_START", "ENHANCER_STOP", header[6], header[7], header[8], "STATUS", ] ] for line in rankEnhancerTable[1:]: # fixing the enhancer ID line[0] = line[0].replace("_lociStitched", "") formattedRankTable.append(line) # getting the genes geneList = [] geneList += line[9].split(",") geneList += line[10].split(",") geneList += line[11].split(",") geneList = [x for x in geneList if len(x) > 0] geneList = utils.uniquify(geneList) geneString = string.join(geneList, ",") bedLine = [line[1], line[2], line[3], line[0], line[-4]] # for gained if float(line[6]) > cutOff: gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) # for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString] gffWindowLine = [ line[1], line[0], "", int(line[2]) - window, int(line[3]) + window, "", ".", "", geneString, ] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) # for conserved else: geneStatus = "CONSERVED" conservedBed.append(bedLine) # now fill in the gene Table for gene in geneList: geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus] geneTable.append(geneTableLine) # concat the bed fullBed = gainedBed + conservedBed + lostBed # start writing the output # there's the two gffs, the bed,the formatted table, the gene table # formatted table formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(formattedRankTable, formattedFilename, "\t") # gffs gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True) gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2)) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name2), window / 1000, window / 1000, ) gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1)) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % ( gffFolder, genome, mergeName, string.upper(name1), window / 1000, window / 1000, ) utils.unParseTable(gainedGFF, gffFilename_gained, "\t") utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t") utils.unParseTable(lostGFF, gffFilename_lost, "\t") utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t") # bed bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName) utils.unParseTable(fullBed, bedFilename, "\t") # geneTable geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName) utils.unParseTable(geneTable, geneFilename, "\t") # finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % ( mergeFolder, name1, outputFolder, genome, mergeName, ) os.system(cmd) # now execute the bamPlot_turbo.py commands bam1 = dataDict[name1]["bam"] bam2 = dataDict[name2]["bam"] bamString = "%s,%s" % (bam1, bam2) nameString = "%s,%s" % (name1, name2) colorString = "0,0,0:100,100,100" # change dir os.chdir("/ark/home/cl512/pipeline/") if len(gainedGFF) > 0: # gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_gained, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) if len(lostGFF) > 0: # lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilename_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) # lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000) cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % ( genome, bamString, gffFilenameWindow_lost, outputFolder, nameString, colorString, plotTitle, ) os.system(cmd) return
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(statOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] headerLength = len(rankEnhancerTable[0]) for line in rankEnhancerTable[1:]: #fix line lengths if len(line) != headerLength: line += ['']*(headerLength-len(line)) #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[-1].split(',') geneList += line[-2].split(',') geneList += line[-3].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff #the cutoff is hard wired, but we can add an option to change the test #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff #print(line) if float(line[-8]) > cutOff and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'UNCHANGED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #formatted diff table #possible that no genes are differential rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t') #make a new formatted table header = rankEnhancerDiffTable[0] formattedRankDiffTable =[header] for line in rankEnhancerDiffTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankDiffTable.append(line) formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bamList1 = [dataDict[name]['bam'] for name in namesList1] bamList2 = [dataDict[name]['bam'] for name in namesList2] bamList = bamList1 + bamList2 bamString = string.join(bamList,',') nameList = [name1]*len(namesList1) + [name2]*len(namesList2) nameString = string.join(nameList,',') print(namesList1[0]) print(namesList2[0]) print(namesList1) print(namesList2) print(dataDict[namesList1[0]]['color']) if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']: colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2) else: colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2) colorString = string.join(colorList,':') #change dir if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True): ''' cleans up the rank output table makes a gff of all of the gained/lost supers beyond a certain cutoff w/ a window makes a list of gained genes and lost genes makes a bed of gained loss ''' dataDict = pipeline_dfci.loadDataTable(dataFile) #making sure window and cutoff are int/float cutOff = float(cutOff) window = int(window) genome = string.upper(genome) #make the output folder outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True) #bring in the old rank table rankEnhancerTable = utils.parseTable(rankOutput,'\t') #make a new formatted table header = rankEnhancerTable[0] header[-4] = 'DELTA RANK' header[-3] = 'IS_SUPER' formattedRankTable =[header] #the gffs gainedGFF = [] lostGFF = [] gainedWindowGFF = [] lostWindowGFF = [] if superOnly: enhancerType = 'SUPERS' else: enhancerType = 'ENHANCERS' #the beds if superOnly: gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] else: gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1) gainedBed = [[gainedTrackHeader]] conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2) conservedBed = [[conservedTrackHeader]] lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2) lostBed = [[lostTrackHeader]] #the genes geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']] for line in rankEnhancerTable[1:]: #fixing the enhancer ID line[0] = line[0].replace('_lociStitched','') formattedRankTable.append(line) #getting the genes geneList = [] geneList += line[9].split(',') geneList += line[10].split(',') geneList += line[11].split(',') geneList = [x for x in geneList if len(x) >0] geneList = utils.uniquify(geneList) geneString = string.join(geneList,',') bedLine = [line[1],line[2],line[3],line[0],line[-4]] #for gained if float(line[6]) > cutOff: gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] gainedGFF.append(gffLine) gainedWindowGFF.append(gffWindowLine) geneStatus = name2 gainedBed.append(bedLine) #for lost elif float(line[6]) < (-1 * cutOff): gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString] gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString] lostGFF.append(gffLine) lostWindowGFF.append(gffWindowLine) geneStatus = name1 lostBed.append(bedLine) #for conserved else: geneStatus = 'CONSERVED' conservedBed.append(bedLine) #now fill in the gene Table for gene in geneList: geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus] geneTable.append(geneTableLine) #concat the bed fullBed = gainedBed + conservedBed + lostBed #start writing the output #there's the two gffs, the bed,the formatted table, the gene table #formatted table formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(formattedRankTable,formattedFilename,'\t') #gffs gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True) gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType) gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000) gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType) gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000) utils.unParseTable(gainedGFF,gffFilename_gained,'\t') utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t') utils.unParseTable(lostGFF,gffFilename_lost,'\t') utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t') #bed bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(fullBed,bedFilename,'\t') #geneTable geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType) utils.unParseTable(geneTable,geneFilename,'\t') #finally, move all of the plots to the output folder cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType) os.system(cmd) #now execute the bamPlot_turbo.py commands if plotBam: bam1 = dataDict[name1]['bam'] bam2 = dataDict[name2]['bam'] bamString = "%s,%s" % (bam1,bam2) nameString = "%s,%s" % (name1,name2) colorString = "0,0,0:100,100,100" #change dir os.chdir(pipelineDir) if len(gainedGFF) > 0: #gained command plotTitle = "%s_ONLY_SE" % (name2) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #gained window command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle) os.system(cmd) if len(lostGFF) > 0: #lost command plotTitle = "%s_ONLY_SE" % (name1) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) #lost command plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000) cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle) os.system(cmd) return
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="Enter a comma separated list of bams to rank by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-n", "--name", dest="name", nargs=1, default=None, help="Provide a name for the analysis otherwise ROSE will guess") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE(s) inputList = [inputFile for inputFile in options.input.split(',') if len(inputFile) > 1] #converting all input files into GFFs and moving into the GFF folder inputGFFList = [] for inputFile in inputList: if inputFile.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed' inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(inputFile, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER os.system('cp %s %s' % (inputFile, gffFolder)) inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1]) inputGFFList.append(inputGFFFile) # GETTING THE LIST OF BAMFILES TO PROCESS #either same number of bams for rankby and control #or only 1 control #or none! #bamlist should be all rankby bams followed by control bams bamFileList = [] if options.control: controlBamList = [bam for bam in options.control.split(',') if len(bam) >0] rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0] if len(controlBamList) == len(rankbyBamList): #case where an equal number of backgrounds are given bamFileList = rankbyBamList + controlBamList elif len(controlBamList) == 1: #case where a universal background is applied bamFileList = rankbyBamList + controlBamList*len(rankbyBamList) else: print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE') sys.exit() else: bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0] # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE GENOME genome = string.upper(options.genome) print('USING %s AS THE GENOME' % (genome)) # GETTING THE CORRECT ANNOT FILE genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir), 'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir), } try: annotFile = genomeDict[genome.upper()] except KeyError: print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome)) sys.exit() #FINDING THE ANALYSIS NAME if options.name: inputName = options.name else: inputName = inputGFFList[0].split('/')[-1].split('.')[0] print('USING %s AS THE ANALYSIS NAME' % (inputName)) print('FORMATTING INPUT REGIONS') # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs #use a simpler unique region naming system if len(inputGFFList) == 1: inputGFF = utils.parseTable(inputGFFList[0],'\t') else: inputLoci = [] for gffFile in inputGFFList: print('\tprocessing %s' % (gffFile)) gff = utils.parseTable(gffFile,'\t') gffCollection = utils.gffToLocusCollection(gff,50) inputLoci += gffCollection.getLoci() inputCollection = utils.LocusCollection(inputLoci,50) inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions inputGFF = utils.locusCollectionToGFF(inputCollection) formattedGFF = [] #now number things appropriately for i,line in enumerate(inputGFF): #use the coordinates to make a new id inputname_chr_sense_start_stop chrom = line[0] coords = [int(line[3]) ,int(line[4])] sense = line[6] lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID] formattedGFF.append(newLine) #name of the master input gff file masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName) utils.unParseTable(formattedGFF,masterGFFFile,'\t') print('USING %s AS THE INPUT GFF' % (masterGFFFile)) # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) #GET CHROMS FOUND IN THE BAMS print('GETTING CHROMS IN BAMFILES') bamChromList = getBamChromList(bamFileList) print("USING THE FOLLOWING CHROMS") print(bamChromList) #LOADING IN THE GFF AND FILTERING BY CHROM print('LOADING AND FILTERING THE GFF') inputGFF = filterGFF(masterGFFFile,bamChromList) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFF) print('CHECKING REFERENCE COLLECTION:') checkRefCollection(referenceCollection) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamFileListUnique = list(bamFileList) bamFileListUnique = utils.uniquify(bamFileListUnique) #prevent redundant mapping print("MAPPING TO THE FOLLOWING BAMS:") print(bamFileListUnique) for bamFile in bamFileListUnique: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) os.system(cmd1) if utils.checkOutput(mappedOut1File,0.2,5): print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('FINDING AVERAGE SIGNAL AMONGST BAMS') metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control) #now try the merging print('CALLING AND PLOTTING SUPER-ENHANCERS') rankbyName = inputName + '_MERGED_SIGNAL' controlName = 'NONE' cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper print('CALLING GENE MAPPING') superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) #for now don't use ranking bam to call top genes cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile) print(cmd) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile) print(cmd) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile) os.system(cmd)
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): ''' maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq ''' startDict = utils.makeStartDict(annotFile) enhancerName = enhancerFile.split('/')[-1].split('.')[0] enhancerTable = utils.parseTable(enhancerFile, '\t') # internal parameter for debugging byRefseq = False if len(transcribedFile) > 0: transcribedTable = utils.parseTable(transcribedFile, '\t') transcribedGenes = [line[1] for line in transcribedTable] else: transcribedGenes = startDict.keys() print('MAKING TRANSCRIPT COLLECTION') transcribedCollection = utils.makeTranscriptCollection( annotFile, 0, 0, 500, transcribedGenes) print('MAKING TSS COLLECTION') tssLoci = [] for geneID in transcribedGenes: tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) # this turns the tssLoci list into a LocusCollection # 50 is the internal parameter for LocusCollection and doesn't really # matter tssCollection = utils.LocusCollection(tssLoci, 50) geneDict = {'overlapping': defaultdict( list), 'proximal': defaultdict(list)} # dictionaries to hold ranks and superstatus of gene nearby enhancers rankDict = defaultdict(list) superDict = defaultdict(list) # list of all genes that appear in this analysis overallGeneList = [] # find the damn header for line in enhancerTable: if line[0][0] == '#': continue else: header = line break if noFormatTable: # set up the output tables # first by enhancer enhancerToGeneTable = [ header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] else: # set up the output tables # first by enhancer enhancerToGeneTable = [ header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] # next by gene geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] # next make the gene to enhancer table geneToEnhancerTable = [ ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] for line in enhancerTable: if line[0][0] == '#' or line[0][0] == 'R': continue enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) # overlapping genes are transcribed genes whose transcript is directly # in the stitchedLocus overlappingLoci = transcribedCollection.getOverlap( enhancerLocus, 'both') overlappingGenes = [] for overlapLocus in overlappingLoci: overlappingGenes.append(overlapLocus.ID()) # proximalGenes are transcribed genes where the tss is within 50kb of # the boundary of the stitched loci proximalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') proximalGenes = [] for proxLocus in proximalLoci: proximalGenes.append(proxLocus.ID()) distalLoci = tssCollection.getOverlap( utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') distalGenes = [] for proxLocus in distalLoci: distalGenes.append(proxLocus.ID()) overlappingGenes = utils.uniquify(overlappingGenes) proximalGenes = utils.uniquify(proximalGenes) distalGenes = utils.uniquify(distalGenes) allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes # these checks make sure each gene list is unique. # technically it is possible for a gene to be overlapping, but not proximal since the # gene could be longer than the 50kb window, but we'll let that slide # here for refID in overlappingGenes: if proximalGenes.count(refID) == 1: proximalGenes.remove(refID) for refID in proximalGenes: if distalGenes.count(refID) == 1: distalGenes.remove(refID) # Now find the closest gene if len(allEnhancerGenes) == 0: closestGene = '' else: # get enhancerCenter enhancerCenter = (int(line[2]) + int(line[3])) / 2 # get absolute distance to enhancer center distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] # get the ID and convert to name closestGene = startDict[ allEnhancerGenes[distList.index(min(distList))]]['name'] # NOW WRITE THE ROW FOR THE ENHANCER TABLE if noFormatTable: newEnhancerLine = list(line) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) else: newEnhancerLine = line[0:9] newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) newEnhancerLine.append( join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) newEnhancerLine.append(closestGene) newEnhancerLine += line[-2:] enhancerToGeneTable.append(newEnhancerLine) # Now grab all overlapping and proximal genes for the gene ordered # table overallGeneList += overlappingGenes for refID in overlappingGenes: geneDict['overlapping'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) overallGeneList += proximalGenes for refID in proximalGenes: geneDict['proximal'][refID].append(enhancerString) rankDict[refID].append(int(line[-2])) superDict[refID].append(int(line[-1])) # End loop through # Make table by gene print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') overallGeneList = utils.uniquify(overallGeneList) enhancerGeneCollection = utils.makeTranscriptCollection( annotFile, 5000, 5000, 500, overallGeneList) enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) # dump the gff to file enhancerFolder = utils.getParentFolder(enhancerFile) gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') # now we need to run bamToGFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') # map density at genes in the +/- 5kb tss region # first on the rankBy bam bamName = rankByBamFile.split('/')[-1] mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) print("Mapping rankby bam %s" % (rankByBamFile)) print(cmd) outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputRank = outputRank.communicate() if len(outputRank[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) sys.exit() # next on the control bam if it exists if len(controlBamFile) > 0: controlName = controlBamFile.split('/')[-1] mappedControlFolder = "%s%s_%s_%s/" % ( enhancerFolder, enhancerName,gffRootName, controlName) mappedControlFile = "%s%s_%s_%s/matrix.gff" % ( enhancerFolder, enhancerName,gffRootName, controlName) cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) print("Mapping control bam %s" % (controlBamFile)) print(cmd) outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) outputControl = outputControl.communicate() if len(outputControl[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) sys.exit() # now get the appropriate output files if len(controlBamFile) > 0: print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % (mappedRankByFile, mappedControlFile)) if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() else: print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) if utils.checkOutput(mappedRankByFile, 1, 30): print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') signalDict = makeSignalDict(mappedRankByFile) else: print("NO MAPPING OUTPUT DETECTED") sys.exit() # use enhancer rank to order rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) usedNames = [] # make a new dict to hold TSS signal by max per geneName geneNameSigDict = defaultdict(list) print('MAKING GENE TABLE') for i in rankOrder: refID = overallGeneList[i] geneName = startDict[refID]['name'] if usedNames.count(geneName) > 0 and uniqueGenes == True: continue else: usedNames.append(geneName) proxEnhancers = geneDict['overlapping'][ refID] + geneDict['proximal'][refID] superStatus = max(superDict[refID]) enhancerRanks = join([str(x) for x in rankDict[refID]], ',') enhancerSignal = signalDict[refID] geneNameSigDict[geneName].append(enhancerSignal) newLine = [geneName, refID, join( proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] geneToEnhancerTable.append(newLine) #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') print('MAKING ENHANCER TO TOP GENE TABLE') if noFormatTable: enhancerToTopGeneTable = [ enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] else: enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] for line in enhancerToGeneTable[1:]: geneList = [] if noFormatTable: geneList += line[-3].split(',') geneList += line[-2].split(',') else: geneList += line[10].split(',') geneList += line[11].split(',') geneList = utils.uniquify([x for x in geneList if len(x) > 0]) if len(geneList) > 0: try: sigVector = [max(geneNameSigDict[x]) for x in geneList] maxIndex = sigVector.index(max(sigVector)) maxGene = geneList[maxIndex] maxSig = sigVector[maxIndex] if maxSig == 0.0: maxGene = 'NONE' maxSig = 'NONE' except ValueError: if len(geneList) == 1: maxGene = geneList[0] maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' else: maxGene = 'NONE' maxSig = 'NONE' if noFormatTable: newLine = line + [maxGene, maxSig] else: newLine = line[0:12] + [maxGene, maxSig] + line[-2:] enhancerToTopGeneTable.append(newLine) # resort enhancerToGeneTable if noFormatTable: return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable else: enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) sortedTable = [enhancerToGeneTable[0]] sortedTopGeneTable = [enhancerToTopGeneTable[0]] for i in enhancerOrder: sortedTable.append(enhancerToGeneTable[(i + 1)]) sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) return sortedTable, sortedTopGeneTable, geneToEnhancerTable
def main(): ''' main run call ''' debug = False from optparse import OptionParser usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" parser = OptionParser(usage=usage) # required flags parser.add_option("-i", "--i", dest="input", nargs=1, default=None, help="Enter a .gff or .bed file of binding sites used to make enhancers") parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-o", "--out", dest="out", nargs=1, default=None, help="Enter an output folder") parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, help="Enter the genome build (MM9,MM8,HG18,HG19)") # optional flags parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, help="Enter a comma separated list of additional bam files to map to") parser.add_option("-c", "--control", dest="control", nargs=1, default=None, help="bamfile to rank enhancer by") parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") parser.add_option("--mask", dest="mask", nargs=1, default=None, help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") # RETRIEVING FLAGS (options, args) = parser.parse_args() if not options.input or not options.rankby or not options.out or not options.genome: print('hi there') parser.print_help() exit() # making the out folder if it doesn't exist outFolder = utils.formatFolder(options.out, True) # figuring out folder schema gffFolder = utils.formatFolder(outFolder + 'gff/', True) mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) # GETTING INPUT FILE if options.input.split('.')[-1] == 'bed': # CONVERTING A BED TO GFF inputGFFName = options.input.split('/')[-1][0:-4] inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) utils.bedToGFF(options.input, inputGFFFile) elif options.input.split('.')[-1] == 'gff': # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) else: print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') # COPY THE INPUT GFF TO THE GFF FOLDER inputGFFFile = options.input os.system('cp %s %s' % (inputGFFFile, gffFolder)) # GETTING THE LIST OF BAMFILES TO PROCESS if options.control: bamFileList = [options.rankby, options.control] else: bamFileList = [options.rankby] if options.bams: bamFileList += options.bams.split(',') bamFileList = utils.uniquify(bamFileList) # optional args # Stitch parameter if options.stitch == '': stitchWindow = '' else: stitchWindow = int(options.stitch) # tss options tssWindow = int(options.tss) if tssWindow != 0: removeTSS = True else: removeTSS = False # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS print('USING %s AS THE INPUT GFF' % (inputGFFFile)) inputName = inputGFFFile.split('/')[-1].split('.')[0] # GETTING THE GENOME genome = options.genome print('USING %s AS THE GENOME' % genome) # GETTING THE CORRECT ANNOT FILE cwd = os.getcwd() genomeDict = { 'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd), 'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd), 'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd), 'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd), 'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd), } annotFile = genomeDict[genome.upper()] # MAKING THE START DICT print('MAKING START DICT') startDict = utils.makeStartDict(annotFile) # LOADING IN THE BOUND REGION REFERENCE COLLECTION print('LOADING IN GFF REGIONS') referenceCollection = utils.gffToLocusCollection(inputGFFFile) # MASKING REFERENCE COLLECTION # see if there's a mask if options.mask: maskFile = options.mask # if it's a bed file if maskFile.split('.')[-1].upper() == 'BED': maskGFF = utils.bedToGFF(maskFile) elif maskFile.split('.')[-1].upper() == 'GFF': maskGFF = utils.parseTable(maskFile, '\t') else: print("MASK MUST BE A .gff or .bed FILE") sys.exit() maskCollection = utils.gffToLocusCollection(maskGFF) # now mask the reference loci referenceLoci = referenceCollection.getLoci() filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) referenceCollection = utils.LocusCollection(filteredLoci, 50) # NOW STITCH REGIONS print('STITCHING REGIONS TOGETHER') stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) # NOW MAKE A STITCHED COLLECTION GFF print('MAKING GFF FROM STITCHED COLLECTION') stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) # making sure start/stop ordering are correct for i in range(len(stitchedGFF)): line = stitchedGFF[i] start = int(line[3]) stop = int(line[4]) if start > stop: line[3] = stop line[4] = start print(stitchWindow) print(type(stitchWindow)) if not removeTSS: stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) else: stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) # WRITING DEBUG OUTPUT TO DISK if debug: print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) utils.unParseTable(debugOutput, debugOutFile, '\t') # WRITE THE GFF TO DISK print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') # SETTING UP THE OVERALL OUTPUT FILE outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) # MAPPING TO THE NON STITCHED (ORIGINAL GFF) # MAPPING TO THE STITCHED GFF # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): bamliquidator_path = 'bamliquidator_batch.py' if not os.path.isfile(bamliquidator_path): raise ValueError('bamliquidator_batch.py not found in path') for bamFile in bamFileList: bamFileName = bamFile.split('/')[-1] # MAPPING TO THE STITCHED GFF mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName) if utils.checkOutput(mappedOut1File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) else: cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) print(cmd1) output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True) output1 = output1.communicate() if len(output1[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) sys.exit() # MAPPING TO THE ORIGINAL GFF mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName) mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName) if utils.checkOutput(mappedOut2File, 0.2, 0.2): print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File)) else: cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile) print(cmd2) output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True) output2 = output2.communicate() if len(output2[0]) > 0: # test if mapping worked correctly print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName)) else: print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName)) sys.exit() print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') # CALCULATE DENSITY BY REGION # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) print('CALLING AND PLOTTING SUPER-ENHANCERS') if options.control: rankbyName = options.rankby.split('/')[-1] controlName = options.control.split('/')[-1] cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) else: rankbyName = options.rankby.split('/')[-1] controlName = 'NONE' cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName) print(cmd) os.system(cmd) # calling the gene mapper time.sleep(20) superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile) os.system(cmd) stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile) os.system(cmd) superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) if options.control: cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) else: cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile) os.system(cmd)