def checkRefCollection(referenceCollection):

    '''
    makes sure the names of all loci in the reference collection are unique
    '''

    namesList = [locus.ID() for locus in referenceCollection.getLoci()]
    
    if len(namesList) != len(ROSE_utils.uniquify(namesList)):
        print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS")
        print("THE SECOND COLUMN OF THE INPUT .GFF OR THE FOURTH COLUMN OF THE INPUT .BED MUST HAVE A UNIQUE IDENTIFIER FOR EACH REGION")
        sys.exit()
    else:
        print("REFERENCE COLLECTION PASSES QC")
        return
Beispiel #2
0
def checkRefCollection(referenceCollection):
    '''
    makes sure the names of all loci in the reference collection are unique
    '''

    namesList = [locus.ID() for locus in referenceCollection.getLoci()]

    if len(namesList) != len(ROSE_utils.uniquify(namesList)):
        print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS")
        print(
            "THE SECOND COLUMN OF THE INPUT .GFF OR THE FOURTH COLUMN OF THE INPUT .BED MUST HAVE A UNIQUE IDENTIFIER FOR EACH REGION"
        )
        sys.exit()
    else:
        print("REFERENCE COLLECTION PASSES QC")
        return
Beispiel #3
0
def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False,matrix = None):

#def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False):
	'''maps reads from a bam to a gff'''
	floor = int(floor)

	#USING BAM CLASS
	bam = ROSE_utils.Bam(bamFile)


	#new GFF to write to
	newGFF = []
	#millionMappedReads


	if rpm:
		MMR= round(float(bam.getTotalReads('mapped'))/1000000,4)
	else:
		MMR = 1

	print('using a MMR value of %s' % (MMR))

	senseTrans = str.maketrans('-+.','+-+')

	if ROSE_utils.checkChrStatus(bamFile) == 1:
		print("has chr")
		hasChrFlag = 1
		#sys.exit();
	else:
		print("does not have chr")
		hasChrFlag = 0
		#sys.exit()

	if type(gff) == str:
		gff = ROSE_utils.parseTable(gff,'\t')

	#setting up a maxtrix table

	newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)])

	#getting and processing reads for gff lines
	ticker = 0
	print('Number lines processed')
	for line in gff:
		line = line[0:9]
		if ticker%100 == 0:
			print(ticker)
		ticker+=1
		if not hasChrFlag:
		line[0] = re.sub(r"chr",r"",line[0])
		gffLocus = ROSE_utils.Locus(line[0],int(line[3]),int(line[4]),line[6],line[1])
		#print(line[0])
		#sys.exit()
		searchLocus = ROSE_utils.makeSearchLocus(gffLocus,int(extension),int(extension))

		reads = bam.getReadsLocus(searchLocus,'both',False,'none')
		#now extend the reads and make a list of extended reads
		extendedReads = []
		for locus in reads:
			if locus.sense() == '+' or locus.sense() == '.':
				locus = ROSE_utils.Locus(locus.chr(),locus.start(),locus.end()+extension,locus.sense(), locus.ID())
			if locus.sense() == '-':
				locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID())
			extendedReads.append(locus)
		if gffLocus.sense() == '+' or gffLocus.sense == '.':
			senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads)
			antiReads = filter(lambda x:x.sense() == '-',extendedReads)
		else:
			senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads)
			antiReads = filter(lambda x:x.sense() == '+',extendedReads)

		senseHash = defaultdict(int)
		antiHash = defaultdict(int)

		#filling in the readHashes
		if sense == '+' or sense == 'both' or sense =='.':
			for read in senseReads:
				for x in range(read.start(),read.end()+1,1):
					senseHash[x]+=1
		if sense == '-' or sense == 'both' or sense == '.':
			#print('foo')
			for read in antiReads:
				for x in range(read.start(),read.end()+1,1):
					antiHash[x]+=1

		#now apply flooring and filtering for coordinates
		keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys())
		if floor > 0:

			keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys)
		#coordinate filtering
		keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys)


		#setting up the output table
		clusterLine = [gffLocus.ID(),gffLocus.__str__()]

		#getting the binsize
		binSize = (gffLocus.len()-1)/int(matrix)
		nBins = int(matrix)
		if binSize == 0:
			clusterLine+=['NA']*int(matrix)
			newGFF.append(clusterLine)
			continue
		n=0
		if gffLocus.sense() == '+' or gffLocus.sense() =='.' or gffLocus.sense() == 'both':
			i = gffLocus.start()

			while n <nBins:
				n+=1
				binKeys = filter(lambda x: i < x < i+binSize,keys)
				binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
				clusterLine+=[round(binDen/MMR,4)]
				i = i+binSize
		else:
			i = gffLocus.end()
			while n < nBins:
				n+=1
				binKeys = filter(lambda x: i-binSize < x < i,keys)
				binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
				clusterLine+=[round(binDen/MMR,4)]
				i = i-binSize
		newGFF.append(clusterLine)


	return newGFF



#=====================================================================
#============================MAIN METHOD==============================
#=====================================================================


def main():
	from optparse import OptionParser
	usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]"
	parser = OptionParser(usage = usage)
	#required flags
	parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None,
						help = "Enter .bam file to be processed.")
	parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
						help = "Enter .gff or ENRICHED REGION file to be processed.")
	#output flag
	parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
						help = "Enter the output filename.")
	#additional options
	parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both',
						help = "Map to '+','-' or 'both' strands. Default maps to both.")


	parser.add_option("-f","--floor", dest="floor",nargs =1, default=0,
						help = "Sets a read floor threshold necessary to count towards density")
	parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200,
						help = "Extends reads by n bp. Default value is 200bp")
	parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False,
						help = "Normalizes density to reads per million (rpm)")


	parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None,
						help = "Outputs a variable bin sized matrix. User must specify number of bins.")

	(options,args) = parser.parse_args()

	print(options)
	print(args)

	if options.bam:
		bamFile = options.bam
		fullPath = os.path.abspath(bamFile)
		bamName = fullPath.split('/')[-1].split('.')[0]
		pathFolder = '/'.join(fullPath.split('/')[0:-1])
		fileList = os.listdir(pathFolder)
		hasBai = False
		for fileName in fileList:
			if fileName.count(bamName) == 1 and fileName.count('.bai') == 1:
				hasBai = True

		if not hasBai:
			print('ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file')
			parser.print_help()
			exit()

	if options.sense:
		if ['+','-','.','both'].count(options.sense) == 0:
			print('ERROR: sense flag must be followed by +,-,.,both')
			parser.print_help()
			exit()


	if options.matrix:
		try:
			int(options.matrix)
		except:
			print('ERROR: User must specify an integer bin number for matrix (try 50)')
			parser.print_help()
			exit()




	if options.input and options.bam:
		inputFile = options.input
		gffFile = inputFile

		bamFile = options.bam

		if options.output == None:
			output = os.getcwd() + inputFile.split('/')[-1]+'.mapped'
		else:
			output = options.output
		if options.matrix:
			print('mapping to GFF and making a matrix with fixed bin number')

			newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.floor,options.rpm,options.matrix)


		ROSE_utils.unParseTable(newGFF,output,'\t')
	else:
		parser.print_help()


if __name__ == "__main__":
	main()
Beispiel #4
0
def mapEnhancerToGene(annotFile,
                      enhancerFile,
                      transcribedFile='',
                      uniqueGenes=True,
                      byRefseq=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    print("Herp")
    startDict = ROSE_utils.makeStartDict(annotFile)
    print("Derp")
    enhancerTable = ROSE_utils.parseTable(enhancerFile, '\t')

    if len(transcribedFile) > 0:
        transcribedTable = ROSE_utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = list(startDict.keys())

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = ROSE_utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(ROSE_utils.makeTSSLocus(geneID, startDict, 0, 0))

    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = ROSE_utils.LocusCollection(tssLoci, 50)

    geneDict = {
        'overlapping': defaultdict(list),
        'proximal': defaultdict(list)
    }
    #list of all genes that appear in this analysis
    overallGeneList = []

    #set up the output tables
    #first by enhancer
    enhancerToGeneTable = [
        enhancerTable[5][0:6] +
        ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] +
        enhancerTable[5][-2:]
    ]

    #next by gene
    geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    for line in enhancerTable[6:]:

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = ROSE_utils.Locus(line[1], line[2], line[3], '.',
                                         line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            ROSE_utils.makeSearchLocus(enhancerLocus, 50000, 50000), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            ROSE_utils.makeSearchLocus(enhancerLocus, 50000000, 50000000),
            'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = ROSE_utils.uniquify(overlappingGenes)
        proximalGenes = ROSE_utils.uniquify(proximalGenes)
        distalGenes = ROSE_utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            #get absolute distance to enhancer center
            distList = [
                abs(enhancerCenter - startDict[geneID]['start'][0])
                for geneID in allEnhancerGenes
            ]
            #get the ID and convert to name
            #print enhancerCenter - startDict[geneID]['start'][0]
            #print distList.index(min(distList))
            #print min(distList)
            #print len(distList)
            #print len(allEnhancerGenes[distList.index(min(distList))])
            #print line
            #print len(startDict[allEnhancerGenes[distList.index(min(distList))]])
            closestGene = startDict[allEnhancerGenes[distList.index(
                min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        newEnhancerLine = line[0:6]
        if byRefseq:
            newEnhancerLine.append(','.join(
                ROSE_utils.uniquify([x for x in overlappingGenes])))
            newEnhancerLine.append(','.join(
                ROSE_utils.uniquify([x for x in proximalGenes])))
            #print newEnhancerLine
            #print len(allEnhancerGenes)
            #print distList
            closestGene = allEnhancerGenes[distList.index(min(distList))]
            newEnhancerLine.append(closestGene)
        else:
            newEnhancerLine.append(','.join(
                ROSE_utils.uniquify(
                    [startDict[x]['name'] for x in overlappingGenes])))
            newEnhancerLine.append(','.join(
                ROSE_utils.uniquify(
                    [startDict[x]['name'] for x in proximalGenes])))
            closestGene = startDict[allEnhancerGenes[distList.index(
                min(distList))]]['name']
            newEnhancerLine.append(closestGene)

        newEnhancerLine += line[-2:]
        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)

    #End loop through

    #Make table by gene
    overallGeneList = ROSE_utils.uniquify(overallGeneList)

    nameOrder = ROSE_utils.order(
        [startDict[x]['name'] for x in overallGeneList])

    usedNames = []
    for i in nameOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['proximal'][refID] + geneDict['overlapping'][
            refID]

        newLine = [geneName, refID, ','.join(proxEnhancers)]
        geneToEnhancerTable.append(newLine)

    #re-sort enhancerToGeneTable

    enhancerOrder = ROSE_utils.order(
        [int(line[-2]) for line in enhancerToGeneTable[1:]])
    sortedTable = [enhancerToGeneTable[0]]
    for i in enhancerOrder:
        sortedTable.append(enhancerToGeneTable[(i + 1)])

    return sortedTable, geneToEnhancerTable
Beispiel #5
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a ROSE ranked enhancer or super-enhancer file")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19,HG38)")

    #optional flags
    parser.add_option("-l",
                      "--list",
                      dest="geneList",
                      nargs=1,
                      default=None,
                      help="Enter a gene list to filter through")
    parser.add_option(
        "-o",
        "--out",
        dest="out",
        nargs=1,
        default=None,
        help="Enter an output folder. Default will be same folder as input file"
    )
    parser.add_option(
        "-r",
        "--refseq",
        dest="refseq",
        action='store_true',
        default=False,
        help="If flagged will write output by refseq ID and not common name")

    #RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.genome:

        parser.print_help()
        exit()

    #GETTING THE INPUT
    enhancerFile = options.input

    #making the out folder if it doesn't exist
    if options.out:
        outFolder = ROSE_utils.formatFolder(options.out, True)
    else:
        outFolder = '/'.join(enhancerFile.split('/')[0:-1]) + '/'

    #GETTING THE GENOME
    genome = options.genome
    print(('USING %s AS THE GENOME' % genome))

    #GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'HG38': '%s/annotation/hg38_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    #GETTING THE TRANSCRIBED LIST
    if options.geneList:

        transcribedFile = options.geneList
    else:
        transcribedFile = ''

    enhancerToGeneTable, geneToEnhancerTable = mapEnhancerToGene(
        annotFile,
        enhancerFile,
        uniqueGenes=True,
        byRefseq=options.refseq,
        transcribedFile=transcribedFile)

    #Writing enhancer output
    enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

    #writing the enhancer table
    out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName)
    ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t')

    #writing the gene table
    out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName)
    ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t')
Beispiel #6
0
def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
	print('PERFORMING REGION STITCHING')
	#first have to turn bound region file into a locus collection

	#need to make sure this names correctly... each region should have a unique name
	boundCollection = ROSE_utils.gffToLocusCollection(inputGFF)

	debugOutput = []
	#filter out all bound regions that overlap the TSS of an ACTIVE GENE
	if removeTSS:
		#first make a locus collection of TSS
		startDict = ROSE_utils.makeStartDict(annotFile)

		#now makeTSS loci for active genes
		removeTicker=0
		#this loop makes a locus centered around +/- tssWindow of transcribed genes
		#then adds it to the list tssLoci
		tssLoci = []
		for geneID in startDict.keys():
			tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,tssWindow,tssWindow))


		#this turns the tssLoci list into a LocusCollection
		#50 is the internal parameter for LocusCollection and doesn't really matter
		tssCollection = ROSE_utils.LocusCollection(tssLoci,50)

		#gives all the loci in boundCollection
		boundLoci = boundCollection.getLoci()

		#this loop will check if each bound region is contained by the TSS exclusion zone
		#this will drop out a lot of the promoter only regions that are tiny
		#typical exclusion window is around 2kb
		for locus in boundLoci:
			if len(tssCollection.getContainers(locus,'both'))>0:

				#if true, the bound locus overlaps an active gene
				boundCollection.remove(locus)
				debugOutput.append([locus.__str__(),locus.ID(),'CONTAINED'])
				removeTicker+=1
		print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker))

	#boundCollection is now all enriched region loci that don't overlap an active TSS
	stitchedCollection = boundCollection.stitchCollection(stitchWindow,'both')

	if removeTSS:
		#now replace any stitched region that overlap 2 distinct genes
		#with the original loci that were there
		fixedLoci = []
		tssLoci = []
		for geneID in startDict.keys():
			tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,50,50))


		#this turns the tssLoci list into a LocusCollection
		#50 is the internal parameter for LocusCollection and doesn't really matter
		tssCollection = ROSE_utils.LocusCollection(tssLoci,50)
		removeTicker = 0
		originalTicker = 0
		for stitchedLocus in stitchedCollection.getLoci():
			overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus,'both')
			tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci]
			tssNames = ROSE_utils.uniquify(tssNames)
			if len(tssNames) > 2:

				#stitchedCollection.remove(stitchedLocus)
				originalLoci = boundCollection.getOverlap(stitchedLocus,'both')
				originalTicker+=len(originalLoci)
				fixedLoci+=originalLoci
				debugOutput.append([stitchedLocus.__str__(),stitchedLocus.ID(),'MULTIPLE_TSS'])
				removeTicker+=1
			else:
				fixedLoci.append(stitchedLocus)

		print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker))
		print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker))
		fixedCollection = ROSE_utils.LocusCollection(fixedLoci,50)
		return fixedCollection,debugOutput
	else:
		return stitchedCollection,debugOutput
def main():
    '''
    main run call
    '''
    debug = False


    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-i","--i", dest="input",nargs = 1, default=None,
                      help = "Enter a ROSE ranked enhancer or super-enhancer file")
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (MM9,MM8,HG18,HG19)")

    #optional flags
    parser.add_option("-l","--list", dest="geneList",nargs = 1, default=None,
                      help = "Enter a gene list to filter through")
    parser.add_option("-o","--out", dest="out",nargs = 1, default=None,
                      help = "Enter an output folder. Default will be same folder as input file")
    parser.add_option("-w","--window", dest="window",nargs = 1, default=50000,
                      help = "Enter a search distance for genes. Default is 50,000bp")
    parser.add_option("-f","--format", dest="formatTable",action= "store_true", default=False,
                      help = "If flagged, maintains original formatting of input table")

    #RETRIEVING FLAGS
    (options,args) = parser.parse_args()


    if not options.input or not options.genome:

        parser.print_help()
        exit()

    #GETTING THE INPUT
    enhancerFile = options.input
    window = int(options.window)

    #making the out folder if it doesn't exist
    if options.out:
        outFolder = ROSE_utils.formatFolder(options.out,True)
    else:
        outFolder = join(enhancerFile.split('/')[0:-1],'/') + '/'


    #GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    #CHECK FORMATTING FLAG
    if options.formatTable:
        noFormatTable =True
    else:
        noFormatTable = False

    #GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18':'%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19':'%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd),
        }

    annotFile = genomeDict[upper(genome)]

    #GETTING THE TRANSCRIBED LIST
    if options.geneList:

        transcribedFile = options.geneList
    else:
        transcribedFile = ''

    enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable)

    #Writing enhancer output
    enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

    if window != 50000:
        #writing the enhancer table
        out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
        ROSE_utils.unParseTable(enhancerToGeneTable,out1,'\t')

        #writing the gene table
        out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
        ROSE_utils.unParseTable(geneToEnhancerTable,out2,'\t')
    else:
        #writing the enhancer table
        out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName)
        ROSE_utils.unParseTable(enhancerToGeneTable,out1,'\t')

        #writing the gene table
        out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName)
        ROSE_utils.unParseTable(geneToEnhancerTable,out2,'\t')
Beispiel #8
0
def main():
    from optparse import OptionParser
    usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option("-b",
                      "--bam",
                      dest="bam",
                      nargs=1,
                      default=None,
                      help="Enter .bam file to be processed.")
    parser.add_option(
        "-i",
        "--input",
        dest="input",
        nargs=1,
        default=None,
        help="Enter .gff or ENRICHED REGION file to be processed.")
    #output flag
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      nargs=1,
                      default=None,
                      help="Enter the output filename.")
    #additional options
    parser.add_option(
        "-s",
        "--sense",
        dest="sense",
        nargs=1,
        default='both',
        help="Map to '+','-' or 'both' strands. Default maps to both.")

    parser.add_option(
        "-f",
        "--floor",
        dest="floor",
        nargs=1,
        default=0,
        help="Sets a read floor threshold necessary to count towards density")
    parser.add_option("-e",
                      "--extension",
                      dest="extension",
                      nargs=1,
                      default=200,
                      help="Extends reads by n bp. Default value is 200bp")
    parser.add_option("-r",
                      "--rpm",
                      dest="rpm",
                      action='store_true',
                      default=False,
                      help="Normalizes density to reads per million (rpm)")

    parser.add_option(
        "-m",
        "--matrix",
        dest="matrix",
        nargs=1,
        default=None,
        help=
        "Outputs a variable bin sized matrix. User must specify number of bins."
    )

    (options, args) = parser.parse_args()

    print(options)
    print(args)

    if options.bam:
        bamFile = options.bam
        fullPath = os.path.abspath(bamFile)
        bamName = fullPath.split('/')[-1].split('.')[0]
        pathFolder = '/'.join(fullPath.split('/')[0:-1])
        fileList = os.listdir(pathFolder)
        hasBai = False
        for fileName in fileList:
            if fileName.count(bamName) == 1 and fileName.count('.bai') == 1:
                hasBai = True

        if not hasBai:
            print(
                'ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file'
            )
            parser.print_help()
            exit()

    if options.sense:
        if ['+', '-', '.', 'both'].count(options.sense) == 0:
            print('ERROR: sense flag must be followed by +,-,.,both')
            parser.print_help()
            exit()

    if options.matrix:
        try:
            int(options.matrix)
        except:
            print(
                'ERROR: User must specify an integer bin number for matrix (try 50)'
            )
            parser.print_help()
            exit()

    if options.input and options.bam:
        inputFile = options.input
        gffFile = inputFile

        bamFile = options.bam

        if options.output == None:
            output = os.getcwd() + inputFile.split('/')[-1] + '.mapped'
        else:
            output = options.output
        if options.matrix:
            print('mapping to GFF and making a matrix with fixed bin number')

            newGFF = mapBamToGFF(bamFile, gffFile, options.sense,
                                 int(options.extension), options.floor,
                                 options.rpm, options.matrix)

        ROSE_utils.unParseTable(newGFF, output, '\t')
    else:
        parser.print_help()
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a ROSE ranked enhancer or super-enhancer file")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    #optional flags
    parser.add_option("-l",
                      "--list",
                      dest="geneList",
                      nargs=1,
                      default=None,
                      help="Enter a gene list to filter through")
    parser.add_option(
        "-o",
        "--out",
        dest="out",
        nargs=1,
        default=None,
        help="Enter an output folder. Default will be same folder as input file"
    )
    parser.add_option(
        "-w",
        "--window",
        dest="window",
        nargs=1,
        default=50000,
        help="Enter a search distance for genes. Default is 50,000bp")
    parser.add_option(
        "-f",
        "--format",
        dest="formatTable",
        action="store_true",
        default=False,
        help="If flagged, maintains original formatting of input table")

    #RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.genome:

        parser.print_help()
        exit()

    #GETTING THE INPUT
    enhancerFile = options.input
    window = int(options.window)

    #making the out folder if it doesn't exist
    if options.out:
        outFolder = ROSE_utils.formatFolder(options.out, True)
    else:
        outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/'

    #GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    #CHECK FORMATTING FLAG
    if options.formatTable:
        noFormatTable = True
    else:
        noFormatTable = False

    #GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[upper(genome)]

    #GETTING THE TRANSCRIBED LIST
    if options.geneList:

        transcribedFile = options.geneList
    else:
        transcribedFile = ''

    enhancerToGeneTable, geneToEnhancerTable = mapEnhancerToGene(
        annotFile, enhancerFile, transcribedFile, True, window, noFormatTable)

    #Writing enhancer output
    enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

    if window != 50000:
        #writing the enhancer table
        out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder, enhancerFileName,
                                                   window / 1000)
        ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t')

        #writing the gene table
        out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder, enhancerFileName,
                                                   window / 1000)
        ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t')
    else:
        #writing the enhancer table
        out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName)
        ROSE_utils.unParseTable(enhancerToGeneTable, out1, '\t')

        #writing the gene table
        out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName)
        ROSE_utils.unParseTable(geneToEnhancerTable, out2, '\t')
Beispiel #10
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option(
        "-g",
        "--genome",
        dest="genome",
        nargs=1,
        default=None,
        help="Enter the genome build (MM9,MM8,HG18,HG19,MM10,HG38)")

    #optional flags
    parser.add_option(
        "-b",
        "--bams",
        dest="bams",
        nargs=1,
        default=None,
        help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c",
                      "--control",
                      dest="control",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s",
                      "--stitch",
                      dest="stitch",
                      nargs=1,
                      default=12500,
                      help="Enter a max linking distance for stitching")
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    #RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    #making the out folder if it doesn't exist
    outFolder = ROSE_utils.formatFolder(options.out, True)

    #figuring out folder schema
    gffFolder = ROSE_utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = ROSE_utils.formatFolder(outFolder + 'mappedGFF/', True)

    #GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        #CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        ROSE_utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        #COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        #COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    #GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileLIst = ROSE_utils.uniquify(bamFileList)
    #optional args

    #Stitch parameter
    stitchWindow = int(options.stitch)

    #tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    #GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    #GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
        'HG38': '%s/annotation/hg38_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[upper(genome)]

    #MAKING THE START DICT
    print('MAKING START DICT')
    startDict = ROSE_utils.makeStartDict(annotFile)

    #LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = ROSE_utils.gffToLocusCollection(inputGFFFile)

    #NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput = regionStitching(inputGFFFile,
                                                      stitchWindow, tssWindow,
                                                      annotFile, removeTSS)

    #NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = ROSE_utils.locusCollectionToGFF(stitchedCollection)

    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      stitchWindow / 1000)
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, stitchWindow / 1000)
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     stitchWindow / 1000)
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, stitchWindow / 1000)
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName,
                                                           stitchWindow / 1000)
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, stitchWindow / 1000)

    #WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        ROSE_utils.unParseTable(debugOutput, debugOutFile, '\t')

    #WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    ROSE_utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    #SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'

    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    #MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    #MAPPING TO THE STITCHED GFF

    # bin for bam mapping
    nBin = 1

    #IMPORTANT
    #CHANGE cmd1 and cmd2 TO PARALLELIZE OUTPUT FOR BATCH SUBMISSION
    #e.g. if using LSF cmd1 = "bsub python bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s" % (nBin,bamFile,stitchedGFFFile,mappedOut1)

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        #MAPPING TO THE STITCHED GFF
        mappedOut1 = '%s%s_%s_MAPPED.gff' % (mappedFolder, stitchedGFFName,
                                             bamFileName)
        #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED
        cmd1 = "python ROSE_bamToGFF_turbo.py -e 200 -r -m %s -b %s -i %s -o %s &" % (
            nBin, bamFile, stitchedGFFFile, mappedOut1)
        print(cmd1)
        os.system(cmd1)

        #MAPPING TO THE ORIGINAL GFF
        mappedOut2 = '%s%s_%s_MAPPED.gff' % (mappedFolder, inputName,
                                             bamFileName)
        #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED
        cmd2 = "python ROSE_bamToGFF_turbo.py 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (
            nBin, bamFile, inputGFFFile, mappedOut2)
        print(cmd2)
        os.system(cmd2)

    print('PAUSING TO MAP')
    time.sleep(10)

    #CHECK FOR MAPPING OUTPUT
    outputDone = False
    ticker = 0
    print('WAITING FOR MAPPING TO COMPLETE. ELAPSED TIME (MIN):')
    while not outputDone:
        '''
        check every 1 minutes for completed output
        '''
        outputDone = True
        if ticker % 6 == 0:
            print(ticker * 5)
        ticker += 1
        #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP
        if ticker == 120:
            print(
                'ERROR: OPERATION TIME OUT. MAPPING OUTPUT NOT DETECTED AFTER 2 HOURS'
            )
            exit()
            break
        for bamFile in bamFileList:

            #GET THE MAPPED OUTPUT NAMES HERE FROM MAPPING OF EACH BAMFILE
            bamFileName = bamFile.split('/')[-1]
            mappedOut1 = '%s%s_%s_MAPPED.gff' % (mappedFolder, stitchedGFFName,
                                                 bamFileName)

            try:
                mapFile = open(mappedOut1, 'r')
                mapFile.close()
            except IOError:
                outputDone = False

            mappedOut2 = '%s%s_%s_MAPPED.gff' % (mappedFolder, inputName,
                                                 bamFileName)

            try:
                mapFile = open(mappedOut2, 'r')
                mapFile.close()
            except IOError:
                outputDone = False
        if outputDone == True:
            break
        time.sleep(60)
    print('MAPPING TOOK %s MINUTES' % (ticker))

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    #CALCULATE DENSITY BY REGION
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    time.sleep(10)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)
    print(cmd)
    os.system(cmd)

    #calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    cmd = "python ROSE_geneMapper.py -g %s -i %s%s" % (genome, outFolder,
                                                       superTableFile)
    os.system(cmd)
def mapBamToGFF(bamFile,gff,sense = 'both',extension = 200,floor = 0,rpm = False,matrix = None):

#def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False):
    '''maps reads from a bam to a gff'''
    floor = int(floor)
    
    #USING BAM CLASS
    bam = ROSE_utils.Bam(bamFile)


    #new GFF to write to
    newGFF = []
    #millionMappedReads


    if rpm:    
        MMR= round(float(bam.getTotalReads('mapped'))/1000000,4)
    else:
        MMR = 1

    print('using a MMR value of %s' % (MMR))
    
    senseTrans = maketrans('-+.','+-+')

    if ROSE_utils.checkChrStatus(bamFile) == 1:
      print "has chr"
      hasChrFlag = 1
      #sys.exit();
    else:
      print "does not have chr"
      hasChrFlag = 0
      #sys.exit()
      
    if type(gff) == str:
        gff = ROSE_utils.parseTable(gff,'\t')
        
    #setting up a maxtrix table

    newGFF.append(['GENE_ID','locusLine'] + ['bin_'+str(n)+'_'+bamFile.split('/')[-1] for n in range(1,int(matrix)+1,1)])        

    #getting and processing reads for gff lines
    ticker = 0
    print('Number lines processed')
    for line in gff:
        line = line[0:9]
        if ticker%100 == 0:
            print ticker
        ticker+=1
        if not hasChrFlag:
	  line[0] = re.sub(r"chr",r"",line[0])
        gffLocus = ROSE_utils.Locus(line[0],float(line[3]),float(line[4]),line[6],line[1])
        #print line[0]
        #sys.exit()
        searchLocus = ROSE_utils.makeSearchLocus(gffLocus,int(extension),int(extension))
        
        reads = bam.getReadsLocus(searchLocus,'both',False,'none')
        #now extend the reads and make a list of extended reads
        extendedReads = []
        for locus in reads:
            if locus.sense() == '+' or locus.sense() == '.':
                locus = ROSE_utils.Locus(locus.chr(),locus.start(),locus.end()+extension,locus.sense(), locus.ID())
            if locus.sense() == '-':
                locus = ROSE_utils.Locus(locus.chr(),locus.start()-extension,locus.end(),locus.sense(),locus.ID())
            extendedReads.append(locus)
        if gffLocus.sense() == '+' or gffLocus.sense == '.':
            senseReads = filter(lambda x:x.sense() == '+' or x.sense() == '.',extendedReads)
            antiReads = filter(lambda x:x.sense() == '-',extendedReads)
        else:
            senseReads = filter(lambda x:x.sense() == '-' or x.sense() == '.',extendedReads)
            antiReads = filter(lambda x:x.sense() == '+',extendedReads)

        senseHash = defaultdict(int)
        antiHash = defaultdict(int)

        #filling in the readHashes             
        if sense == '+' or sense == 'both' or sense =='.':
            for read in senseReads:
                for x in range(read.start(),read.end()+1,1):
                    senseHash[x]+=1
        if sense == '-' or sense == 'both' or sense == '.':
            #print('foo')
            for read in antiReads:
                for x in range(read.start(),read.end()+1,1):
                    antiHash[x]+=1

        #now apply flooring and filtering for coordinates
        keys = ROSE_utils.uniquify(senseHash.keys()+antiHash.keys())
        if floor > 0:

            keys = filter(lambda x: (senseHash[x]+antiHash[x]) > floor,keys)
        #coordinate filtering
        keys = filter(lambda x: gffLocus.start() < x < gffLocus.end(),keys)


        #setting up the output table
        clusterLine = [gffLocus.ID(),gffLocus.__str__()]

        #getting the binsize
        binSize = (gffLocus.len()-1)/int(matrix)
        nBins = int(matrix)
        if binSize == 0:
            clusterLine+=['NA']*int(matrix)
            newGFF.append(clusterLine)
            continue
        n=0
        if gffLocus.sense() == '+' or gffLocus.sense() =='.' or gffLocus.sense() == 'both':
            i = gffLocus.start()

            while n <nBins:
                n+=1
                binKeys = filter(lambda x: i < x < i+binSize,keys)
                binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
                clusterLine+=[round(binDen/MMR,4)]
                i = i+binSize
        else:
            i = gffLocus.end()
            while n < nBins:
                n+=1
                binKeys = filter(lambda x: i-binSize < x < i,keys)
                binDen = float(sum([senseHash[x]+antiHash[x] for x in binKeys]))/binSize
                clusterLine+=[round(binDen/MMR,4)]
                i = i-binSize
        newGFF.append(clusterLine)
        
            
    return newGFF
def main():
    from optparse import OptionParser
    usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None,
                      help = "Enter .bam file to be processed.")
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter .gff or ENRICHED REGION file to be processed.")
    #output flag
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output filename.")
    #additional options
    parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both',
                      help = "Map to '+','-' or 'both' strands. Default maps to both.")


    parser.add_option("-f","--floor", dest="floor",nargs =1, default=0,
                      help = "Sets a read floor threshold necessary to count towards density")    
    parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200,
                      help = "Extends reads by n bp. Default value is 200bp")
    parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False,
                      help = "Normalizes density to reads per million (rpm)")


    parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None,
                      help = "Outputs a variable bin sized matrix. User must specify number of bins.")

    (options,args) = parser.parse_args()

    print(options)
    print(args)

    if options.bam:
        bamFile = options.bam
        fullPath = os.path.abspath(bamFile)
        bamName = fullPath.split('/')[-1].split('.')[0]
        pathFolder = join(fullPath.split('/')[0:-1],'/')
        fileList = os.listdir(pathFolder)
        hasBai = False
        for fileName in fileList:
            if fileName.count(bamName) == 1 and fileName.count('.bai') == 1:
                hasBai = True

        if not hasBai:
            print('ERROR: no associated .bai file found with bam. Must use a sorted bam with accompanying index file')
            parser.print_help()
            exit()
   
    if options.sense:
        if ['+','-','.','both'].count(options.sense) == 0:
            print('ERROR: sense flag must be followed by +,-,.,both')
            parser.print_help()
            exit()


    if options.matrix:
        try:
            int(options.matrix)
        except:
            print('ERROR: User must specify an integer bin number for matrix (try 50)')
            parser.print_help()
            exit()
            

    
    
    if options.input and options.bam:
        inputFile = options.input
        gffFile = inputFile

        bamFile = options.bam
        
        if options.output == None:
            output = os.getcwd() + inputFile.split('/')[-1]+'.mapped'
        else:
            output = options.output
        if options.matrix:
            print('mapping to GFF and making a matrix with fixed bin number')

            newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.floor,options.rpm,options.matrix)

            
        ROSE_utils.unParseTable(newGFF,output,'\t')
    else:
        parser.print_help()
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False):
    
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = ROSE_utils.makeStartDict(annotFile)
    enhancerTable = ROSE_utils.parseTable(enhancerFile,'\t')

    #internal parameter for debugging
    byRefseq = False


    if len(transcribedFile) > 0:
        transcribedTable = ROSE_utils.parseTable(transcribedFile,'\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = ROSE_utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)


    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,0,0))


    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = ROSE_utils.LocusCollection(tssLoci,50)

    

    geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)}

    #dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict= defaultdict(list)

    #list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]

        
    else:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

        #next by gene
        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

    #next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]

        


    for line in enhancerTable:
        if line[0][0] =='#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1],line[2],line[3])
        
        enhancerLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.',line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus         
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both')           
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:                
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both')           
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())


        distalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both')           
        distalGenes =[]
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

            
            
        overlappingGenes = ROSE_utils.uniquify(overlappingGenes)
        proximalGenes = ROSE_utils.uniquify(proximalGenes)
        distalGenes = ROSE_utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)


        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3]))/2

            #get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            #get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList +=overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))
            
        overallGeneList+=proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))



    #End loop through
    
    #Make table by gene
    overallGeneList = ROSE_utils.uniquify(overallGeneList)  

    #use enhancer rank to order
    rankOrder = ROSE_utils.order([min(rankDict[x]) for x in overallGeneList])
        
    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)
        
        proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID]
        
        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]],',')
    
        newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus]
        geneToEnhancerTable.append(newLine)

    #resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable,geneToEnhancerTable
    else:
        enhancerOrder = ROSE_utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i+1)])

        return sortedTable,geneToEnhancerTable
def mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,output,refName):


    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    
    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE']]

    lociLenList = []

    #strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)
    
    for locus in loci:
        #numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        #numOrder = order(numLociList,decreasing=True)
    lenOrder = ROSE_utils.order(lociLenList,decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker+=1
        if ticker%1000 ==0:
            print(ticker)
        locus = loci[i]

        #First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus,'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize+=refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        
        locusTable.append([locus.ID(),locus.chr(),locus.start(),locus.end(),stitchCount,refEnrichSize])
        
            

    print('GETTING MAPPED DATA')
    for bamFile in bamFileList:
        
        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        #assumes standard convention for naming enriched region gffs
        
        #opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName))

        mappedGFF =ROSE_utils.parseTable('%s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName),'\t')        

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(ROSE_utils.Locus(chrom,start,end,'.',line[0]))
            try:
                signalDict[line[0]] = float(line[2])*(abs(end-start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue
                
                
        
        mappedCollection = ROSE_utils.LocusCollection(mappedLoci,500)
        locusTable[0].append(bamFileName)

        for i in range(1,len(locusTable)):
            signal=0.0
            line = locusTable[i]
            lineLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus,sense='both')
            for region in overlappingRegions:
                signal+= signalDict[region.ID()]
            locusTable[i].append(signal)

    ROSE_utils.unParseTable(locusTable,output,'\t')
Beispiel #15
0
		print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
		#COPY THE INPUT GFF TO THE GFF FOLDER
	inputGFFFile = options.input
		os.system('cp %s %s' % (inputGFFFile,gffFolder))


	#GETTING THE LIST OF BAMFILES TO PROCESS
	if options.control:
		bamFileList = [options.rankby,options.control]

	else:
		bamFileList = [options.rankby]

	if options.bams:
		bamFileList += options.bams.split(',')
		bamFileLIst = ROSE_utils.uniquify(bamFileList)
	#optional args

	#Stitch parameter
	stitchWindow = int(options.stitch)

	#tss options
	tssWindow = int(options.tss)
	if tssWindow != 0:
		removeTSS = True
	else:
		removeTSS = False

	#GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
	print('USING %s AS THE INPUT GFF' % (inputGFFFile))
	inputName = inputGFFFile.split('/')[-1].split('.')[0]
def main():
    '''
    main run call
    '''
    debug = False


    from optparse import OptionParser
    usage = "usage: %prog [options] -g [INPUT_GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-i","--i", dest="input",nargs = 1, default=None,
                      help = "Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r","--rankby", dest="rankby",nargs = 1, default=None,
                      help = "bamfile to rank enhancer by")
    parser.add_option("-o","--out", dest="out",nargs = 1, default=None,
                      help = "Enter an output folder")
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Reference genome file: example- hg18_refseq.ucsc")
    
    #optional flags
    parser.add_option("-b","--bams", dest="bams",nargs = 1, default=None,
                      help = "Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c","--control", dest="control",nargs = 1, default=None,
                      help = "bamfile to rank enhancer by")
    parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default=12500,
                      help = "Enter a max linking distance for stitching")
    parser.add_option("-t","--tss", dest="tss",nargs = 1, default=0,
                      help = "Enter a distance from TSS to exclude. 0 = no TSS exclusion")




    #RETRIEVING FLAGS
    (options,args) = parser.parse_args()


    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    #making the out folder if it doesn't exist
    outFolder = ROSE_utils.formatFolder(options.out,True)

    
    #figuring out folder schema
    gffFolder = ROSE_utils.formatFolder(outFolder+'gff/',True)
    mappedFolder = ROSE_utils.formatFolder(outFolder+ 'mappedGFF/',True)


    #GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        #CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder,inputGFFName)
        ROSE_utils.bedToGFF(options.input,inputGFFFile)
    elif options.input.split('.')[-1] =='gff':
        #COPY THE INPUT GFF TO THE GFF FOLDER
	inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile,gffFolder))        

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        #COPY THE INPUT GFF TO THE GFF FOLDER
	inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile,gffFolder))        



    #GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:        
        bamFileList = [options.rankby,options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileLIst = ROSE_utils.uniquify(bamFileList)
    #optional args

    #Stitch parameter
    stitchWindow = int(options.stitch)
    
    #tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    #GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]


    #GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)
    

    #GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
##    genomeDict = {
##        'HG18':'%s/annotation/hg18_refseq.ucsc' % (cwd),
##        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
##        'HG19':'%s/annotation/hg19_refseq.ucsc' % (cwd),
##        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
##        'MM10':'%s/annotation/mm10_refseq.ucsc' % (cwd),
##        }
    
    annotFile = genome

##    annotFile = genomeDict[upper(genome)]

    #MAKING THE START DICT
    print('MAKING START DICT')
    startDict = ROSE_utils.makeStartDict(annotFile)


    #LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = ROSE_utils.gffToLocusCollection(inputGFFFile)

    #CHECKING INPUT REGIONS FOR FORMATTING
    print('CHECKING INPUT TO MAKE SURE EACH REGION HAS A UNIQUE IDENTIFIER')
    checkRefCollection(referenceCollection) #makes sure that all input regions have a unique ID

    #NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection,debugOutput = regionStitching(inputGFFFile,stitchWindow,tssWindow,annotFile,removeTSS)

    
    #NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF=ROSE_utils.locusCollectionToGFF(stitchedCollection)
    
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder,inputName,stitchWindow/1000)
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,stitchWindow/1000)
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder,inputName,stitchWindow/1000)
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder,inputName,stitchWindow/1000)
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName,stitchWindow/1000)
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder,inputName,stitchWindow/1000)

    #WRITING DEBUG OUTPUT TO DISK
        
    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        ROSE_utils.unParseTable(debugOutput,debugOutFile,'\t')

    #WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    ROSE_utils.unParseTable(stitchedGFF,stitchedGFFFile,'\t')



    #SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'

    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))
    
    #MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    #MAPPING TO THE STITCHED GFF


    # bin for bam mapping
    nBin =1

    #IMPORTANT
    #CHANGE cmd1 and cmd2 TO PARALLELIZE OUTPUT FOR BATCH SUBMISSION
    #e.g. if using LSF cmd1 = "bsub python bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s" % (nBin,bamFile,stitchedGFFFile,mappedOut1)

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        #MAPPING TO THE STITCHED GFF
        mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName)
        #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED
        cmd1 = "python /usr/local/'bin'/ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,stitchedGFFFile,mappedOut1)
        print(cmd1)
        os.system(cmd1)

        #MAPPING TO THE ORIGINAL GFF
        mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName)
        #WILL TRY TO RUN AS A BACKGROUND PROCESS. BATCH SUBMIT THIS LINE TO IMPROVE SPEED
        cmd2 = "python /usr/local/'bin'/ROSE_bamToGFF.py -f 1 -e 200 -r -m %s -b %s -i %s -o %s &" % (nBin,bamFile,inputGFFFile,mappedOut2)
        print(cmd2)
        os.system(cmd2)
        

    
    print('PAUSING TO MAP')
    time.sleep(10)

    #CHECK FOR MAPPING OUTPUT
    outputDone = False
    ticker = 0
    print('WAITING FOR MAPPING TO COMPLETE. ELAPSED TIME (MIN):')
    while not outputDone:

        '''
        check every 5 minutes for completed output
        '''
        outputDone = True
        if ticker%6 == 0:
            print(ticker*5)
        ticker +=1
        #CHANGE THIS PARAMETER TO ALLOW MORE TIME TO MAP
        if ticker == 144:
            print('ERROR: OPERATION TIME OUT. MAPPING OUTPUT NOT DETECTED')
            exit()
            break
        for bamFile in bamFileList:
            
            #GET THE MAPPED OUTPUT NAMES HERE FROM MAPPING OF EACH BAMFILE
            bamFileName = bamFile.split('/')[-1]
            mappedOut1 ='%s%s_%s_MAPPED.gff' % (mappedFolder,stitchedGFFName,bamFileName)

            try:
                 mapFile = open(mappedOut1,'r')
                 mapFile.close()
            except IOError:
                outputDone = False

            mappedOut2 ='%s%s_%s_MAPPED.gff' % (mappedFolder,inputName,bamFileName)
            
            try:
                mapFile = open(mappedOut2,'r')
                mapFile.close()
            except IOError:
                outputDone = False
        if outputDone == True:
            break
        time.sleep(300)
    print('MAPPING TOOK %s MINUTES' % (ticker*5))

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    #CALCULATE DENSITY BY REGION
    mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,outputFile1,refName = stitchedGFFName)


    time.sleep(10)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')


    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < /usr/local/bin/ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < /usr/local/bin/ROSE_callSuper.R' % (outFolder,outputFile1,inputName,controlName)
    print(cmd)
    os.system(cmd)
Beispiel #17
0
def mapCollection(stitchedCollection,referenceCollection,bamFileList,mappedFolder,output,refName):


	'''
	makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
	'''


	print('FORMATTING TABLE')
	loci = stitchedCollection.getLoci()

	locusTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE']]

	lociLenList = []

	#strip out any that are in chrY
	for locus in list(loci):
		if locus.chr() == 'chrY':
			loci.remove(locus)

	for locus in loci:
		#numLociList.append(int(stitchLocus.ID().split('_')[1]))
		lociLenList.append(locus.len())
		#numOrder = order(numLociList,decreasing=True)
	lenOrder = ROSE_utils.order(lociLenList,decreasing=True)
	ticker = 0
	for i in lenOrder:
		ticker+=1
		if ticker%1000 ==0:
			print(ticker)
		locus = loci[i]

		#First get the size of the enriched regions within the stitched locus
		refEnrichSize = 0
		refOverlappingLoci = referenceCollection.getOverlap(locus,'both')
		for refLocus in refOverlappingLoci:
			refEnrichSize+=refLocus.len()

		try:
			stitchCount = int(locus.ID().split('_')[0])
		except ValueError:
			stitchCount = 1

		locusTable.append([locus.ID(),locus.chr(),locus.start(),locus.end(),stitchCount,refEnrichSize])



	print('GETTING MAPPED DATA')
	for bamFile in bamFileList:

		bamFileName = bamFile.split('/')[-1]

		print('GETTING MAPPING DATA FOR  %s' % bamFile)
		#assumes standard convention for naming enriched region gffs

		#opening up the mapped GFF
		print('OPENING %s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName))

		mappedGFF =ROSE_utils.parseTable('%s%s_%s_MAPPED.gff' % (mappedFolder,refName,bamFileName),'\t')

		signalDict = defaultdict(float)
		print('MAKING SIGNAL DICT FOR %s' % (bamFile))
		mappedLoci = []
		for line in mappedGFF[1:]:

			chrom = line[1].split('(')[0]
			start = int(line[1].split(':')[-1].split('-')[0])
			end = int(line[1].split(':')[-1].split('-')[1])
			mappedLoci.append(ROSE_utils.Locus(chrom,start,end,'.',line[0]))
			try:
				signalDict[line[0]] = float(line[2])*(abs(end-start))
			except ValueError:
				print('WARNING NO SIGNAL FOR LINE:')
				print(line)
				continue



		mappedCollection = ROSE_utils.LocusCollection(mappedLoci,500)
		locusTable[0].append(bamFileName)

		for i in range(1,len(locusTable)):
			signal=0.0
			line = locusTable[i]
			lineLocus = ROSE_utils.Locus(line[1],line[2],line[3],'.')
			overlappingRegions = mappedCollection.getOverlap(lineLocus,sense='both')
			for region in overlappingRegions:
				signal+= signalDict[region.ID()]
			locusTable[i].append(signal)

	ROSE_utils.unParseTable(locusTable,output,'\t')
def regionStitching(inputGFF,stitchWindow,tssWindow,annotFile,removeTSS=True):
    print('PERFORMING REGION STITCHING')
    #first have to turn bound region file into a locus collection

    #need to make sure this names correctly... each region should have a unique name
    boundCollection = ROSE_utils.gffToLocusCollection(inputGFF)

    debugOutput = []
    #filter out all bound regions that overlap the TSS of an ACTIVE GENE
    if removeTSS:
        #first make a locus collection of TSS
        startDict = ROSE_utils.makeStartDict(annotFile)

        #now makeTSS loci for active genes
        removeTicker=0
        #this loop makes a locus centered around +/- tssWindow of transcribed genes
        #then adds it to the list tssLoci
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,tssWindow,tssWindow))


        #this turns the tssLoci list into a LocusCollection
        #50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = ROSE_utils.LocusCollection(tssLoci,50)

        #gives all the loci in boundCollection
        boundLoci = boundCollection.getLoci()

        #this loop will check if each bound region is contained by the TSS exclusion zone
        #this will drop out a lot of the promoter only regions that are tiny
        #typical exclusion window is around 2kb
        for locus in boundLoci:
            if len(tssCollection.getContainers(locus,'both'))>0:
                
                #if true, the bound locus overlaps an active gene
                boundCollection.remove(locus)
                debugOutput.append([locus.__str__(),locus.ID(),'CONTAINED'])
                removeTicker+=1
        print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker))

    #boundCollection is now all enriched region loci that don't overlap an active TSS
    stitchedCollection = boundCollection.stitchCollection(stitchWindow,'both')

    if removeTSS:
        #now replace any stitched region that overlap 2 distinct genes
        #with the original loci that were there
        fixedLoci = []
        tssLoci = []
        for geneID in startDict.keys():
            tssLoci.append(ROSE_utils.makeTSSLocus(geneID,startDict,50,50))


        #this turns the tssLoci list into a LocusCollection
        #50 is the internal parameter for LocusCollection and doesn't really matter
        tssCollection = ROSE_utils.LocusCollection(tssLoci,50)
        removeTicker = 0
        originalTicker = 0
        for stitchedLocus in stitchedCollection.getLoci():
            overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus,'both')
            tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci]
            tssNames = ROSE_utils.uniquify(tssNames)
            if len(tssNames) > 2:
            
                #stitchedCollection.remove(stitchedLocus)
                originalLoci = boundCollection.getOverlap(stitchedLocus,'both')
                originalTicker+=len(originalLoci)
                fixedLoci+=originalLoci
                debugOutput.append([stitchedLocus.__str__(),stitchedLocus.ID(),'MULTIPLE_TSS'])
                removeTicker+=1
            else:
                fixedLoci.append(stitchedLocus)

        print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker))
        print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker))
        fixedCollection = ROSE_utils.LocusCollection(fixedLoci,50)
        return fixedCollection,debugOutput
    else:
        return stitchedCollection,debugOutput
Beispiel #19
0
def main():
	'''
	main run call
	'''
	debug = False


	from optparse import OptionParser
	usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
	parser = OptionParser(usage = usage)
	#required flags
	parser.add_option("-i","--i", dest="input",nargs = 1, default=None,
						help = "Enter a .gff or .bed file of binding sites used to make enhancers")
	parser.add_option("-r","--rankby", dest="rankby",nargs = 1, default=None,
						help = "bamfile to rank enhancer by")
	parser.add_option("-o","--out", dest="out",nargs = 1, default=None,
						help = "Enter an output folder")
	parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
						help = "Enter the genome build (MM9,MM8,HG18,HG19)")

	#optional flags
	parser.add_option("-b","--bams", dest="bams",nargs = 1, default=None,
						help = "Enter a comma separated list of additional bam files to map to")
	parser.add_option("-c","--control", dest="control",nargs = 1, default=None,
						help = "bamfile to rank enhancer by")
	parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default=12500,
						help = "Enter a max linking distance for stitching")
	parser.add_option("-t","--tss", dest="tss",nargs = 1, default=0,
						help = "Enter a distance from TSS to exclude. 0 = no TSS exclusion")




	#RETRIEVING FLAGS
	(options,args) = parser.parse_args()


	if not options.input or not options.rankby or not options.out or not options.genome:
		print('hi there')
		parser.print_help()
		exit()

	#making the out folder if it doesn't exist
	outFolder = ROSE_utils.formatFolder(options.out,True)


	#figuring out folder schema
	gffFolder = ROSE_utils.formatFolder(outFolder+'gff/',True)
	mappedFolder = ROSE_utils.formatFolder(outFolder+ 'mappedGFF/',True)


	#GETTING INPUT FILE
	if options.input.split('.')[-1] == 'bed':
		#CONVERTING A BED TO GFF
		inputGFFName = options.input.split('/')[-1][0:-4]
		inputGFFFile = '%s%s.gff' % (gffFolder,inputGFFName)
		ROSE_utils.bedToGFF(options.input,inputGFFFile)
	elif options.input.split('.')[-1] =='gff':
		#COPY THE INPUT GFF TO THE GFF FOLDER
	inputGFFFile = options.input
		os.system('cp %s %s' % (inputGFFFile,gffFolder))

	else:
		print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
		#COPY THE INPUT GFF TO THE GFF FOLDER
	inputGFFFile = options.input
		os.system('cp %s %s' % (inputGFFFile,gffFolder))
Beispiel #20
0
def mapBamToGFF(bamFile,
                gff,
                sense='both',
                extension=200,
                floor=0,
                rpm=False,
                matrix=None):

    #def mapBamToGFF(bamFile,gff,sense = 'both',unique = 0,extension = 200,floor = 0,density = False,rpm = False,binSize = 25,clusterGram = None,matrix = None,raw = False,includeJxnReads = False):
    '''maps reads from a bam to a gff'''
    floor = int(floor)

    #USING BAM CLASS
    bam = ROSE_utils.Bam(bamFile)

    #new GFF to write to
    newGFF = []
    #millionMappedReads

    if rpm:
        MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4)
    else:
        MMR = 1

    print(('using a MMR value of %s' % (MMR)))

    #senseTrans = maketrans('-+.','+-+') #deprecated

    if ROSE_utils.checkChrStatus(bamFile) == 1:
        print("has chr")
        hasChrFlag = 1
        #sys.exit();
    else:
        print("does not have chr")
        hasChrFlag = 0
        #sys.exit()

    if type(gff) == str:
        gff = ROSE_utils.parseTable(gff, '\t')

    #setting up a maxtrix table

    newGFF.append(['GENE_ID', 'locusLine'] + [
        'bin_' + str(n) + '_' + bamFile.split('/')[-1]
        for n in range(1,
                       int(matrix) + 1, 1)
    ])

    #getting and processing reads for gff lines
    ticker = 0
    print('Number lines processed')
    for line in gff:
        line = line[0:9]
        if ticker % 100 == 0:
            print(ticker)
        ticker += 1
        if not hasChrFlag:
            line[0] = re.sub(r"chr", r"", line[0])
        gffLocus = ROSE_utils.Locus(line[0], int(line[3]), int(line[4]),
                                    line[6], line[1])
        #print line[0]
        #sys.exit()
        searchLocus = ROSE_utils.makeSearchLocus(gffLocus, int(extension),
                                                 int(extension))

        reads = bam.getReadsLocus(searchLocus, 'both', False, 'none')
        #now extend the reads and make a list of extended reads
        extendedReads = []
        for locus in reads:
            if locus.sense() == '+' or locus.sense() == '.':
                locus = ROSE_utils.Locus(locus.chr(), locus.start(),
                                         locus.end() + extension,
                                         locus.sense(), locus.ID())
            if locus.sense() == '-':
                locus = ROSE_utils.Locus(locus.chr(),
                                         locus.start() - extension,
                                         locus.end(), locus.sense(),
                                         locus.ID())
            extendedReads.append(locus)
        if gffLocus.sense() == '+' or gffLocus.sense == '.':
            senseReads = [
                x for x in extendedReads
                if x.sense() == '+' or x.sense() == '.'
            ]
            antiReads = [x for x in extendedReads if x.sense() == '-']
        else:
            senseReads = [
                x for x in extendedReads
                if x.sense() == '-' or x.sense() == '.'
            ]
            antiReads = [x for x in extendedReads if x.sense() == '+']

        senseHash = defaultdict(int)
        antiHash = defaultdict(int)

        #filling in the readHashes
        if sense == '+' or sense == 'both' or sense == '.':
            for read in senseReads:
                for x in range(read.start(), read.end() + 1, 1):
                    senseHash[x] += 1
        if sense == '-' or sense == 'both' or sense == '.':
            #print('foo')
            for read in antiReads:
                for x in range(read.start(), read.end() + 1, 1):
                    antiHash[x] += 1

        #now apply flooring and filtering for coordinates
        keys = ROSE_utils.uniquify(
            list(senseHash.keys()) + list(antiHash.keys()))
        if floor > 0:

            keys = [x for x in keys if (senseHash[x] + antiHash[x]) > floor]
        #coordinate filtering
        keys = [x for x in keys if gffLocus.start() < x < gffLocus.end()]

        #setting up the output table
        clusterLine = [gffLocus.ID(), gffLocus.__str__()]

        #getting the binsize
        binSize = (gffLocus.len() - 1) / int(matrix)
        nBins = int(matrix)
        if binSize == 0:
            clusterLine += ['NA'] * int(matrix)
            newGFF.append(clusterLine)
            continue
        n = 0
        if gffLocus.sense() == '+' or gffLocus.sense(
        ) == '.' or gffLocus.sense() == 'both':
            i = gffLocus.start()

            while n < nBins:
                n += 1
                binKeys = [x for x in keys if i < x < i + binSize]
                binDen = float(
                    sum([senseHash[x] + antiHash[x]
                         for x in binKeys])) / binSize
                clusterLine += [round(binDen / MMR, 4)]
                i = i + binSize
        else:
            i = gffLocus.end()
            while n < nBins:
                n += 1
                binKeys = [x for x in keys if i - binSize < x < i]
                binDen = float(
                    sum([senseHash[x] + antiHash[x]
                         for x in binKeys])) / binSize
                clusterLine += [round(binDen / MMR, 4)]
                i = i - binSize
        newGFF.append(clusterLine)

    return newGFF
Beispiel #21
0
def mapEnhancerToGene(annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000,
                      noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = ROSE_utils.makeStartDict(annotFile)
    enhancerTable = ROSE_utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = ROSE_utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = ROSE_utils.makeTranscriptCollection(annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(ROSE_utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = ROSE_utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    # superDict= defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [['REGION_ID', 'OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]


    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [['CHROM','START','END','REGION_ID','OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE','enhancerRank']]

        # next by gene
        geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[0], line[1], line[2])

        enhancerLocus = ROSE_utils.Locus(line[0], line[1], line[2], '.', line[3])

        # overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow),'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(ROSE_utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = ROSE_utils.uniquify(overlappingGenes)
        proximalGenes = ROSE_utils.uniquify(proximalGenes)
        distalGenes = ROSE_utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[1]) + int(line[2])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:4]
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(join(ROSE_utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[4:5]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            if ',' in line[4]:
                rankDict[refID].append(int(line[4].split(',')[0]))
            else:
                rankDict[refID].append(int(line[4]))
                # superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            if ',' in line[4]:
                rankDict[refID].append(int(line[4].split(',')[0]))
            else:
                rankDict[refID].append(int(line[4]))
                # superDict[refID].append(int(line[-1]))

    # End loop through

    # Make table by gene
    overallGeneList = ROSE_utils.uniquify(overallGeneList)

    # use enhancer rank to order
    rankOrder = ROSE_utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][refID] + geneDict['proximal'][refID]

        # superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        newLine = [geneName, refID, join(proxEnhancers, ','), enhancerRanks]
        geneToEnhancerTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, geneToEnhancerTable
    else:
        tmp = []
        for line in enhancerToGeneTable[1:-1]:
            if ',' in line[-1]:
                tmp.append(int(line[-1].split(',')[0]))
            else:
                tmp.append(int(line[-1]))
        enhancerOrder = ROSE_utils.order(tmp)
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])

        return sortedTable, geneToEnhancerTable