def main():

    genome = 'hg19'

    projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/'
    bedFolder = projectFolder + 'bed/'
    canyonBed = bedFolder + 'canyon_Mut_sizeSelected.bed'
    scripts = projectFolder + 'scripts/'
    sampleName = 'Mut'
    outputUp = 'HG19_' + sampleName + '_1000extend_upstreamFlanking.bed'
    outputDown = 'HG19_' + sampleName + '_' + '1000extend_downstreamFlanking.bed'

    geneGTF = '/storage/goodell/home/jmreyes/grail/genomes/Homo_sapiens/UCSC/%s/Annotation/Genes/genes.gtf' % (
        genome)

    bedList = [outputUp, outputDown]
    cmdBash = [['#!/usr/bin/bash']]
    cmdOut = projectFolder + 'scripts/' + sampleName + '_geneIntersect.sh'
    for bed in bedList:
        bedIn = bed
        bedName = bedIn.split('/')[-1].split('.bed')[0]
        sortedOut = bedFolder + bedName + '.sorted.bed'
        intersectOut = bedFolder + bedName + '_geneIntersect.bed'
        sortBedCmd = 'sort -k1,1 -k2,2n %s > %s' % (bedFolder + bedIn,
                                                    sortedOut)
        cmdBash.append([sortBedCmd])

        intersectCmd = 'bedtools closest -d -a %s -b %s > %s' % (
            sortedOut, geneGTF, intersectOut)
        cmdBash.append([intersectCmd])

    utils.unParseTable(cmdBash, cmdOut, '\t')
def makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1):

    '''
    makes the fold table and writes to disk
    fold table is ranked by fold change
    first column is guideID, second column is gene name, third is fold change
    '''

    guideDict,geneDict = makeAnnotDict(annotFile)

    testIdx = utils.parseTable(testIdxFile,'\t')
    controlIdx = utils.parseTable(controlIdxFile,'\t')

    #for each guide, divide the count by the MMR then add 1 then take the log2 ratio

    outTable = [['GUIDE_ID','GENE','LOG2_RATIO',testName,controlName]]
    for i in range(len(testIdx)):

        guideID = testIdx[i][0]
        gene = guideDict[guideID]
        
        testCount = float(testIdx[i][2])/testMMR + epsilon
        controlCount = float(controlIdx[i][2])/controlMMR + epsilon

        log2Ratio = numpy.log2(testCount/controlCount)

        newLine = [guideID,gene,log2Ratio,round(testCount,4),round(controlCount,4)]

        outTable.append(newLine)

    outputFile = '%s%s_log2Ratio.txt' % (outputFolder,analysisName)
    utils.unParseTable(outTable,outputFile,'\t')
    return outputFile
Beispiel #3
0
def buildGraph(edgeDict,
               gene_to_enhancer_dict,
               output_folder,
               analysis_name,
               cutoff=1):
    '''
    from the collapsed edge dictionary, build a target graph
    require at least n motifs to constitute an edge where n is set by cutoff. 
    default is 1
    '''

    node_list = edgeDict.keys()
    node_list.sort()
    #this is only edges between TFs
    graph = nx.DiGraph(name=analysis_name)
    graph.add_nodes_from(node_list)

    #this stores ALL edges identified by motifs
    edge_table = [[
        'SOURCE', 'TARGET', 'CHROM', 'START', 'STOP', 'REGION_ID',
        'TF_INTERACTION'
    ]]
    edge_output = '%s%s_EDGE_TABLE.txt' % (output_folder, analysis_name)

    for source in node_list:
        print(source)
        target_list = edgeDict[source].keys()
        target_list.sort()
        for target in target_list:

            #now we need to see which target regions this guy overlaps
            target_regions = gene_to_enhancer_dict[target]
            target_collection = utils.LocusCollection(target_regions, 50)

            #get the edges hitting that target
            edgeLoci = edgeDict[source][target]
            if node_list.count(target) > 0:
                tf_interaction = 1
            else:
                tf_interaction = 0
            #only add to the graph if this is a TF/TF interaction
            if len(edgeLoci) >= cutoff and node_list.count(target) > 0:
                graph.add_edge(source, target)

            #now for each edge, add to the table
            for edgeLocus in edgeLoci:
                regionString = ','.join([
                    locus.ID()
                    for locus in target_collection.getOverlap(edgeLocus)
                ])
                edgeLine = [
                    source, target,
                    edgeLocus.chr(),
                    edgeLocus.start(),
                    edgeLocus.end(), regionString, tf_interaction
                ]
                edge_table.append(edgeLine)

    utils.unParseTable(edge_table, edge_output, '\t')
    return graph
Beispiel #4
0
def findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2):
    '''
    takes in the super dict
    returns a dictionary of refseqs with all valley loci that are associated
    '''

    print 'IDENTIFYING VALLEYS IN SUPER ENHANCERS'

    valleyBED = []
    valleyDict = {}

    for gene in TFtoEnhancerDict.keys():
        valleyDict[gene] = []
        print gene
        for region in TFtoEnhancerDict[gene]:
            scoreArray = scoreValley(region, bamFile, projectName, projectFolder)
            for index,score in enumerate(scoreArray):
                if score > cutoff:
                    valley = utils.Locus(region.chr(), region.start() + index*10,
                                         region.start() + (index+1)*10, '.')
                    valleyDict[gene].append(valley)

        stitchedValleys = stitchValleys(valleyDict[gene])
        for valley in stitchedValleys:
            valleyBED.append([valley.chr(), valley.start(), valley.end()])
            valleyDict[gene] = stitchedValleys

    bedfilename = projectFolder + projectName + '_valleys.bed'
    utils.unParseTable(valleyBED, bedfilename, '\t')
    print bedfilename

    return bedfilename
Beispiel #5
0
def makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder):

    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap,'\t')
    namesList = medianDict.keys()
    signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]
    for line in regionMap[1:]:

        newLine = line[0:6]
        for i in range(len(namesList)):
            enhancerIndex = (i*2) + 6
            controlIndex = (i*2) + 7
            enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal/medianDict[namesList[i]]
            newLine.append(enhancerSignal)

        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable,outputFile,'\t')
    return outputFile
Beispiel #6
0
def formatOutput(TFtoEnhancerDict, refseqToNameDict, projectName, projectFolder):

    '''                                                                             
    takes in the dict mapping TFs to all proximal supers                                     
    returns a file that lists each canidate TFs                                                     
    and gives the coordinates of the super enhancers around them                                    
    '''

    output = [['TF_refseq', 'TF_name', 'chr', 'start', 'stop', 'SuperID', 'Super_Load' ]]

    used = []
 
    for gene in TFtoEnhancerDict.keys():
        for superEnh in TFtoEnhancerDict[gene]:

            check = (refseqToNameDict[gene], superEnh.chr(), superEnh.start(), superEnh.end())
            
            if check not in used:
                newline = [gene, refseqToNameDict[gene]]
                newline.append(superEnh.chr())
                newline.append(superEnh.start())
                newline.append(superEnh.end())
                newline.append(superEnh.ID())
                newline.append(superEnh.score())
                output.append(newline)

                used.append(check)

    outputname = projectFolder + projectName + '_CANIDATE_TF_AND_SUPER_TABLE.txt'

    utils.unParseTable(output, outputname, '\t')

    return 1
Beispiel #7
0
def cut_1000(order_table,regions_table,geneDict, outpath_top='',outpath_bottom=''):

	top_table = []
	top_cut_regions = []
	top_table.append(regions_table[0])
	for line in order_table[1:1001]:
		top_cut_regions.append(line[4].split(','))
	print(top_cut_regions[1:10])
	for line in regions_table:
		region = line[0]
				#print(region)
		if any(region in s for s in top_cut_regions):
			top_table.append(line)


	utils.unParseTable(top_table,outpath_top,'\t')

	bottom_table=[]
	bottom_cut_regions = []
	bottom_table.append(regions_table[0])
	for line in order_table[len(order_table)-1000:len(order_table)]:
		bottom_cut_regions.append(line[4].split(','))

	print(bottom_cut_regions[1:10])
	for line in regions_table:
		region = line[0]
		if any(region in s for s in bottom_cut_regions):
			bottom_table.append(line)

	utils.unParseTable(bottom_table,outpath_bottom,'\t')
Beispiel #8
0
def collapseRegionMap(regionMapFile, name='', controlBams=False):
    '''
    takes a regionMap file and collapses signal into a single column
    also fixes any stupid start/stop sorting issues
    needs to take into account whether or not controls were used
    '''

    regionMap = utils.parseTable(regionMapFile, '\t')

    for n, line in enumerate(regionMap):

        if n == 0:
            #new header
            if len(name) == 0:
                name = 'MERGED_SIGNAL'
            regionMap[n] = line[0:6] + [name]

        else:
            newLine = list(line[0:6])
            if controlBams:
                signalLine = [float(x) for x in line[6:]]
                rankbyIndexes = range(0, len(signalLine) / 2, 1)
                controlIndexes = range(len(signalLine) / 2, len(signalLine), 1)
                metaVector = []
                for i, j in zip(rankbyIndexes, controlIndexes):
                    #min signal is 0
                    metaVector.append(max(0, signalLine[i] - signalLine[j]))
                metaSignal = numpy.mean(metaVector)
            else:
                metaSignal = numpy.mean([float(x) for x in line[6:]])
            regionMap[n] = newLine + [metaSignal]

    outputFile = string.replace(regionMapFile, 'REGION', 'META')
    utils.unParseTable(regionMap, outputFile, '\t')
    return (outputFile)
Beispiel #9
0
def filterSubpeaks(subpeakFile,gene_to_enhancer_dict, analysis_name,output_folder):
    '''
    takes the initial subpeaks in, stitches them, 
    '''


    # stitch the subpeaks
    print(subpeakFile)
    subpeakCollection = utils.importBoundRegion(subpeakFile,'%s_subpeak' % (analysis_name))
    
    subpeakCollection = subpeakCollection.stitchCollection()
    
    subpeakLoci = subpeakCollection.getLoci()


    all_sub_bed = []
    for locus in subpeakLoci:
        bed_line = [locus.chr(),locus.start(),locus.end(),'.',locus.ID()]
        all_sub_bed.append(bed_line)


    all_bed_path = output_folder + analysis_name + '_all_subpeak.bed'
    utils.unParseTable(all_sub_bed, all_bed_path, '\t')

    return all_bed_path
Beispiel #10
0
def createExpressionDict(annotationFile, projectFolder, projectName, refseqToNameDict, expCutoff,expressionFile=''):
    '''
    input: an activity table with refseq in first column and expression or promoter
    acetylation in second column
    output: a dictionary keyed by refseq that points to activity
    '''

    print 'CREATING EXPRESSION DICTIONARY'

    if not expressionFile:
        expressionFilename = projectFolder + 'bamliquidator/matrix.txt'
    else:
        expressionFilename = expressionFile
        
    expressionTable = utils.parseTable(expressionFilename, '\t')

    expressionDictNM = {}
    expressionDictGene = {}

    for line in expressionTable[1:]:
        trid = line[0]
        geneName = refseqToNameDict[trid]
        try:
            exp = float(line[2])
        except IndexError:
            exp = float(line[1])

        # Save the expression value of each NMid in a dict, keep higher value if multiple
        if trid in expressionDictNM and exp > expressionDictNM[trid]:
            expressionDictNM[trid] = exp
        elif trid not in expressionDictNM:
            expressionDictNM[trid] = exp

        # Save the value of the expression if it's the highest for that gene
        if geneName in expressionDictGene and exp > expressionDictGene[geneName]:
            expressionDictGene[geneName] = exp
        elif geneName not in expressionDictGene:
            expressionDictGene[geneName] = exp

    cutoff = numpy.percentile(expressionDictGene.values(), expCutoff)
    print 'Expression cutoff: ' + str(cutoff)

    expressedGenes = []
    expressedNM = []

    for nmid in expressionDictNM:
        if float(expressionDictNM[nmid]) > cutoff:
            expressedGenes.append(refseqToNameDict[nmid])
            expressedNM.append(nmid)

    expressedGenes = utils.uniquify(expressedGenes)
    Genefilename = projectFolder + projectName + '_EXPRESSED_GENES.txt'
    utils.unParseTable(expressedGenes, Genefilename, '')

    expressedNM = utils.uniquify(expressedNM)
    NMfilename = projectFolder + projectName + '_EXPRESSED_NM.txt'
    utils.unParseTable(expressedNM, NMfilename, '')
      
    return expressedNM, expressionDictNM
Beispiel #11
0
def makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder):

    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap,'\t')
    namesList = nameDict.keys()
    namesList.sort()
    signalTable = [['REGION_ID','CHROM','START','STOP','NUM_LOCI','CONSTITUENT_SIZE'] + namesList]

    print("len of %s for namesList" % (len(namesList)))
    print(namesList)
    for line in regionMap[1:]:

        newLine = line[0:6]
        
        
        #a little tricky here to add datasets sequentially
        i = 6 #start w/ the first column w/ data
        for name in namesList:
            
            if nameDict[name]['background'] == True:
                enhancerIndex = int(i)
                i +=1
                controlIndex = int(i)
                i +=1
                try:
                    enhancerSignal = float(line[enhancerIndex]) - float(line[controlIndex])
                except IndexError:
                    print line
                    print len(line)
                    print enhancerIndex
                    print controlIndex
                    sys.exit()
                
            else:
                enhancerIndex = int(i)
                i+=1
                enhancerSignal = float(line[enhancerIndex])

            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal/medianDict[name]
            newLine.append(enhancerSignal)
                
            


        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder,genome,analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable,outputFile,'\t')
    return outputFile
Beispiel #12
0
def makeEnhancerSignalTable(nameDict, mergedRegionMap, medianDict,
                            analysisName, genome, outputFolder):
    '''
    makes a table where each row is an enhancer and each column is the log2 
    background corrected signal vs. median
    '''

    #load in the region map
    regionMap = utils.parseTable(mergedRegionMap, '\t')
    namesList = nameDict.keys()
    namesList.sort()
    signalTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE'
    ] + namesList]

    print("len of %s for namesList" % (len(namesList)))
    print(namesList)
    for line in regionMap[1:]:

        newLine = line[0:6]

        #a little tricky here to add datasets sequentially
        i = 6  #start w/ the first column w/ data
        for name in namesList:

            if nameDict[name]['background'] == True:
                enhancerIndex = int(i)
                i += 1
                controlIndex = int(i)
                i += 1
                try:
                    enhancerSignal = float(line[enhancerIndex]) - float(
                        line[controlIndex])
                except IndexError:
                    print line
                    print len(line)
                    print enhancerIndex
                    print controlIndex
                    sys.exit()

            else:
                enhancerIndex = int(i)
                i += 1
                enhancerSignal = float(line[enhancerIndex])

            if enhancerSignal < 0:
                enhancerSignal = 0
            enhancerSignal = enhancerSignal / medianDict[name]
            newLine.append(enhancerSignal)

        signalTable.append(newLine)

    outputFile = "%s%s_%s_signalTable.txt" % (outputFolder, genome,
                                              analysisName)
    print "WRITING MEDIAN NORMALIZED SIGNAL TABLE TO %s" % (outputFile)
    utils.unParseTable(signalTable, outputFile, '\t')
    return outputFile
Beispiel #13
0
def summarizeVenn(mapped_path, group_list=['CG', 'THMYCN'], output=''):
    '''
    summarizes binary occupancy across group to make a venn diagram
    '''

    group_table = [['GFF_LINE', 'ID'] + group_list]

    mapped_table = utils.parseTable(mapped_path, '\t')

    group_cols = []
    for group in group_list:
        group_names = [
            name for name in mapped_table[0] if name.count(group) > 0
        ]
        group_cols.append(
            [mapped_table[0].index(name) for name in group_names])

    print(group_cols)
    for line in mapped_table[1:]:
        binary_vector = []  #a 1/0 vector to hold mapping by group
        for i in range(len(group_list)):
            cols = group_cols[i]
            signal = max([int(line[x]) for x in cols])
            binary_vector.append(signal)

        new_line = line[0:2] + binary_vector
        group_table.append(new_line)

    print(group_table[0:5])

    #now add up the stats
    #this part assumes only 2 groups for now otherwise gets combinatorially challenging
    #permute all possible binary combinations given the vector length
    binary_combinations = [[0], [1]]
    for i in range(len(group_list) - 1):
        new_combinations = []
        for x in binary_combinations:
            print(x)
            x1 = list(x) + [1]
            x0 = list(x) + [0]
            new_combinations.append(x1)
            new_combinations.append(x0)

            binary_combinations = list(new_combinations)

    print(binary_combinations)
    count_table = [group_list + ['count']]
    for combo in binary_combinations:
        count = len([line for line in group_table[1:] if line[2:] == combo])

        count_table.append(combo + [count])
    print(count_table)
    if len(output) > 0:
        utils.unParseTable(count_table, output, '\t')
    else:
        return count_table
Beispiel #14
0
def mergeCollections(nameDict, analysisName, output='', superOnly=True):
    '''
    merges them collections
    '''

    allLoci = []
    namesList = nameDict.keys()
    for name in namesList:

        seCollection = makeSECollection(nameDict[name]['enhancerFile'], name,
                                        superOnly)
        if superOnly:
            print "DATASET: %s HAS %s SUPERENHANCERS" % (name,
                                                         len(seCollection))
        else:
            print "DATASET: %s HAS %s ENHANCERS" % (name, len(seCollection))
        allLoci += seCollection.getLoci()

    print len(allLoci)

    mergedCollection = utils.LocusCollection(allLoci, 50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci))
    #sort by size and provide a unique ID

    sizeList = [locus.len() for locus in stitchedLoci]

    sizeOrder = utils.order(sizeList, decreasing=True)

    orderedLoci = [stitchedLoci[i] for i in sizeOrder]

    for i in range(len(orderedLoci)):
        orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName, str(i + 1))

    mergedGFF = []
    for locus in orderedLoci:
        newLine = [
            locus.chr(),
            locus.ID(), '',
            locus.start(),
            locus.end(), '',
            locus.sense(), '',
            locus.ID()
        ]
        mergedGFF.append(newLine)

    if len(output) == 0:
        return mergedGFF
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(mergedGFF, output, '\t')
        return output
Beispiel #15
0
def filterPeaks(tabixFolder,mycTablePath,outputPath,repeatList = []):

    '''                                                                                             
    auto filters the 3 repeat classes LINE, LTR, Simple_repeat                                      
    outputs a bed in the format of                                                                  
    [PEAK_ID,CHROM, START,STOP,LENGTH, LINE, LTR, Simple_repeat]                                    
    '''

    if len(repeatList) == 0:
        repeatList = ['LINE','LTR','Simple_repeat']

    repeatTable = [['PEAK_ID','CHROM','START','STOP','LENGTH'] + repeatList]

    mycTable = utils.parseTable(mycTablePath,'\t')
    ticker =0
    for line in mycTable[1:]:
        if line[0][0] =='P':
            continue

        if ticker % 100 == 0:
            print ticker
        ticker +=1
        peak_ID = line[0]
        chrom = line[1]
        start = int(line[2])
        stop = int(line[3])
        length = line[4]
        locusString = '%s:%s-%s' % (chrom,start,stop)

        repeatFractions = []
        for repeatClass in repeatList:
            tabixGFF = '%shg19_%s_category_sorted.gff.gz' % (tabixFolder,repeatClass)

            tabixCmd = 'tabix %s %s' % (tabixGFF,locusString)

            tabix = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)

            tabixLines = tabix.stdout.readlines()

            tabixLines = [x.rstrip().split('\t') for x in tabixLines] #i think you get back essentially gff lines                                                                                      

            overlapFraction = 0.0
            for line in tabixLines:
                lineStart = int(line[3])
                lineStop = int(line[4])
                lineStart = max(start,lineStart)
                lineStop = min(stop,lineStop)
                overlapLength = lineStop - lineStart
                overlapFraction += float(overlapLength)/float(length)
            repeatFractions.append(round(overlapFraction,4))

        newLine = [peak_ID,chrom,start,stop,length] + repeatFractions
        repeatTable.append(newLine)

    utils.unParseTable(repeatTable,outputPath,'\t')
Beispiel #16
0
def assignEnhancerRank(enhancerToGeneFile,
                       enhancerFile1,
                       enhancerFile2,
                       name1,
                       name2,
                       rankOutput=''):
    '''
    for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
    '''

    enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t')

    enhancerCollection1 = makeSECollection(enhancerFile1, name1, False)
    enhancerCollection2 = makeSECollection(enhancerFile2, name2, False)

    enhancerDict1 = makeSEDict(enhancerFile1, name1, False)
    enhancerDict2 = makeSEDict(enhancerFile2, name2, False)

    #we're going to update the enhancerToGeneTable

    enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2]

    for i in range(1, len(enhancerToGene)):

        line = enhancerToGene[i]

        locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0])

        #if the enhancer doesn't exist, its ranking is dead last on the enhancer list

        enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both')
        if len(enhancer1Overlap) == 0:
            enhancer1Rank = len(enhancerCollection1)
        else:

            rankList1 = [
                enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap
            ]
            enhancer1Rank = min(rankList1)

        enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both')
        if len(enhancer2Overlap) == 0:
            enhancer2Rank = len(enhancerCollection2)
        else:

            rankList2 = [
                enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap
            ]
            enhancer2Rank = min(rankList2)
        enhancerToGene[i] += [enhancer1Rank, enhancer2Rank]

    if len(rankOutput) == 0:
        return enhancerToGene
    else:
        utils.unParseTable(enhancerToGene, rankOutput, '\t')
Beispiel #17
0
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3],
                                     gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]),
                           gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)

    overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
    print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" %
          (len(overlapLoci), gffLine))

    # since beds come from multiple sources, we want to figure out how to offset them
    offsetDict = {}  # this will store each ID name
    bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
    bedNamesList.sort()
    for i in range(len(bedNamesList)):
        offsetDict[bedNamesList[
            i]] = 2 * i  # offsets different categories of bed regions

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])

    # fill out the name table
    for name in bedNamesList:
        offset = offsetDict[name]
        nameTable.append([name, 0, 0.0 - offset])

    for bedLocus in overlapLoci:

        offset = offsetDict[bedLocus.ID()]

        [start,
         stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]

        diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unParseTable(diagramTable,
                       outFolder + gffString + '_bedDiagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt',
                       '\t')
Beispiel #18
0
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory,
                         projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable
    ]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFtoEnhancerDict.keys():
        subpeakDict[gene] = []
        for region in TFtoEnhancerDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [
                utils.makeSearchLocus(x, constExtension, constExtension)
                for x in overlaps
            ]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append(
                    [overlap.chr(),
                     overlap.start(),
                     overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|' + subpeak.chr() + '|' + str(
                subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(),
                                       int(subpeak.start() + 1),
                                       int(subpeak.end() + 1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
Beispiel #19
0
def findValleys(gene_to_enhancer_dict,
                bamFileList,
                projectName,
                projectFolder,
                cutoff=0.2):
    '''
    takes in the super dict
    returns a dictionary of refseqs with all valley loci that are associated
    returns 2 kinds of bed files...
    1 = all 
    '''

    #first make the bamDict

    all_valley_bed = []
    valleyDict = {}

    #start w/ a bamFileList and make a list of bam type objects
    bam_list = [utils.Bam(bam_path) for bam_path in bamFileList]
    max_read_length = max([bam.getReadLengths()[0] for bam in bam_list])

    gene_list = gene_to_enhancer_dict.keys()
    gene_list.sort()
    ticker = 0
    print('number of regions processed:')
    for gene in gene_list:

        valleyDict[gene] = []

        for region in gene_to_enhancer_dict[gene]:
            if ticker % 100 == 0:
                print(ticker)
            ticker += 1
            scoreArray = scoreValley(region, bam_list, max_read_length,
                                     projectName, projectFolder)
            for index, score in enumerate(scoreArray):
                if score > cutoff:
                    valley = utils.Locus(region.chr(),
                                         region.start() + index * 10,
                                         region.start() + (index + 1) * 10,
                                         '.')
                    valleyDict[gene].append(valley)

        stitchedValleys = stitchValleys(valleyDict[gene])
        for valley in stitchedValleys:
            all_valley_bed.append([valley.chr(), valley.start(), valley.end()])
            valleyDict[gene] = stitchedValleys

    all_bed_path = projectFolder + projectName + '_all_valleys.bed'
    utils.unParseTable(all_valley_bed, all_bed_path, '\t')

    return all_bed_path
Beispiel #20
0
def getTonyInfo(uniqueIDList, colList):
    '''
    pass this a uniqueID List and a list of columns

    '''

    uniqueIDString = string.join(uniqueIDList, ',')

    columnString = string.join([str(x) for x in colList], ',')

    cmd = "perl /ark/tony/admin/getDB_Data.pl -i %s -c %s -o TAB" % (
        uniqueIDString, columnString)

    sqlOut = subprocess.Popen(cmd,
                              stdin=subprocess.PIPE,
                              stderr=subprocess.PIPE,
                              stdout=subprocess.PIPE,
                              shell=True)

    sqlText = sqlOut.communicate()

    sqlText = sqlText[0]

    sqlTable = sqlText.split('\n')
    sqlTable = [x for x in sqlTable if len(x) > 0]

    sqlTable = [x.split('\t') for x in sqlTable]

    header = [x.split(':')[-1] for x in sqlTable[0][1:]]
    header = [str.upper(x) for x in header]
    header = ['GENOME', 'SOURCE', 'CELL_TYPE', 'NAME', 'BAMFILE']
    tonyDict = {}
    for line in sqlTable[1:]:
        uniqueID = line[0]
        tonyDict[uniqueID] = {}
        for i in range(len(header)):
            tonyDict[uniqueID][header[i]] = line[(i + 1)]
    newTable = []
    newTable.append(header)

    for key in tonyDict.keys():
        newLine = []
        newLine.append(str.upper(tonyDict[key]['GENOME']))
        newLine.append(tonyDict[key]['SOURCE'])
        newLine.append(tonyDict[key]['CELL_TYPE'])
        newLine.append(tonyDict[key]['NAME'])
        newLine.append(tonyDict[key]['BAMFILE'])
        newTable.append(newLine)

    #print newTable

    utils.unParseTable(newTable, '/grail/projects/masterBamTable.txt', '\t')
Beispiel #21
0
def averagingMappedSignal(mapped_list, output_path, setName):
    '''
    averages signal across a set of mapped gffs and writes the new output
    '''

    #create a list containing all of the tables
    table_list = [
        utils.parseTable(mapped_list[i], '\t') for i in range(len(mapped_list))
    ]

    #first set up the output header
    output_header = ['GENE_ID', 'locusLine']
    nCols = len(table_list[0][0]) - 2
    for n in range(nCols):
        output_header.append('bin_%s_%s' % (n + 1, setName))

    output_table = [output_header]
    #now iterate through each row to set up the gene ID and locus line
    for i in range(1, len(table_list[0])):

        line = table_list[0][i]
        if len(line) > 2:
            output_table.append(line[0:2])

    #now run through the whole matrix in i,j notation and put average signal into the final matrix

    #iterate through rows
    row_ticker = 1
    for i in range(1, len(table_list[0])):
        line = table_list[0][i]
        if len(line) == 2:
            continue
        signal_vector = []
        #iterate through columns
        for j in range(2, len(table_list[0][0])):
            try:
                signal_vector = [float(table[i][j]) for table in table_list]
            except IndexError:
                print(i, j)
                print(table_list[0][i])
                print(table_list[1][i])

            signal = max(round(numpy.average(signal_vector), 4), 0)

            output_table[row_ticker].append(signal)
        row_ticker += 1

    print(len(table_list[0]))
    print(len(output_table))
    utils.unParseTable(output_table, output_path, '\t')
    return output_path
Beispiel #22
0
def fix_table_s1(ob_s1_path):
    '''
    fixes formatting of table s1
    '''

    s1 = open(ob_s1_path, 'r')

    lines = s1.readlines()
    if len(lines) == 1:
        lines = lines[0].split('\r')
        s1_table = [line.rstrip().split('\t') for line in lines]
        utils.unParseTable(s1_table, ob_s1_path, '\t')

    return ob_s1_path
Beispiel #23
0
def assignEnhancerRank(enhancerToGeneFile,enhancerFile1,enhancerFile2,name1,name2,rankOutput=''):

    '''
    for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
    '''

    enhancerToGene = utils.parseTable(enhancerToGeneFile,'\t')

    enhancerCollection1 = makeSECollection(enhancerFile1,name1,False)
    enhancerCollection2 = makeSECollection(enhancerFile2,name2,False)

    enhancerDict1 = makeSEDict(enhancerFile1,name1,False)
    enhancerDict2 = makeSEDict(enhancerFile2,name2,False)

    
    #we're going to update the enhancerToGeneTable

    enhancerToGene[0] += ['%s_rank' % name1,'%s_rank' % name2]
    
    for i in range(1,len(enhancerToGene)):

        line = enhancerToGene[i]
        
        locusLine = utils.Locus(line[1],line[2],line[3],'.',line[0])
        
        #if the enhancer doesn't exist, its ranking is dead last on the enhancer list

        enhancer1Overlap = enhancerCollection1.getOverlap(locusLine,'both')
        if len(enhancer1Overlap) == 0:
            enhancer1Rank = len(enhancerCollection1)
        else:
            
            rankList1 = [enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap]
            enhancer1Rank = min(rankList1)


        enhancer2Overlap = enhancerCollection2.getOverlap(locusLine,'both')
        if len(enhancer2Overlap) == 0:
            enhancer2Rank = len(enhancerCollection2)
        else:
            
            rankList2 = [enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap]
            enhancer2Rank = min(rankList2)
        enhancerToGene[i]+=[enhancer1Rank,enhancer2Rank]


    if len(rankOutput) == 0:
        return enhancerToGene
    else:
        utils.unParseTable(enhancerToGene,rankOutput,'\t')
Beispiel #24
0
def mergeCollections(nameDict,analysisName,output='',superOnly=True):

    '''
    merges them collections
    '''

    allLoci = []
    namesList = nameDict.keys()
    for name in namesList:
        
        seCollection =makeSECollection(nameDict[name]['enhancerFile'],name,superOnly)
        if superOnly:
            print "DATASET: %s HAS %s SUPERENHANCERS" % (name,len(seCollection))
        else:
            print "DATASET: %s HAS %s ENHANCERS" % (name,len(seCollection))
        allLoci += seCollection.getLoci()

    print len(allLoci)


    mergedCollection = utils.LocusCollection(allLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    print "IDENTIFIED %s CONSENSUS ENHANCER REGIONS" % (len(stitchedLoci))
    #sort by size and provide a unique ID

    sizeList = [locus.len() for locus in stitchedLoci]

    sizeOrder = utils.order(sizeList,decreasing=True)
    
    orderedLoci = [stitchedLoci[i] for i in sizeOrder]

    for i in range(len(orderedLoci)):
        orderedLoci[i]._ID = 'merged_%s_%s' % (analysisName,str(i+1))

    mergedGFF = []
    for locus in orderedLoci:
        newLine = [locus.chr(),locus.ID(),'',locus.start(),locus.end(),'',locus.sense(),'',locus.ID()]
        mergedGFF.append(newLine)


    if len(output) == 0:
        return mergedGFF
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(mergedGFF,output,'\t')
        return output
Beispiel #25
0
def makePeakGFFs(peak_path_list):

    '''
    makes a stitched gff for all MYC bound TSS and Distal regions across all datasets
    '''

    #setting the output
    tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder)
    distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder)

    #check to see if already done
    if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1):
        print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path))
        return tss_gff_path,distal_gff_path

    #emtpy loci lists to hold everything
    tss_loci = []
    distal_loci = []

    
    for peak_path in peak_path_list:
        print('processing %s' % (peak_path))

        peak_table=  utils.parseTable(peak_path,'\t')

        for line in peak_table[1:]:
            peak_locus = utils.Locus(line[1],line[2],line[3],'.')
            if int(line[5]) == 0:
                distal_loci.append(peak_locus)
            else:
                tss_loci.append(peak_locus)

    #now combind the loci
    print('stitching loci')
    distal_collection = utils.LocusCollection(distal_loci,50)
    tss_collection = utils.LocusCollection(tss_loci,50)

    stitched_distal_collection = distal_collection.stitchCollection()
    stitched_tss_collection = tss_collection.stitchCollection()

    #now make the gffs
    distal_gff= utils.locusCollectionToGFF(distal_collection)
    tss_gff= utils.locusCollectionToGFF(tss_collection)

    #now write to disk
    utils.unParseTable(distal_gff,distal_gff_path,'\t')
    utils.unParseTable(tss_gff,tss_gff_path,'\t')
    
    return tss_gff_path,distal_gff_path
Beispiel #26
0
def addLengths(gene_table_path, peak_table_path):
    '''
    add tss and distal lengths to a gene table
    using the peak table
    '''

    output_path = string.replace(gene_table_path, 'GENE_TABLE',
                                 'GENE_TABLE_LENGTH')

    print(output_path)

    tss_dict = defaultdict(int)
    distal_dict = defaultdict(int)

    peak_table = utils.parseTable(peak_table_path, '\t')
    for line in peak_table[1:]:
        #get the genes
        gene_list = []
        if len(line) == 15:
            gene_list += line[-1].split(',')
            gene_list += line[-2].split(',')
        elif len(line) == 14:
            gene_list += line[-1].split(',')
        else:
            continue

        gene_list = utils.uniquify(
            [gene for gene in gene_list if len(gene) > 0])

        for gene in gene_list:
            if int(line[5]) == 1:
                tss_dict[gene] += int(line[4])
            else:
                distal_dict[gene] += int(line[4])

    #now fill out the gene table
    gene_table = utils.parseTable(gene_table_path, '\t')

    output_table = [gene_table[0] + ['TSS_LENGTH', 'DISTAL_LENGTH']]

    for line in gene_table[1:]:
        gene = line[0]
        new_line = line + [tss_dict[gene], distal_dict[gene]]
        output_table.append(new_line)

    utils.unParseTable(output_table, output_path, '\t')

    return output_path
Beispiel #27
0
def getTonyInfo(uniqueIDList,colList):

    '''
    pass this a uniqueID List and a list of columns

    '''

    uniqueIDString = string.join(uniqueIDList,',')

    columnString = string.join([str(x) for x in colList],',')

    cmd = "perl /ark/tony/admin/getDB_Data.pl -i %s -c %s -o TAB" % (uniqueIDString,columnString)
    
    sqlOut = subprocess.Popen(cmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True)

    sqlText = sqlOut.communicate()

    sqlText = sqlText[0]
    
    sqlTable = sqlText.split('\n')
    sqlTable = [x for x in sqlTable if len(x) > 0]

    sqlTable = [x.split('\t') for x in sqlTable]

    header = [x.split(':')[-1] for x in sqlTable[0][1:]]
    header= [str.upper(x) for x in header]
    header = ['GENOME', 'SOURCE', 'CELL_TYPE', 'NAME', 'BAMFILE']
    tonyDict = {}
    for line in sqlTable[1:]:
        uniqueID = line[0]
        tonyDict[uniqueID] = {}
        for i in range(len(header)):
            tonyDict[uniqueID][header[i]] = line[(i+1)]
    newTable = []        
    newTable.append(header)

    for key in tonyDict.keys():
        newLine = []
        newLine.append(str.upper(tonyDict[key]['GENOME']))
        newLine.append(tonyDict[key]['SOURCE'])
        newLine.append(tonyDict[key]['CELL_TYPE'])
        newLine.append(tonyDict[key]['NAME'])
        newLine.append(tonyDict[key]['BAMFILE'])
        newTable.append(newLine)

    #print newTable
    
    utils.unParseTable(newTable, '/grail/projects/masterBamTable.txt', '\t')
Beispiel #28
0
def findValleys(gene_to_enhancer_dict, bamFileList, projectName, projectFolder, cutoff = 0.2):
    '''
    takes in the super dict
    returns a dictionary of refseqs with all valley loci that are associated
    returns 2 kinds of bed files...
    1 = all 
    '''

    #first make the bamDict


    all_valley_bed = []
    valleyDict = {}

    #start w/ a bamFileList and make a list of bam type objects
    bam_list = [utils.Bam(bam_path) for bam_path in bamFileList]
    max_read_length = max([bam.getReadLengths()[0] for bam in bam_list])

    gene_list = gene_to_enhancer_dict.keys()
    gene_list.sort()
    ticker = 0
    print('number of regions processed:')
    for gene in gene_list:
        
        valleyDict[gene] = []

        for region in gene_to_enhancer_dict[gene]:
            if ticker %100 == 0:
                print(ticker)
            ticker+=1
            scoreArray = scoreValley(region, bam_list,max_read_length,projectName, projectFolder)
            for index,score in enumerate(scoreArray):
                if score > cutoff:
                    valley = utils.Locus(region.chr(), region.start() + index*10,
                                         region.start() + (index+1)*10, '.')
                    valleyDict[gene].append(valley)

        stitchedValleys = stitchValleys(valleyDict[gene])
        for valley in stitchedValleys:
            all_valley_bed.append([valley.chr(), valley.start(), valley.end()])
            valleyDict[gene] = stitchedValleys


    all_bed_path = projectFolder + projectName + '_all_valleys.bed'
    utils.unParseTable(all_valley_bed, all_bed_path, '\t')


    return all_bed_path
Beispiel #29
0
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)

    overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
    print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" % (len(overlapLoci),gffLine))

    # since beds come from multiple sources, we want to figure out how to offset them
    offsetDict = {}  # this will store each ID name
    bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
    bedNamesList.sort()
    for i in range(len(bedNamesList)):
        offsetDict[bedNamesList[i]] = 2 * i  # offsets different categories of bed regions

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])

    # fill out the name table
    for name in bedNamesList:
        offset = offsetDict[name]
        nameTable.append([name, 0, 0.0 - offset])

    for bedLocus in overlapLoci:

        offset = offsetDict[bedLocus.ID()]

        [start, stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]

        diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unParseTable(diagramTable, outFolder + gffString + '_bedDiagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt', '\t')
Beispiel #30
0
def summarizeData(dataFile,output ='',namesList= []):

    dataDict=pipeline_dfci.loadDataTable(dataFile)

    if len(namesList) == 0:
        namesList = dataDict.keys()

    if len(output) == 0:
        output = string.replace(dataFile,'.txt','_SUMMARY.txt')

    print('WRITING OUTPUT TO %s' % (output))
    readTable = [['NAME','TOTAL_READS','MAPPED_READS','PEAKS']]

    for name in namesList:
        print('GETTING DATA SUMMARY FOR %s' % (name))

        uniqueID = dataDict[name]['uniqueID']

        mappedReads = round(float(pipeline_dfci.getTONYInfo(uniqueID,'67'))/1000000,2)
        totalRaw = pipeline_dfci.getTONYInfo(uniqueID,'68')
        totalRaw = int(totalRaw.split('::')[0])
        totalReads = round(float(totalRaw)/1000000,2)
        #mappedReads = 0
        #totalReads = 0

        #getting the spot score
        #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID)
        #spotFile = '%sspot/%s_%s/%s_hg19.sorted.spot.out' % (projectFolder,uniqueID,name,uniqueID)
        #spotTable = utils.parseTable(spotFile,'\t')
        #spotScore = spotTable[1][0].split(' ')[-1]

        #get the peak count
        if name.count('H3K27AC') == 1 or name.count('ATAC') ==1:
            peakCollection = utils.importBoundRegion('%s%s' % (macsEnrichedFolder,dataDict[name]['enrichedMacs']),name)
            peakCount = len(peakCollection)
        else:
            peakCount = 'NA'



        newLine = [name,totalReads,mappedReads,peakCount]
        print(newLine)
        readTable.append(newLine)


    utils.unParseTable(readTable,output,'\t')    
Beispiel #31
0
def buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1):
    '''
    from the collapsed edge dictionary, build a target graph
    require at least n motifs to constitute an edge where n is set by cutoff. 
    default is 1
    '''

    node_list = edgeDict.keys()
    node_list.sort()
    #this is only edges between TFs
    graph = nx.DiGraph(name=analysis_name)
    graph.add_nodes_from(node_list)
    

    #this stores ALL edges identified by motifs
    edge_table = [['SOURCE','TARGET','CHROM','START','STOP','REGION_ID','TF_INTERACTION']]
    edge_output = '%s%s_EDGE_TABLE.txt' % (output_folder,analysis_name)

    for source in node_list:
        print(source)
        target_list = edgeDict[source].keys()
        target_list.sort()
        for target in target_list:

            #now we need to see which target regions this guy overlaps
            target_regions = gene_to_enhancer_dict[target]
            target_collection = utils.LocusCollection(target_regions,50)

            #get the edges hitting that target
            edgeLoci = edgeDict[source][target]
            if node_list.count(target) > 0:
                tf_interaction = 1
            else:
                tf_interaction = 0
            #only add to the graph if this is a TF/TF interaction
            if len(edgeLoci) >= cutoff and node_list.count(target) > 0:
                graph.add_edge(source,target)
                
            #now for each edge, add to the table
            for edgeLocus in edgeLoci:
                regionString = ','.join([locus.ID() for locus in target_collection.getOverlap(edgeLocus)])
                edgeLine = [source,target,edgeLocus.chr(),edgeLocus.start(),edgeLocus.end(),regionString,tf_interaction]
                edge_table.append(edgeLine)

    utils.unParseTable(edge_table,edge_output,'\t')
    return graph
Beispiel #32
0
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict,
               canidateGenes):
    '''
    import the FIMO output once it's finished
    build the networkX directed graph
    '''

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]

    # The reverse of the other dict, from motif name to gene name
    for line in motifDatabase:
        motifDatabaseDict[line[0]] = line[1]

    fimoFile = projectFolder + 'FIMO/fimo.txt'
    fimoTable = utils.parseTable(fimoFile, '\t')

    graph = nx.DiGraph(name=projectName)
    graph.add_nodes_from(canidateGenes)

    motifDict = defaultdict(list)
    for line in fimoTable[1:]:

        source = motifDatabaseDict[line[0]]  #motifId
        # region = line[1].split('|')
        region = line[2].split('|')
        target = refseqToNameDict[
            region[0]]  #gene name corresponding to the NMid
        graph.add_edge(source, target)
        # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3])))
        motifDict[source].append((region[1], int(region[2]) + int(line[3]),
                                  int(region[2]) + int(line[4])))

    utils.formatFolder(projectFolder + 'motifBED/', True)
    for gene in motifDict.keys():
        if motifDict[gene]:
            bed = []
            for loc in motifDict[gene]:
                bed.append([loc[0], loc[1], loc[2]])

            filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed'
            utils.unParseTable(bed, filename, '\t')

    return graph
Beispiel #33
0
def mergeCollections(superFile1,superFile2,name1,name2,output=''):

    '''
    merges them collections
    '''

    conSuperCollection = makeSECollection(superFile1,name1)

    tnfSuperCollection = makeSECollection(superFile2,name2)


    #now merge them
    mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci()

    mergedCollection = utils.LocusCollection(mergedLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    
    #loci that are in both get renamed with a new unique identifier

    renamedLoci =[]
    ticker = 1
    for locus in stitchedLoci:

        if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)):

            newID = 'CONSERVED_%s' % (str(ticker))
            ticker +=1
            locus._ID = newID
        else:
            locus._ID = locus.ID()[2:]
        renamedLoci.append(locus)

    #now we turn this into a gff and write it out
    gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))

    if len(output) == 0:
        return gff
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(gff,output,'\t')
        return output
Beispiel #34
0
def makeFoldTable(annotFile,
                  analysisName,
                  testName,
                  controlName,
                  testMMR,
                  controlMMR,
                  testIdxFile,
                  controlIdxFile,
                  outputFolder,
                  epsilon=1):
    '''
    makes the fold table and writes to disk
    fold table is ranked by fold change
    first column is guideID, second column is gene name, third is fold change
    '''

    guideDict, geneDict = makeAnnotDict(annotFile)

    testIdx = utils.parseTable(testIdxFile, '\t')
    controlIdx = utils.parseTable(controlIdxFile, '\t')

    #for each guide, divide the count by the MMR then add 1 then take the log2 ratio

    outTable = [['GUIDE_ID', 'GENE', 'LOG2_RATIO', testName, controlName]]
    for i in range(len(testIdx)):

        guideID = testIdx[i][0]
        gene = guideDict[guideID]

        testCount = float(testIdx[i][2]) / testMMR + epsilon
        controlCount = float(controlIdx[i][2]) / controlMMR + epsilon

        log2Ratio = numpy.log2(testCount / controlCount)

        newLine = [
            guideID, gene, log2Ratio,
            round(testCount, 4),
            round(controlCount, 4)
        ]

        outTable.append(newLine)

    outputFile = '%s%s_log2Ratio.txt' % (outputFolder, analysisName)
    utils.unParseTable(outTable, outputFile, '\t')
    return outputFile
Beispiel #35
0
def mergeCollections(superFile1,superFile2,name1,name2,output=''):

    '''
    merges them collections
    '''

    conSuperCollection = makeSECollection(superFile1,name1)

    tnfSuperCollection = makeSECollection(superFile2,name2)


    #now merge them
    mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci()

    mergedCollection = utils.LocusCollection(mergedLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    
    #loci that are in both get renamed with a new unique identifier

    renamedLoci =[]
    ticker = 1
    for locus in stitchedLoci:

        if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)):

            newID = 'CONSERVED_%s' % (str(ticker))
            ticker +=1
            locus._ID = newID
        else:
            locus._ID = locus.ID()[2:]
        renamedLoci.append(locus)

    #now we turn this into a gff and write it out
    gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))

    if len(output) == 0:
        return gff
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(gff,output,'\t')
        return output
def main():

    projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/'
    bedFolder = projectFolder+'bed/'
    wtCanyonBed = bedFolder+'canyon_WT_sizeSelected.bed'
    mutCanyonBed = bedFolder+'canyon_Mut_sizeSelected.bed'
    
    wtCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'wt_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(wtCanyonBed, '\t')])
    mutCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'mut_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(mutCanyonBed, '\t')])
    overlappingCanyons = []

    wtExpansion = []
    mutExpansion = []

    wtUnique = []
    mutUnique = []
    overlapCounter = 0
    mutOverlap = 0
    for locus in wtCanyonLocusCollection.getLoci():
        wtMutOverlap = mutCanyonLocusCollection.getOverlap(locus, 'both')
        if len(wtMutOverlap) > 0:
            overlapCounter += 1
            for overlap in wtMutOverlap:
                newLine = [locus.chr(), locus.start(), locus.end(), locus.end()-locus.start(), overlap.chr(), overlap.start(), overlap.end(), overlap.end()-overlap.start()]
                wtLength = locus.end()-locus.start()
                mutLength = overlap.end()-overlap.start()
                if mutLength > wtLength:
                    mutExpansion.append(newLine)
                elif wtLength > mutLength:
                    wtExpansion.append(newLine)
        else:
            wtUnique.append(locus)

    for locus in mutCanyonLocusCollection.getLoci():
        mutWTOverlap = wtCanyonLocusCollection.getOverlap(locus, 'both')
        if len(mutWTOverlap) > 0:
            mutOverlap += 1
        else:
            mutUnique.append(locus)


    print len(mutExpansion)
    print len(wtExpansion)
    utils.unParseTable(mutExpansion, projectFolder+'tables/MUT_canyons_expanded.txt', '\t')
    utils.unParseTable(wtExpansion, projectFolder+'tables/WT_canyons_expanded.txt', '\t')
Beispiel #37
0
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension):
    '''
    takes as input a BED file of constituents
    outputs a FASTA  file of merged extended super-enhancer consituents and associated formated name
    '''

    print 'MAKE FASTA'

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFandSuperDict.keys():
        subpeakDict[gene] = []
        for region in TFandSuperDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    # Output the fasta file of extended SE constituents
    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
def getExpanded(locusTable, expansion, status, output):

    loci = utils.parseTable(locusTable, '\t')
    expandedList = []
    for line in loci:
        wtLocus = line[0:4]
        mutLocus = line[4:8]
        if status == 'WT':
            newLine = expansionStat(wtLocus, mutLocus, expansion=0.1)
            if len(newLine) > 0:
                expandedList.append(newLine)
        elif status == 'MUT':
            newLine = expansionStat(mutLocus, wtLocus, expansion=0.1)
            if len(newLine) > 0:
                expandedList.append(newLine)

    print len(expandedList), ' expanded loci in ', status
    utils.unParseTable(expandedList, output, '\t')
Beispiel #39
0
def makeRigerTable(foldTableFile, output=''):
    '''
    blah
    '''

    #need a table of this format
    rigerTable = [[
        'Construct', 'GeneSymbol', 'NormalizedScore', 'Construct Rank',
        'HairpinWeight'
    ]]
    #set weight to 1 for now

    foldTable = utils.parseTable(foldTableFile, '\t')

    constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],
                                 decreasing=True)

    #make geneCountDict
    print("making gene count dictionary")
    geneCountDict = defaultdict(int)
    for line in foldTable[1:]:
        geneCountDict[line[1]] += 1

    print("iterating through constructs")
    constructRank = 1
    for i in constructOrder:
        rowIndex = i + 1  # accounts for the header
        geneName = foldTable[rowIndex][1]
        if geneCountDict[geneName] == 1:
            print(
                "Gene %s only has one guide RNA. Excluding from FRIGER analysis"
                % (geneName))
            continue

        newLine = foldTable[rowIndex][0:3] + [constructRank, 1]
        rigerTable.append(newLine)
        constructRank += 1

    if len(output) == 0:
        output = string.replace(foldTableFile, '_log2Ratio.txt', '_friger.txt')

    utils.unParseTable(rigerTable, output, '\t')

    return output
Beispiel #40
0
def main():

    projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/'
    bedFolder = projectFolder + 'bed/'
    canyonBed = bedFolder + 'canyon_WT_sizeSelected.bed'

    extension = 1000
    sampleName = 'WT'
    upstreamEdgeBed = []
    downstreamEdgeBed = []

    outputUp = 'HG19_' + sampleName + '_' + str(
        extension) + 'extend_upstreamFlanking.bed'
    outputDown = 'HG19_' + sampleName + '_' + str(
        extension) + 'extend_downstreamFlanking.bed'
    for line in utils.parseTable(canyonBed, '\t'):
        print line
        chrom = line[0]
        start = int(line[1])
        end = int(line[2])
        canyon_name = sampleName + '_' + str(chrom) + '(.):' + str(
            start) + '-' + str(end)

        if start > extension:
            startUpstream = start - extension
            startDownstream = start + extension

            endUpstream = end - extension
            endDownstream = end + extension
            upstreamLine = [
                chrom, startUpstream, startDownstream, canyon_name + '_5Flank'
            ]
            downstreamLine = [
                chrom, endUpstream, endDownstream, canyon_name + '_3Flank'
            ]
            upstreamEdgeBed.append(upstreamLine)
            downstreamEdgeBed.append(downstreamLine)

        else:
            pass

    utils.unParseTable(upstreamEdgeBed, bedFolder + outputUp, '\t')
    utils.unParseTable(downstreamEdgeBed, bedFolder + outputDown, '\t')
Beispiel #41
0
def calculatePromoterActivity(annotationFile,
                              bamFile,
                              projectName,
                              projectFolder,
                              refseqToNameDict,
                              background=False):
    '''
    calculates the level of acetylation at each TF promoter
    '''

    print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA'

    annotTable = utils.parseTable(annotationFile, '\t')
    output = []
    counter = 0

    bam = utils.Bam(bamFile)

    if background:
        background = utils.Bam(background)

    startDict = utils.makeStartDict(annotationFile)

    tssLoci = []
    for gene in startDict:
        tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500))
    tssCollection = utils.LocusCollection(tssLoci, 50)

    gff = utils.locusCollectionToGFF(tssCollection)

    outputname = projectFolder + projectName + '_TSS.gff'
    utils.unParseTable(gff, outputname, '\t')

    mappingCmd = 'bamliquidator_batch'
    mappingCmd += ' -r ' + outputname
    mappingCmd += ' -o ' + projectFolder + 'bamliquidator'
    mappingCmd += ' -m -e 200 '
    mappingCmd += bamFile

    subprocess.call(mappingCmd, shell=True)

    print mappingCmd
Beispiel #42
0
def makeRigerTable(foldTableFile,output=''):

    '''
    blah
    '''

    #need a table of this format
    rigerTable = [['Construct','GeneSymbol','NormalizedScore','Construct Rank','HairpinWeight']]
    #set weight to 1 for now

    foldTable = utils.parseTable(foldTableFile,'\t')

    constructOrder = utils.order([float(line[2]) for line in foldTable[1:]],decreasing=True)

    #make geneCountDict
    print("making gene count dictionary")
    geneCountDict= defaultdict(int)
    for line in foldTable[1:]:
        geneCountDict[line[1]] +=1

    print("iterating through constructs")
    constructRank = 1
    for i in constructOrder:
        rowIndex = i+1 # accounts for the header
        geneName = foldTable[rowIndex][1]
        if geneCountDict[geneName] == 1:
            print("Gene %s only has one guide RNA. Excluding from FRIGER analysis" % (geneName))
            continue

        newLine = foldTable[rowIndex][0:3] + [constructRank,1]
        rigerTable.append(newLine)
        constructRank += 1

    if len(output) == 0:
        output = string.replace(foldTableFile,'_log2Ratio.txt','_friger.txt')
    
    utils.unParseTable(rigerTable,output,'\t')

    return output
Beispiel #43
0
def collapseRegionMap(regionMapFile,name='',controlBams=False):

    '''
    takes a regionMap file and collapses signal into a single column
    also fixes any stupid start/stop sorting issues
    needs to take into account whether or not controls were used
    '''

    regionMap = utils.parseTable(regionMapFile,'\t')

    for n,line in enumerate(regionMap):
        
        if n ==0:
            #new header
            if len(name) == 0:
                name = 'MERGED_SIGNAL'
            regionMap[n] = line[0:6] +[name]

        else:
            newLine = list(line[0:6])
            if controlBams:
                signalLine = [float(x) for x in line[6:]]
                rankbyIndexes = range(0,len(signalLine)/2,1)
                controlIndexes = range(len(signalLine)/2,len(signalLine),1)
                metaVector = []
                for i,j in zip(rankbyIndexes,controlIndexes):
                    #min signal is 0
                    metaVector.append(max(0,signalLine[i] - signalLine[j]))
                metaSignal = numpy.mean(metaVector)
            else:
                metaSignal = numpy.mean([float(x) for x in line[6:]])
            regionMap[n] = newLine + [metaSignal]

    outputFile = string.replace(regionMapFile,'REGION','META')
    utils.unParseTable(regionMap,outputFile,'\t')
    return(outputFile)
Beispiel #44
0
def main():
    from optparse import OptionParser
    usage = "usage: %prog [options] -b [SORTED BAMFILE] -i [INPUTFILE] -o [OUTPUTFILE]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None,
                      help = "Enter .bam file to be processed.")
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter .gff or ENRICHED REGION file to be processed.")
    #output flag
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output filename.")
    #additional options
    parser.add_option("-s","--sense", dest="sense",nargs = 1, default='.',
                      help = "Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200,
                      help = "Extends reads by n bp. Default value is 200bp")
    parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False,
                      help = "Normalizes density to reads per million (rpm)")
    parser.add_option("-c","--cluster", dest="cluster",nargs = 1, default=None,
                      help = "Outputs a fixed bin size clustergram. user must specify bin size.")
    parser.add_option("-m","--matrix", dest="matrix",nargs = 1, default=None,
                      help = "Outputs a variable bin sized matrix. User must specify number of bins.")
    (options,args) = parser.parse_args()

    print(options)
    print(args)

   
    if options.sense:
        if ['+','-','.','both'].count(options.sense) == 0:
            print('ERROR: sense flag must be followed by +,-,.,both')
            parser.print_help()
            exit()

    if options.cluster and options.matrix:
        print('ERROR: Cannot specify both matrix and clustergram flags.')
        parser.print_help()
        exit()

    if options.matrix:
        try:
            int(options.matrix)
        except:
            print('ERROR: User must specify an integer bin number for matrix (try 50)')
            parser.print_help()
            exit()
            
    if options.cluster:
        try:
            int(options.cluster)
        except:
            print('ERROR: User must specify an integer bin size for clustergram (try 25)')
            parser.print_help()
            exit()

    
    
    if options.input and options.bam:
        inputFile = options.input
        if inputFile.split('.')[-1] != 'gff':
            print('converting file to a .gff')
            gffFile = convertEnrichedRegionsToGFF(inputFile)
        else:
            gffFile = inputFile

        bamFile = options.bam
        
        if options.output == None:
            output = os.getcwd() + inputFile.split('/')[-1]+'.mapped'
        else:
            output = options.output
        if options.cluster:
            print('mapping to GFF and making clustergram with fixed bin width')
            newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,int(options.cluster),None)
        elif options.matrix:
            print('mapping to GFF and making a matrix with fixed bin number')
            newGFF = mapBamToGFF(bamFile,gffFile,options.sense,int(options.extension),options.rpm,None,int(options.matrix))

        print('bamToGFF_turbo writing output to: %s' % (output))
        # Hackjob to make subdirectories for ROSE integration
        try:
            os.mkdir(os.path.dirname(output))
        except OSError:
            pass
        utils.unParseTable(newGFF,output,'\t')

    else:
        parser.print_help()
Beispiel #45
0
def makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, outFolder, names, title, bedCollection):
    '''
    makes a plot table for each line of the gff mapped against all the bams in the bamList
    '''

    # load in the gff
    if type(gff) == str:
        gff = utils.parseTable(gff, '\t')

    # load in the annotation
    print('loading in annotation for %s' % (genome))
    geneDict, txCollection = loadAnnotFile(genome)

    # make an MMR dict so MMRs are only computed once
    print('Getting information about read depth in bams')
    mmrDict = {}
    for bamFile in bamFileList:
        # millionMappedReads
        idxCmd = 'samtools idxstats %s' % (bamFile)
        idxPipe = subprocess.Popen(idxCmd, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
        idxStats = idxPipe.communicate()
        idxStats = idxStats[0].split('\n')
        idxStats = [line.split('\t') for line in idxStats]

        rawCount = sum([int(line[2]) for line in idxStats[:-1]])
        if rpm:
            MMR = round(float(rawCount) / 1000000, 4)
        else:
            MMR = 1
        mmrDict[bamFile] = MMR
        # bam = Bam(bamFile)
        # if rpm:
        #     MMR= round(float(bam.getTotalReads('mapped'))/1000000,4)
        # else:
        #     MMR = 1
        # mmrDict[bamFile] = MMR
        # mmrDict[bamFile] = 21.5377

    ticker = 1
    # go line by line in the gff
    summaryTable = [['DIAGRAM_TABLE', 'NAME_TABLE', 'BED_DIAGRAM_TABLE', 'BED_NAME_TABLE', 'PLOT_TABLE', 'CHROM', 'ID', 'SENSE', 'START', 'END']]
    for gffLine in gff:
        gffString = 'line_%s_%s_%s_%s_%s_%s' % (ticker, gffLine[0], gffLine[1], gffLine[6], gffLine[3], gffLine[4])
        ticker += 1
        print('writing the gene diagram table for region %s' % (gffLine[1]))
        mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=gffString)
        mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=gffString)
        outTable = []

        outTable.append(['BAM', 'GENE_ID', 'NAME', 'LOCUSLINE', 'COLOR1', 'COLOR2', 'COLOR3'] + ['bin_' + str(n) for n in range(1, int(nBins) + 1, 1)])

        for i in range(0, len(bamFileList), 1):
            bamFile = bamFileList[i]
            name = names[i]

            color = colorList[i]
            print('getting data for location %s in dataset %s' % (gffLine[1], bamFile))
            mmr = mmrDict[bamFile]
            newLine = mapBamToGFFLine(bamFile, mmr, name, gffLine, color, nBins, sense, extension)

            outTable.append(newLine)

        # get the gene name
        if geneDict.has_key(gffLine[1]):
            geneName = geneDict[gffLine[1]].commonName()
        else:
            geneName = gffLine[1]
        utils.unParseTable(outTable, outFolder + gffString + '_plotTemp.txt', '\t')
        diagramTable = outFolder + gffString + '_diagramTemp.txt'
        plotTable = outFolder + gffString + '_plotTemp.txt'
        nameTable = outFolder + gffString + '_nameTemp.txt'
        bedNameTable = outFolder + gffString + '_bedNameTemp.txt'
        bedDiagramTable = outFolder + gffString + '_bedDiagramTemp.txt'
        summaryTable.append([diagramTable, nameTable, bedDiagramTable, bedNameTable, plotTable, gffLine[0], geneName, gffLine[6], gffLine[3], gffLine[4]])
    summaryTableFileName = "%s%s_summary.txt" % (outFolder, title)
    utils.unParseTable(summaryTable, summaryTableFileName, '\t')
    return summaryTableFileName
Beispiel #46
0
def mapGFFLineToAnnot(gffLine, outFolder, nBins, geneDict, txCollection, sense='both', header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3], gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]), gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    plotBuffer = int(gffLocus.len() / float(nBins) * 20)

    overlapLoci = txCollection.getOverlap(gffLocus, sense='both')
    geneList = [locus.ID() for locus in overlapLoci]

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])
    offsetCollection = utils.LocusCollection([], 500)
    for geneID in geneList:

        gene = geneDict[geneID]

        print(gene.commonName())
        if len(gene.commonName()) > 1:
            name = gene.commonName()
        else:
            name = geneID
        offset = 4 * len(offsetCollection.getOverlap(gene.txLocus()))
        offsetCollection.append(utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer))
        # write the name of the gene down
        if gene.sense() == '+':
            geneStart = gene.txLocus().start()
        else:
            geneStart = gene.txLocus().end()
        geneStart = abs(geneStart - refPoint) * scaleFactor
        nameTable.append([name, geneStart, -2 - offset])
        # draw a line across the entire txLocus

        [start, stop] = [abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords()]
        diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset])

        # now draw thin boxes for all txExons
        if len(gene.txExons()) > 0:
            for txExon in gene.txExons():

                [start, stop] = [abs(x - refPoint) * scaleFactor for x in txExon.coords()]

                diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

        # now draw fatty boxes for the coding exons if any
        if len(gene.cdExons()) > 0:
            for cdExon in gene.cdExons():

                [start, stop] = [abs(x - refPoint) * scaleFactor for x in cdExon.coords()]

                diagramTable.append([start, -1 - offset, stop, 1 - offset])

    utils.unParseTable(diagramTable, outFolder + gffString + '_diagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt', '\t')
Beispiel #47
0
def findCanidateTFs(genome, enhancer_gff, expressedNM, expressionDictNM,
                    bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci 
    '''
    
    #loading in the enhancer gff regions
    enhancer_collection = utils.gffToLocusCollection(enhancer_gff)
    enhancer_loci = enhancer_collection.getLoci()


    #loading in the genome and TF info
    annot_file = genome.returnFeature('annot_file')
    startDict = utils.makeStartDict(annot_file)    

    tf_table = utils.parseTable(genome.returnFeature('tf_file'),'\t')
    refID_list = [line[0] for line in tf_table] #creates a list of all NM IDs for TFs

    #make a collection of all TF TSSs
    tssLoci = []
    for refID in refID_list:
        tssLoci.append(utils.makeTSSLocus(refID,startDict,0,0)) #this is a precise 1 coordinate TSS locus
    tssCollection = utils.LocusCollection(tssLoci,50)    



    enhancerTable = [['ENHANCER_ID','CHROM','START','STOP','GENE_LIST']]

    gene_to_enhancer_dict = defaultdict(list)
    # Loop through enhancers
    #all gene nnames stored by refID
    for enhancer in enhancer_loci:
        

        # If the enhancer overlaps a TSS, save it
        overlapping_loci = tssCollection.getOverlap(enhancer, 'both')
        overlapping_refIDs =[locus.ID() for locus in overlapping_loci]

        # Find all gene TSS within 100 kb
        proximal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both')
        proximal_refIDs =[locus.ID() for locus in proximal_loci]
        
        # If no genes are within 100 kb, find the closest active gene within 1 million bp
        closest_refID = []
        if len(overlapping_refIDs) == 0 and len(proximal_refIDs) == 0:
        
            distal_loci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both')
            distal_refIDs =[locus.ID() for locus in distal_loci]

            enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2
            distance_list = [abs(enhancerCenter - startDict[geneID]['start'][0])
                             for geneID in distal_refIDs]
            if len(distance_list) > 0:
                closest_refID = [distalGenes[distance_list.index(min(distance_list))]]

        #now we have all potential gene cases
        all_refIDs = overlappingGenes + proximalGenes + closest_refID
        
        #now we get all names and refIDs
        all_refIDs = utils.uniquify([refID for refID in all_refIDs if len(refID) > 0 ])
        all_names = utils.uniquify([startDict[refID]['name'] for refID in all_refIDs])
        
        #first do enhancer level assignment
        names_string = ','.join(all_names)
        enhancer_table.append([enhancer.ID(),enhancer.chr(),enhancer.start(),enhancer.end(),names_string])

        #now do gene level assignment
        for refID in all_refIDs:
            gene_to_enhancer_dict[refID].append(enhancer.ID())

        #an enhancer can be assigned to multiple genes
        #a promoter can only be assigned to 1 gene
        #promoters don't have enhancerIDs so don't add them yet
        #this should just be an enhancer level table
        #followed by a gene level table



        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)
 

        # If a TSS overlaps an enhancer, assign them together
        if overlappingGenes:
            for gene in overlappingGenes:
                if gene in tf_list:
                    TFtoEnhancerDict[gene].append(enhancer)
                    enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
                
        # Otherwise, assign the enhancer to the most active gene in 100 kb
        elif not overlappingGenes and proximalGenes:
            highestGene = ''
            highestActivity = 0
            for gene in proximalGenes:
                if expressionDictNM[gene] > highestActivity:
                    highestActivity = expressionDictNM[gene]
                    highestGene = gene
            if highestGene in TFlist:
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
            
        elif not overlappingGenes and not proximalGenes and closestGene:
            if closestGene in TFlist:
                gene = closestGene
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])

    # Add promoter is it's not contained in the super
    if promoter:
        for gene in TFtoEnhancerDict.keys():
            promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, 
                                   int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense'])
            overlapBool = False
            for enhancer in TFtoEnhancerDict[gene]:
                if promoter.overlaps(enhancer):
                    overlapBool = True
            if not overlapBool:
                TFtoEnhancerDict[gene].append(promoter)

    seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt'
    utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t')

    return TFtoEnhancerDict
Beispiel #48
0
def collapseFimo(fimo_output,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile):

    '''
    collapses motifs from fimo
    for each source node (TF) and each target node (gene enhancer regions), collapse motif instances
    then spit out a ginormous set of beds and a single crazy collapsed bed
    '''
    
    #first build up the motif name conversion database

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = defaultdict(list)
    # The reverse of the other dict, from motif name to gene name
    # a motif can go to multiple genes
    for line in motifDatabase:
        motifDatabaseDict[line[0]].append(line[1])



    #make the folder to store motif beds
    utils.formatFolder('%smotif_beds/' % (output_folder),True)

    edgeDict = {}
    #first layer are source nodes
    for tf in candidate_tf_list:
        edgeDict[tf] = defaultdict(list) #next layer are target nodes which are derived from the fimo output
        

    fimoTable = utils.parseTable(fimo_output,'\t')
    print(fimo_output)

    #fimo sometimes puts the region in either the first or second column
    fimo_line = fimoTable[1]
    if fimo_line[1].count('|') >0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index))

    for line in fimoTable[1:]:
        source_tfs = motifDatabaseDict[line[0]]   #motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],int(region[2]) + int(line[3]), int(region[2]) + int(line[4]),'.')
            else:
                target_locus = utils.Locus(region[1],int(region[2]) + int(line[2]), int(region[2]) + int(line[3]),'.')
            #what's missing here is the enhancer id of the target locus
            try:
                edgeDict[source][target].append(target_locus)
            except KeyError:
                print('this motif is not in the network')
                print(line)
                sys.exit()


    #now we actually want to collapse this down in a meaningful way
    #overlapping motifs count as a single binding site. This way a TF with tons of motifs
    #that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '%s%s_all_motifs.bed' % (output_folder,analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edgeDict[tf].keys()
        bed_header = ['track name = "%s" description="%s motifs in %s"' % (tf,tf,analysis_name)]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder,tf)
        for target in target_nodes:
            edgeCollection = utils.LocusCollection(edgeDict[tf][target],50)
            edgeCollection = edgeCollection.stitchCollection()
            edgeLoci = edgeCollection.getLoci()
            edgeDict[tf][target] = edgeLoci
            for locus in edgeLoci:
                bed_line = [locus.chr(),locus.start(),locus.end(),target,'','+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unParseTable(target_bed,target_bed_path,'\t')

    #now the loci are all stitched up 
    utils.unParseTable(all_bed,all_bed_path,'\t')
    return edgeDict
def main():
    '''
    main run call
    '''

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a ROSE ranked enhancer or super-enhancer file")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter the bam used to rank enhancers")
    parser.add_option("-c", "--control", dest="control", nargs=1, default='',
                      help="Enter a background bam for background correction")

    parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None,
                      help="Enter a gene list to filter through")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder. Default will be same folder as input file")
    parser.add_option(
        "-w", "--window", dest="window", nargs=1, default=50000,
        help="Enter a search distance for genes. Default is 50,000bp")
    parser.add_option(
        "-f", "--format", dest="formatTable", action="store_true", default=False,
        help="If flagged, maintains original formatting of input table")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.genome or not options.rankby:

        parser.print_help()
        exit()

    print(options)

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # GETTING THE INPUT
    enhancerFile = options.input
    window = int(options.window)

    # making the out folder if it doesn't exist
    if options.out:
        outFolder = utils.formatFolder(options.out, True)
    else:
        outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/'

    # GETTING BAM INFO
    rankByBamFile = options.rankby
    controlBamFile = options.control

    # CHECK FORMATTING FLAG
    if options.formatTable:
        noFormatTable = True
    else:
        noFormatTable = False

    # GETTING THE TRANSCRIBED LIST
    if options.geneList:

        transcribedFile = options.geneList
    else:
        transcribedFile = ''

    if options.rankby:
        enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable = mapEnhancerToGeneTop(
            rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile, True, window, noFormatTable)

        # Writing enhancer output
        enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

        if window != 50000:
            # writing the enhancer table

            out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            print("writing output to %s" % (out1))
            utils.unParseTable(enhancerToGeneTable, out1, '\t')

            # writing enhancer top gene table
            out2 = '%s%s_ENHANCER_TO_TOP_GENE_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            utils.unParseTable(enhancerToTopGeneTable, out2, '\t')

            # writing the gene table
            out3 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            utils.unParseTable(geneToEnhancerTable, out3, '\t')
        else:
            # writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(enhancerToGeneTable, out1, '\t')

            # writing the enhancer table
            out2 = '%s%s_ENHANCER_TO_TOP_GENE.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(enhancerToTopGeneTable, out2, '\t')

            # writing the gene table
            out3 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(geneToEnhancerTable, out3, '\t')
    else:
        #do traditional mapping
        enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable)

        #Writing enhancer output
        enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

        if window != 50000:
            #writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
            utils.unParseTable(enhancerToGeneTable,out1,'\t')

            #writing the gene table
            out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
            utils.unParseTable(geneToEnhancerTable,out2,'\t')
        else:
            #writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName)
            utils.unParseTable(enhancerToGeneTable,out1,'\t')

            #writing the gene table
            out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName)
            utils.unParseTable(geneToEnhancerTable,out2,'\t')
Beispiel #50
0
def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName):
    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']]

    lociLenList = []

    # strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    lenOrder = utils.order(lociLenList, decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize += refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        coords = [int(x) for x in locus.coords()]

        locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize])

    print('GETTING MAPPED DATA')
    print("USING A BAMFILE LIST:")
    print(bamFileList)
    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName))

        mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t')

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signalDict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mappedCollection = utils.LocusCollection(mappedLoci, 500)
        locusTable[0].append(bamFileName)

        for i in range(1, len(locusTable)):
            signal = 0.0
            line = locusTable[i]
            lineLocus = utils.Locus(line[1], line[2], line[3], '.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both')
            for region in overlappingRegions:
                signal += signalDict[region.ID()]
            locusTable[i].append(signal)

    utils.unParseTable(locusTable, output, '\t')
Beispiel #51
0
def optimizeStitching(locusCollection, name, outFolder, stepSize=500):
    '''
    takes a locus collection and starts writing out stitching stats at step sized intervals
    '''
    maxStitch = 15000  # set a hard wired match stitching parameter

    stitchTable = [['STEP', 'NUM_REGIONS', 'TOTAL_CONSTIT', 'TOTAL_REGION', 'MEAN_CONSTIT', 'MEDIAN_CONSTIT', 'MEAN_REGION', 'MEDIAN_REGION', 'MEAN_STITCH_FRACTION', 'MEDIAN_STITCH_FRACTION']]
    # first consolidate the collection
    locusCollection = locusCollection.stitchCollection(stitchWindow=0)
    total_constit = sum([locus.len() for locus in locusCollection.getLoci()])
    step = 0
    while step <= maxStitch:

        print("Getting stitch stats for %s (bp)" % (step))
        stitchCollection = locusCollection.stitchCollection(stitchWindow=step)
        num_regions = len(stitchCollection)
        stitchLoci = stitchCollection.getLoci()
        regionLengths = [locus.len() for locus in stitchLoci]
        total_region = sum(regionLengths)
        constitLengths = []
        for locus in stitchLoci:

            constitLoci = locusCollection.getOverlap(locus)
            constitLengths.append(sum([locus.len() for locus in constitLoci]))

        meanConstit = round(numpy.mean(constitLengths), 2)
        medianConstit = round(numpy.median(constitLengths), 2)

        meanRegion = round(numpy.mean(regionLengths), 2)
        medianRegion = round(numpy.median(regionLengths), 2)

        stitchFractions = [float(constitLengths[i]) / float(regionLengths[i]) for i in range(len(regionLengths))]
        meanStitchFraction = round(numpy.mean(stitchFractions), 2)
        medianStitchFraction = round(numpy.median(stitchFractions), 2)

        newLine = [step, num_regions, total_constit, total_region, meanConstit, medianConstit, meanRegion, medianRegion, meanStitchFraction, medianStitchFraction]

        stitchTable.append(newLine)

        step += stepSize

    # write the stitch table to disk
    stitchParamFile = '%s%s_stitch_params.tmp' % (outFolder, name)
    utils.unParseTable(stitchTable, stitchParamFile, '\t')
    # call the rscript
    rCmd = 'Rscript ./ROSE2_stitchOpt.R %s %s %s' % (stitchParamFile, outFolder, name)
    print(rCmd)
    # get back the stitch parameter
    rOutput = subprocess.Popen(rCmd, stdout=subprocess.PIPE, shell=True)
    rOutputTest = rOutput.communicate()

    print(rOutputTest)

    stitchParam = rOutputTest[0].split('\n')[2]
    try:
        stitchParam = int(stitchParam)
    except ValueError:
        print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED")
        sys.exit()

    # delete? the table
    # os.system('rm -f %s' % (stitchParamFile))
    return stitchParam
Beispiel #52
0
def formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list):
    '''
    takes the networkx graph
    returns all figures, tables, etc
    '''

    # output the network as a .ntx dictionary of lists

    networkFilename = output_folder + analysis_name + '.ntx'
    networkFile = open(networkFilename, 'w')
    networkDictOfLists = nx.to_dict_of_lists(graph)
    pickle.dump(networkDictOfLists, networkFile)

    # output the adjacency list and nodelist
    nodeFile = output_folder + analysis_name + '_NODELIST.txt'
    if nx.__version__[0] == '1':
        nodeList = [ [n] for n in graph.nodes_iter()]
    elif nx.__version__[0] == '2':
        nodeList = [[n] for n in graph.nodes()]
    else:
        print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE')
        sys.exit()
    utils.unParseTable(nodeList, nodeFile, '\t')

    adjFile = output_folder + analysis_name + '_ADJ_LIST.txt'
    
    if nx.__version__[0] == '1':
        adjList = graph.adjacency_list()
    elif nx.__version__[0] == '2':
        adjList = [n[1].keys() for n in graph.adjacency()]
    else:
        print('ERROR: UNSUPPORTED VERSION OF NETWORKX MODULE')
        sys.exit()

    
    utils.unParseTable(adjList, adjFile, '\t')

    edgesTable = [['From', 'To']]
    targetList = []
    for i,gene in enumerate(nodeList):
        for j in adjList[i]:
            newline = [gene[0],j]
            edgesTable.append(newline)
            TFname = gene[0]

    edgeFile = output_folder + analysis_name + '_EDGE_LIST.txt'
    utils.unParseTable(edgesTable, edgeFile, '\t')


    # Make the degree table    
    degTable = [['Tf', 'In_Degree', 'Out_Degree', 'Total_Connections' ]]
    degFile = output_folder + analysis_name + '_DEGREE_TABLE.txt'

    for node in graph.nodes(): #shouldn't we output the table for the TFs that have motifs only ? for canidateMotifs in graph.nodes()....
        newline = [node, graph.in_degree()[node], graph.out_degree()[node], graph.degree()[node]]
        degTable.append(newline)

    utils.unParseTable(degTable, degFile, '\t')

    print 'DEFINING THE CORE REGULATORY CIRCUIT'

    autoreg = graph.selfloop_edges()
    selfLoops = [x for x,y in autoreg]
    selfLoopFile = output_folder + analysis_name + '_SELF_LOOPS.txt'
    utils.unParseTable(selfLoops, selfLoopFile, '')

    #recover bidirectional edges

    pairs = []
    for n in selfLoops:
        for m in selfLoops:
            if n != m:
                if graph.has_edge(n,m) and graph.has_edge(m,n):
                    pairs.append([n,m])
    
    unDirGraph = nx.from_edgelist(pairs)
    cliqueGen = find_cliques_recursive(unDirGraph)
    cliqueList = list(cliqueGen)

    utils.unParseTable(cliqueList, output_folder + analysis_name + '_CLIQUES_ALL.txt', '\t')

    cliqueRanking = []
    outDegreeDict = graph.out_degree()

    for c in cliqueList:
        score = 0
        for gene in c:
            score += outDegreeDict[gene]
        score = score/len(c)
        if score > 0 and len(c) > 2:
            cliqueRanking.append((c, score))


    sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1])
    cliqueFile = output_folder + analysis_name + '_CLIQUE_SCORES_DEGREE.txt'
    utils.unParseTable(sortCliqueRanking, cliqueFile, '\t')

    factorEnrichmentDict = {}

    for factor in selfLoops:
        factorEnrichmentDict[factor] = 0
    for pair in cliqueRanking:
        c = pair[0]
        for factor in c:
            factorEnrichmentDict[factor] += 1

    factorRankingTable = []
    for factor in selfLoops:
        newline = [factor, factorEnrichmentDict[factor]/float(len(cliqueRanking))]
        factorRankingTable.append(newline)

    factorRankingFile = output_folder + analysis_name + '_ENRICHED_CLIQUE_FACTORS.txt'
    utils.unParseTable(factorRankingTable, factorRankingFile, '\t')

    # Begin VSA scoring 

    # Initiate the graph
    G=nx.Graph()

    #recover bidirectional edges
    bidirectionalEdges = pairs

    #fill up the graph
    G.add_nodes_from(selfLoops)
    G.add_edges_from(bidirectionalEdges)

    #find all the cliques
    cliques = find_cliques_recursive(G)
    cliqueList = list(cliques)

    print 'Number of cliques:'
    print len(cliqueList)

    #count the occurences of the TFs accross the loops

    dicoTFinloopsCounts={}

    for clique in cliqueList:
        for TF in clique:

            if dicoTFinloopsCounts.has_key(TF):
                dicoTFinloopsCounts[TF]+=1

            else:
                dicoTFinloopsCounts[TF]=1

    #calculate a score by loop

    cliqueRanking = []

    cliqueNub = 0


    for clique in cliqueList:
        cliqueScore=0


        for TF in clique:
            cliqueScore = (float(cliqueScore) + (float(dicoTFinloopsCounts[TF])))
            cliqueRanking.append((clique, cliqueScore/len(clique), len(clique)))

    #print(cliqueRanking)
    sortCliqueRanking = sorted(cliqueRanking, reverse=True, key=lambda x:x[1])
    #print(sortCliqueRanking)
    cliqueFile = output_folder + analysis_name + '_CLIQUE_SCORES_VSA.txt'
    utils.unParseTable(sortCliqueRanking, cliqueFile, '\t')

    print 'Top CRC:'
    print sortCliqueRanking[0]
Beispiel #53
0
def main():

    import argparse
    parser = argparse.ArgumentParser(usage="usage: prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" )



    #required flags                                                                                                                   
    parser.add_argument("-e","--enhancer_file", dest="enhancers", default=None,type=str,
                        help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)",required=True)

    parser.add_argument("-g","--genome",dest="genome", default = None,type=str,
                        help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9",required=True)
    parser.add_argument("-o","--output",dest="output", default = None,type=str,
                        help = "Enter an output folder",required=True)
    parser.add_argument("-n","--name",dest="name", default = None,type=str,
                        help = "Provide a name for the job",required=True)


    #you either need bams for valleys or subpeaks
    parser.add_argument("-b","--bam",dest="bam", default = None,type=str,
                        help = "Enter a comma separated list of bams of valley finding",required=False)
    parser.add_argument("-s","--subpeaks", dest="subpeaks",default=None,type=str,
                        help = "Enter a BED file of regions to search for motifs",required=False)



    #additional options                                                                                  
    parser.add_argument("-a","--activity",dest="activity", default = None,type=str,
                        help = "A table with active gene names in the first column",required=False)
    parser.add_argument("-l","--extension-length", dest="extension", default=100,type=int,
                        help = "Enter the length to extend subpeak regions for motif finding. default is 100",required=False)
    parser.add_argument("-B","--background", dest="background", default=None,type=str,
                        help = "Provide a background BAM file",required=False)
    parser.add_argument("-N", "--number", dest="number", default=1,type=int,
                        help = "Enter the number of non overlapping motifs in a region required to assign a binding event. Default=1",required=False)     #I have modified the destination of -N option so that it is different from the destination of -E option
    parser.add_argument("--motifs", dest="motifs", default=False,type=str,
                        help = "Enter additional PWM file for the analysis",required=False)
    parser.add_argument("-t","--tfs", dest="tfs",default=None,type=str,
                        help = "Enter additional TFs (comma separated) to be used in the bindinf analysis",required=False)
    parser.add_argument("--config", dest="config",default='',type=str,
                        help = "Enter genome configuration file to overwrite default paths",required=False)


    args = parser.parse_args()





    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================


    ###
    # Define all global file names
    ###
    print(args)
    genome = loadGenome(args.genome,args.config)

    motifDatabaseFile = genome.returnFeature('motif_database')
    motifConvertFile = genome.returnFeature('motif_convert')

    # User input files
    enhancer_file = args.enhancers

    if args.bam == None and args.subpeaks == None:
        print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed')
        sys.exit()

    #set the subpeak file
    if args.subpeaks:
        subpeakFile = args.subpeaks
    else: subpeakFile = None


    #will need to fix bams down the line to take in multiple bams
    if args.bam:
        bamFileList = [bam_path for bam_path in args.bam.split(',') if len(bam_path) >0]
        print(bamFileList)
    else:
        bamFileList = []

    if args.background:
        background = args.background

    else: 
        background = None


    #output folder and analysis name
    print(args.output)
    output_folder = utils.formatFolder(args.output,True)
    analysis_name = args.name

    #optional arguments
    #activity path
    activity_path = args.activity

    #motif extension
    constExtension = args.extension

    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing TF connectivity for %s' % (analysis_name))
    print('Writing output to %s' % (output_folder))
    if subpeakFile:
        print('Using %s to define subpeaks for motif finding' % (subpeakFile))
    else:
        print('Identifying valleys from .bam files')
    print('Using %s to define active genes' % (activity_path))


    #=====================================================================================
    #=======================II. IDENTIFYING CANDIDATE TFS AND NODES=======================
    #=====================================================================================

    print('\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#======================================\n')
    
    geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict= geneToEnhancerDict(genome, enhancer_file, activity_path)
    #write these guys to disk

    gene_out = '%s%s_GENE_TABLE.txt' % (output_folder,analysis_name)
    gene_tf_out = '%s%s_GENE_TF_TABLE.txt' % (output_folder,analysis_name)

    enhancer_out = '%s%s_ENHANCER_TABLE.txt' % (output_folder,analysis_name)
    enhancer_tf_out = '%s%s_ENHANCER_TF_TABLE.txt' % (output_folder,analysis_name)

    summary_out= '%s%s_GENE_SUMMARY.txt' % (output_folder,analysis_name)
    
    utils.unParseTable(enhancerTable,enhancer_out,'\t')    
    utils.unParseTable(enhancerTFTable,enhancer_tf_out,'\t')

    utils.unParseTable(geneTable,gene_out,'\t')
    utils.unParseTable(geneTFTable,gene_tf_out,'\t')

    utils.unParseTable(geneSummaryTable,summary_out,'\t')
    

    print('Identified %s genes w/ proximal cis-regulatory elements' % (len(gene_to_enhancer_dict)))
            
    print('Identified %s candidate TFs' % (len(candidate_tf_list)))
    print(candidate_tf_list)


    #=====================================================================================
    #==========================III. FINDING VALLEYS/SUBPEAKS==============================
    #=====================================================================================

    print('\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#======================================\n')


    #so here we would need to find valleys everywhere
    if subpeakFile == None:
        print('finding valleys')
        #note: the tf_bed_path is for networks, all is for out degree finding
        all_bed_path = findValleys(gene_to_enhancer_dict, bamFileList, analysis_name, output_folder, cutoff = 0.2)
    else:
        print('Using subpeaks from %s' % (subpeakFile))
        all_bed_path = filterSubpeaks(subpeakFile,gene_to_enhancer_dict,analysis_name,output_folder)


    #first make the subpeak bed and subpeak fasta for the tfs

    all_sub_bed,all_fasta = generateSubpeakFASTA(gene_to_enhancer_dict, all_bed_path, genome, analysis_name,output_folder, constExtension)
    if subpeakFile == None:
        #this is the case where we did valleys #only reason you would need to output the sub bed
        all_sub_out = '%s%s_all_subpeak.bed' % (output_folder,analysis_name)
        utils.unParseTable(all_sub_bed,all_sub_out,'\t')


    #writing the all subpeak fasta out to disk
    all_fasta_out = '%s%s_all_subpeak.fasta' % (output_folder,analysis_name)
    utils.unParseTable(all_fasta,all_fasta_out,'')
        

    #=====================================================================================
    #=================================IV. FINDING MOTIFS==================================
    #=====================================================================================

    print('\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#======================================\n')


    #first make background
    bg_path = makeMotifBackground(all_fasta_out,output_folder,analysis_name)

    #find motifs for all regions
    fimo_out = findMotifs(all_fasta_out,bg_path,candidate_tf_list, output_folder, analysis_name, motifConvertFile, motifDatabaseFile)

    edgeDict = collapseFimo(fimo_out,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile)

    #=====================================================================================
    #============================V. RUNNING NETWORK ANALYSIS==============================
    #=====================================================================================

    print('\n\n#======================================\n#========V. BUILDING NETWORK===========\n#======================================\n')


    print('building graph and edge table')
    graph = buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1)

    formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list)

        
    print('FINISHED RUNNING CRC FOR %s' % (analysis_name))

    sys.exit()
import utils
from sys import argv


filename = argv[1]
outname = filename[:-3] + 'sorted.bed'


bedfile = utils.parseTable(filename, '\t')
out = []
for line in bedfile:

    coords = [int(line[1]), int(line[2])]
    start = min(coords)
    end = max(coords)

    newline = [line[0], start, end] + line[3:]
    out.append(newline)

utils.unParseTable(out, outname, '\t')
    
Beispiel #55
0
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000):

    """
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    # making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    # make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True)

    # bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, "\t")

    # make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formattedRankTable = [header]

    # the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    # the beds
    gainedTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0'
        % (genome, name2, genome, name2, name1)
    )
    gainedBed = [[gainedTrackHeader]]
    conservedTrackHeader = (
        'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0'
        % (genome, name1, name2, genome, name1, name2)
    )
    conservedBed = [[conservedTrackHeader]]

    lostTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0'
        % (genome, name1, genome, name1, name2)
    )
    lostBed = [[lostTrackHeader]]

    # the genes
    geneTable = [
        [
            "GENE",
            "ENHANCER_ID",
            "ENHANCER_CHROM",
            "ENHANCER_START",
            "ENHANCER_STOP",
            header[6],
            header[7],
            header[8],
            "STATUS",
        ]
    ]

    for line in rankEnhancerTable[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formattedRankTable.append(line)

        # getting the genes
        geneList = []
        geneList += line[9].split(",")
        geneList += line[10].split(",")
        geneList += line[11].split(",")
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ",")

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        # for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        # for conserved
        else:
            geneStatus = "CONSERVED"
            conservedBed.append(bedLine)

        # now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus]
            geneTable.append(geneTableLine)

    # concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(formattedRankTable, formattedFilename, "\t")

    # gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2))
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name2),
        window / 1000,
        window / 1000,
    )

    gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1))
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name1),
        window / 1000,
        window / 1000,
    )

    utils.unParseTable(gainedGFF, gffFilename_gained, "\t")
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t")

    utils.unParseTable(lostGFF, gffFilename_lost, "\t")
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t")

    # bed
    bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName)
    utils.unParseTable(fullBed, bedFilename, "\t")

    # geneTable
    geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(geneTable, geneFilename, "\t")

    # finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % (
        mergeFolder,
        name1,
        outputFolder,
        genome,
        mergeName,
    )
    os.system(cmd)

    # now execute the bamPlot_turbo.py commands
    bam1 = dataDict[name1]["bam"]
    bam2 = dataDict[name2]["bam"]
    bamString = "%s,%s" % (bam1, bam2)
    nameString = "%s,%s" % (name1, name2)
    colorString = "0,0,0:100,100,100"

    # change dir
    os.chdir("/ark/home/cl512/pipeline/")

    if len(gainedGFF) > 0:
        # gained command
        plotTitle = "%s_ONLY_SE" % (name2)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # gained window command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    if len(lostGFF) > 0:
        # lost command
        plotTitle = "%s_ONLY_SE" % (name1)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # lost command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    return
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(statOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]
    headerLength = len(rankEnhancerTable[0])
    for line in rankEnhancerTable[1:]:
        #fix line lengths
        if len(line) != headerLength:
            line += ['']*(headerLength-len(line))

        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[-1].split(',')
        geneList += line[-2].split(',')
        geneList += line[-3].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff
        #the cutoff is hard wired, but we can add an option to change the test
        #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff
        #print(line)
        
        if float(line[-8]) > cutOff and int(line[-4]) == 1:

            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'UNCHANGED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #formatted diff table
    #possible that no genes are differential
    rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t')
    
    
    #make a new formatted table
    header = rankEnhancerDiffTable[0]
    formattedRankDiffTable =[header]

    for line in rankEnhancerDiffTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankDiffTable.append(line)


    formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t')



    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)


    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        

        bamList1 = [dataDict[name]['bam'] for name in namesList1]
        bamList2 = [dataDict[name]['bam'] for name in namesList2]
        bamList = bamList1 + bamList2
        bamString = string.join(bamList,',')
        
        nameList = [name1]*len(namesList1) + [name2]*len(namesList2)
        nameString = string.join(nameList,',')
        print(namesList1[0])
        print(namesList2[0])

        print(namesList1)
        print(namesList2)
        print(dataDict[namesList1[0]]['color'])
        if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']:
            colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2)
        else:
            colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2)
        colorString = string.join(colorList,':')

        #change dir

    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Beispiel #57
0
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1,bam2)
        nameString = "%s,%s" % (name1,name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)
    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Beispiel #58
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Beispiel #60
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)