Beispiel #1
0
def splitRegions(inputGFF,tssCollection):


    #if even a single coordinate is shared with the +/-1kb 
    splitGFF = []
    for line in inputGFF:

        chrom = line[0]
        regionID = line[1]
        lineLocus = utils.Locus(line[0],line[3],line[4],'.')

        overlappingLoci = tssCollection.getOverlap(lineLocus)
        if len(overlappingLoci) > 0: #case where a tss Overlap
            #identify the parts of the line locus that are contained
            localTSSCollection = utils.LocusCollection(overlappingLoci,50)
            overlappingCoords = lineLocus.coords()
            for tssLocus in overlappingLoci:
                overlappingCoords += tssLocus.coords()

            overlappingCoords = utils.uniquify(overlappingCoords)
            overlappingCoords.sort()

            #you need to hack and slash add 1 to the last coordinate of the overlappingCoords
            overlappingCoords[-1] +=1

            i = 0
            regionTicker = 1
            while i < (len(overlappingCoords)-1):
                start = int(overlappingCoords[i])
                stop = int(overlappingCoords[(i+1)])-1
                if (stop - start) < 50: #this eliminates really tiny regions
                    i+=1
                    continue
                splitLocus = utils.Locus(chrom,start+1,stop,'.')


                if lineLocus.overlaps(splitLocus): #has to be a mycn site
                    newID = '%s_%s' % (regionID,regionTicker)
                    tssStatus = 0
                    if localTSSCollection.getOverlap(splitLocus):
                        tssStatus = 1
                    splitGFFLine = [chrom,newID,newID,start,stop,'','.',tssStatus,newID]

                    splitGFF.append(splitGFFLine)
                    regionTicker+=1
                i+=1
        else:
            line[7] = 0
            splitGFF.append(line)




    return splitGFF
Beispiel #2
0
def makeBedCollection(bedFileList):
    '''
    takes in a list of bedFiles and makes a single huge collection
    each locus has as its ID the name of the bed file
    '''

    bedLoci = []
    print("MAKING BED COLLECTION FOR:")
    for bedFile in bedFileList:

        bedName = bedFile.split('/')[-1].split('.')[0]
        print(bedName)
        bed = utils.parseTable(bedFile, '\t')
        for line in bed:
            if len(line) >= 3:
                #check that line[0]
                if line[0][0:3] == 'chr':
                    try:
                        coords = [int(line[1]), int(line[2])]
                        bedLocus = utils.Locus(line[0], min(coords),
                                               max(coords), '.', bedName)
                        bedLoci.append(bedLocus)

                    except ValueError:
                        pass

        print("IDENTIFIED %s BED REGIONS" % (len(bedLoci)))

    return utils.LocusCollection(bedLoci, 50)
Beispiel #3
0
def findValleys(TFtoEnhancerDict, bamFile, projectName, projectFolder, cutoff = 0.2):
    '''
    takes in the super dict
    returns a dictionary of refseqs with all valley loci that are associated
    '''

    print 'IDENTIFYING VALLEYS IN SUPER ENHANCERS'

    valleyBED = []
    valleyDict = {}

    for gene in TFtoEnhancerDict.keys():
        valleyDict[gene] = []
        print gene
        for region in TFtoEnhancerDict[gene]:
            scoreArray = scoreValley(region, bamFile, projectName, projectFolder)
            for index,score in enumerate(scoreArray):
                if score > cutoff:
                    valley = utils.Locus(region.chr(), region.start() + index*10,
                                         region.start() + (index+1)*10, '.')
                    valleyDict[gene].append(valley)

        stitchedValleys = stitchValleys(valleyDict[gene])
        for valley in stitchedValleys:
            valleyBED.append([valley.chr(), valley.start(), valley.end()])
            valleyDict[gene] = stitchedValleys

    bedfilename = projectFolder + projectName + '_valleys.bed'
    utils.unParseTable(valleyBED, bedfilename, '\t')
    print bedfilename

    return bedfilename
def main():

    projectFolder = '/storage/goodell/projects/jmreyes/amish_ayala/'
    bedFolder = projectFolder+'bed/'
    wtCanyonBed = bedFolder+'canyon_WT_sizeSelected.bed'
    mutCanyonBed = bedFolder+'canyon_Mut_sizeSelected.bed'
    
    wtCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'wt_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(wtCanyonBed, '\t')])
    mutCanyonLocusCollection = utils.LocusCollection([utils.Locus(x[0], x[1], x[2], '.', 'mut_'+str(x[0])+':'+str(x[1])+'-'+str(x[2])) for x in utils.parseTable(mutCanyonBed, '\t')])
    overlappingCanyons = []

    wtExpansion = []
    mutExpansion = []

    wtUnique = []
    mutUnique = []
    overlapCounter = 0
    mutOverlap = 0
    for locus in wtCanyonLocusCollection.getLoci():
        wtMutOverlap = mutCanyonLocusCollection.getOverlap(locus, 'both')
        if len(wtMutOverlap) > 0:
            overlapCounter += 1
            for overlap in wtMutOverlap:
                newLine = [locus.chr(), locus.start(), locus.end(), locus.end()-locus.start(), overlap.chr(), overlap.start(), overlap.end(), overlap.end()-overlap.start()]
                wtLength = locus.end()-locus.start()
                mutLength = overlap.end()-overlap.start()
                if mutLength > wtLength:
                    mutExpansion.append(newLine)
                elif wtLength > mutLength:
                    wtExpansion.append(newLine)
        else:
            wtUnique.append(locus)

    for locus in mutCanyonLocusCollection.getLoci():
        mutWTOverlap = wtCanyonLocusCollection.getOverlap(locus, 'both')
        if len(mutWTOverlap) > 0:
            mutOverlap += 1
        else:
            mutUnique.append(locus)


    print len(mutExpansion)
    print len(wtExpansion)
    utils.unParseTable(mutExpansion, projectFolder+'tables/MUT_canyons_expanded.txt', '\t')
    utils.unParseTable(wtExpansion, projectFolder+'tables/WT_canyons_expanded.txt', '\t')
Beispiel #5
0
def assignEnhancerRank(enhancerToGeneFile,
                       enhancerFile1,
                       enhancerFile2,
                       name1,
                       name2,
                       rankOutput=''):
    '''
    for all genes in the enhancerToGene Table, assigns the highest overlapping ranked enhancer in the other tables
    '''

    enhancerToGene = utils.parseTable(enhancerToGeneFile, '\t')

    enhancerCollection1 = makeSECollection(enhancerFile1, name1, False)
    enhancerCollection2 = makeSECollection(enhancerFile2, name2, False)

    enhancerDict1 = makeSEDict(enhancerFile1, name1, False)
    enhancerDict2 = makeSEDict(enhancerFile2, name2, False)

    #we're going to update the enhancerToGeneTable

    enhancerToGene[0] += ['%s_rank' % name1, '%s_rank' % name2]

    for i in range(1, len(enhancerToGene)):

        line = enhancerToGene[i]

        locusLine = utils.Locus(line[1], line[2], line[3], '.', line[0])

        #if the enhancer doesn't exist, its ranking is dead last on the enhancer list

        enhancer1Overlap = enhancerCollection1.getOverlap(locusLine, 'both')
        if len(enhancer1Overlap) == 0:
            enhancer1Rank = len(enhancerCollection1)
        else:

            rankList1 = [
                enhancerDict1[x.ID()]['rank'] for x in enhancer1Overlap
            ]
            enhancer1Rank = min(rankList1)

        enhancer2Overlap = enhancerCollection2.getOverlap(locusLine, 'both')
        if len(enhancer2Overlap) == 0:
            enhancer2Rank = len(enhancerCollection2)
        else:

            rankList2 = [
                enhancerDict2[x.ID()]['rank'] for x in enhancer2Overlap
            ]
            enhancer2Rank = min(rankList2)
        enhancerToGene[i] += [enhancer1Rank, enhancer2Rank]

    if len(rankOutput) == 0:
        return enhancerToGene
    else:
        utils.unParseTable(enhancerToGene, rankOutput, '\t')
Beispiel #6
0
def generateSubpeakFASTA(TFtoEnhancerDict, subpeaks, genomeDirectory,
                         projectName, projectFolder, constExtension):
    '''
    from a BED file of constituents
    generate a FASTA for the consituients contained within the canidate supers
    '''

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [
        utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable
    ]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFtoEnhancerDict.keys():
        subpeakDict[gene] = []
        for region in TFtoEnhancerDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [
                utils.makeSearchLocus(x, constExtension, constExtension)
                for x in overlaps
            ]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append(
                    [overlap.chr(),
                     overlap.start(),
                     overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|' + subpeak.chr() + '|' + str(
                subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(),
                                       int(subpeak.start() + 1),
                                       int(subpeak.end() + 1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
Beispiel #7
0
def mapGFFLineToBed(gffLine, outFolder, nBins, bedCollection, header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3],
                                     gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]),
                           gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    # plotBuffer = int(gffLocus.len() / float(nBins) * 20) # UNUSED (?)

    overlapLoci = bedCollection.getOverlap(gffLocus, sense='both')
    print("IDENTIFIED %s OVERLAPPING BED LOCI FOR REGION %s" %
          (len(overlapLoci), gffLine))

    # since beds come from multiple sources, we want to figure out how to offset them
    offsetDict = {}  # this will store each ID name
    bedNamesList = utils.uniquify([locus.ID() for locus in overlapLoci])
    bedNamesList.sort()
    for i in range(len(bedNamesList)):
        offsetDict[bedNamesList[
            i]] = 2 * i  # offsets different categories of bed regions

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])

    # fill out the name table
    for name in bedNamesList:
        offset = offsetDict[name]
        nameTable.append([name, 0, 0.0 - offset])

    for bedLocus in overlapLoci:

        offset = offsetDict[bedLocus.ID()]

        [start,
         stop] = [abs(x - refPoint) * scaleFactor for x in bedLocus.coords()]

        diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

    utils.unParseTable(diagramTable,
                       outFolder + gffString + '_bedDiagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_bedNameTemp.txt',
                       '\t')
Beispiel #8
0
def findValleys(gene_to_enhancer_dict,
                bamFileList,
                projectName,
                projectFolder,
                cutoff=0.2):
    '''
    takes in the super dict
    returns a dictionary of refseqs with all valley loci that are associated
    returns 2 kinds of bed files...
    1 = all 
    '''

    #first make the bamDict

    all_valley_bed = []
    valleyDict = {}

    #start w/ a bamFileList and make a list of bam type objects
    bam_list = [utils.Bam(bam_path) for bam_path in bamFileList]
    max_read_length = max([bam.getReadLengths()[0] for bam in bam_list])

    gene_list = gene_to_enhancer_dict.keys()
    gene_list.sort()
    ticker = 0
    print('number of regions processed:')
    for gene in gene_list:

        valleyDict[gene] = []

        for region in gene_to_enhancer_dict[gene]:
            if ticker % 100 == 0:
                print(ticker)
            ticker += 1
            scoreArray = scoreValley(region, bam_list, max_read_length,
                                     projectName, projectFolder)
            for index, score in enumerate(scoreArray):
                if score > cutoff:
                    valley = utils.Locus(region.chr(),
                                         region.start() + index * 10,
                                         region.start() + (index + 1) * 10,
                                         '.')
                    valleyDict[gene].append(valley)

        stitchedValleys = stitchValleys(valleyDict[gene])
        for valley in stitchedValleys:
            all_valley_bed.append([valley.chr(), valley.start(), valley.end()])
            valleyDict[gene] = stitchedValleys

    all_bed_path = projectFolder + projectName + '_all_valleys.bed'
    utils.unParseTable(all_valley_bed, all_bed_path, '\t')

    return all_bed_path
Beispiel #9
0
def createSuperLoci(superTable, Enumber='super'):
    '''
    takes as input a ROSE SuperEnhancer table 
    output a table of loci for SuperEnhancers
    '''

    print 'CREATING SUPER-ENHANCER LOCUS COLLECTION'

    output = []

    if Enumber == 'super':
        for line in superTable[6:]:
            if line[-1] == '1':
                locus = utils.Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7])))
                output.append(locus)
    else:
        end = 6+int(Enumber)
        for line in superTable[6:end]:
            locus = utils.Locus(line[1], line[2], line[3], '.', line[0], (float(line[6])-float(line[7])))
            output.append(locus)

    return output
Beispiel #10
0
def makePeakGFFs(peak_path_list):

    '''
    makes a stitched gff for all MYC bound TSS and Distal regions across all datasets
    '''

    #setting the output
    tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder)
    distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder)

    #check to see if already done
    if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1):
        print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path))
        return tss_gff_path,distal_gff_path

    #emtpy loci lists to hold everything
    tss_loci = []
    distal_loci = []

    
    for peak_path in peak_path_list:
        print('processing %s' % (peak_path))

        peak_table=  utils.parseTable(peak_path,'\t')

        for line in peak_table[1:]:
            peak_locus = utils.Locus(line[1],line[2],line[3],'.')
            if int(line[5]) == 0:
                distal_loci.append(peak_locus)
            else:
                tss_loci.append(peak_locus)

    #now combind the loci
    print('stitching loci')
    distal_collection = utils.LocusCollection(distal_loci,50)
    tss_collection = utils.LocusCollection(tss_loci,50)

    stitched_distal_collection = distal_collection.stitchCollection()
    stitched_tss_collection = tss_collection.stitchCollection()

    #now make the gffs
    distal_gff= utils.locusCollectionToGFF(distal_collection)
    tss_gff= utils.locusCollectionToGFF(tss_collection)

    #now write to disk
    utils.unParseTable(distal_gff,distal_gff_path,'\t')
    utils.unParseTable(tss_gff,tss_gff_path,'\t')
    
    return tss_gff_path,distal_gff_path
Beispiel #11
0
def make_mycn_regions(conserved_rank_path):

    '''
    takes conserved NB MYCN regions 
    then creates a bed and gff of regions
    '''

    conserved_rank_table = utils.parseTable(conserved_rank_path,'\t')
    mycn_gff = []
    mycn_flank_gff = []
    mycn_bed = []
    mycn_flank_bed = []

    for line in conserved_rank_table[1:]:
        locus_line = utils.Locus(line[1],line[2],line[3],'.')
        
        if int(line[3]) < int(line[2]):
            print('uh oh')
            print(line)
        gff_line = [line[1],line[0],'',line[2],line[3],'','.','',line[0]]
        bed_line = [line[1],line[2],line[3],line[0]]
        mycn_gff.append(gff_line)
        mycn_bed.append(bed_line)

        gff_flank_line = [line[1],line[0],'',int(line[2])-500,int(line[3])+500,'','.','',line[0]]
        bed_flank_line = [line[1],int(line[2])-500,int(line[3])+500,line[0]]
        mycn_flank_gff.append(gff_flank_line)
        mycn_flank_bed.append(bed_flank_line)
        
    mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    mycn_flank_gff_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder)

    mycn_bed_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.bed' % (bedFolder)
    mycn_flank_bed_path = '%sHG19_NB_MYCN_CONSERVED_-500_+500.bed' % (bedFolder)

    #writing to disk
    utils.unParseTable(mycn_gff,mycn_gff_path,'\t')
    utils.unParseTable(mycn_flank_gff,mycn_flank_gff_path,'\t')

    utils.unParseTable(mycn_bed,mycn_bed_path,'\t')
    utils.unParseTable(mycn_flank_bed,mycn_flank_bed_path,'\t')

    print(mycn_gff_path)
    print(mycn_flank_gff_path)
    print(mycn_bed_path)
    print(mycn_flank_bed_path)
    return mycn_gff_path,mycn_flank_gff_path
Beispiel #12
0
def mapBamToGFFLine(bamFile, MMR, name, gffLine, color, nBins, sense='both', extension=200):
    '''maps reads from a bam to a gff'''

    print('using a MMR/scaling denominator value of %s' % (MMR))

    line = gffLine[0:9]
    gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6], line[1])

    # setting up the output clusterline
    colorLine = color
    bamName = bamFile.split('/')[-1]
    clusterLine = [bamName, gffLocus.ID(), name, gffLocus.__str__()] + colorLine

    binSize = gffLocus.len() / nBins
    # some regions will be too short to get info on
    # we just kick these back and abandon them
    if binSize == 0:
        clusterLine += ['NA'] * int(nBins)
        return clusterLine

    # flippy flip if sense is negative
    senseTrans = string.maketrans('-+.', '+-+')
    if sense == '-':
        bamSense = string.translate(gffLocus.sense(), senseTrans)
    elif sense == '+':
        bamSense = gffLocus.sense()
    else:
        bamSense = '.'
    # using the bamLiquidator to get the readstring
    # print('using nBin of %s' % nBin)

    bamCommand = "%s %s %s %s %s %s %s %s" % (bamliquidatorString, bamFile, gffLocus.chr(), gffLocus.start(), gffLocus.end(), bamSense, nBins, extension)
    # print(bamCommand)
    getReads = subprocess.Popen(bamCommand, stdin=subprocess.PIPE, stderr=subprocess.PIPE, stdout=subprocess.PIPE, shell=True)
    readString = getReads.communicate()
    denList = readString[0].split('\n')[:-1]

    # flip the denList if the actual gff region is -
    if gffLocus.sense() == '-':
        denList = denList[::-1]

    # converting from units of total bp of read sequence per bin to rpm/bp
    denList = [round(float(x) / binSize / MMR, 4) for x in denList]

    clusterLine += denList

    return clusterLine
Beispiel #13
0
def generateSubpeakFASTA(TFandSuperDict, subpeaks, genomeDirectory, projectName, projectFolder, motifExtension):
    '''
    takes as input a BED file of constituents
    outputs a FASTA  file of merged extended super-enhancer consituents and associated formated name
    '''

    print 'MAKE FASTA'

    subpeakDict = {}
    subpeakBED = [['track name=' + projectName + ' color=204,0,204']]
    subpeakTable = utils.parseTable(subpeaks, '\t')

    subpeakLoci = [utils.Locus(l[0], int(l[1]), int(l[2]), '.') for l in subpeakTable]
    subpeakCollection = utils.LocusCollection(subpeakLoci, 50)

    for gene in TFandSuperDict.keys():
        subpeakDict[gene] = []
        for region in TFandSuperDict[gene]:
            overlaps = subpeakCollection.getOverlap(region)
            extendedOverlaps = [utils.makeSearchLocus(x, motifExtension, motifExtension) for x in overlaps]

            overlapCollectionTemp = utils.LocusCollection(extendedOverlaps, 50)
            overlapCollection = overlapCollectionTemp.stitchCollection()
            for overlap in overlapCollection.getLoci():
                subpeakBED.append([overlap.chr(), overlap.start(), overlap.end()])
                subpeakDict[gene].append(overlap)

    bedfilename = projectFolder + projectName + '_subpeaks.bed'
    utils.unParseTable(subpeakBED, bedfilename, '\t')

    fasta = []

    for gene in subpeakDict:
        for subpeak in subpeakDict[gene]:

            fastaTitle = gene + '|'  + subpeak.chr() + '|' + str(subpeak.start()) + '|' + str(subpeak.end())
            fastaLine = utils.fetchSeq(genomeDirectory, subpeak.chr(), int(subpeak.start()+1), int(subpeak.end()+1))

            fasta.append('>' + fastaTitle)
            fasta.append(upper(fastaLine))

    # Output the fasta file of extended SE constituents
    outname = projectFolder + projectName + '_SUBPEAKS.fa'

    utils.unParseTable(fasta, outname, '')
Beispiel #14
0
def makeSECollection(enhancerFile, name, superOnly=True):
    '''
    returns a locus collection from a super table
    top gives the number of rows
    '''
    enhancerTable = utils.parseTable(enhancerFile, '\t')
    enhancerLoci = []

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue
        else:

            if superOnly and int(line[-1]) == 0:
                break
            enhancerLoci.append(
                utils.Locus(line[1], line[2], line[3], '.',
                            name + '_' + line[0]))

    return utils.LocusCollection(enhancerLoci, 50)
Beispiel #15
0
def makeSECollection(enhancerFile,name,top=0):
    '''
    returns a locus collection from a super table
    top gives the number of rows
    '''
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    superLoci = []

    ticker = 0
    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue
        else:
            ticker+=1

            superLoci.append(utils.Locus(line[1],line[2],line[3],'.',name+'_'+line[0]))

            if ticker == top:
                break
    return utils.LocusCollection(superLoci,50)
Beispiel #16
0
def mapCollection(stitchedCollection, referenceCollection, bamFileList,
                  mappedFolder, output, refName):
    '''
    makes a table of factor density in a stitched locus and ranks table by number of loci stitched together
    '''

    print('FORMATTING TABLE')
    loci = stitchedCollection.getLoci()

    locusTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE'
    ]]

    lociLenList = []

    # strip out any that are in chrY
    for locus in list(loci):
        if locus.chr() == 'chrY':
            loci.remove(locus)

    for locus in loci:
        # numLociList.append(int(stitchLocus.ID().split('_')[1]))
        lociLenList.append(locus.len())
        # numOrder = order(numLociList,decreasing=True)
    lenOrder = utils.order(lociLenList, decreasing=True)
    ticker = 0
    for i in lenOrder:
        ticker += 1
        if ticker % 1000 == 0:
            print(ticker)
        locus = loci[i]

        # First get the size of the enriched regions within the stitched locus
        refEnrichSize = 0
        refOverlappingLoci = referenceCollection.getOverlap(locus, 'both')
        for refLocus in refOverlappingLoci:
            refEnrichSize += refLocus.len()

        try:
            stitchCount = int(locus.ID().split('_')[0])
        except ValueError:
            stitchCount = 1
        coords = [int(x) for x in locus.coords()]

        locusTable.append([
            locus.ID(),
            locus.chr(),
            min(coords),
            max(coords), stitchCount, refEnrichSize
        ])

    print('GETTING MAPPED DATA')
    print("USING A BAMFILE LIST:")
    print(bamFileList)
    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        print('GETTING MAPPING DATA FOR  %s' % bamFile)
        # assumes standard convention for naming enriched region gffs

        # opening up the mapped GFF
        print('OPENING %s%s_%s_MAPPED/matrix.txt' %
              (mappedFolder, refName, bamFileName))

        mappedGFF = utils.parseTable(
            '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName),
            '\t')

        signalDict = defaultdict(float)
        print('MAKING SIGNAL DICT FOR %s' % (bamFile))
        mappedLoci = []
        for line in mappedGFF[1:]:

            chrom = line[1].split('(')[0]
            start = int(line[1].split(':')[-1].split('-')[0])
            end = int(line[1].split(':')[-1].split('-')[1])
            mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0]))
            try:
                signalDict[line[0]] = float(line[2]) * (abs(end - start))
            except ValueError:
                print('WARNING NO SIGNAL FOR LINE:')
                print(line)
                continue

        mappedCollection = utils.LocusCollection(mappedLoci, 500)
        locusTable[0].append(bamFileName)

        for i in range(1, len(locusTable)):
            signal = 0.0
            line = locusTable[i]
            lineLocus = utils.Locus(line[1], line[2], line[3], '.')
            overlappingRegions = mappedCollection.getOverlap(lineLocus,
                                                             sense='both')
            for region in overlappingRegions:
                signal += signalDict[region.ID()]
            locusTable[i].append(signal)

    utils.unParseTable(locusTable, output, '\t')
Beispiel #17
0
def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False):
    
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerTable = utils.parseTable(enhancerFile,'\t')

    #internal parameter for debugging
    byRefseq = False


    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile,'\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes)


    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0))


    #this turns the tssLoci list into a LocusCollection
    #50 is the internal parameter for LocusCollection and doesn't really matter
    tssCollection = utils.LocusCollection(tssLoci,50)

    

    geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)}

    #dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict= defaultdict(list)

    #list of all genes that appear in this analysis
    overallGeneList = []

    if noFormatTable:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']]

        
    else:
        #set up the output tables
        #first by enhancer
        enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]]

        #next by gene
        geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']]

    #next make the gene to enhancer table
    geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']]

        


    for line in enhancerTable:
        if line[0][0] =='#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1],line[2],line[3])
        
        enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0])

        #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus         
        overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both')           
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:                
            overlappingGenes.append(overlapLocus.ID())

        #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both')           
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())


        distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both')           
        distalGenes =[]
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

            
            
        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        #these checks make sure each gene list is unique.
        #technically it is possible for a gene to be overlapping, but not proximal since the
        #gene could be longer than the 50kb window, but we'll let that slide here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)


        #Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            #get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3]))/2

            #get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            #get the ID and convert to name
            closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name']

        #NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),','))
            newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        #Now grab all overlapping and proximal genes for the gene ordered table

        overallGeneList +=overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))
            
        overallGeneList+=proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))



    #End loop through
    
    #Make table by gene
    overallGeneList = utils.uniquify(overallGeneList)  

    #use enhancer rank to order
    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])
        
    usedNames = []
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:

            continue
        else:
            usedNames.append(geneName)
        
        proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID]
        
        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]],',')
    
        newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus]
        geneToEnhancerTable.append(newLine)

    #resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable,geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i+1)])

        return sortedTable,geneToEnhancerTable
Beispiel #18
0
def mapBamToGFF(bamFile,
                gff,
                sense='.',
                extension=200,
                rpm=False,
                clusterGram=None,
                matrix=None):
    '''maps reads from a bam to a gff'''

    #creating a new gff to output
    newGFF = []
    #reading in the bam
    bam = utils.Bam(bamFile)

    #getting RPM normalization
    if rpm:
        MMR = round(float(bam.getTotalReads('mapped')) / 1000000, 4)
    else:
        MMR = 1

    print('using a MMR value of %s' % (MMR))

    #creating a sense trans
    senseTrans = string.maketrans('-+.', '+-+')

    #reading in the gff
    if type(gff) == str:
        gff = utils.parseTable(gff, '\t')

    #setting up a clustergram table
    if clusterGram:
        binSize = int(clusterGram)
        binSizeList = []
        #now go through each line of the gff and make sure they're all the same length
        for i in range(0, len(gff), 1):
            line = gff[i]
            gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]),
                                   line[6], line[1])
            binSizeList.append(gffLocus.len() / binSize)
        binSizeList = utils.uniquify(binSizeList)
        if len(binSizeList) > 1:
            print(
                'WARNING: lines in gff are of different length. Output clustergram will have variable row length'
            )
        newGFF.append(['GENE_ID', 'locusLine'] + [
            str(x * binSize) + '_' + bamFile.split('/')[-1]
            for x in range(1,
                           max(binSizeList) + 1, 1)
        ])

    #setting up a maxtrix table
    if matrix:
        newGFF.append(['GENE_ID', 'locusLine'] + [
            'bin_' + str(n) + '_' + bamFile.split('/')[-1]
            for n in range(1,
                           int(matrix) + 1, 1)
        ])
        nBin = int(matrix)

    # Try to use the bamliquidatior script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidatorString = '/usr/bin/bamliquidator'
    if not os.path.isfile(bamliquidatorString):
        bamliquidatorString = './bamliquidator'
        if not os.path.isfile(bamliquidatorString):
            raise ValueError('bamliquidator not found in path')

    #getting and processing reads for gff lines
    ticker = 0
    print('Number lines processed')
    for line in gff:
        line = line[0:9]
        if ticker % 100 == 0:
            print(ticker)
        ticker += 1
        gffLocus = utils.Locus(line[0], int(line[3]), int(line[4]), line[6],
                               line[1])

        #get the nBin and binSize
        if clusterGram:
            nBin = gffLocus.len() / int(clusterGram)
            binSize = int(clusterGram)
        if matrix:
            nBin = int(matrix)
            binSize = gffLocus.len() / nBin
            #some regions will be too short to get info on
            if binSize == 0:
                clusterLine = [gffLocus.ID(),
                               gffLocus.__str__()] + ['NA'] * nBin
                newGFF.append(clusterLine)
                continue

        #flippy flip if sense is negative
        if sense == '-':
            bamSense = string.translate(gffLocus.sense(), senseTrans)
        elif sense == '+':
            bamSense = gffLocus.sense()
        else:
            bamSense = '.'
        #using the bamLiquidator to get the readstring
        #print('using nBin of %s' % nBin)
        bamCommand = "%s %s %s %s %s %s %s %s" % (
            bamliquidatorString, bamFile, line[0], gffLocus.start(),
            gffLocus.end(), bamSense, nBin, extension)
        #print(bamCommand)
        getReads = subprocess.Popen(bamCommand,
                                    stdin=subprocess.PIPE,
                                    stderr=subprocess.PIPE,
                                    stdout=subprocess.PIPE,
                                    shell=True)
        readString, stderr = getReads.communicate()
        if stderr:
            print("STDERR out: %s" % (stderr))
        denList = readString.split('\n')[:-1]
        #print("denlist is: %s" % denList)
        #flip the denList if the actual gff region is -
        if gffLocus.sense() == '-':
            denList = denList[::-1]

        #converting from units of total bp of read sequence per bin to rpm/bp

        denList = [round(float(x) / binSize / MMR, 4) for x in denList]

        #if the gff region is - strand, flip the

        clusterLine = [gffLocus.ID(), gffLocus.__str__()] + denList
        newGFF.append(clusterLine)

    return newGFF
Beispiel #19
0
def findCanidateTFs(annotationFile, superLoci, expressedNM, TFlist, refseqToNameDict, projectFolder, projectName):
    '''
    find all TFs within 1Mb of the super-enhancer center that are considered expressed 
    return a dictionary keyed by TF that points to a list of super-enhancer loci
    '''

    print 'FINDING CANIDATE TFs'

    startDict = utils.makeStartDict(annotationFile)

    # Find the location of the TSS of all transcripts (NMid) considered expressed
    tssLoci = []
    for geneID in expressedNM:
        tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0))
    tssCollection = utils.LocusCollection(tssLoci,50)

    # Assign all transcripts (NMid) that are TFs to a super-enhancer if it is the closest gene
    seAssignment = []
    seAssignmentGene = []
    TFandSuperDict = {}

    for superEnh in superLoci:

        seCenter = (superEnh.start() + superEnh.end()) / 2 

        # Find all transcripts whose TSS occur within 1Mb of the SE center
        searchLocus = utils.Locus(superEnh.chr(), superEnh.start()-1000000, superEnh.end()+1000000, '.')
        allEnhancerLoci = tssCollection.getOverlap(searchLocus)
        allEnhancerGenes = [locus.ID() for locus in allEnhancerLoci]

        # Find the transcript that is closest to the center
        if allEnhancerGenes:
            distList = [abs(seCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes]
            closestGene = allEnhancerGenes[distList.index(min(distList))]
        else:
            closestGene = ''

        seAssignment.append([superEnh.chr(), superEnh.start(), superEnh.end(), closestGene])

        # Select the transcript if it is a TF, and allow for a TF to have multiple SEs
        if closestGene in TFlist and closestGene not in TFandSuperDict.keys():
            TFandSuperDict[closestGene] = [superEnh]
        elif closestGene in TFlist and closestGene in TFandSuperDict.keys():
            TFandSuperDict[closestGene].append(superEnh)

        # Convert the selected TF NMids to gene names
        if closestGene != '':
            geneName = refseqToNameDict[closestGene]
            seAssignmentGene.append([superEnh.chr(), superEnh.start(), superEnh.end(), geneName])

    # Output the list of SE-assigned transcripts (NMids)
    seAssignmentFile = projectFolder + projectName + '_SE_ASSIGNMENT_TRANSCRIPT.txt'
    utils.unParseTable(seAssignment, seAssignmentFile, '\t')

    # Output the list of SE-assigned genes
    seAssignmentGeneFile = projectFolder + projectName + '_SE_ASSIGNMENT_GENE.txt'
    utils.unParseTable(seAssignmentGene, seAssignmentGeneFile, '\t')

    print 'Number of canidate TFs:', len(TFandSuperDict)

    return TFandSuperDict
Beispiel #20
0
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100):

    '''
    uses the  conserved MYCN sites and ranks eboxes within them
    by average background subtracted signal
    searches 100bp (window variable)  from mycn summits
    '''
    
    window = int(window)

    #bring in the conserved mycn region
    print('making gff of nb mycn summits')
    nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t')

    nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50)

    dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1]
    names_list.sort()

    summit_loci = []
    #first makes a gff of all summits +/- 100bp for all nb mycn datasets
    for name in names_list:
        summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
        summit_bed = utils.parseTable(summit_bed_path,'\t')
        for line in summit_bed:
            summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3])
            if len(nb_mycn_collection.getOverlap(summit_locus)) > 0:
                summit_loci.append(summit_locus)

    summit_collection =utils.LocusCollection(summit_loci,50)
    summit_merged_collection = summit_collection.stitchCollection()
    
    summit_gff = utils.locusCollectionToGFF(summit_merged_collection)
    summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window)
    utils.unParseTable(summit_gff,summit_gff_path,'\t')

    #this is borrowed from above and maps chip-seq signal to the gff
    print('mapping to nb mycn summits and making signal dict')
    gffList = [summit_gff_path]
    summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList)


    mycnSignalTable = utils.parseTable(summit_signal_path,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        region_id = line[1]
        coords = [int(x) for x in line[1].split(':')[-1].split('-')]
        line_length = coords[1]-coords[0]
        mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length

    #now for each region find the eboxes and then add up the signal
    print('making ebox ranking')
    ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG']
    eboxDict = {}
    for ebox in ebox_list:
        eboxDict[ebox] = []
    ticker = 0
    for line in summit_gff:
        if ticker % 1000 == 0:
            print(ticker)
        ticker+=1

        chrom = line[0]
        sense = '.'

        start = int(line[3])
        end = int(line[4])
        region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4])
        signal = mycn_sig_dict[region_id]

        sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True)
        
        motifVector = []
        matches = re.finditer('CA..TG',str.upper(sequenceLine))
        if matches:
            for match in matches:
                motifVector.append(match.group())
        
        #count only 1 of each motif type per line
        #motifVector = utils.uniquify(motifVector)
        for motif in motifVector:
            if ebox_list.count(motif) > 0:
                eboxDict[motif].append(signal)
            else:
                eboxDict[utils.revComp(motif)].append(signal)


    eboxTable =[]
    eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']]
    for ebox in eboxDict.keys():
        newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])]
        eboxTable.append(newLine)


    occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True)
    
    for x in occurenceOrder:
        eboxTableOrdered.append(eboxTable[x])
    print(eboxTableOrdered)
    ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window)
    utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t')
    return ebox_outfile
Beispiel #21
0
def collapseFimo(fimo_output, gene_to_enhancer_dict, candidate_tf_list,
                 output_folder, analysis_name, motifConvertFile):
    '''
    collapses motifs from fimo
    for each source node (TF) and each target node (gene enhancer regions), collapse motif instances
    then spit out a ginormous set of beds and a single crazy collapsed bed
    '''

    #first build up the motif name conversion database

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = defaultdict(list)
    # The reverse of the other dict, from motif name to gene name
    # a motif can go to multiple genes
    for line in motifDatabase:
        motifDatabaseDict[line[0]].append(line[1])

    #make the folder to store motif beds
    utils.formatFolder('%smotif_beds/' % (output_folder), True)

    edgeDict = {}
    #first layer are source nodes
    for tf in candidate_tf_list:
        edgeDict[tf] = defaultdict(
            list
        )  #next layer are target nodes which are derived from the fimo output

    fimoTable = utils.parseTable(fimo_output, '\t')
    print(fimo_output)

    #fimo sometimes puts the region in either the first or second column
    fimo_line = fimoTable[1]
    if fimo_line[1].count('|') > 0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index))

    for line in fimoTable[1:]:
        source_tfs = motifDatabaseDict[line[0]]  #motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[3]),
                                           int(region[2]) + int(line[4]), '.')
            else:
                target_locus = utils.Locus(region[1],
                                           int(region[2]) + int(line[2]),
                                           int(region[2]) + int(line[3]), '.')
            #what's missing here is the enhancer id of the target locus
            try:
                edgeDict[source][target].append(target_locus)
            except KeyError:
                print('this motif is not in the network')
                print(line)
                sys.exit()

    #now we actually want to collapse this down in a meaningful way
    #overlapping motifs count as a single binding site. This way a TF with tons of motifs
    #that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '%s%s_all_motifs.bed' % (output_folder, analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edgeDict[tf].keys()
        bed_header = [
            'track name = "%s" description="%s motifs in %s"' %
            (tf, tf, analysis_name)
        ]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder, tf)
        for target in target_nodes:
            edgeCollection = utils.LocusCollection(edgeDict[tf][target], 50)
            edgeCollection = edgeCollection.stitchCollection()
            edgeLoci = edgeCollection.getLoci()
            edgeDict[tf][target] = edgeLoci
            for locus in edgeLoci:
                bed_line = [
                    locus.chr(),
                    locus.start(),
                    locus.end(), target, '', '+'
                ]
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unParseTable(target_bed, target_bed_path, '\t')

    #now the loci are all stitched up
    utils.unParseTable(all_bed, all_bed_path, '\t')
    return edgeDict
Beispiel #22
0
def make_shep_on_mycn_landscape(shep_on_dataFile):

    '''
    finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)


    print('LOADING SHEP ON MYCN SITES')
    #load all of the shep_on sites
    # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder)
    # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t')

    shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder)
    shep_on_bed = utils.parseTable(shep_on_bed_path,'\t')
    shep_on_gff = utils.bedToGFF(shep_on_bed)
    
    #now get the conserved NB MYCN regions
    nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file)

    print('LOADING SHEP ACTIVE ENHANCERS') 
    #make a collection of enhancers
    shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder)
    shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC')

    #now get the active promoters
    print('LOADING SHEP ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder)
    shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t')
    transcribedList = [line[1] for line in shep_transcribed_table]
    tssLoci = []
    for refID in transcribedList:
        tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000))

    shep_tss_collection = utils.LocusCollection(tssLoci,50)

    #now initialize the 6 gffs we will need
    shep_mycn_gff = [] 
    shep_mycn_gff_5kb = []
    shep_mycn_gff_1kb = []

    shep_mycn_promoter_gff = []
    shep_mycn_promoter_gff_1kb = []
    shep_mycn_promoter_gff_5kb = []

    shep_mycn_enhancer_gff = []
    shep_mycn_enhancer_gff_1kb = []
    shep_mycn_enhancer_gff_5kb = []

    #and their respective file names
    shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder)
    shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder)
    shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder)

    print('ITERATING THROUGH SHEP MYCN PEAKS')

    ticker = 0
    enhancer = 0
    promoter = 0 

    other = 0
    for line in shep_on_gff:
        if ticker % 1000 == 0:
            print ticker
        ticker+=1
        peakID = '%s_%s' % ('SHEP_MYCN',str(ticker))

        lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID)

        if nb_conserved_mycn_collection.getOverlap(lineLocus):
            gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID]
            peakCenter = (int(line[3]) + int(line[4]))/2
            gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID]
            #the 1kb is not a center +/- but a flank
            gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID]

            shep_mycn_gff.append(gffLine)
            shep_mycn_gff_5kb.append(gffLine_5kb)
            shep_mycn_gff_1kb.append(gffLine_1kb)

            #tss overlap should take precedence over enhancer overlap
            if shep_tss_collection.getOverlap(lineLocus,'both'):
                shep_mycn_promoter_gff.append(gffLine)
                shep_mycn_promoter_gff_5kb.append(gffLine_5kb)
                shep_mycn_promoter_gff_1kb.append(gffLine_1kb)
                promoter+=1
            #now check for enhancer overlap
            elif shep_enhancer_collection.getOverlap(lineLocus,'both'):
                shep_mycn_enhancer_gff.append(gffLine)
                shep_mycn_enhancer_gff_5kb.append(gffLine_5kb)
                shep_mycn_enhancer_gff_1kb.append(gffLine_1kb)
                enhancer+=1
            else:
                other+=1
    
    print('Of %s shep on mycn peaks' % (len(shep_on_gff)))
    print('%s are promoter' % (promoter))
    print('%s are enhancer' % (enhancer))
    print('%s are other' % (other))
    #now write out the gffs
    utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t')
    utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
# First, load in the node TFs, ATAC peaks and super enhancer regions we'll consider for this analysis
# From networks already constructed from CRC2.py

node_file = '/crusader/projects/cll/final/network/lines/zinba/' + projectName + '/' + projectName + '_NODELIST.txt'
node_table = utils.parseTable(node_file, '\t')
nodelist = [x[0] for x in node_table]
print nodelist
super_enhancer_file = '/crusader/projects/cll/final/rose/' + projectName + '_H3K27ac/' + projectName + '_H3K27ac_peaks_SuperEnhancers.table.txt'

se_table = utils.parseTable(super_enhancer_file, '\t')

subpeak_file = '/crusader/projects/cll/final/zinba/lines/MEC1_ATAC/MEC1_ATAC.peaks.bed'
subpeak_table = utils.parseTable(subpeak_file, '\t')
subpeak_loci = []
for line in subpeak_table:
    subpeak_loci.append(utils.Locus(line[0], line[1], line[2], '.'))
subpeak_collection = utils.LocusCollection(subpeak_loci, 100)
subpeak_dict = {}  # key is enhancer ID, points to a list of loci

# assign subpeak Loci to each super enhancer
fasta = []
se_namelist = []
for line in se_table[6:]:

    se_id = line[0]
    se_namelist.append(se_id)
    subpeak_dict[se_id] = []

    se_locus = utils.Locus(line[1], line[2], line[3], '.')
    overlaps = subpeak_collection.getOverlap(se_locus)
def main():

    projectFolder = '/storage/goodell/home/jmreyes/projects/amish_ayala/'

    #gather up DMR tables
    #ayala MUT vs WT
    mutWT_hypo = utils.parseTable(projectFolder + 'bed/hypoDMRsWT.vs.Mut.bed',
                                  '\t')
    mutWT_hyper = utils.parseTable(
        projectFolder + 'bed/hyperDMRsWT.vs.Mut.bed', '\t')

    mutWT_control = utils.parseTable(
        projectFolder + 'bed/Control_nonDMRsWT.vs.Mut.bed', '\t')

    #ley all
    tbrs_all = utils.parseTable(projectFolder + 'bed/TBRS_DMRs.bed', '\t')
    aml_all = utils.parseTable(projectFolder + 'bed/AML_DMRs.bed', '\t')

    tbrs_hypo = []
    tbrs_hyper = []

    aml_hypo = []
    aml_hyper = []

    tbrs_all_loci = []
    aml_all_loci = []

    for line in tbrs_all:
        chrom = 'chr' + line[0]
        start = line[1]
        end = line[2]
        if 'hypo' in line:
            tbrs_all_loci.append(
                utils.Locus(
                    chrom, start, end, '.', 'tbrs_all_hypo_' + str(chrom) +
                    ':' + str(start) + '-' + str(end)))
        elif 'hyper' in line:
            tbrs_all_loci.append(
                utils.Locus(
                    chrom, start, end, '.', 'tbrs_all_hyper_' + str(chrom) +
                    ':' + str(start) + '-' + str(end)))

    for line in aml_all:
        chrom = 'chr' + line[0]
        start = line[1]
        end = line[2]
        if 'hypo' in line:
            aml_all_loci.append(
                utils.Locus(
                    chrom, start, end, '.', 'aml_all_hypo_' + str(chrom) +
                    ':' + str(start) + '-' + str(end)))
        elif 'hyper' in line:
            aml_all_loci.append(
                utils.Locus(
                    chrom, start, end, '.', 'aml_all_hyper_' + str(chrom) +
                    ':' + str(start) + '-' + str(end)))

    mutWT_hypo_loci = []

    for line in mutWT_hypo:
        chrom = line[0]
        start = line[1]
        end = line[2]
        sense = '.'
        locusID = 'hypo_' + str(chrom) + ':' + str(start) + '-' + str(end)
        new_line = utils.Locus(chrom, start, end, '.', locusID)
        mutWT_hypo_loci.append(new_line)

    mutWT_hyper_loci = []

    for line in mutWT_hyper:
        chrom = line[0]
        start = line[1]
        end = line[2]
        sense = '.'
        locusID = 'hyper_' + str(chrom) + ':' + str(start) + '-' + str(end)
        new_line = utils.Locus(chrom, start, end, '.', locusID)
        mutWT_hyper_loci.append(new_line)

    print len(mutWT_hyper_loci)
    print len(mutWT_hypo_loci)

    mutWT_all_loci = mutWT_hyper_loci + mutWT_hypo_loci
    mutWT_hypo_LC = utils.LocusCollection(mutWT_hypo_loci)

    tbrs_all_LC = utils.LocusCollection(tbrs_all_loci)
    aml_all_LC = utils.LocusCollection(aml_all_loci)

    tbrs_all_overlap = []
    aml_all_overlap = []

    for locus in mutWT_hypo_LC.getLoci():

        tbrs_overlap = tbrs_all_LC.getOverlap(locus, 'both')
        if len(tbrs_overlap) > 0:
            for overlapLocus in tbrs_overlap:
                overlapChrom = overlapLocus.chr()
                overlapStart = overlapLocus.start()
                overlapEnd = overlapLocus.end()

                tbrs_all_overlap.append([
                    locus.ID(), overlapChrom, overlapStart, overlapEnd,
                    overlapLocus.ID()
                ])

        aml_overlap = aml_all_LC.getOverlap(locus, 'both')
        if len(aml_overlap) > 0:
            for overlapLocus in aml_overlap:
                overlapChrom = overlapLocus.chr()
                overlapStart = overlapLocus.start()
                overlapEnd = overlapLocus.end()

                aml_all_overlap.append([
                    locus.ID(), overlapChrom, overlapStart, overlapEnd,
                    overlapLocus.ID()
                ])

    utils.unParseTable(tbrs_all_overlap,
                       projectFolder + 'tables/DMRsvsTBRS_all_overlaps.txt',
                       '\t')
    utils.unParseTable(aml_all_overlap,
                       projectFolder + 'tables/DMRsvsAML_all_overlaps.txt',
                       '\t')
Beispiel #25
0
def mapGFFLineToAnnot(gffLine,
                      outFolder,
                      nBins,
                      geneDict,
                      txCollection,
                      sense='both',
                      header=''):
    '''
    for every line produces a file with all of the rectangles to draw
    '''

    if len(header) == 0:
        gffString = '%s_%s_%s_%s' % (gffLine[0], gffLine[6], gffLine[3],
                                     gffLine[4])
    else:
        gffString = header
    diagramTable = [[0, 0, 0, 0]]
    nameTable = [['', 0, 0]]
    gffLocus = utils.Locus(gffLine[0], int(gffLine[3]), int(gffLine[4]),
                           gffLine[6], gffLine[1])

    scaleFactor = float(nBins) / gffLocus.len()
    # plotting buffer for diagrams
    plotBuffer = int(gffLocus.len() / float(nBins) * 20)

    overlapLoci = txCollection.getOverlap(gffLocus, sense='both')
    geneList = [locus.ID() for locus in overlapLoci]

    if gffLine[6] == '-':
        refPoint = int(gffLine[4])
    else:
        refPoint = int(gffLine[3])
    offsetCollection = utils.LocusCollection([], 500)
    for geneID in geneList:

        gene = geneDict[geneID]

        print(gene.commonName())
        if len(gene.commonName()) > 1:
            name = gene.commonName()
        else:
            name = geneID
        offset = 4 * len(offsetCollection.getOverlap(gene.txLocus()))
        offsetCollection.append(
            utils.makeSearchLocus(gene.txLocus(), plotBuffer, plotBuffer))
        # write the name of the gene down
        if gene.sense() == '+':
            geneStart = gene.txLocus().start()
        else:
            geneStart = gene.txLocus().end()
        geneStart = abs(geneStart - refPoint) * scaleFactor
        nameTable.append([name, geneStart, -2 - offset])
        # draw a line across the entire txLocus

        [start, stop] = [
            abs(x - refPoint) * scaleFactor for x in gene.txLocus().coords()
        ]
        diagramTable.append([start, -0.01 - offset, stop, 0.01 - offset])

        # now draw thin boxes for all txExons
        if len(gene.txExons()) > 0:
            for txExon in gene.txExons():

                [start, stop] = [
                    abs(x - refPoint) * scaleFactor for x in txExon.coords()
                ]

                diagramTable.append([start, -0.5 - offset, stop, 0.5 - offset])

        # now draw fatty boxes for the coding exons if any
        if len(gene.cdExons()) > 0:
            for cdExon in gene.cdExons():

                [start, stop] = [
                    abs(x - refPoint) * scaleFactor for x in cdExon.coords()
                ]

                diagramTable.append([start, -1 - offset, stop, 1 - offset])

    utils.unParseTable(diagramTable,
                       outFolder + gffString + '_diagramTemp.txt', '\t')
    utils.unParseTable(nameTable, outFolder + gffString + '_nameTemp.txt',
                       '\t')
Beispiel #26
0
def geneToEnhancerDict(genome, enhancer_file, activity_path):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci 
    '''
    print('Identifying enhancers and target genes from %s' % (enhancer_file))
    #should this do gene assignment????
    #for now assume gene assignment has been done
    #can later toggle to do gene assignment

    #first load the TF lists

    tf_table = utils.parseTable(genome.returnFeature('tf_file'), '\t')

    motif_table = utils.parseTable(genome.returnFeature('motif_convert'), '\t')
    #this gives all tfs that have a motif
    motif_tfs = utils.uniquify([line[1] for line in motif_table])

    #intersect w/ the activity table
    if len(activity_path) > 0:
        activity_table = utils.parseTable(activity_path, '\t')
        #figure out the right column for actual gene names (basically not NM or NR and not a numeral)
        for i in range(len(activity_table[0])):
            # try:
            #     foo = int(activity_table[0][i])
            # except ValueError: # case where it is not an integer
            if activity_table[0][i][0:2] != 'NM' and activity_table[0][i][
                    0:2] != 'NR':  #assumes refseq
                gene_col = i
                break
        print('using column %s of %s gene activity table for common names' %
              (gene_col + 1, activity_path))

        active_gene_list = [
            string.upper(line[gene_col]) for line in activity_table
        ]

        tf_list_refseq = [
            line[0] for line in tf_table if active_gene_list.count(line[1]) > 0
            and motif_tfs.count(line[1]) > 0
        ]
        tf_list_name = utils.uniquify([
            line[1] for line in tf_table if active_gene_list.count(line[1]) > 0
            and motif_tfs.count(line[1]) > 0
        ])
    else:
        tf_list_refseq = [
            line[0] for line in tf_table if motif_tfs.count(line[1]) > 0
        ]
        tf_list_name = [
            line[1] for line in tf_table if motif_tfs.count(line[1]) > 0
        ]

    print('Identified %s TFs from %s that have motifs' %
          (len(tf_list_name), genome.returnFeature('tf_file')))

    #keyed by gene with loci objects in the list
    gene_to_enhancer_dict = defaultdict(list)
    enhancer_to_gene_dict = defaultdict(list)

    #assuming id,chrom,start,stop w/ gene names in the last 3 columns per standard ROSE output
    enhancer_table = utils.parseTable(enhancer_file, '\t')
    print('Analyzing %s cis-regulatory regions' % (len(enhancer_table)))

    #now let's make the enhancer table by region and then by gene
    enhancerTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']]
    enhancerTFTable = [['ENHANCER_ID', 'CHROM', 'START', 'STOP', 'GENE_LIST']]
    geneTable = [['GENE', 'TF', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']]
    geneTFTable = [['GENE', 'CHROM', 'START', 'STOP', 'ENHANCER_ID']]
    geneSummaryTable = [['GENE', 'TF', 'ENHANCER_LIST']]

    #will need to track which ones are TFs
    candidate_tf_list = []
    #find the columns for gene assignment
    header = enhancer_table[0]
    header_length = len(enhancer_table[0])
    closest_index = header.index('CLOSEST_GENE')
    proximal_index = header.index('PROXIMAL_GENES')
    overlap_index = header.index('OVERLAP_GENES')
    for line in enhancer_table[1:]:
        if len(
                line
        ) != header_length:  #don't bother trying to figure out lines w/o target genes
            continue
        enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0])
        closest_gene_list = line[closest_index].split(',')
        proximal_gene_list = line[proximal_index].split(',')
        overlap_gene_list = line[overlap_index].split(',')
        all_gene_list = closest_gene_list + proximal_gene_list + overlap_gene_list
        all_gene_list = [string.upper(gene) for gene in all_gene_list]

        #print(all_gene_list)

        #print(activity_path)
        #print(active_gene_list)
        #gets a unique list of all tfs

        if len(activity_path) > 0:
            all_gene_list = utils.uniquify([
                gene for gene in all_gene_list
                if active_gene_list.count(gene) > 0
            ])
        else:
            all_gene_list = utils.uniquify(all_gene_list)
        candidate_gene_list = utils.uniquify(
            [gene for gene in all_gene_list if tf_list_name.count(gene) > 0])
        if len(all_gene_list) > 0:
            for gene in all_gene_list:

                gene_to_enhancer_dict[gene].append(enhancer_locus)
                enhancer_to_gene_dict[enhancer_locus].append(gene)
            newLine = line[0:4] + [','.join(all_gene_list)]
        else:
            newLine = line[0:4] + ['']
        enhancerTable.append(newLine)

        if len(candidate_gene_list) > 0:
            tfLine = line[0:4] + [','.join(candidate_gene_list)]
            enhancerTFTable.append(tfLine)

    #now iterate through each gene and list the enhancers
    gene_list = gene_to_enhancer_dict.keys()
    print(gene_list)
    gene_list.sort()
    for gene in gene_list:
        if tf_list_name.count(gene) > 0:
            tf_status = 1
            candidate_tf_list.append(gene)
        else:
            tf_status = 0
        enhancer_loci = gene_to_enhancer_dict[gene]
        enhancerString = ','.join(
            [enhancer.ID() for enhancer in enhancer_loci])
        geneSummaryTable.append([gene, tf_status, enhancerString])
        for enhancer in enhancer_loci:
            newLine = [
                gene, tf_status,
                enhancer.chr(),
                enhancer.start(),
                enhancer.end(),
                enhancer.ID()
            ]
            geneTable.append(newLine)
            if tf_status == 1:
                newLine = [
                    gene,
                    enhancer.chr(),
                    enhancer.start(),
                    enhancer.end(),
                    enhancer.ID()
                ]
                geneTFTable.append(newLine)

    return geneTable, geneTFTable, enhancerTable, enhancerTFTable, geneSummaryTable, candidate_tf_list, gene_to_enhancer_dict
Beispiel #27
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    #get the chromLists from the various bams here
    cmd = 'samtools idxstats %s' % (rankByBamFile)
    idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
    idxStats= idxStats.communicate()
    bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
    
    if len(controlBamFile) > 0:
        cmd = 'samtools idxstats %s' % (controlBamFile)
        idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
        idxStats= idxStats.communicate()
        bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
        bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0]



    #now make sure no genes have a bad chrom 
    overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0]

    
    #now make an enhancer collection of all transcripts    
    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch'


    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)
    os.system(cmd)

    #check for completion
    if utils.checkOutput(mappedRankByFile,0.2,5):
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.txt" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        os.system(cmd)

        #check for completion
        if utils.checkOutput(mappedControlFile,0.2,5):
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Beispiel #28
0
def make_mycn_stats_table(nb_all_chip_dataFile,outFile):

    '''
    making a table of conserved mycn peaks w/ some additional stats
    mycn and h3k27ac signal is avg. background normalized across 4 samples
    active tss defined as the union of all H3K27ac occupied promoters in NB
    active enhancers defined as the union of all H3K27ac sites outside of promoters
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)

    print('SETTING UP OUTPUT TABLE')
    outTable = [['PEAK_ID','CHROM','START','STOP','LENGTH','ACTIVE_TSS_OVERLAP','ENHANCER_OVERLAP','CPG_ISLAND_OVERLAP','CPG_ISLAND_FRACTION','GC_FREQ','MYCN_RANK','AVG_MYCN_SIGNAL','AVG_H3K27AC_SIGNAL','CANON_EBOX_COUNT','NONCANON_EBOX_COUNT','TOTAL_EBOX_COUNT','CANON_EXP','NON_CANON_EXP','GABPA_COUNT','GABPA_EXP','GATA_COUNT','GATA_EXP']]

    dinuc = nmers(2,['A','T','G','C'])

    #input files
    mycnSignalFile = '%sHG19_NB_MYCN_CONSERVED_-0_+0_NB_ALL_SIGNAL.txt' % (signalFolder)
    h3k27acSignalFile = '%sHG19_NB_MYCN_CONSERVED_-500_+500_NB_ALL_SIGNAL.txt' % (signalFolder)
    mycnRankFile = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder)
    activeGeneFile = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    #note, this is the ucsc hg19 cpg islands extended file
    #to download and format run ./beds/download_cpg.sh
    cpgFile = '%sbeds/hg19_cpg_islands.bed' % (projectFolder)
    enhancerFile = '%smeta_rose/NB_H3K27AC/NB_H3K27AC_AllEnhancers.table.txt' % (projectFolder)

    print('LOADING MYCN BINDING DATA')
    mycnSignalTable = utils.parseTable(mycnSignalFile,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(mycnSignalTable) != len(utils.uniquify([line[0] for line in mycnSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (mycnSignalFile))
        sys.exit()
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        mycn_sig_dict[line[0]] = numpy.mean(line_sig)


    
    print('LOADING MYCN RANK DATA')
    mycnRankTable = utils.parseTable(mycnRankFile,'\t')

    print('LOADING H3K27AC BINDING DATA')
    h3k27acSignalTable = utils.parseTable(h3k27acSignalFile,'\t')
    #making a signal dictionary for background subtracted H3K27ac binding
    names_list = ['BE2C_H3K27AC','KELLY_H3K27AC','NGP_H3K27AC','SHEP21_0HR_H3K27AC_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = h3k27acSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    h3k27ac_sig_dict = {}
    #this only works if the first column are unique identifiers
    if len(h3k27acSignalTable) != len(utils.uniquify([line[0] for line in h3k27acSignalTable])):
        print('Error: Column 1 of must contain unique identifiers.' % (h3k27acSignalFile))
        sys.exit()
    for line in h3k27acSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        h3k27ac_sig_dict[line[0]] = numpy.mean(line_sig)



    #making the cpg collection
    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(cpgFile,'\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0],line[1],line[2],'.',line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci,50)
        
    #next make the tss collection of active promoters
    print('LOADING ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    activeTable = utils.parseTable(activeGeneFile,'\t')
    tss_1kb_loci = []
    for line in activeTable:
        tss_1kb_loci.append(utils.makeTSSLocus(line[1],startDict,1000,1000))
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci,50)


    #enhancer file
    print("LOADING ACTIVE ENHANCERS")
    enhancerTable = utils.parseTable(enhancerFile,'\t')
    print('STARTING WITH THE FOLLOWING NUMBER OF ENHANCERS IN NB')
    print(len(enhancerTable) - 6)
    enhancerLoci = []
    for line in enhancerTable:
        if line[0][0] != '#' and line[0][0] != 'R':
            try:
                lineLocus = utils.Locus(line[1],int(line[2]),int(line[3]),'.',line[0])
                enhancerLoci.append(lineLocus)
            except IndexError:
                print(line)
                sys.exit()
    enhancerCollection = utils.LocusCollection(enhancerLoci,50)

    print('CLASSIFYING MYCN PEAKS')
    ticker = 0
    for i in range(1,len(mycnSignalTable)):
        if ticker%100 == 0:
            print(ticker)
        ticker +=1

        line = mycnSignalTable[i]        

        mycn_signal = round(mycn_sig_dict[line[0]],4)
        h3k27ac_signal = round(h3k27ac_sig_dict[line[0]],4)
        
        peakID = line[0]
        locusString = line[1]
        chrom = locusString.split('(')[0]
        [start,stop] = [int(x) for x in line[1].split(':')[-1].split('-')]
        lineLocus = utils.Locus(chrom,start,stop,'.',peakID)
        
        tssOverlap = 0
        if tss_1kb_collection.getOverlap(lineLocus,'both'):
            tssOverlap = 1

        enhancerOverlap = 0
        if enhancerCollection.getOverlap(lineLocus,'both') and tssOverlap == 0:
            enhancerOverlap = 1

        cpgIslandOverlap = 0
        if cpgCollection.getOverlap(lineLocus,'both'):
            cpgIslandOverlap = 1

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus,'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(),lineLocus.start())
            cpgEnd = min(locus.end(),lineLocus.end())
            overlappingBases += (cpgEnd-cpgStart)
        overlapFraction = round(float(overlappingBases)/lineLocus.len(),2)
        
        #now get the seq
        lineSeq = string.upper(utils.fetchSeq(genomeDirectory,chrom,start,stop,True))
        gcFreq = round(float(lineSeq.count('GC') + lineSeq.count('CG'))/len(lineSeq),2)
            
        dinuc_dict = {}
        for nmer in dinuc:
            dinuc_dict[nmer] = float(lineSeq.count('GC'))/len(lineSeq)

        
        mycnRankLine = mycnRankTable[i]
        mycnRank = numpy.mean([float(x) for x in mycnRankLine[6:]])

        canonMatchList = re.findall('CACGTG',lineSeq)
        canon_count = len(canonMatchList)

        eboxMatchList = re.findall('CA..TG',lineSeq)
        ebox_count = len(eboxMatchList)

        non_canon_count = ebox_count-canon_count

        #get the expected values
        canon_exp = dinuc_dict['CA']*dinuc_dict['CG']*dinuc_dict['TG']*(len(lineSeq) - 5)
        canon_exp = round(canon_exp,2)
        notCG = 1- dinuc_dict['CG']
        non_exp = dinuc_dict['CA']*notCG*dinuc_dict['TG']*(len(lineSeq) - 5)
        non_exp = round(non_exp,2)



        #for gata and GABPA
        gabpaMatchList = re.findall('CGGAAG',lineSeq) + re.findall('CTTCCG',lineSeq)
        gabpa_count = len(gabpaMatchList)

        gabpa_exp_f = dinuc_dict['CG'] * dinuc_dict['GA'] * dinuc_dict['AG']*(len(lineSeq) - 5)
        gabpa_exp_r = dinuc_dict['CT'] * dinuc_dict['TC'] * dinuc_dict['CG']*(len(lineSeq) - 5)
        
        gabpa_exp = round(gabpa_exp_f,2) + round(gabpa_exp_r,2)

        gataMatchList = re.findall('GATAA',lineSeq) + re.findall('TTATC',lineSeq)
        gata_count = len(gataMatchList)

        an_freq = 1 - dinuc_dict['AA'] - dinuc_dict['AT'] - dinuc_dict['AG'] -dinuc_dict['AC']
        cn_freq = 1 - dinuc_dict['CA'] - dinuc_dict['CT'] - dinuc_dict['CG'] -dinuc_dict['CC']
        gata_exp_f = dinuc_dict['GA'] * dinuc_dict['TA'] * an_freq*(len(lineSeq) - 5)
        gata_exp_r = dinuc_dict['TT'] * dinuc_dict['AT'] * cn_freq*(len(lineSeq) - 5)
        gata_exp = round(gata_exp_f,2) + round(gata_exp_r,2)

        
        

        newLine = [peakID,chrom,start,stop,lineLocus.len(),tssOverlap,enhancerOverlap,cpgIslandOverlap,overlapFraction,gcFreq,mycnRank,mycn_signal,h3k27ac_signal,canon_count,non_canon_count,ebox_count,canon_exp,non_exp,gabpa_count,gabpa_exp,gata_count,gata_exp]
        outTable.append(newLine)

    utils.unParseTable(outTable,outFile,'\t')
    
    return outFile
Beispiel #29
0
def makePeakTable(paramDict,
                  splitGFFPath,
                  averageTablePath,
                  startDict,
                  geneList,
                  genomeDirectory,
                  tads_path=''):
    '''
    makes the final peak table with ebox info
    '''

    peakTable = [[
        'REGION_ID', 'CHROM', 'START', 'STOP', 'LENGTH', 'TSS', 'CPG',
        'CPG_FRACTION', 'GC_FREQ', 'SIGNAL', 'CANON_EBOX_COUNT',
        'NON_CANON_EBOX_COUNT', 'TOTAL_EBOX_COUNT', 'OVERLAPPING_GENES',
        'PROXIMAL_GENES'
    ]]

    print('LOADING PEAK REGIONS')
    peakGFF = utils.parseTable(splitGFFPath, '\t')

    print('LOADING BINDING DATA')
    signalTable = utils.parseTable(averageTablePath, '\t')

    print('LOADING CPGS ISLANDS')
    cpgBed = utils.parseTable(paramDict['cpgPath'], '\t')
    cpgLoci = []
    for line in cpgBed:
        cpgLoci.append(utils.Locus(line[0], line[1], line[2], '.', line[-1]))
    cpgCollection = utils.LocusCollection(cpgLoci, 50)

    print("MAKING TSS COLLECTIONS")
    if len(geneList) == 0:
        geneList = startDict.keys()

    tss_1kb_loci = []
    tss_50kb_loci = []
    for refID in geneList:
        tss_1kb_loci.append(utils.makeTSSLocus(refID, startDict, 1000, 1000))
        tss_50kb_loci.append(utils.makeTSSLocus(refID, startDict, 50000,
                                                50000))

    #make a 1kb flanking and 50kb flanking collection
    tss_1kb_collection = utils.LocusCollection(tss_1kb_loci, 50)
    tss_50kb_collection = utils.LocusCollection(tss_50kb_loci, 50)

    if len(tads_path) > 0:
        print('LOADING TADS FROM %s' % (tads_path))
        tad_collection = utils.importBoundRegion(tads_path, 'tad')
        use_tads = True

        #building a tad dict keyed by tad ID w/ genes in that tad provided
        tad_dict = defaultdict(list)
        for tss_locus in tss_1kb_loci:
            overlapping_tads = tad_collection.getOverlap(tss_locus, 'both')
            for tad_locus in overlapping_tads:
                tad_dict[tad_locus.ID()].append(tss_locus.ID())

    else:
        use_tads = False

    print('CLASSIFYING PEAKS')
    ticker = 0

    no_tad_count = 0
    for i in range(len(peakGFF)):
        if ticker % 1000 == 0:
            print(ticker)
        ticker += 1

        #getting the particulars of the region
        gffLine = peakGFF[i]
        peakID = gffLine[1]
        chrom = gffLine[0]
        start = int(gffLine[3])
        stop = int(gffLine[4])
        lineLocus = utils.Locus(chrom, start, stop, '.', peakID)

        #getting the mapped signal
        signalLine = signalTable[(i + 1)]
        signalVector = [float(x) for x in signalLine[2:]]

        #setting up the new line
        newLine = [peakID, chrom, start, stop, lineLocus.len()]

        #get the tss status from the gff itself (we are able to do this nicely from the split gff code earlier
        newLine.append(gffLine[7])

        #check cpg status
        if cpgCollection.getOverlap(lineLocus, 'both'):
            newLine.append(1)
        else:
            newLine.append(0)

        #now do fractional cpgOverlap
        overlappingCpGLoci = cpgCollection.getOverlap(lineLocus, 'both')
        overlappingBases = 0
        for locus in overlappingCpGLoci:
            cpgStart = max(locus.start(), lineLocus.start())
            cpgEnd = min(locus.end(), lineLocus.end())
            overlappingBases += (cpgEnd - cpgStart)
        overlapFraction = float(overlappingBases) / lineLocus.len()

        newLine.append(round(overlapFraction, 2))

        #now get the seq
        lineSeq = string.upper(
            utils.fetchSeq(genomeDirectory, chrom, start, stop, True))
        if len(lineSeq) == 0:
            print('UH OH')
            print(lineSeq)
            print(gffLine)
            print(i)
            print(chrom)
            print(start)
            print(stop)
            sys.exit()

        gcFreq = float(lineSeq.count('GC') +
                       lineSeq.count('CG')) / len(lineSeq)
        newLine.append(gcFreq)

        #this is where we add the ChIP-Seq signal
        newLine += signalVector

        eboxMatchList = re.findall('CA..TG', lineSeq)
        if len(eboxMatchList) == 0:
            newLine += [0] * 3
        else:
            totalCount = len(eboxMatchList)
            canonCount = eboxMatchList.count('CACGTG')
            otherCount = totalCount - canonCount
            newLine += [canonCount, otherCount, totalCount]

        #now find the overlapping and proximal genes
        #here each overlapping gene the tss 1kb locus overlaps the peak

        if use_tads:

            tad_loci = tad_collection.getOverlap(lineLocus, 'both')

            tad_id_list = [tad_locus.ID() for tad_locus in tad_loci]
            tad_genes = []
            for tad_id in tad_id_list:
                tad_genes += tad_dict[tad_id]
            if len(tad_genes) == 0:
                #print('no tad for this region')
                #print(gffLine)
                no_tad_count += 1
        else:
            tad_genes = []

        if len(tad_genes) > 0:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
                if tad_genes.count(locus.ID()) > 0
            ]
            # print('linked peak to tad genes')
            # print([startDict[x]['name'] for x in tad_genes])
            # print(tad_id_list)
            # print(gffLine)
            # print(overlappingGenes)
            # print(proximalGenes)
        else:
            overlappingGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_1kb_collection.getOverlap(lineLocus, 'both')
            ]
            proximalGenes = [
                startDict[locus.ID()]['name']
                for locus in tss_50kb_collection.getOverlap(lineLocus, 'both')
            ]

        overlappingGenes = utils.uniquify(overlappingGenes)
        #here the tss 50kb locus overlaps the peak
        #overlap takes priority over proximal
        proximalGenes = [
            gene for gene in proximalGenes if overlappingGenes.count(gene) == 0
        ]
        proximalGenes = utils.uniquify(proximalGenes)

        overlappingString = string.join(overlappingGenes, ',')
        proximalString = string.join(proximalGenes, ',')

        newLine += [overlappingString, proximalString]

        peakTable.append(newLine)

    print('Out of %s regions, %s were assigned to at least 1 tad' %
          (len(peakTable), no_tad_count))
    return peakTable
Beispiel #30
0
def findCanidateTFs(annotationFile, enhancerLoci, expressedNM, expressionDictNM,
                    bamFile, TFlist, refseqToNameDict, projectFolder, projectName, promoter):
    '''                                                           
    Assign each Super-Enhancer to the closest active TSS to its center
    Return a dictionary keyed by TF that points to a list of loci
    '''

    print 'FINDING CANIDATE TFs'

    enhancerAssignment = []
    TFtoEnhancerDict = defaultdict(list)

    startDict = utils.makeStartDict(annotationFile)    

    tssLoci = []
    for gene in expressedNM:
        tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000))
    tssCollection = utils.LocusCollection(tssLoci,50)    


    # Loop through enhancers
    for enhancer in enhancerLoci:
        

        # If the enhancer overlaps a TSS, save it
        overlappingLoci = tssCollection.getOverlap(enhancer, 'both')
        overlappingGenes =[]
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # Find all gene TSS within 100 kb
        proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,100000,100000),'both')
        proximalGenes =[]
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())
        
        # If no genes are within 100 kb, find the closest active gene
        closestGene = ''
        if len(overlappingGenes) == 0 and len(proximalGenes) == 0:
        
            distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancer,1000000,1000000),'both')
            distalGenes =[]
            for distalLocus in distalLoci:
                distalGenes.append(distalLocus.ID())

            enhancerCenter = (int(enhancer.start()) + int(enhancer.end())) / 2
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in distalGenes]
            if distList:
                closestGene = distalGenes[distList.index(min(distList))]


        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)
 

        # If a TSS overlaps an enhancer, assign them together
        if overlappingGenes:
            for gene in overlappingGenes:
                if gene in TFlist:
                    TFtoEnhancerDict[gene].append(enhancer)
                    enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
                
        # Otherwise, assign the enhancer to the most active gene in 100 kb
        elif not overlappingGenes and proximalGenes:
            highestGene = ''
            highestActivity = 0
            for gene in proximalGenes:
                if expressionDictNM[gene] > highestActivity:
                    highestActivity = expressionDictNM[gene]
                    highestGene = gene
            if highestGene in TFlist:
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])
            
        elif not overlappingGenes and not proximalGenes and closestGene:
            if closestGene in TFlist:
                gene = closestGene
                TFtoEnhancerDict[gene].append(enhancer)
                enhancerAssignment.append([gene, enhancer.chr(), enhancer.start(), enhancer.end(), enhancer.ID()])

    # Add promoter is it's not contained in the super
    if promoter:
        for gene in TFtoEnhancerDict.keys():
            promoter = utils.Locus(startDict[gene]['chr'], int(startDict[gene]['start'][0]) - 2000, 
                                   int(startDict[gene]['start'][0]) + 2000, startDict[gene]['sense'])
            overlapBool = False
            for enhancer in TFtoEnhancerDict[gene]:
                if promoter.overlaps(enhancer):
                    overlapBool = True
            if not overlapBool:
                TFtoEnhancerDict[gene].append(promoter)

    seAssignmentFile = projectFolder + projectName + '_ENHANCER_ASSIGNMENT.txt'
    utils.unParseTable(enhancerAssignment, seAssignmentFile, '\t')

    return TFtoEnhancerDict