Beispiel #1
0
def makePeakGFFs(peak_path_list):

    '''
    makes a stitched gff for all MYC bound TSS and Distal regions across all datasets
    '''

    #setting the output
    tss_gff_path = '%sHG19_MYC_TSS_REGIONS_-0_+0.gff' % (gffFolder)
    distal_gff_path = '%sHG19_MYC_DISTAL_REGIONS_-0_+0.gff' % (gffFolder)

    #check to see if already done
    if utils.checkOutput(tss_gff_path,0.1,0.1) and utils.checkOutput(distal_gff_path,0.1,0.1):
        print('OUTPUT FOUND AT %s and %s' % (tss_gff_path,distal_gff_path))
        return tss_gff_path,distal_gff_path

    #emtpy loci lists to hold everything
    tss_loci = []
    distal_loci = []

    
    for peak_path in peak_path_list:
        print('processing %s' % (peak_path))

        peak_table=  utils.parseTable(peak_path,'\t')

        for line in peak_table[1:]:
            peak_locus = utils.Locus(line[1],line[2],line[3],'.')
            if int(line[5]) == 0:
                distal_loci.append(peak_locus)
            else:
                tss_loci.append(peak_locus)

    #now combind the loci
    print('stitching loci')
    distal_collection = utils.LocusCollection(distal_loci,50)
    tss_collection = utils.LocusCollection(tss_loci,50)

    stitched_distal_collection = distal_collection.stitchCollection()
    stitched_tss_collection = tss_collection.stitchCollection()

    #now make the gffs
    distal_gff= utils.locusCollectionToGFF(distal_collection)
    tss_gff= utils.locusCollectionToGFF(tss_collection)

    #now write to disk
    utils.unParseTable(distal_gff,distal_gff_path,'\t')
    utils.unParseTable(tss_gff,tss_gff_path,'\t')
    
    return tss_gff_path,distal_gff_path
Beispiel #2
0
def mergeCollections(superFile1,superFile2,name1,name2,output=''):

    '''
    merges them collections
    '''

    conSuperCollection = makeSECollection(superFile1,name1)

    tnfSuperCollection = makeSECollection(superFile2,name2)


    #now merge them
    mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci()

    mergedCollection = utils.LocusCollection(mergedLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    
    #loci that are in both get renamed with a new unique identifier

    renamedLoci =[]
    ticker = 1
    for locus in stitchedLoci:

        if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)):

            newID = 'CONSERVED_%s' % (str(ticker))
            ticker +=1
            locus._ID = newID
        else:
            locus._ID = locus.ID()[2:]
        renamedLoci.append(locus)

    #now we turn this into a gff and write it out
    gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))

    if len(output) == 0:
        return gff
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(gff,output,'\t')
        return output
Beispiel #3
0
def mergeCollections(superFile1,superFile2,name1,name2,output=''):

    '''
    merges them collections
    '''

    conSuperCollection = makeSECollection(superFile1,name1)

    tnfSuperCollection = makeSECollection(superFile2,name2)


    #now merge them
    mergedLoci = conSuperCollection.getLoci() + tnfSuperCollection.getLoci()

    mergedCollection = utils.LocusCollection(mergedLoci,50)

    #stitch the collection together
    stitchedCollection = mergedCollection.stitchCollection()

    stitchedLoci = stitchedCollection.getLoci()
    
    #loci that are in both get renamed with a new unique identifier

    renamedLoci =[]
    ticker = 1
    for locus in stitchedLoci:

        if len(conSuperCollection.getOverlap(locus)) > 0 and len(tnfSuperCollection.getOverlap(locus)):

            newID = 'CONSERVED_%s' % (str(ticker))
            ticker +=1
            locus._ID = newID
        else:
            locus._ID = locus.ID()[2:]
        renamedLoci.append(locus)

    #now we turn this into a gff and write it out
    gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))

    if len(output) == 0:
        return gff
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(gff,output,'\t')
        return output
Beispiel #4
0
def calculatePromoterActivity(annotationFile,
                              bamFile,
                              projectName,
                              projectFolder,
                              refseqToNameDict,
                              background=False):
    '''
    calculates the level of acetylation at each TF promoter
    '''

    print 'GENERATING AN ACTIVITY TABLE USING CHIP DATA'

    annotTable = utils.parseTable(annotationFile, '\t')
    output = []
    counter = 0

    bam = utils.Bam(bamFile)

    if background:
        background = utils.Bam(background)

    startDict = utils.makeStartDict(annotationFile)

    tssLoci = []
    for gene in startDict:
        tssLoci.append(utils.makeTSSLocus(gene, startDict, 2500, 2500))
    tssCollection = utils.LocusCollection(tssLoci, 50)

    gff = utils.locusCollectionToGFF(tssCollection)

    outputname = projectFolder + projectName + '_TSS.gff'
    utils.unParseTable(gff, outputname, '\t')

    mappingCmd = 'bamliquidator_batch'
    mappingCmd += ' -r ' + outputname
    mappingCmd += ' -o ' + projectFolder + 'bamliquidator'
    mappingCmd += ' -m -e 200 '
    mappingCmd += bamFile

    subprocess.call(mappingCmd, shell=True)

    print mappingCmd
Beispiel #5
0
def calculatePromoterActivity(annotationFile, bamFile, projectName, projectFolder, refseqToNameDict):
    '''
    calculates the level of H3K27ac at each promoter from a H3K27ac bam file
    '''

    print 'IDENTIFY EXPRESSED GENES'

    annotTable = utils.parseTable(annotationFile, '\t')
    output = []
    counter = 0

    bam = utils.Bam(bamFile)

    startDict = utils.makeStartDict(annotationFile)

    tssLoci = []
    for gene in startDict:
        tssLoci.append(utils.makeTSSLocus(gene,startDict,1000,1000))
    tssCollection = utils.LocusCollection(tssLoci,50)

    gff = utils.locusCollectionToGFF(tssCollection)


    outputname = projectFolder + projectName + '_TSS.gff'
    utils.unParseTable(gff, outputname, '\t')

    # run bamToGFF.py to quantify signal at each TSS +/- 1kb

    mappingCmd = 'python ./bamToGFF.py'
    mappingCmd += ' -r '
    mappingCmd += ' -d '
    mappingCmd += ' -o ' + projectFolder + 'matrix.gff'
    mappingCmd += ' -m 1 -f 0 -e 200 '
    mappingCmd += ' -i ' + projectFolder + projectName + '_TSS.gff'
    mappingCmd += ' -b ' + bamFile

    call(mappingCmd, shell=True)

    print  mappingCmd
Beispiel #6
0
    def merge_regions():
        '''
        merges ha peaks to identify all overlapping peaks
        filters out anything overlapping the HA controls
        '''
        hk_dox_ha_1 = utils.importBoundRegion(
            '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1')
        hk_dox_ha_2 = utils.importBoundRegion(
            '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2')

        hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci()

        #control datasets
        hk_ctl_ha_1 = utils.importBoundRegion(
            '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1')
        hk_ctl_ha_2 = utils.importBoundRegion(
            '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2')

        hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci()
        hk_ctl_lc = utils.LocusCollection(hk_ctl_loci)

        print(len(hk_dox_loci))
        stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection()
        print(len(stitched_lc))
        filtered_loci = []
        for locus in stitched_lc.getLoci():
            if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len(
                    hk_dox_ha_2.getOverlap(locus)) > 0:
                if len(hk_ctl_lc.getOverlap(locus)) == 0:
                    filtered_loci.append(locus)

        print(len(filtered_loci))
        filtered_lc = utils.LocusCollection(filtered_loci)
        gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (
            gffFolder)
        filtered_gff = utils.locusCollectionToGFF(filtered_lc)
        utils.unParseTable(filtered_gff, gff_path, '\t')
Beispiel #7
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-n",
        "--name",
        dest="name",
        nargs=1,
        default=None,
        help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option(
        "-c",
        "--control",
        dest="control",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam"
    )
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [
        inputFile for inputFile in options.input.split(',')
        if len(inputFile) > 1
    ]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][
                0:-4]  #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])

        else:
            print(
                'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    bamFileList = []
    if options.control:
        controlBamList = [
            bam for bam in options.control.split(',') if len(bam) > 0
        ]
        rankbyBamList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList * len(rankbyBamList)
        else:
            print(
                'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE'
            )
            sys.exit()
    else:
        bamFileList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()

    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))

    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0], '\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile, '\t')
            gffCollection = utils.gffToLocusCollection(gff, 50)
            inputLoci += gffCollection.getLoci()

        inputCollection = utils.LocusCollection(inputLoci, 50)
        inputCollection = inputCollection.stitchCollection(
        )  # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i, line in enumerate(inputGFF):

        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName, str(i + 1))  #1 indexing

        newLine = [
            chrom, lineID, lineID,
            min(coords),
            max(coords), '', sense, '', lineID
        ]
        formattedGFF.append(newLine)

    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome),
                                               inputName)
    utils.unParseTable(formattedGFF, masterGFFFile, '\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,
                                       inputName + '_MERGED_SIGNAL',
                                       controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (
        pipeline_dir, outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superStretchTableFile)
    os.system(cmd)
Beispiel #8
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-b",
        "--bams",
        dest="bams",
        nargs=1,
        default=None,
        help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c",
                      "--control",
                      dest="control",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd),
        'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(inputGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch.py'

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Beispiel #9
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
Beispiel #10
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Beispiel #11
0
def rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100):

    '''
    uses the  conserved MYCN sites and ranks eboxes within them
    by average background subtracted signal
    searches 100bp (window variable)  from mycn summits
    '''
    
    window = int(window)

    #bring in the conserved mycn region
    print('making gff of nb mycn summits')
    nb_mycn_gff = utils.parseTable(mycn_gff_path,'\t')

    nb_mycn_collection = utils.gffToLocusCollection(nb_mycn_gff,50)

    dataDict =pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    names_list = [name for name in dataDict.keys() if name.count('MYCN') == 1]
    names_list.sort()

    summit_loci = []
    #first makes a gff of all summits +/- 100bp for all nb mycn datasets
    for name in names_list:
        summit_bed_path = '%s%s/%s_summits.bed' % (macsFolder,name,name)
        summit_bed = utils.parseTable(summit_bed_path,'\t')
        for line in summit_bed:
            summit_locus = utils.Locus(line[0],int(line[1])-window,int(line[2])+window,'.',line[3])
            if len(nb_mycn_collection.getOverlap(summit_locus)) > 0:
                summit_loci.append(summit_locus)

    summit_collection =utils.LocusCollection(summit_loci,50)
    summit_merged_collection = summit_collection.stitchCollection()
    
    summit_gff = utils.locusCollectionToGFF(summit_merged_collection)
    summit_gff_path = '%sHG19_NB_MYCN_SUMMITS_-%s_+%s.gff' % (gffFolder,window,window)
    utils.unParseTable(summit_gff,summit_gff_path,'\t')

    #this is borrowed from above and maps chip-seq signal to the gff
    print('mapping to nb mycn summits and making signal dict')
    gffList = [summit_gff_path]
    summit_signal_path = pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList)


    mycnSignalTable = utils.parseTable(summit_signal_path,'\t')

    #making a signal dictionary for MYCN binding
    names_list = ['BE2C_MYCN','KELLY_MYCN','NGP_MYCN','SHEP21_0HR_MYCN_NOSPIKE']
    background_list = [dataDict[name]['background'] for name in names_list]
    header = mycnSignalTable[0]
    chip_columns = [header.index(name) for name in names_list]
    background_columns = [header.index(background_name) for background_name in background_list]
    
    mycn_sig_dict = {}
    for line in mycnSignalTable[1:]:
        line_sig = []
        for i in range(len(names_list)):
            line_sig.append(float(line[chip_columns[i]]) - float(line[background_columns[i]]))
        region_id = line[1]
        coords = [int(x) for x in line[1].split(':')[-1].split('-')]
        line_length = coords[1]-coords[0]
        mycn_sig_dict[region_id] = numpy.mean(line_sig)*line_length

    #now for each region find the eboxes and then add up the signal
    print('making ebox ranking')
    ebox_list = ['CACGTG','CAGTTG','CAAGTG','CAGGTG','CAATTG','CAAATG','CATCTG','CAGCTG','CATGTG','CATATG']
    eboxDict = {}
    for ebox in ebox_list:
        eboxDict[ebox] = []
    ticker = 0
    for line in summit_gff:
        if ticker % 1000 == 0:
            print(ticker)
        ticker+=1

        chrom = line[0]
        sense = '.'

        start = int(line[3])
        end = int(line[4])
        region_id = '%s(%s):%s-%s' % (line[0],line[6],line[3],line[4])
        signal = mycn_sig_dict[region_id]

        sequenceLine = utils.fetchSeq(genomeDirectory,chrom,start,end,True)
        
        motifVector = []
        matches = re.finditer('CA..TG',str.upper(sequenceLine))
        if matches:
            for match in matches:
                motifVector.append(match.group())
        
        #count only 1 of each motif type per line
        #motifVector = utils.uniquify(motifVector)
        for motif in motifVector:
            if ebox_list.count(motif) > 0:
                eboxDict[motif].append(signal)
            else:
                eboxDict[utils.revComp(motif)].append(signal)


    eboxTable =[]
    eboxTableOrdered =[['EBOX','OCCURENCES','AVG_HEIGHT']]
    for ebox in eboxDict.keys():
        newLine = [ebox,len(eboxDict[ebox]),numpy.mean(eboxDict[ebox])]
        eboxTable.append(newLine)


    occurenceOrder = utils.order([line[2] for line in eboxTable],decreasing=True)
    
    for x in occurenceOrder:
        eboxTableOrdered.append(eboxTable[x])
    print(eboxTableOrdered)
    ebox_outfile = '%sHG19_NB_MYCN_CONSERVED_SUMMITS_-%s_+%s_EBOX_RANK.txt' % (tableFolder,window,window)
    utils.unParseTable(eboxTableOrdered,ebox_outfile,'\t')
    return ebox_outfile
Beispiel #12
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #these are the datasets we will use
    pipeline_dfci.summary(shep21_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#================II. RUNNING DIFFERENTIAL ROSE ANALYSIS================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #use the dynamic rose tools to first map twist1 binding sites
    #and then quantify

    name1 = 'SHEP21_0HR_TWIST'
    name2 = 'SHEP21_24HR_B_TWIST'
    analysis_name = 'SHEP21_TWIST1'
    rank_gff_path = wrapDRose(shep21_dataFile, name1, name2, analysis_name)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================III. MAPPING MYCN DATA TO RANK GFF==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #for shep21 nospike
    gffList = [rank_gff_path]
    dataDict = pipeline_dfci.loadDataTable(shep21_dataFile)
    names_list = [
        name for name in dataDict.keys()
        if name.count('MYCN') == 1 or name.count('INPUT') == 1
        or name.count('TWIST') == 1 and name.count('rep2') == 0
    ]
    print(names_list)
    #map_regions(shep21_dataFile,gffList,names_list)

    gffList = ['%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder)]
    #map_regions(shep21_dataFile,gffList,names_list)

    #make a gff of twist and mycn sites at 0hr
    twist_collection = utils.importBoundRegion(
        '%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder),
        'SHEP21_0HR_TWIST')

    mycn_collection = utils.importBoundRegion(
        '%smacsEnriched/SHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (projectFolder),
        'SHEP21_0HR_MYCN_NOSPIKE')

    all_loci = twist_collection.getLoci() + mycn_collection.getLoci()
    all_collection = utils.LocusCollection(all_loci, 50)
    stitched_collection = all_collection.stitchCollection()

    stitched_loci = stitched_collection.getLoci()

    overlap_loci = []
    for locus in stitched_loci:
        if len(twist_collection.getOverlap(locus, 'both')) > 0 and len(
                mycn_collection.getOverlap(locus, 'both')) > 0:
            overlap_loci.append(locus)

    overlap_collection = utils.LocusCollection(overlap_loci, 50)
    overlap_gff = utils.locusCollectionToGFF(overlap_collection)
    overlap_gff_path = '%sHG19_SHEP21_0HR_TWIST_MYCN_INTERSECTION_-0_+0.gff' % (
        gffFolder)
    utils.unParseTable(overlap_gff, overlap_gff_path, '\t')

    gffList = [overlap_gff_path]
    map_regions(shep21_dataFile, gffList, names_list)
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.gff" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)

    outputRank = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
    outputRank = outputRank.communicate()
    if len(outputRank[0]) > 0:  # test if mapping worked correctly
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.gff" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = 'python ' + bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        outputControl = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True)
        outputControl = outputControl.communicate()
        if len(outputControl[0]) > 0:  # test if mapping worked correctly
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Beispiel #14
0
def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False):
    '''
    maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq
    '''
    startDict = utils.makeStartDict(annotFile)
    enhancerName = enhancerFile.split('/')[-1].split('.')[0]
    enhancerTable = utils.parseTable(enhancerFile, '\t')

    # internal parameter for debugging
    byRefseq = False

    if len(transcribedFile) > 0:
        transcribedTable = utils.parseTable(transcribedFile, '\t')
        transcribedGenes = [line[1] for line in transcribedTable]
    else:
        transcribedGenes = startDict.keys()

    print('MAKING TRANSCRIPT COLLECTION')
    transcribedCollection = utils.makeTranscriptCollection(
        annotFile, 0, 0, 500, transcribedGenes)

    print('MAKING TSS COLLECTION')
    tssLoci = []
    for geneID in transcribedGenes:
        tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0))

    # this turns the tssLoci list into a LocusCollection
    # 50 is the internal parameter for LocusCollection and doesn't really
    # matter
    tssCollection = utils.LocusCollection(tssLoci, 50)

    geneDict = {'overlapping': defaultdict(
        list), 'proximal': defaultdict(list)}

    # dictionaries to hold ranks and superstatus of gene nearby enhancers
    rankDict = defaultdict(list)
    superDict = defaultdict(list)

    # list of all genes that appear in this analysis
    overallGeneList = []

    # find the damn header
    for line in enhancerTable:
        if line[0][0] == '#':
            continue
        else:
            header = line
            break

    if noFormatTable:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']]

    else:
        # set up the output tables
        # first by enhancer
        enhancerToGeneTable = [
            header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]]

        # next by gene
        geneToEnhancerTable = [
            ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']]

    # next make the gene to enhancer table
    geneToEnhancerTable = [
        ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']]

    for line in enhancerTable:
        if line[0][0] == '#' or line[0][0] == 'R':
            continue

        enhancerString = '%s:%s-%s' % (line[1], line[2], line[3])

        enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0])

        # overlapping genes are transcribed genes whose transcript is directly
        # in the stitchedLocus
        overlappingLoci = transcribedCollection.getOverlap(
            enhancerLocus, 'both')
        overlappingGenes = []
        for overlapLocus in overlappingLoci:
            overlappingGenes.append(overlapLocus.ID())

        # proximalGenes are transcribed genes where the tss is within 50kb of
        # the boundary of the stitched loci
        proximalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both')
        proximalGenes = []
        for proxLocus in proximalLoci:
            proximalGenes.append(proxLocus.ID())

        distalLoci = tssCollection.getOverlap(
            utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both')
        distalGenes = []
        for proxLocus in distalLoci:
            distalGenes.append(proxLocus.ID())

        overlappingGenes = utils.uniquify(overlappingGenes)
        proximalGenes = utils.uniquify(proximalGenes)
        distalGenes = utils.uniquify(distalGenes)
        allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes
        # these checks make sure each gene list is unique.
        # technically it is possible for a gene to be overlapping, but not proximal since the
        # gene could be longer than the 50kb window, but we'll let that slide
        # here
        for refID in overlappingGenes:
            if proximalGenes.count(refID) == 1:
                proximalGenes.remove(refID)

        for refID in proximalGenes:
            if distalGenes.count(refID) == 1:
                distalGenes.remove(refID)

        # Now find the closest gene
        if len(allEnhancerGenes) == 0:
            closestGene = ''
        else:
            # get enhancerCenter
            enhancerCenter = (int(line[2]) + int(line[3])) / 2

            # get absolute distance to enhancer center
            distList = [abs(enhancerCenter - startDict[geneID]['start'][0])
                        for geneID in allEnhancerGenes]
            # get the ID and convert to name
            closestGene = startDict[
                allEnhancerGenes[distList.index(min(distList))]]['name']

        # NOW WRITE THE ROW FOR THE ENHANCER TABLE
        if noFormatTable:

            newEnhancerLine = list(line)
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)

        else:
            newEnhancerLine = line[0:9]
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ','))
            newEnhancerLine.append(
                join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ','))
            newEnhancerLine.append(closestGene)
            newEnhancerLine += line[-2:]

        enhancerToGeneTable.append(newEnhancerLine)
        # Now grab all overlapping and proximal genes for the gene ordered
        # table

        overallGeneList += overlappingGenes
        for refID in overlappingGenes:
            geneDict['overlapping'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

        overallGeneList += proximalGenes
        for refID in proximalGenes:
            geneDict['proximal'][refID].append(enhancerString)
            rankDict[refID].append(int(line[-2]))
            superDict[refID].append(int(line[-1]))

    # End loop through
    # Make table by gene
    print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION')
    overallGeneList = utils.uniquify(overallGeneList)

    #get the chromLists from the various bams here
    cmd = 'samtools idxstats %s' % (rankByBamFile)
    idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
    idxStats= idxStats.communicate()
    bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
    
    if len(controlBamFile) > 0:
        cmd = 'samtools idxstats %s' % (controlBamFile)
        idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True)
        idxStats= idxStats.communicate()
        bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]]
        bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0]



    #now make sure no genes have a bad chrom 
    overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0]

    
    #now make an enhancer collection of all transcripts    
    enhancerGeneCollection = utils.makeTranscriptCollection(
        annotFile, 5000, 5000, 500, overallGeneList)

    enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection)

    # dump the gff to file
    enhancerFolder = utils.getParentFolder(enhancerFile)
    gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome)
    enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName)
    utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t')

    # now we need to run bamToGFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch'


    print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS')
    # map density at genes in the +/- 5kb tss region
    # first on the rankBy bam
    bamName = rankByBamFile.split('/')[-1]
    mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName)
    mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName)
    cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile)
    print("Mapping rankby bam %s" % (rankByBamFile))
    print(cmd)
    os.system(cmd)

    #check for completion
    if utils.checkOutput(mappedRankByFile,0.2,5):
        print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
    else:
        print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile))
        sys.exit()

    # next on the control bam if it exists
    if len(controlBamFile) > 0:
        controlName = controlBamFile.split('/')[-1]
        mappedControlFolder = "%s%s_%s_%s/" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        mappedControlFile = "%s%s_%s_%s/matrix.txt" % (
            enhancerFolder, enhancerName,gffRootName, controlName)
        cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile)
        print("Mapping control bam %s" % (controlBamFile))
        print(cmd)
        os.system(cmd)

        #check for completion
        if utils.checkOutput(mappedControlFile,0.2,5):
            print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
        else:
            print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile))
            sys.exit()

    # now get the appropriate output files
    if len(controlBamFile) > 0:
        print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" %
              (mappedRankByFile, mappedControlFile))
        if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile, mappedControlFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()
    else:
        print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile))
        if utils.checkOutput(mappedRankByFile, 1, 30):
            print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES')
            signalDict = makeSignalDict(mappedRankByFile)
        else:
            print("NO MAPPING OUTPUT DETECTED")
            sys.exit()

    # use enhancer rank to order

    rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList])

    usedNames = []

    # make a new dict to hold TSS signal by max per geneName
    geneNameSigDict = defaultdict(list)
    print('MAKING GENE TABLE')
    for i in rankOrder:
        refID = overallGeneList[i]
        geneName = startDict[refID]['name']
        if usedNames.count(geneName) > 0 and uniqueGenes == True:
            continue
        else:
            usedNames.append(geneName)

        proxEnhancers = geneDict['overlapping'][
            refID] + geneDict['proximal'][refID]

        superStatus = max(superDict[refID])
        enhancerRanks = join([str(x) for x in rankDict[refID]], ',')

        enhancerSignal = signalDict[refID]
        geneNameSigDict[geneName].append(enhancerSignal)

        newLine = [geneName, refID, join(
            proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal]
        geneToEnhancerTable.append(newLine)
    #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t')
    print('MAKING ENHANCER TO TOP GENE TABLE')

    if noFormatTable:
        enhancerToTopGeneTable = [
            enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']]
    else:
        enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [
            'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]]

    for line in enhancerToGeneTable[1:]:

        geneList = []
        if noFormatTable:
            geneList += line[-3].split(',')
            geneList += line[-2].split(',')

        else:
            geneList += line[10].split(',')
            geneList += line[11].split(',')

        geneList = utils.uniquify([x for x in geneList if len(x) > 0])
        if len(geneList) > 0:
            try:
                sigVector = [max(geneNameSigDict[x]) for x in geneList]
                maxIndex = sigVector.index(max(sigVector))
                maxGene = geneList[maxIndex]
                maxSig = sigVector[maxIndex]
                if maxSig == 0.0:
                    maxGene = 'NONE'
                    maxSig = 'NONE'
            except ValueError:
                if len(geneList) == 1:
                    maxGene = geneList[0]
                    maxSig = 'NONE'    
                else:
                    maxGene = 'NONE'
                    maxSig = 'NONE'    
        else:
            maxGene = 'NONE'
            maxSig = 'NONE'
        if noFormatTable:
            newLine = line + [maxGene, maxSig]
        else:
            newLine = line[0:12] + [maxGene, maxSig] + line[-2:]
        enhancerToTopGeneTable.append(newLine)

    # resort enhancerToGeneTable
    if noFormatTable:
        return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable
    else:
        enhancerOrder = utils.order([int(line[-2])
                                    for line in enhancerToGeneTable[1:]])
        sortedTable = [enhancerToGeneTable[0]]
        sortedTopGeneTable = [enhancerToTopGeneTable[0]]
        for i in enhancerOrder:
            sortedTable.append(enhancerToGeneTable[(i + 1)])
            sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)])

        return sortedTable, sortedTopGeneTable, geneToEnhancerTable
Beispiel #15
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. CALLING MACS============================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #running peak finding using macs 1.4.2 on all chip datasets
    #this usually takes ~2-3 hours on a reasonably fast machine
    #a 3 hour time out on this entire operation is set
    #if peak calling takes longer than 3 hours, simply run the script again after completion
    #run_macs(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================II. DEFINING ACTIVE GENES IN MOUSE==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #here we will identify active promoters in various contexts as those with
    #an H3K27AC peak in the +/- 1kb tss region
    #UCSC refseq annotations are used for all genes

    #make_active_gene_lists(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================III. CALLING ROSE TO MAP ENHANCERS=================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for SCG_H3K27AC
    # analysisName = 'SCG_H3K27AC'
    # namesList = ['SCG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for CG_H3K27AC
    # analysisName = 'CG_H3K27AC'
    # namesList = ['CG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for GANGLIA_H3K27AC
    # analysisName = 'GANGLIA_H3K27AC'
    # namesList = ['CG_H3K27Ac','SCG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for THMYCN
    # analysisName = 'THMYCN_H3K27AC'
    # namesList = ['THMYCN_139076_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN1_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================IV. LIFTING OVER NB CONSERVED REGIONS================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #liftover a pair of gffs
    # #first convert to bed
    # nb_promoter_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (hg19_projectFolder)
    # nb_enhancer_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (hg19_projectFolder)

    # nb_promoter_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (hg19_projectFolder)
    # nb_enhancer_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (hg19_projectFolder)

    # nb_promoter_gff = utils.parseTable(nb_promoter_gff_path,'\t')
    # nb_enhancer_gff = utils.parseTable(nb_enhancer_gff_path,'\t')

    # utils.gffToBed(nb_promoter_gff,nb_promoter_bed_path)
    # utils.gffToBed(nb_enhancer_gff,nb_enhancer_bed_path)

    # print('converted NB conserved gffs to beds at %s and %s' % (nb_promoter_bed_path,nb_enhancer_bed_path))

    # #note, now you have to liftover manually to create beds
    # mm9_promoter_bed_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (bedFolder)
    # mm9_enhancer_bed_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (bedFolder)

    # mm9_promoter_gff_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder)
    # mm9_enhancer_gff_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder)

    # utils.bedToGFF(mm9_promoter_bed_path,mm9_promoter_gff_path)
    # utils.bedToGFF(mm9_enhancer_bed_path,mm9_enhancer_gff_path)

    # print('writing mm9 nb mycn sites to %s and %s' % (mm9_promoter_gff_path,mm9_enhancer_gff_path))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================V. MAPPING ENRICHED TO GFFS====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # setName = 'THMYCN'
    # gffList = [mm9_promoter_gff_path,mm9_enhancer_gff_path]
    # cellTypeList = ['THMYCN1','THMYCN2','THMYCN','CG','SCG']
    # mapList = ['CG_H3K27Ac',
    #             'SCG_H3K27Ac',
    #             'THMYCN1_H3K27Ac',
    #             'THMYCN_139423_H3K27Ac',
    #             'THMYCN_139076_H3K27Ac',
    #             ]

    # #pipeline_dfci.mapEnrichedToGFF(mouse_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=mapList,useBackground=True)

    # #summarize info for venn diagrams for each

    # promoter_mapped_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder)
    # promoter_venn_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_VENN.txt' % (tableFolder)
    # summarizeVenn(promoter_mapped_path,group_list = ['CG','THMYCN'],output=promoter_venn_path)

    # enhancer_mapped_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder)
    # enhancer_venn_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_VENN.txt' % (tableFolder)
    # summarizeVenn(enhancer_mapped_path,group_list = ['CG','THMYCN'],output=enhancer_venn_path)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VI. MAKING MYCN REGIONS GFF======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)
    names_list = [
        'THMYCN2_MYCN',
        'THMYCN_139076_MYCN',
        'THMYCN_139423_MYCN',
    ]

    mycn_loci = []
    for name in names_list:
        mycn_collection = utils.importBoundRegion(
            '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']),
            name)
        mycn_loci += mycn_collection.getLoci()

    mycn_collection = utils.LocusCollection(mycn_loci, 50)
    mycn_collection.stitchCollection()
    mycn_gff = utils.locusCollectionToGFF(mycn_collection)
    mycn_gff_path = '%sMM9_THMYCN_MYCN_-0_+0.gff' % (gffFolder)
    utils.unParseTable(mycn_gff, mycn_gff_path, '\t')

    #make collections
    promoter_collection = utils.gffToLocusCollection(
        '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder))
    enhancer_collection = utils.gffToLocusCollection(
        '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder))
    #make the overlap table
    overlap_table = [['PROMOTER', 'ENHANCER', 'NONE']]
    promoter_count = 0
    enhancer_count = 0
    none_count = 0
    for line in mycn_gff:
        locus = utils.Locus(line[0],
                            int(line[3]) - 10000,
                            int(line[4]) + 10000, '.')
        if enhancer_collection.getOverlap(locus, 'both'):
            enhancer_count += 1
            continue

        if promoter_collection.getOverlap(locus, 'both'):
            promoter_count += 1
        else:
            none_count += 1

    overlap_table.append([promoter_count, enhancer_count, none_count])
    overlap_table_path = '%sMM9_THMYCN_OVERLAP.txt' % (tableFolder)
    utils.unParseTable(overlap_table, overlap_table_path, '\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VI. MAPPING GFFS FOR HEATMAP====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #map_for_heatmap(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VII. AVERAGING MAPPED SIGNAL====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # set_list = ['GANGLIA_H3K27AC','THMYCN_H3K27AC','THMYCN_MYCN']
    # set_names = [
    #     ['CG_H3K27Ac','SCG_H3K27Ac'],
    #     ['THMYCN1_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN_139076_H3K27Ac'],
    #     ['THMYCN2_MYCN','THMYCN_139076_MYCN','THMYCN_139423_MYCN']
    # ]
    # for i in range(len(set_list)):
    #     setName = set_list[i]
    #     names_list =set_names[i]
    #     print(setName)
    #     print(names_list)
    #     #for promoters
    #     mapped_list = ['%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list]
    #     output_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,setName)
    #     print(output_path)
    #     averagingMappedSignal(mapped_list,output_path,setName)

    #     #for enhancers
    #     mapped_list = ['%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list]
    #     output_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,setName)
    #     print(output_path)
    #     averagingMappedSignal(mapped_list,output_path,setName)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VIII. MAKING HEATMAPS/METAS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
Beispiel #16
0
def mergeCollections(enhancerFile1,enhancerFile2,name1,name2,output='',inputGFF=''):

    '''
    merges them collections
    '''
    print(enhancerFile1)
    print(enhancerFile2)
    name1Collection = makeSECollection(enhancerFile1,name1)

    name2Collection = makeSECollection(enhancerFile2,name2)
    print(len(name1Collection))
    print(len(name2Collection))
    print('weeeeeee')

    
    if len(inputGFF) == 0:
        #now merge them
        mergedLoci = name1Collection.getLoci() + name2Collection.getLoci()

        mergedCollection = utils.LocusCollection(mergedLoci,50)

        #stitch the collection together
        stitchedCollection = mergedCollection.stitchCollection()

        stitchedLoci = stitchedCollection.getLoci()
    else:
        locusCollection = utils.gffToLocusCollection(inputGFF)
        stitchedCollection = locusCollection.stitchCollection()
        stitchedLoci = stitchedCollection.getLoci()

    #rename loci by presence in group1 or group2

    renamedLoci =[]
    conserved_ticker = 1
    name1_ticker = 1
    name2_ticker = 1
    for locus in stitchedLoci:

        if len(name1Collection.getOverlap(locus)) > 0 and len(name1Collection.getOverlap(locus)) > 0:

            newID = 'CONSERVED_%s' % (str(conserved_ticker))
            conserved_ticker +=1

        elif len(name1Collection.getOverlap(locus)) > 0 and len(name1Collection.getOverlap(locus)) == 0:
            newID = '%s_%s' % (name1,str(name1_ticker))
            name1_ticker +=1
        else:
            newID = '%s_%s' % (name2,str(name2_ticker))
            name2_ticker +=1
            
        locus._ID = newID
        renamedLoci.append(locus)

    #now we turn this into a gff and write it out
    gff = utils.locusCollectionToGFF(utils.LocusCollection(renamedLoci,50))

    if len(output) == 0:
        return gff
    else:
        print "writing merged gff to %s" % (output)
        utils.unParseTable(gff,output,'\t')
        return output