Example #1
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-n",
        "--name",
        dest="name",
        nargs=1,
        default=None,
        help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option(
        "-c",
        "--control",
        dest="control",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam"
    )
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [
        inputFile for inputFile in options.input.split(',')
        if len(inputFile) > 1
    ]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][
                0:-4]  #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])

        else:
            print(
                'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    bamFileList = []
    if options.control:
        controlBamList = [
            bam for bam in options.control.split(',') if len(bam) > 0
        ]
        rankbyBamList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList * len(rankbyBamList)
        else:
            print(
                'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE'
            )
            sys.exit()
    else:
        bamFileList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()

    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))

    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0], '\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile, '\t')
            gffCollection = utils.gffToLocusCollection(gff, 50)
            inputLoci += gffCollection.getLoci()

        inputCollection = utils.LocusCollection(inputLoci, 50)
        inputCollection = inputCollection.stitchCollection(
        )  # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i, line in enumerate(inputGFF):

        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName, str(i + 1))  #1 indexing

        newLine = [
            chrom, lineID, lineID,
            min(coords),
            max(coords), '', sense, '', lineID
        ]
        formattedGFF.append(newLine)

    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome),
                                               inputName)
    utils.unParseTable(formattedGFF, masterGFFFile, '\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,
                                       inputName + '_MERGED_SIGNAL',
                                       controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (
        pipeline_dir, outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superStretchTableFile)
    os.system(cmd)
Example #2
0
def main():
    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs='*',
        help="Enter a space separated list of .bam files for the main factor",
        required=True)
    parser.add_argument("-i",
                        "--input",
                        dest="input",
                        type=str,
                        help="Enter .gff or .bed file of regions to analyze",
                        required=True)
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=
        "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported",
        required=True)

    # output flag
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        type=str,
                        help="Enter the output folder.",
                        required=True)

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=
        "specify a table where first column represents a list of active refseq genes",
        required=False)

    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs='*',
        help=
        "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction",
        required=False)
    parser.add_argument(
        "-w",
        "--window",
        dest="window",
        type=int,
        help=
        "Enter a window to define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000)
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs='*',
        help="enter a space separated list of other bams to map to",
        required=False)

    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        "enter a root name for the analysis, otherwise will try to find the name from the input file",
        required=False)

    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        "Run the analysis on the top N genes by total signal. Default is 5000",
        required=False,
        default=5000)
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        "Include a .bed of tad regions to restrict enhancer/gene association",
        required=False,
        default=None)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.output:

        #=====================================================================================
        #===============================I. PARSING ARGUMENTS==================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n'
        )

        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome.upper()
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))

        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output, True)
        outputFolder = utils.formatFolder(
            '%s%s' % (parentFolder, analysisName), True)

        print('WRITING OUTPUT TO %s' % (outputFolder))

        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            print('input in bed format, converting to gff')
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath, '\t')

        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath, '\t')

            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(
                        activityTable[0][i]).count('NR_') > 0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable
                        ]  # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))

        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            use_tads = True
            tads_path = args.tads
        else:
            use_tads = False
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))

        #important here to define the window
        startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile(
            genome, window, geneList, True)
        #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
        #sys.exit()

        print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES')

        print(chrom_list)
        filtered_gff = [
            line for line in inputGFF if chrom_list.count(line[0]) > 0
        ]

        print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' %
              (len(filtered_gff), len(inputGFF)))

        #=====================================================================================
        #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
        #=====================================================================================

        print(
            '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n'
        )

        #now we need to split the input region
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(filtered_gff, tssCollection)
        print(len(filtered_gff))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName)
        utils.unParseTable(splitGFF, splitGFFPath, '\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),
                                          True)

        signalTable = mapBams(bamFileList, splitGFFPath, analysisName,
                              mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,
                                                     analysisName)
        utils.unParseTable(signalTable, signalTablePath, '\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList, splitGFFPath,
                                         analysisName, mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (
                outputFolder, analysisName)
            utils.unParseTable(controlSignalTable, controlSignalTablePath,
                               '\t')

        #now create the background subtracted summarized average table

        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,
                                        analysisName,
                                        useBackground=args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,
                                                       analysisName)
        utils.unParseTable(averageTable, averageTablePath, '\t')

        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName)
        if utils.checkOutput(peakTablePath, 0.1, 0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath, '\t')
        else:
            peakTable = makePeakTable(paramDict, splitGFFPath,
                                      averageTablePath, startDict, geneList,
                                      genomeDirectory, tads_path)
            utils.unParseTable(peakTable, peakTablePath, '\t')

        geneTable = makeGeneTable(peakTable, analysisName)

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName)
        utils.unParseTable(geneTable, geneTablePath, '\t')

        #if mouse, need to convert genes over
        if genome.count('MM') == 1:
            print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA')
            converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % (
                outputFolder, analysisName)

            converted_geneTable = [geneTable[0]]
            for line in geneTable[1:]:
                converted_name = mouse_convert_dict[line[0]]
                if len(converted_name) > 0:
                    converted_geneTable.append([converted_name] + line[1:])

                    utils.unParseTable(converted_geneTable,
                                       converted_geneTablePath, '\t')

            geneTablePath = converted_geneTablePath
            geneTable = converted_geneTable

        #=====================================================================================
        #===================================III. PLOTTING ====================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n'
        )

        #if there are fewer genes in the gene table than the top genes, only run on all
        if len(geneTable) < int(top):
            print(
                'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s'
                % (len(geneTable) - 1, top))
            top = 0
            use_top = False
        else:
            use_top = True

        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath, outputFolder, analysisName, top)

        #=====================================================================================
        #==================================IV. RUNNING GSEA===================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n'
        )

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter',
                 use_top)
        callGSEA(outputFolder, analysisName, top, 'total_contribution',
                 use_top)

        if use_top:
            print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
            #for top by enhancer v promoter metric
            top_promoterTablePath, top_distalTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'enhancer_vs_promoter')
            top_signalTablePath, top_backgroundTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'total_contribution')

            print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
            callR_GSEA(top_promoterTablePath, top_distalTablePath,
                       outputFolder, analysisName + '_enhancer_vs_promoter',
                       top)
            callR_GSEA(top_signalTablePath, top_backgroundTablePath,
                       outputFolder, analysisName + '_total_contribution', top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        #for top
        all_promoterTablePath, all_distalTablePath = detectGSEAOutput(
            analysisName, outputFolder, 'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder,
                   analysisName, 'all')

        #these files can be parsed to make the NES plot

        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
Example #3
0
def main():

    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument("-b", "--bam", dest="bam", nargs='*',
                        help="Enter a space separated list of .bam files for the main factor", required=True)
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="Enter .gff or .bed file of regions to analyze", required=True)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify a genome, HG18,HG19,MM8,MM9,MM10,RN6 are currently supported", required=True)
    # output flag
    parser.add_argument("-o", "--output", dest="output", type=str,
                        help="Enter the output folder.", required=True)


    # additional options flags and optional arguments
    parser.add_argument("-a", "--activity", dest="activity", type=str,
                        help="specify a table where first column represents a list of active refseq genes", required=False)

    parser.add_argument("-c", "--control", dest="control", nargs='*',
                        help="Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False)
    parser.add_argument("-w", "--window", dest="window",type=int,
                        help="Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000)
    parser.add_argument("--other-bams", dest="other", nargs='*',
                        help="enter a space separated list of other bams to map to", required=False)

    parser.add_argument("--name", dest="name", type=str,
                        help="enter a root name for the analysis, otherwise will try to find the name from the input file", required=False)


    parser.add_argument("--top", dest="top", type=int,
                        help="Run the analysis on the top N genes by total signal. Default is 5000", required=False,default=5000)
    parser.add_argument("--tads", dest="tads", type=str,
                        help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None)

    #add by Quanhu Sheng
    parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str,
                        help="Enter the folder contains chromosome sequence in fasta format", required=True)
    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
    parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True)
    parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True)
    parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.genomeDirectory and args.output and args.gseaPath and args.gmxPath and args.cpgPath:
        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))
        
        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output,True)
        outputFolder = utils.formatFolder('%s%s' % (parentFolder,analysisName),True)

        print('WRITING OUTPUT TO %s' % (outputFolder))


        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath,'\t')
        
        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath,'\t')
            
            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(activityTable[0][i]).count('NR_') >0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable] # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))
        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            tads_path = args.tads
        else:
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))
        
        genomeDirectory=args.genomeDirectory
        
        #important here to define the window
        startDict,tssCollection,geneList = loadAnnotFile(genome,window,geneList,True)
        print('IDENTIFIED %s valid ACTIVE GENES' % (len(geneList)))
        print(len(startDict))

        #now we need to split the input region 
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(inputGFF,tssCollection)
        print(len(inputGFF))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder,analysisName)
        utils.unParseTable(splitGFF,splitGFFPath,'\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),True)
        
        signalTable = mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,analysisName)
        utils.unParseTable(signalTable,signalTablePath,'\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList,splitGFFPath,analysisName,mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (outputFolder,analysisName)
            utils.unParseTable(controlSignalTable,controlSignalTablePath,'\t')

        #now create the background subtracted summarized average table
        
        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,analysisName,useBackground = args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,analysisName)
        utils.unParseTable(averageTable,averageTablePath,'\t')


        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder,analysisName)
        if utils.checkOutput(peakTablePath,0.1,0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath,'\t')
        else:
            peakTable = makePeakTable(args.cpgPath,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path)        
            utils.unParseTable(peakTable,peakTablePath,'\t')

        geneTable = makeGeneTable(peakTable,analysisName)        

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder,analysisName)
        utils.unParseTable(geneTable,geneTablePath,'\t')
        
        if(top > len(geneTable)):
          top = 'all'
        
        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath,outputFolder,analysisName,top)

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top)
        
        if top != 'all':
          print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
          top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top)

          print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
          callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all')


        #these files can be parsed to make the NES plot


        #gsea_report_for_DISTAL_1459192369220.xls
        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
Example #4
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-b",
        "--bams",
        dest="bams",
        nargs=1,
        default=None,
        help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c",
                      "--control",
                      dest="control",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd),
        'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(inputGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch.py'

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Example #5
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
Example #6
0
def make_shep_on_mycn_landscape(shep_on_dataFile):

    '''
    finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)


    print('LOADING SHEP ON MYCN SITES')
    #load all of the shep_on sites
    # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder)
    # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t')

    shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder)
    shep_on_bed = utils.parseTable(shep_on_bed_path,'\t')
    shep_on_gff = utils.bedToGFF(shep_on_bed)
    
    #now get the conserved NB MYCN regions
    nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file)

    print('LOADING SHEP ACTIVE ENHANCERS') 
    #make a collection of enhancers
    shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder)
    shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC')

    #now get the active promoters
    print('LOADING SHEP ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder)
    shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t')
    transcribedList = [line[1] for line in shep_transcribed_table]
    tssLoci = []
    for refID in transcribedList:
        tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000))

    shep_tss_collection = utils.LocusCollection(tssLoci,50)

    #now initialize the 6 gffs we will need
    shep_mycn_gff = [] 
    shep_mycn_gff_5kb = []
    shep_mycn_gff_1kb = []

    shep_mycn_promoter_gff = []
    shep_mycn_promoter_gff_1kb = []
    shep_mycn_promoter_gff_5kb = []

    shep_mycn_enhancer_gff = []
    shep_mycn_enhancer_gff_1kb = []
    shep_mycn_enhancer_gff_5kb = []

    #and their respective file names
    shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder)
    shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder)
    shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder)

    print('ITERATING THROUGH SHEP MYCN PEAKS')

    ticker = 0
    enhancer = 0
    promoter = 0 

    other = 0
    for line in shep_on_gff:
        if ticker % 1000 == 0:
            print ticker
        ticker+=1
        peakID = '%s_%s' % ('SHEP_MYCN',str(ticker))

        lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID)

        if nb_conserved_mycn_collection.getOverlap(lineLocus):
            gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID]
            peakCenter = (int(line[3]) + int(line[4]))/2
            gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID]
            #the 1kb is not a center +/- but a flank
            gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID]

            shep_mycn_gff.append(gffLine)
            shep_mycn_gff_5kb.append(gffLine_5kb)
            shep_mycn_gff_1kb.append(gffLine_1kb)

            #tss overlap should take precedence over enhancer overlap
            if shep_tss_collection.getOverlap(lineLocus,'both'):
                shep_mycn_promoter_gff.append(gffLine)
                shep_mycn_promoter_gff_5kb.append(gffLine_5kb)
                shep_mycn_promoter_gff_1kb.append(gffLine_1kb)
                promoter+=1
            #now check for enhancer overlap
            elif shep_enhancer_collection.getOverlap(lineLocus,'both'):
                shep_mycn_enhancer_gff.append(gffLine)
                shep_mycn_enhancer_gff_5kb.append(gffLine_5kb)
                shep_mycn_enhancer_gff_1kb.append(gffLine_1kb)
                enhancer+=1
            else:
                other+=1
    
    print('Of %s shep on mycn peaks' % (len(shep_on_gff)))
    print('%s are promoter' % (promoter))
    print('%s are enhancer' % (enhancer))
    print('%s are other' % (other))
    #now write out the gffs
    utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t')
    utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')
    new_table = []
    for line in bed_table[1:]:
        chrom = line[0]
        start = int(line[1]) - 50
        stop = int(line[2]) + 50
        edge = line[3]
        strand = line[4]
        new_line = [chrom, start, stop, edge, strand]
        new_table.append(new_line)
    tmp_path = '%s%s' % (temp_dir, bed)
    if len(new_table) > 0:
        utils.unParseTable(new_table, tmp_path, '\t')
        gff = bed.split('.')[0] + '.gff'
        print(gff)
        gff_path = '%s%s' % (temp_dir, gff)
        utils.bedToGFF(tmp_path, gff_path)

        genome_directory = genome_dir_dict[genome]

        print('gffToFasta Tool running on ' + gff_path + ' for ' + genome)
        fasta = utils.gffToFasta(genome,
                                 genome_directory,
                                 gff_path,
                                 UCSC=True,
                                 useID=False)

        print('Creating density table')
        table = []
        header = ['DENSITY', 'POSITIONS', 'POS_COUNT', 'SUBPEAK_LENGTH']
        table.append(header)
Example #8
0
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
Example #9
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#============II. MAKING A BED OUT OF HG19 FIGURE REGIONS==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    hg19_gff_path = '%sgff/HG19_NB_FIGURE_GENES.gff' % (hg19_projectFolder)

    hg19_gff = utils.parseTable(hg19_gff_path, '\t')
    print(hg19_gff)

    hg19_bed = utils.gffToBed(hg19_gff)
    print(hg19_bed)
    hg19_bed_path = '%sbeds/HG19_NB_FIGURE_GENES.bed' % (hg19_projectFolder)
    utils.unParseTable(hg19_bed, hg19_bed_path, '\t')
    #need to manually lift this over to mm9
    #https://genome.ucsc.edu/cgi-bin/hgLiftOver

    mm9_bed_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.bed' % (bedFolder)
    mm9_gff_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.gff' % (gffFolder)
    mm9_gff = utils.bedToGFF(mm9_bed_path)

    #now add some additional manual regions

    added_gff_regions = [
        [
            'chr12', 'TWIST1_ENHANCER', 'TWIST1_ENHANCER', 34639818, 34656263,
            '', '-', '', 'TWIST1_ENHANCER'
        ],
        [
            'chr11', 'NPM1_PROMOTER_2', 'NPM1_PROMOTER_2', 33049820, 33065883,
            '', '+', '', 'NPM1_PROMOTER_2'
        ],
        [
            'chr6', 'GATA2_ENHANCER', 'GATA2_ENHANCER', 88135802, 88159867, '',
            '+', '', 'GATA2_ENHANCER'
        ],
        [
            'chr7', 'PHOX2A', 'PHOX2A', 108964211, 108974610, '', '+', '',
            'PHOX2A'
        ],
        [
            'chr15',
            'LET7B',
            'LET7B',
            85497440,
            85538754,
            '',
            '+',
            '',
            'LET7B',
        ],
        [
            'chr10', 'LIN28B', 'LIN28B', 45161233, 45217227, '', '-', '',
            'LIN28B'
        ],
    ]

    mm9_gff_full = mm9_gff + added_gff_regions

    utils.unParseTable(mm9_gff_full, mm9_gff_path, '\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. PLOTTING DATA IN MOUSE===================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #plot mouse regions
    plot_mouse_genes(mouse_dataFile, mm9_gff_path)