Python bedToGFF Examples

Programming Language: Python

Namespace/Package Name: utils

Method/Function: bedToGFF

Examples at hotexamples.com: 9

Python bedToGFF - 9 examples found. These are the top rated real world Python examples of utils.bedToGFF extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-n",
        "--name",
        dest="name",
        nargs=1,
        default=None,
        help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option(
        "-c",
        "--control",
        dest="control",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam"
    )
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [
        inputFile for inputFile in options.input.split(',')
        if len(inputFile) > 1
    ]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][
                0:-4]  #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])

        else:
            print(
                'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
            )
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder, inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    bamFileList = []
    if options.control:
        controlBamList = [
            bam for bam in options.control.split(',') if len(bam) > 0
        ]
        rankbyBamList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList * len(rankbyBamList)
        else:
            print(
                'ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE'
            )
            sys.exit()
    else:
        bamFileList = [
            bam for bam in options.rankby.split(',') if len(bam) > 0
        ]

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()

    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))

    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0], '\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile, '\t')
            gffCollection = utils.gffToLocusCollection(gff, 50)
            inputLoci += gffCollection.getLoci()

        inputCollection = utils.LocusCollection(inputLoci, 50)
        inputCollection = inputCollection.stitchCollection(
        )  # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i, line in enumerate(inputGFF):

        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]), int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName, str(i + 1))  #1 indexing

        newLine = [
            chrom, lineID, lineID,
            min(coords),
            max(coords), '', sense, '', lineID
        ]
        formattedGFF.append(newLine)

    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder, string.upper(genome),
                                               inputName)
    utils.unParseTable(formattedGFF, masterGFFFile, '\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,
                                       inputName + '_MERGED_SIGNAL',
                                       controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (
        pipeline_dir, outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (
        pipeline_dir, genome, outFolder, superStretchTableFile)
    os.system(cmd)

Example #2

Show file

File: enhancerPromoter.py Project: mufrdrk/pipeline

def main():
    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs='*',
        help="Enter a space separated list of .bam files for the main factor",
        required=True)
    parser.add_argument("-i",
                        "--input",
                        dest="input",
                        type=str,
                        help="Enter .gff or .bed file of regions to analyze",
                        required=True)
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=
        "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported",
        required=True)

    # output flag
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        type=str,
                        help="Enter the output folder.",
                        required=True)

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=
        "specify a table where first column represents a list of active refseq genes",
        required=False)

    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs='*',
        help=
        "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction",
        required=False)
    parser.add_argument(
        "-w",
        "--window",
        dest="window",
        type=int,
        help=
        "Enter a window to define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000)
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs='*',
        help="enter a space separated list of other bams to map to",
        required=False)

    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        "enter a root name for the analysis, otherwise will try to find the name from the input file",
        required=False)

    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        "Run the analysis on the top N genes by total signal. Default is 5000",
        required=False,
        default=5000)
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        "Include a .bed of tad regions to restrict enhancer/gene association",
        required=False,
        default=None)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.output:

        #=====================================================================================
        #===============================I. PARSING ARGUMENTS==================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n'
        )

        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome.upper()
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))

        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output, True)
        outputFolder = utils.formatFolder(
            '%s%s' % (parentFolder, analysisName), True)

        print('WRITING OUTPUT TO %s' % (outputFolder))

        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            print('input in bed format, converting to gff')
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath, '\t')

        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath, '\t')

            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(
                        activityTable[0][i]).count('NR_') > 0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable
                        ]  # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))

        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            use_tads = True
            tads_path = args.tads
        else:
            use_tads = False
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))

        #important here to define the window
        startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile(
            genome, window, geneList, True)
        #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
        #sys.exit()

        print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES')

        print(chrom_list)
        filtered_gff = [
            line for line in inputGFF if chrom_list.count(line[0]) > 0
        ]

        print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' %
              (len(filtered_gff), len(inputGFF)))

        #=====================================================================================
        #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
        #=====================================================================================

        print(
            '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n'
        )

        #now we need to split the input region
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(filtered_gff, tssCollection)
        print(len(filtered_gff))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName)
        utils.unParseTable(splitGFF, splitGFFPath, '\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),
                                          True)

        signalTable = mapBams(bamFileList, splitGFFPath, analysisName,
                              mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,
                                                     analysisName)
        utils.unParseTable(signalTable, signalTablePath, '\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList, splitGFFPath,
                                         analysisName, mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (
                outputFolder, analysisName)
            utils.unParseTable(controlSignalTable, controlSignalTablePath,
                               '\t')

        #now create the background subtracted summarized average table

        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,
                                        analysisName,
                                        useBackground=args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,
                                                       analysisName)
        utils.unParseTable(averageTable, averageTablePath, '\t')

        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName)
        if utils.checkOutput(peakTablePath, 0.1, 0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath, '\t')
        else:
            peakTable = makePeakTable(paramDict, splitGFFPath,
                                      averageTablePath, startDict, geneList,
                                      genomeDirectory, tads_path)
            utils.unParseTable(peakTable, peakTablePath, '\t')

        geneTable = makeGeneTable(peakTable, analysisName)

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName)
        utils.unParseTable(geneTable, geneTablePath, '\t')

        #if mouse, need to convert genes over
        if genome.count('MM') == 1:
            print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA')
            converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % (
                outputFolder, analysisName)

            converted_geneTable = [geneTable[0]]
            for line in geneTable[1:]:
                converted_name = mouse_convert_dict[line[0]]
                if len(converted_name) > 0:
                    converted_geneTable.append([converted_name] + line[1:])

                    utils.unParseTable(converted_geneTable,
                                       converted_geneTablePath, '\t')

            geneTablePath = converted_geneTablePath
            geneTable = converted_geneTable

        #=====================================================================================
        #===================================III. PLOTTING ====================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n'
        )

        #if there are fewer genes in the gene table than the top genes, only run on all
        if len(geneTable) < int(top):
            print(
                'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s'
                % (len(geneTable) - 1, top))
            top = 0
            use_top = False
        else:
            use_top = True

        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath, outputFolder, analysisName, top)

        #=====================================================================================
        #==================================IV. RUNNING GSEA===================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n'
        )

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter',
                 use_top)
        callGSEA(outputFolder, analysisName, top, 'total_contribution',
                 use_top)

        if use_top:
            print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
            #for top by enhancer v promoter metric
            top_promoterTablePath, top_distalTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'enhancer_vs_promoter')
            top_signalTablePath, top_backgroundTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'total_contribution')

            print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
            callR_GSEA(top_promoterTablePath, top_distalTablePath,
                       outputFolder, analysisName + '_enhancer_vs_promoter',
                       top)
            callR_GSEA(top_signalTablePath, top_backgroundTablePath,
                       outputFolder, analysisName + '_total_contribution', top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        #for top
        all_promoterTablePath, all_distalTablePath = detectGSEAOutput(
            analysisName, outputFolder, 'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder,
                   analysisName, 'all')

        #these files can be parsed to make the NES plot

        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))

Example #3

Show file

File: enhancerPromoter.py Project: shengqh/pipeline

def main():

    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument("-b", "--bam", dest="bam", nargs='*',
                        help="Enter a space separated list of .bam files for the main factor", required=True)
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="Enter .gff or .bed file of regions to analyze", required=True)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify a genome, HG18,HG19,MM8,MM9,MM10,RN6 are currently supported", required=True)
    # output flag
    parser.add_argument("-o", "--output", dest="output", type=str,
                        help="Enter the output folder.", required=True)


    # additional options flags and optional arguments
    parser.add_argument("-a", "--activity", dest="activity", type=str,
                        help="specify a table where first column represents a list of active refseq genes", required=False)

    parser.add_argument("-c", "--control", dest="control", nargs='*',
                        help="Enter a space separated list of .bam files for background. If flagged, will perform background subtraction", required=False)
    parser.add_argument("-w", "--window", dest="window",type=int,
                        help="Enter a window to define the TSS area +/- the TSS. Default is 1kb", required=False, default=1000)
    parser.add_argument("--other-bams", dest="other", nargs='*',
                        help="enter a space separated list of other bams to map to", required=False)

    parser.add_argument("--name", dest="name", type=str,
                        help="enter a root name for the analysis, otherwise will try to find the name from the input file", required=False)


    parser.add_argument("--top", dest="top", type=int,
                        help="Run the analysis on the top N genes by total signal. Default is 5000", required=False,default=5000)
    parser.add_argument("--tads", dest="tads", type=str,
                        help="Include a .bed of tad regions to restrict enhancer/gene association", required=False,default=None)

    #add by Quanhu Sheng
    parser.add_argument("--genomeDirectory", dest="genomeDirectory", type=str,
                        help="Enter the folder contains chromosome sequence in fasta format", required=True)
    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set
    parser.add_argument("--gseaPath", dest="gseaPath", type=str, help="Enter GSEA jar file location", required=True)
    parser.add_argument("--gmxPath", dest="gmxPath", type=str, help="Enter GSEA gmt file location, such as c2.all.v5.1.symbols.gmt", required=True)
    parser.add_argument("--cpgPath", dest="cpgPath", type=str, help="Enter cpg coordinates in bed format", required=True)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.genomeDirectory and args.output and args.gseaPath and args.gmxPath and args.cpgPath:
        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))
        
        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output,True)
        outputFolder = utils.formatFolder('%s%s' % (parentFolder,analysisName),True)

        print('WRITING OUTPUT TO %s' % (outputFolder))


        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath,'\t')
        
        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath,'\t')
            
            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(activityTable[0][i]).count('NR_') >0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable] # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))
        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            tads_path = args.tads
        else:
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))
        
        genomeDirectory=args.genomeDirectory
        
        #important here to define the window
        startDict,tssCollection,geneList = loadAnnotFile(genome,window,geneList,True)
        print('IDENTIFIED %s valid ACTIVE GENES' % (len(geneList)))
        print(len(startDict))

        #now we need to split the input region 
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(inputGFF,tssCollection)
        print(len(inputGFF))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder,analysisName)
        utils.unParseTable(splitGFF,splitGFFPath,'\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),True)
        
        signalTable = mapBams(bamFileList,splitGFFPath,analysisName,mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,analysisName)
        utils.unParseTable(signalTable,signalTablePath,'\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList,splitGFFPath,analysisName,mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (outputFolder,analysisName)
            utils.unParseTable(controlSignalTable,controlSignalTablePath,'\t')

        #now create the background subtracted summarized average table
        
        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,analysisName,useBackground = args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,analysisName)
        utils.unParseTable(averageTable,averageTablePath,'\t')


        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder,analysisName)
        if utils.checkOutput(peakTablePath,0.1,0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath,'\t')
        else:
            peakTable = makePeakTable(args.cpgPath,splitGFFPath,averageTablePath,startDict,geneList,genomeDirectory,tads_path)        
            utils.unParseTable(peakTable,peakTablePath,'\t')

        geneTable = makeGeneTable(peakTable,analysisName)        

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder,analysisName)
        utils.unParseTable(geneTable,geneTablePath,'\t')
        
        if(top > len(geneTable)):
          top = 'all'
        
        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath,outputFolder,analysisName,top)

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(args.gseaPath, args.gmxPath, outputFolder,analysisName,top)
        
        if top != 'all':
          print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
          top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,top)

          print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
          callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        top_promoterTablePath,top_distalTablePath = detectGSEAOutput(analysisName,outputFolder,'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(top_promoterTablePath,top_distalTablePath,outputFolder,analysisName,'all')


        #these files can be parsed to make the NES plot


        #gsea_report_for_DISTAL_1459192369220.xls
        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))

Example #4

Show file

def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option(
        "-i",
        "--i",
        dest="input",
        nargs=1,
        default=None,
        help="Enter a .gff or .bed file of binding sites used to make enhancers"
    )
    parser.add_option("-r",
                      "--rankby",
                      dest="rankby",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o",
                      "--out",
                      dest="out",
                      nargs=1,
                      default=None,
                      help="Enter an output folder")
    parser.add_option("-g",
                      "--genome",
                      dest="genome",
                      nargs=1,
                      default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option(
        "-b",
        "--bams",
        dest="bams",
        nargs=1,
        default=None,
        help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c",
                      "--control",
                      dest="control",
                      nargs=1,
                      default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "Enter a max linking distance for stitching. Default will determine optimal stitching parameter"
    )
    parser.add_option(
        "-t",
        "--tss",
        dest="tss",
        nargs=1,
        default=0,
        help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        "Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions"
    )

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print(
            'WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT'
        )
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (cwd),
        'RN6': '%s/annotation/rn6_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(inputGFFFile, bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [
            locus for locus in referenceLoci
            if len(maskCollection.getOverlap(locus, 'both')) == 0
        ]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" %
              (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(
        referenceCollection, inputName, outFolder, stitchWindow, tssWindow,
        annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName,
                                                      str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName,
                                                str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName,
                                                     str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (
            gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (
            inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (
            gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = 'bamliquidator_batch.py'

    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName,
                                               bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (
            mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" %
                  (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (
                stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File, 0.2, 5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" %
                      (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection,
                  referenceCollection,
                  bamFileList,
                  mappedFolder,
                  outputFile1,
                  refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (
            outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)

    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)

    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (
            genome, options.rankby, options.control, outFolder,
            superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (
            genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)

Example #5

Show file

File: ROSE2_META.py Project: linlabcode/pipeline

def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)

Example #6

Show file

def make_shep_on_mycn_landscape(shep_on_dataFile):

    '''
    finds mycn peaks in shep21 that are conserved in nb and segregates them into promoter or enhancer
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)


    print('LOADING SHEP ON MYCN SITES')
    #load all of the shep_on sites
    # shep_on_gff_path = '%smeta_rose/SHEP_ON_MYC/gff/HG19_SHEP_ON_MYC_ALL_-0_+0.gff' % (projectFolder)
    # shep_on_gff = utils.parseTable(shep_on_gff_path,'\t')

    shep_on_bed_path = '%sSHEP_6HR_MYCN_peaks.bed' % (macsEnrichedFolder)
    shep_on_bed = utils.parseTable(shep_on_bed_path,'\t')
    shep_on_gff = utils.bedToGFF(shep_on_bed)
    
    #now get the conserved NB MYCN regions
    nb_conserved_mycn_gff_file = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    nb_conserved_mycn_collection = utils.gffToLocusCollection(nb_conserved_mycn_gff_file)

    print('LOADING SHEP ACTIVE ENHANCERS') 
    #make a collection of enhancers
    shep_enhancer_file = '%smeta_rose/SHEP_ON_H3K27AC/SHEP_ON_H3K27AC_AllEnhancers.table.txt' % (projectFolder)
    shep_enhancer_collection = utils.makeSECollection(shep_enhancer_file,'SHEP_H3K27AC')

    #now get the active promoters
    print('LOADING SHEP ACTIVE PROMOTERS')
    startDict = utils.makeStartDict(annotFile)
    shep_transcribed_file = '%sHG19_SHEP_ON_H3K27AC_ACTIVE.txt' % (geneListFolder)
    shep_transcribed_table = utils.parseTable(shep_transcribed_file,'\t')
    transcribedList = [line[1] for line in shep_transcribed_table]
    tssLoci = []
    for refID in transcribedList:
        tssLoci.append(utils.makeTSSLocus(refID,startDict,1000,1000))

    shep_tss_collection = utils.LocusCollection(tssLoci,50)

    #now initialize the 6 gffs we will need
    shep_mycn_gff = [] 
    shep_mycn_gff_5kb = []
    shep_mycn_gff_1kb = []

    shep_mycn_promoter_gff = []
    shep_mycn_promoter_gff_1kb = []
    shep_mycn_promoter_gff_5kb = []

    shep_mycn_enhancer_gff = []
    shep_mycn_enhancer_gff_1kb = []
    shep_mycn_enhancer_gff_5kb = []

    #and their respective file names
    shep_mycn_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    shep_mycn_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_promoter_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder)
    shep_mycn_promoter_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_promoter_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder)

    shep_mycn_enhancer_gff_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder)
    shep_mycn_enhancer_gff_5kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder)
    shep_mycn_enhancer_gff_1kb_file = '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder)

    print('ITERATING THROUGH SHEP MYCN PEAKS')

    ticker = 0
    enhancer = 0
    promoter = 0 

    other = 0
    for line in shep_on_gff:
        if ticker % 1000 == 0:
            print ticker
        ticker+=1
        peakID = '%s_%s' % ('SHEP_MYCN',str(ticker))

        lineLocus = utils.Locus(line[0],line[3],line[4],'.',peakID)

        if nb_conserved_mycn_collection.getOverlap(lineLocus):
            gffLine = [line[0],peakID,peakID,line[3],line[4],'','.','',peakID]
            peakCenter = (int(line[3]) + int(line[4]))/2
            gffLine_5kb = [line[0],peakID,peakID,peakCenter - 5000,peakCenter + 5000,'','.','',peakID]
            #the 1kb is not a center +/- but a flank
            gffLine_1kb = [line[0],peakID,peakID,int(line[3]) - 1000,int(line[4]) + 1000,'','.','',peakID]

            shep_mycn_gff.append(gffLine)
            shep_mycn_gff_5kb.append(gffLine_5kb)
            shep_mycn_gff_1kb.append(gffLine_1kb)

            #tss overlap should take precedence over enhancer overlap
            if shep_tss_collection.getOverlap(lineLocus,'both'):
                shep_mycn_promoter_gff.append(gffLine)
                shep_mycn_promoter_gff_5kb.append(gffLine_5kb)
                shep_mycn_promoter_gff_1kb.append(gffLine_1kb)
                promoter+=1
            #now check for enhancer overlap
            elif shep_enhancer_collection.getOverlap(lineLocus,'both'):
                shep_mycn_enhancer_gff.append(gffLine)
                shep_mycn_enhancer_gff_5kb.append(gffLine_5kb)
                shep_mycn_enhancer_gff_1kb.append(gffLine_1kb)
                enhancer+=1
            else:
                other+=1
    
    print('Of %s shep on mycn peaks' % (len(shep_on_gff)))
    print('%s are promoter' % (promoter))
    print('%s are enhancer' % (enhancer))
    print('%s are other' % (other))
    #now write out the gffs
    utils.unParseTable(shep_mycn_gff,shep_mycn_gff_file,'\t')
    utils.unParseTable(shep_mycn_gff_5kb,shep_mycn_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_gff_1kb,shep_mycn_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_promoter_gff,shep_mycn_promoter_gff_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_5kb,shep_mycn_promoter_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_promoter_gff_1kb,shep_mycn_promoter_gff_1kb_file,'\t')

    utils.unParseTable(shep_mycn_enhancer_gff,shep_mycn_enhancer_gff_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_5kb,shep_mycn_enhancer_gff_5kb_file,'\t')
    utils.unParseTable(shep_mycn_enhancer_gff_1kb,shep_mycn_enhancer_gff_1kb_file,'\t')

Example #7

Show file

File: carg_box_seq_freq.py Project: linlabbcm/RASMC_Phenotypic_Switching

    new_table = []
    for line in bed_table[1:]:
        chrom = line[0]
        start = int(line[1]) - 50
        stop = int(line[2]) + 50
        edge = line[3]
        strand = line[4]
        new_line = [chrom, start, stop, edge, strand]
        new_table.append(new_line)
    tmp_path = '%s%s' % (temp_dir, bed)
    if len(new_table) > 0:
        utils.unParseTable(new_table, tmp_path, '\t')
        gff = bed.split('.')[0] + '.gff'
        print(gff)
        gff_path = '%s%s' % (temp_dir, gff)
        utils.bedToGFF(tmp_path, gff_path)

        genome_directory = genome_dir_dict[genome]

        print('gffToFasta Tool running on ' + gff_path + ' for ' + genome)
        fasta = utils.gffToFasta(genome,
                                 genome_directory,
                                 gff_path,
                                 UCSC=True,
                                 useID=False)

        print('Creating density table')
        table = []
        header = ['DENSITY', 'POSITIONS', 'POS_COUNT', 'SUBPEAK_LENGTH']
        table.append(header)

Example #8

Show file

File: ROSE2_main.py Project: afederation/pipeline

def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)

Example #9

Show file

def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#============II. MAKING A BED OUT OF HG19 FIGURE REGIONS==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    hg19_gff_path = '%sgff/HG19_NB_FIGURE_GENES.gff' % (hg19_projectFolder)

    hg19_gff = utils.parseTable(hg19_gff_path, '\t')
    print(hg19_gff)

    hg19_bed = utils.gffToBed(hg19_gff)
    print(hg19_bed)
    hg19_bed_path = '%sbeds/HG19_NB_FIGURE_GENES.bed' % (hg19_projectFolder)
    utils.unParseTable(hg19_bed, hg19_bed_path, '\t')
    #need to manually lift this over to mm9
    #https://genome.ucsc.edu/cgi-bin/hgLiftOver

    mm9_bed_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.bed' % (bedFolder)
    mm9_gff_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.gff' % (gffFolder)
    mm9_gff = utils.bedToGFF(mm9_bed_path)

    #now add some additional manual regions

    added_gff_regions = [
        [
            'chr12', 'TWIST1_ENHANCER', 'TWIST1_ENHANCER', 34639818, 34656263,
            '', '-', '', 'TWIST1_ENHANCER'
        ],
        [
            'chr11', 'NPM1_PROMOTER_2', 'NPM1_PROMOTER_2', 33049820, 33065883,
            '', '+', '', 'NPM1_PROMOTER_2'
        ],
        [
            'chr6', 'GATA2_ENHANCER', 'GATA2_ENHANCER', 88135802, 88159867, '',
            '+', '', 'GATA2_ENHANCER'
        ],
        [
            'chr7', 'PHOX2A', 'PHOX2A', 108964211, 108974610, '', '+', '',
            'PHOX2A'
        ],
        [
            'chr15',
            'LET7B',
            'LET7B',
            85497440,
            85538754,
            '',
            '+',
            '',
            'LET7B',
        ],
        [
            'chr10', 'LIN28B', 'LIN28B', 45161233, 45217227, '', '-', '',
            'LIN28B'
        ],
    ]

    mm9_gff_full = mm9_gff + added_gff_regions

    utils.unParseTable(mm9_gff_full, mm9_gff_path, '\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. PLOTTING DATA IN MOUSE===================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #plot mouse regions
    plot_mouse_genes(mouse_dataFile, mm9_gff_path)