Beispiel #1
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")

    #additional options
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        
        if options.names:
            nameString = options.names
            [name1,name2] =nameString.split(',')
        else:
            name1 = roseFolder1.split('/')[-1]
            name1 = string.replace(name1,'_ROSE','')

            name2 = roseFolder2.split('/')[-1]
            name2 = string.replace(name2,'_ROSE','')

        mergeName = "%s_%s_merged" % (name1,name2)

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls
        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)
        superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1)
        superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2)

        allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1)
        allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2)

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder)

        else:
            mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder)


        if superOnly:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)
        else:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)

        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')
        if utils.checkOutput(superOutput):
            #part2 is the R script
            rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2)
            print(rcmd) 
            os.system(rcmd)
            time.sleep(30)
            callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)
        else:
            print('ERROR: ROSE CALL FAILED')
            sys.exit()

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')
        if superOnly:
            gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName)
        else:
            gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        exit()
Beispiel #2
0
# name2 = 'MAC_H4K12AC_1H_MINUS'


# main(dataFile,genome,mergeFolder,roseFolder1,roseFolder2,name1,name2,mergeName,True,True)



#=====================================================
#===================MYC NB LEVELS=====================
#=====================================================
#MYC vs MYCN in BE(2)C
mergeName = 'BE2C_MYC_SUPERS'
genome ='hg18'
dataFile = '/ark/home/cl512/projects/neuroblastoma/NEURO_TABLE.txt'
mergeFolder = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/dynamicEnhancer/%s/' % (mergeName)
mergeFolder = pipeline_dfci.formatFolder(mergeFolder,True)
roseFolder1 = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/be2c_mycn_rose/'
roseFolder2 = '/ark/home/cl512/projects/neuroblastoma/MYC_analysis/be2c_myc_rose/'

name1 = 'BE2C_MYCN'
name2 = 'BE2C_MYC'


#main(dataFile,genome,mergeFolder,roseFolder1,roseFolder2,name1,name2,mergeName,True,True)




# #=====================================================
# #=======================786-O_SUPERS==================
# #=====================================================
Beispiel #3
0
def finishRankOutput(dataFile,rankOutput,genome,mergeFolder,mergeName,name1,name2,cutOff=1.5,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,name1,outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1,bam2)
        nameString = "%s,%s" % (name1,name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)
    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return
Beispiel #4
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs='*',
        help="Enter a comma separated list of .bam files to be processed.",
        required=True)
    parser.add_argument(
        "-i",
        "--input",
        dest="input",
        type=str,
        help="Enter .gff or genomic region e.g. chr1:+:1-1000.",
        required=True)
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported",
        required=True)

    # output flag
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        type=str,
                        help="Enter the output folder.",
                        required=True)
    # additional options
    parser.add_argument(
        "--stretch-input",
        dest="stretch_input",
        default=None,
        type=int,
        help=
        "Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)"
    )
    parser.add_argument(
        "-c",
        "--color",
        dest="color",
        default=None,
        help=
        "Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow"
    )
    parser.add_argument(
        "-s",
        "--sense",
        dest="sense",
        default='both',
        help="Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_argument("-e",
                        "--extension",
                        dest="extension",
                        default=200,
                        help="Extends reads by n bp. Default value is 200bp")
    parser.add_argument(
        "-r",
        "--rpm",
        dest="rpm",
        action='store_true',
        default=False,
        help="Normalizes density to reads per million (rpm) Default is False")
    parser.add_argument(
        "-y",
        "--yScale",
        dest="yScale",
        default="relative",
        help=
        "Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling"
    )
    parser.add_argument(
        "-n",
        "--names",
        dest="names",
        default=None,
        help="Enter a comma separated list of names for your bams")
    parser.add_argument(
        "-p",
        "--plot",
        dest="plot",
        default="MULTIPLE",
        help=
        "Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'"
    )
    parser.add_argument(
        "-t",
        "--title",
        dest="title",
        default='',
        help=
        "Specify a title for the output plot(s), default will be the coordinate region"
    )

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument(
        "--scale",
        dest="scale",
        default='',
        help=
        "Enter a comma separated list of scaling factors for your bams. Default is none"
    )
    parser.add_argument(
        "--save-temp",
        dest="save",
        action='store_true',
        default=False,
        help="If flagged will save temporary files made by bamPlot")
    parser.add_argument("--bed",
                        dest="bed",
                        help="Add a space-delimited list of bed files to plot")
    parser.add_argument(
        "--multi-page",
        dest="multi",
        action='store_true',
        default=False,
        help="If flagged will create a new pdf for each region")

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bamFileList = args.bam
        if len(args.bam) == 1:
            bamFileList = args.bam[0].split(',')

        # Make sure these are actually files & readable (!)
        for filename in bamFileList:
            assert (os.access(filename, os.R_OK))

        # bringing in any beds
        if args.bed:
            bedFileList = args.bed
            if type(bedFileList) == str:
                bedFileList = args.bed.split(',')
            print(bedFileList)
            bedCollection = makeBedCollection(bedFileList)
        else:
            bedCollection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {'+', '-', '.'}
        if os.access(args.input, os.R_OK):
            if args.input.endswith('.bed'):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parseTable(args.input, '\t')
                gffName = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [[
                            e[0], '', args.input, e[1], e[2], '', e[5], '', ''
                        ] for e in parsed_input_bed]
                except IndexError:
                    pass

                if gff is None:
                    print(
                        "Your bed doesn't have a valid senese parameter. Defaulting to both strands, '.'"
                    )
                    # We only take chr/start/stop and ignore everything else.
                    gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', '']
                           for e in parsed_input_bed]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parseTable(args.input, '\t')
                gffName = args.input.split('/')[-1].split('.')[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chromLine = args.input.split(':')
            try:
                chrom = chromLine[0]
                sense = chromLine[1]
            except IndexError:
                print(
                    'Invalid input line or inaccessible file. Try: chr1:.:1-5000'
                )
                exit()
            assert (sense in valid_sense_options)
            [start, end] = chromLine[2].split('-')
            if chrom[0:3] != 'chr':
                print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT')
                exit()
            gffLine = [chrom, '', args.input, start, end, '', sense, '', '']
            gffName = "%s_%s_%s_%s" % (chrom, sense, start, end)
            gff = [gffLine]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print('Stretching inputs to a minimum of: %d bp' %
                  (args.stretch_input))
            minLength = args.stretch_input
            stretchGff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < minLength:
                    pad = int((minLength - difference) / 2)
                    stretchGff.append([
                        e[0], e[1], e[2],
                        int(e[3]) - pad,
                        int(e[4]) + pad, e[5], e[6], e[7], e[8]
                    ])
                else:
                    stretchGff.append(e)

            gff = stretchGff

        # Sanity test the gff object
        assert (all([e[6] in valid_sense_options
                     for e in gff]))  # All strands are sane
        #assert(all([int(e[3]) < int(e[4]) for e in gff]))  # All start/stops are ordered

        # bring in the genome
        genome = args.genome.upper()
        if ['HG18', 'HG19', 'HG19_RIBO', 'MM9', 'MM10',
                'RN4'].count(genome) == 0:
            print(
                'ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10'
                % (genome))
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        rootFolder = args.output
        if rootFolder[-1] != '/':
            rootFolder += '/'
        try:
            os.listdir(rootFolder)
        except OSError:
            print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder))
            exit()

        # Get analysis title
        if len(args.title) == 0:
            title = gffName
        else:
            title = args.title

        # make a temp folder
        tempFolder = rootFolder + title + '/'
        print("CREATING TEMP FOLDER %s" % (tempFolder))
        pipeline_dfci.formatFolder(tempFolder, create=True)

        # colors
        if args.color:
            colorList = args.color.split(':')
            colorList = [x.split(',') for x in colorList]
            if len(colorList) < len(bamFileList):
                print(
                    'WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED'
                )
                # recycling the color list
                colorList += colorList * (len(bamFileList) / len(colorList))
                colorList = colorList[0:len(bamFileList)]

        else:
            # cycles through the colors of the rainbow
            colorList = tasteTheRainbow(len(bamFileList))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale

        yScale = args.yScale.upper()

        # names
        if args.names:
            names = args.names.split(',')

            if len(names) != len(bamFileList):
                print(
                    'ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND'
                )
                parser.print_help()
                exit()
        else:
            names = [x.split('/')[-1] for x in bamFileList]

        # plot style
        plotStyle = args.plot.upper()
        if ['SINGLE', 'MULTIPLE', 'MERGE'].count(plotStyle) == 0:
            print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle))
            parser.print_help()
            exit()

        # now run!
        summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList,
                                                 colorList, nBins, sense,
                                                 extension, rpm, tempFolder,
                                                 names, title, bedCollection,
                                                 scale)
        print("%s is the summary table" % (summaryTableFileName))

        #running the R command to plot
        multi = args.multi
        outFile = "%s%s_plots.pdf" % (rootFolder, title)
        rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle,
                         multi)

        # open a bash file
        bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title)
        bashFile = open(bashFileName, 'w')
        bashFile.write('#!/usr/bin/bash\n')
        bashFile.write(rCmd)
        bashFile.close()
        print("Wrote R command to %s" % (bashFileName))
        os.system("bash %s" % (bashFileName))

        # delete temp files
        if not args.save:
            if utils.checkOutput(outFile, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert (" " not in tempFolder)
                assert (tempFolder is not "/")
                removeCommand = "rm -rf %s" % (tempFolder)
                print(removeCommand)
                os.system(removeCommand)
            else:
                print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile))

    else:
        parser.print_help()
        sys.exit()
Beispiel #5
0
#mask Files
maskFile = '%smasks/hg19_encode_blacklist.bed' % (projectFolder)

#genomeDirectory
genomeDirectory = '/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/'

#making folders
folderList = [
    gffFolder, macsFolder, macsEnrichedFolder, mappedEnrichedFolder,
    mappedFolder, wiggleFolder, metaFolder, metaRoseFolder, fastaFolder,
    figureCodeFolder, figuresFolder, geneListFolder, bedFolder, signalFolder,
    tableFolder
]

for folder in folderList:
    pipeline_dfci.formatFolder(folder, True)

#==========================================================================
#============================LIST OF DATAFILES=============================
#==========================================================================

#this project will utilize multiple datatables
#data tables are organized largely by type/system
#some data tables overlap for ease of analysis

#ATAC-Seq
atac_dataFile = '%sdata_tables/ATAC_TABLE.txt' % (projectFolder)

#ChIP-Seq
be2c_dataFile = '%sdata_tables/BE2C_TABLE.txt' % (projectFolder)
mm1s_dataFile = '%sdata_tables/MM1S_TABLE.txt' % (projectFolder)
projectFolder = '/grail/projects/%s/' % (projectName) #PATH TO YOUR PROJECT FOLDER

#standard folder names
gffFolder ='%sgff/' % (projectFolder)
macsFolder = '%smacsFolder/' % (projectFolder)
macsEnrichedFolder = '%smacsEnriched/' % (projectFolder)
mappedEnrichedFolder = '%smappedEnriched/' % (projectFolder)
mappedFolder = '%smappedFolder/' % (projectFolder)
wiggleFolder = '%swiggles/' % (projectFolder)
metaFolder = '%smeta/' % (projectFolder)

#making folders
folderList = [gffFolder,macsFolder,macsEnrichedFolder,mappedEnrichedFolder,mappedFolder,wiggleFolder,metaFolder]

for folder in folderList:
    pipeline_dfci.formatFolder(folder,True)



#==========================================================================
#========================FORMATTING SAMPLE TABLE===========================
#==========================================================================

##THIS SECTION CREATES A DATA TABLE FROM A WHITEHEAD ANNOTATION SPREADSHEET

##give full path
##sampleTableFile = 'YOUR_WIGTC_ANNOTATION.xls' #<- the .xls file in the seq data folder provided by WI

#dirpath = ''  <- provide full path of folder containing raw seq files
##e.g. /ark/home/jr246/raw/130925_..../QualityScore/
Beispiel #7
0
def finishRankOutput(dataFile, rankOutput, genome, mergeFolder, mergeName, name1, name2, cutOff=1.5, window=100000):

    """
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    """
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    # making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    # make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + "output/", True)

    # bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, "\t")

    # make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = "DELTA RANK"
    header[-3] = "IS_SUPER"
    formattedRankTable = [header]

    # the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    # the beds
    gainedTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0'
        % (genome, name2, genome, name2, name1)
    )
    gainedBed = [[gainedTrackHeader]]
    conservedTrackHeader = (
        'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0'
        % (genome, name1, name2, genome, name1, name2)
    )
    conservedBed = [[conservedTrackHeader]]

    lostTrackHeader = (
        'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0'
        % (genome, name1, genome, name1, name2)
    )
    lostBed = [[lostTrackHeader]]

    # the genes
    geneTable = [
        [
            "GENE",
            "ENHANCER_ID",
            "ENHANCER_CHROM",
            "ENHANCER_START",
            "ENHANCER_STOP",
            header[6],
            header[7],
            header[8],
            "STATUS",
        ]
    ]

    for line in rankEnhancerTable[1:]:
        # fixing the enhancer ID
        line[0] = line[0].replace("_lociStitched", "")
        formattedRankTable.append(line)

        # getting the genes
        geneList = []
        geneList += line[9].split(",")
        geneList += line[10].split(",")
        geneList += line[11].split(",")
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ",")

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        # for gained
        if float(line[6]) > cutOff:
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        # for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [line[1], line[0], "", line[2], line[3], "", ".", "", geneString]
            gffWindowLine = [
                line[1],
                line[0],
                "",
                int(line[2]) - window,
                int(line[3]) + window,
                "",
                ".",
                "",
                geneString,
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        # for conserved
        else:
            geneStatus = "CONSERVED"
            conservedBed.append(bedLine)

        # now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene, line[0], line[1], line[2], line[3], line[6], line[7], line[8], geneStatus]
            geneTable.append(geneTableLine)

    # concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    # start writing the output
    # there's the two gffs, the bed,the formatted table, the gene table

    # formatted table
    formattedFilename = "%s%s_%s_MERGED_SUPERS_RANK_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(formattedRankTable, formattedFilename, "\t")

    # gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + "gff/", True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name2))
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name2),
        window / 1000,
        window / 1000,
    )

    gffFilename_lost = "%s%s_%s_%s_ONLY_SUPERS_-0_+0.gff" % (gffFolder, genome, mergeName, string.upper(name1))
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_SUPERS_-%sKB_+%sKB.gff" % (
        gffFolder,
        genome,
        mergeName,
        string.upper(name1),
        window / 1000,
        window / 1000,
    )

    utils.unParseTable(gainedGFF, gffFilename_gained, "\t")
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, "\t")

    utils.unParseTable(lostGFF, gffFilename_lost, "\t")
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, "\t")

    # bed
    bedFilename = "%s%s_%s_MERGED_SUPERS.bed" % (outputFolder, genome, mergeName)
    utils.unParseTable(fullBed, bedFilename, "\t")

    # geneTable
    geneFilename = "%s%s_%s_MERGED_SUPERS_GENE_TABLE.txt" % (outputFolder, genome, mergeName)
    utils.unParseTable(geneTable, geneFilename, "\t")

    # finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_SUPERS_DELTA.pdf" % (mergeFolder, name1, outputFolder, genome, mergeName)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_SUPERS_RANK_PLOT.png" % (
        mergeFolder,
        name1,
        outputFolder,
        genome,
        mergeName,
    )
    os.system(cmd)

    # now execute the bamPlot_turbo.py commands
    bam1 = dataDict[name1]["bam"]
    bam2 = dataDict[name2]["bam"]
    bamString = "%s,%s" % (bam1, bam2)
    nameString = "%s,%s" % (name1, name2)
    colorString = "0,0,0:100,100,100"

    # change dir
    os.chdir("/ark/home/cl512/pipeline/")

    if len(gainedGFF) > 0:
        # gained command
        plotTitle = "%s_ONLY_SE" % (name2)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # gained window command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_gained,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    if len(lostGFF) > 0:
        # lost command
        plotTitle = "%s_ONLY_SE" % (name1)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilename_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

        # lost command
        plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
        cmd = "python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE" % (
            genome,
            bamString,
            gffFilenameWindow_lost,
            outputFolder,
            nameString,
            colorString,
            plotTitle,
        )
        os.system(cmd)

    return
Beispiel #8
0
def finishRankOutput(dataFile,
                     rankOutput,
                     genome,
                     mergeFolder,
                     mergeName,
                     name1,
                     name2,
                     cutOff=1.5,
                     window=100000,
                     superOnly=True,
                     plotBam=True):
    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder = pipeline_dfci.formatFolder(mergeFolder + 'output/', True)

    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(rankOutput, '\t')

    #make a new formatted table
    header = rankEnhancerTable[0]
    header[-4] = 'DELTA RANK'
    header[-3] = 'IS_SUPER'
    formattedRankTable = [header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (
            genome, name2, genome, name2, name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (
            genome, name1, name2, genome, name1, name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (
            genome, name1, genome, name1, name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (
            genome, name2, genome, name2, name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (
            genome, name1, name2, genome, name1, name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (
            genome, name1, genome, name1, name2)
        lostBed = [[lostTrackHeader]]

    #the genes
    geneTable = [[
        'GENE', 'ENHANCER_ID', 'ENHANCER_CHROM', 'ENHANCER_START',
        'ENHANCER_STOP', header[6], header[7], header[8], 'STATUS'
    ]]

    for line in rankEnhancerTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched', '')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[9].split(',')
        geneList += line[10].split(',')
        geneList += line[11].split(',')
        geneList = [x for x in geneList if len(x) > 0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList, ',')

        bedLine = [line[1], line[2], line[3], line[0], line[-4]]

        #for gained
        if float(line[6]) > cutOff:
            gffLine = [
                line[1], line[0], '', line[2], line[3], '', '.', '', geneString
            ]
            gffWindowLine = [
                line[1], line[0], '',
                int(line[2]) - window,
                int(line[3]) + window, '', '.', '', geneString
            ]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[6]) < (-1 * cutOff):
            gffLine = [
                line[1], line[0], '', line[2], line[3], '', '.', '', geneString
            ]
            gffWindowLine = [
                line[1], line[0], '',
                int(line[2]) - window,
                int(line[3]) + window, '', '.', '', geneString
            ]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'CONSERVED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [
                gene, line[0], line[1], line[2], line[3], line[6], line[7],
                line[8], geneStatus
            ]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed

    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table

    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (
        outputFolder, genome, mergeName, enhancerType)
    utils.unParseTable(formattedRankTable, formattedFilename, '\t')

    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder + 'gff/', True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (
        gffFolder, genome, mergeName, string.upper(name2), enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (
        gffFolder, genome, mergeName, string.upper(name2), enhancerType,
        window / 1000, window / 1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (
        gffFolder, genome, mergeName, string.upper(name1), enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (
        gffFolder, genome, mergeName, string.upper(name1), enhancerType,
        window / 1000, window / 1000)

    utils.unParseTable(gainedGFF, gffFilename_gained, '\t')
    utils.unParseTable(gainedWindowGFF, gffFilenameWindow_gained, '\t')

    utils.unParseTable(lostGFF, gffFilename_lost, '\t')
    utils.unParseTable(lostWindowGFF, gffFilenameWindow_lost, '\t')

    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder, genome, mergeName,
                                             enhancerType)
    utils.unParseTable(fullBed, bedFilename, '\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (
        outputFolder, genome, mergeName, enhancerType)
    utils.unParseTable(geneTable, geneFilename, '\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (
        mergeFolder, name1, outputFolder, genome, mergeName, enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (
        mergeFolder, name1, outputFolder, genome, mergeName, enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        bam1 = dataDict[name1]['bam']
        bam2 = dataDict[name2]['bam']
        bamString = "%s,%s" % (bam1, bam2)
        nameString = "%s,%s" % (name1, name2)
        colorString = "0,0,0:100,100,100"

        #change dir
        os.chdir(pipelineDir)

        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilename_gained, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2, window / 1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilenameWindow_gained, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilename_lost, outputFolder, nameString,
                colorString, plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1, window / 1000)
            cmd = 'python bamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MULTIPLE' % (
                genome, bamString, gffFilenameWindow_lost, outputFolder,
                nameString, colorString, plotTitle)
            os.system(cmd)

    return
Beispiel #9
0
def main():
    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option(
        "-g",
        "--genome",
        dest="genome",
        nargs=1,
        default=None,
        help="Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project")
    parser.add_option("-d",
                      "--data",
                      dest="data",
                      nargs=1,
                      default=None,
                      help="Enter the data file for the project")
    parser.add_option("-r",
                      "--rose",
                      dest="rose",
                      nargs=1,
                      default=None,
                      help="Enter a comma separated list of rose folder")
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      nargs=1,
                      default=None,
                      help="Enter the output folder for the project")
    parser.add_option(
        "-n",
        "--names",
        dest="names",
        nargs=1,
        default=None,
        help="Enter a comma separated list of names to go with the datasets")

    #additional options
    parser.add_option("-p",
                      "--plot",
                      dest="plot",
                      action='store_true',
                      default=False,
                      help="If flagged, will plot differential regions")
    parser.add_option(
        "-a",
        "--all",
        dest="all",
        action='store_true',
        default=False,
        help=
        "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m",
                      "--median",
                      dest="median",
                      action='store_true',
                      default=False,
                      help="If flagged, will use median enhancer scaling")
    parser.add_option(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        nargs=1,
        default='super',
        help="specify type of enhancer to analyze: super, stretch, superStretch"
    )

    (options, args) = parser.parse_args()

    print(options)
    print(args)

    if options.genome and options.data and options.rose and options.output and options.names:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1, roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output, True)

        nameString = options.names
        [name1, name2] = nameString.split(',')

        mergeName = "%s_%s_merged" % (name1, name2)

        #option for median scaling
        medianScale = options.median

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (
                name1, name2, parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (
                name1, name2, parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (
                name1, name2, parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (
                name1, name2, parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1, name2)
        #start with the all enhancer tables from the initial rose calls

        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1, False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2, False)

        roseDict1 = makeRoseDict(roseFolder1)
        roseDict2 = makeRoseDict(roseFolder2)

        #choosing the type of enhancer to analyze
        enhancerCallType = string.lower(options.enhancer_type)
        if superOnly:
            print("ANALYZING ENHANCER TYPE: %s" %
                  (string.upper(enhancerCallType)))
        superFile1 = roseDict1[enhancerCallType]
        superFile2 = roseDict2[enhancerCallType]

        allFile1 = roseDict1['AllEnhancer']
        allFile2 = roseDict2['AllEnhancer']

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            if len(superFile1) == 0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (
                    enhancerCallType, roseFolder1)
                sys.exit()
            if len(superFile2) == 0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (
                    enhancerCallType, roseFolder2)
                sys.exit()
            roseOutput = callMergeSupers(dataFile, superFile1, superFile2,
                                         name1, name2, mergeName, genome,
                                         parentFolder)

        else:

            roseOutput = callMergeSupers(dataFile, allFile1, allFile2, name1,
                                         name2, mergeName, genome,
                                         parentFolder)

        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

        #part2 is the R script
        mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (
            parentFolder, string.upper(genome), mergeName)
        rcmd = callDeltaRScript(mergedGFFFile, parentFolder, dataFile, name1,
                                name2, allFile1, allFile2, medianScale)
        print(rcmd)
        os.system(rcmd)

        time.sleep(30)
        callRoseGeneMapper(mergedGFFFile, genome, parentFolder, name1)

        #rank the genes

        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

        gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),
                                                  mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (
            parentFolder, name1, gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (
                parentFolder, name1, gffName)
            assignEnhancerRank(enhancerToGeneFile, allFile1, allFile2, name1,
                               name2, rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput, name1, name2, superFile1,
                                   superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile, rankOutput, genome, parentFolder, mergeName,
                         name1, name2, 1, 100000, superOnly, plotBam)
    else:
        parser.print_help()
        sys.exit()
Beispiel #10
0
def main():
    
    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-b","--bam", dest="bam",nargs = 1, default=None,
                      help = "Enter a comma separated list of .bam files to be processed.")
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter .gff or genomic region e.g. chr1:+:1-1000.")
    parser.add_option("-g","--genome",dest="genome",nargs =1, default = None,
                      help = "specify a genome, options are hg18 or mm9 right now")


    #output flag
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder.")
    #additional options
    parser.add_option("-c","--color", dest="color",nargs = 1, default=None,
                      help = "Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow")
    parser.add_option("-s","--sense", dest="sense",nargs = 1, default='both',
                      help = "Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_option("-e","--extension", dest="extension",nargs = 1, default=200,
                      help = "Extends reads by n bp. Default value is 200bp")
    parser.add_option("-r","--rpm", dest="rpm",action = 'store_true', default=False,
                      help = "Normalizes density to reads per million (rpm) Default is True")
    parser.add_option("-y","--yScale",dest="yScale",nargs =1, default = "relative",
                      help = "Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling")
    parser.add_option("-n","--names",dest="names",nargs =1, default = None,
                      help = "Enter a comma separated list of names for your bams")
    parser.add_option("-p","--plot",dest="plot",nargs =1, default = "multiple",
                      help = "Choose either all lines on a single plot or multiple plots. options = 'single,multiple'")
    parser.add_option("-t","--title",dest ="title",nargs=1,default = '',
                      help = "Specify a title for the output plot(s), default will be the coordinate region")
                  


    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.bam and options.input and options.genome and options.output:

        #bring in the bams
        bamFileList = options.bam.split(',')
        
        #bring in the gff
        try:
            gff = parseTable(options.input,'\t')
            gffName = options.input.split('/')[-1].split('.')[0]
        except IOError:
            #means a coordinate line has been given e.g. chr1:+:1-100

            chromLine = options.input.split(':')
            
            chrom = chromLine[0]
            sense = chromLine[1]
            [start,end] = chromLine[2].split('-')
            if chrom[0:3] != 'chr':
                print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT')
                exit()
            gffLine = [chrom,'',options.input,start,end,'',sense,'','']
            gffName = "%s_%s_%s_%s" % (chrom,sense,start,end)
            gff = [gffLine]

        #bring in the genome
        genome = upper(options.genome)
        if ['HG18','HG19','MM9','RN5'].count(genome) == 0:
            print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG18, RN5, OR MM9' % (genome))
            parser.print_help()
            exit()

        #bring in the rest of the options
        
        #output
        rootFolder = options.output
        if rootFolder[-1] != '/':
            rootFolder+='/'
        try:
            foo = os.listdir(rootFolder)
        except OSError:
            print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %S' % (rootFolder))
            exit()

        #Get analysis title
        if len(options.title) == 0:
            title = gffName
        else:
            title = options.title

        #make a temp folder
        tempFolder = rootFolder + title + '/'
        print("CREATING TEMP FOLDER %s" % (tempFolder))
        pipeline_dfci.formatFolder(tempFolder,create=True)
                         
        #colors
        if options.color:
            colorList = options.color.split(':')
            colorList = [x.split(',') for x in colorList]
            if len(colorList) < len(bamFileList):
                print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED')
                #recycling the color list
                colorList += colorList*(len(bamFileList)/len(colorList))
                colorList = colorList[0:len(bamFileList)]

        else:
            #cycles through the colors of the rainbow
            colorList = tasteTheRainbow(len(bamFileList))

        #sense
        sense = options.sense
        
        extension = int(options.extension)

        rpm = options.rpm

        yScale = upper(options.yScale)
                                     
        #names
        if options.names:
            names = options.names.split(',')
        
            if len(names) != len(bamFileList):
                print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND')
                parser.print_help()
                exit()
        else:
            names = [x.split('/')[-1] for x in bamFileList]

        #plot style
        plotStyle = upper(options.plot)
        if ['SINGLE','MULTIPLE'].count(plotStyle) == 0:
            print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle))
            parser.print_help()
            exit()


        #now run!
        summaryTableFileName = makeBamPlotTables(gff,genome,bamFileList,colorList,nBins,sense,extension,rpm,tempFolder,names,title)
        print ("%s is the summary table" % (summaryTableFileName))


        outFile = "%s%s_plots.pdf" % (rootFolder,title)
        rCmd = callRPlot(summaryTableFileName,outFile,yScale,plotStyle)

        #open a bash file to get shit done
        bashFileName = "%s%s_Rcmd.sh" % (tempFolder,title)
        bashFile = open(bashFileName,'w')
        bashFile.write(rCmd)
        bashFile.close()
        print("Wrote R command to %s" % (bashFileName))
        os.system("bash %s" % (bashFileName))



        
    else:
        parser.print_help()
        exit()
Beispiel #11
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")


    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output and options.names:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        

        nameString = options.names
        [name1,name2] =nameString.split(',')

        mergeName = "%s_%s_merged" % (name1,name2)

        #option for median scaling
        medianScale = options.median

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls

        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

        roseDict1 = makeRoseDict(roseFolder1)
        roseDict2 = makeRoseDict(roseFolder2)


        #choosing the type of enhancer to analyze
        enhancerCallType = string.lower(options.enhancer_type)
        if superOnly:
            print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
        superFile1 = roseDict1[enhancerCallType]
        superFile2 = roseDict2[enhancerCallType]

        allFile1 = roseDict1['AllEnhancer']
        allFile2 = roseDict2['AllEnhancer']

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            if len(superFile1) ==0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
                sys.exit()
            if len(superFile2) == 0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
                sys.exit()
            roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder)

        else:

            roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder)



        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

        #part2 is the R script
        mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
        rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale)
        print(rcmd) 
        os.system(rcmd)

        time.sleep(30)
        callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

        gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        sys.exit()
Beispiel #12
0
def makeGEORNATable(dataFile, namesList, geoName, outputFolder=''):
    '''
    makes a geo table and a bash script to format everything
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    if len(namesList) == 0:
        namesList = dataDict.keys()

    #set up bash script and output folder
    outputFolder = pipeline_dfci.formatFolder(outputFolder, True)
    bashFileName = '%s%s_bash.sh' % (outputFolder, geoName)

    bashFile = open(bashFileName, 'w')

    geoTable = [['SAMPLE_NAME', 'TITLE', 'CELL_TYPE', 'RAW_FILE', 'BARCODE']]

    namesList.sort()

    for name in namesList:

        sampleName = dataDict[name]['uniqueID']
        title = name
        cell_type = name.split('_')[0]
        raw_file = "%s.fastq.gz" % (name)

        fastqFile = dataDict[name]['fastq']
        uniqueID = dataDict[name]['uniqueID']
        try:
            barcode = pipeline_dfci.getTONYInfo(uniqueID, 38)
        except IndexError:
            barcode = ''

        newLine = [sampleName, title, cell_type, raw_file, barcode]
        geoTable.append(newLine)

    utils.unParseTable(geoTable, "%s%s_meta.xls" % (outputFolder, geoName),
                       '\t')

    #now make the folder to hold everything and the relevant bash script
    if len(outputFolder) == 0:
        outputFolder = './%s/' % (geoName)

    else:
        outputFolder = outputFolder + geoName + '/'

    pipeline_dfci.formatFolder(outputFolder, True)

    #now make the bash file
    bashFile.write('#!/usr/bin/bash\n')
    bashFile.write('cd %s\n' % (outputFolder))
    bashFile.write('\n\n')

    #write the untar command
    for name in namesList:
        fastqFile = dataDict[name]['fastq']
        if len(fastqFile) == 0:
            print "NO FASTQ FILE FOR %s" % (name)
            continue
        if fastqFile.count(
                'tar.gz'
        ) > 0:  #for files generated by whitehead that have tar header #####RACHEL READ HERE
            tarCmd = 'tar --strip-components 5 --to-stdout -xzvf %s | gzip > %s.fastq.gz\n' % (
                fastqFile, name)
        else:
            tarCmd = 'cp %s %s.fastq.gz\n' % (fastqFile, name)
        bashFile.write(tarCmd)

    bashFile.write('\n\n\n')

    #write the md5sums for the wiggles
    bashFile.write('\n\n\n')
    bashFile.write("echo '' > md5sum.txt\n")

    #write md5sums for the fastqs
    for name in namesList:
        md5Cmd = 'md5sum %s.fastq.gz >> md5sum.txt\n' % (name)
        bashFile.write(md5Cmd)

    #the big tar command
    tarCmd = '#tar -cvzf %s.tar.gz %s\n' % (geoName, outputFolder)
    bashFile.write(tarCmd)
    bashFile.close()
Beispiel #13
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument("-b", "--bam", dest="bam", nargs='*',
                        help="Enter a comma separated list of .bam files to be processed.", required=True)
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="Enter .gff or genomic region e.g. chr1:+:1-1000.", required=True)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify a genome, HG18,HG19,MM8,MM9,MM10 are currently supported", required=True)

    # output flag
    parser.add_argument("-o", "--output", dest="output", type=str,
                        help="Enter the output folder.", required=True)
    # additional options
    parser.add_argument("--stretch-input", dest="stretch_input", default=None, type=int,
                        help="Stretch the input regions to a minimum length in bp, e.g. 10000 (for 10kb)")
    parser.add_argument("-c", "--color", dest="color", default=None,
                        help="Enter a colon separated list of colors e.g. 255,0,0:255,125,0, default samples the rainbow")
    parser.add_argument("-s", "--sense", dest="sense", default='both',
                        help="Map to '+','-' or 'both' strands. Default maps to both.")
    parser.add_argument("-e", "--extension", dest="extension", default=200,
                        help="Extends reads by n bp. Default value is 200bp")
    parser.add_argument("-r", "--rpm", dest="rpm", action='store_true', default=False,
                        help="Normalizes density to reads per million (rpm) Default is False")
    parser.add_argument("-y", "--yScale", dest="yScale", default="relative",
                        help="Choose either relative or uniform y axis scaling. options = 'relative,uniform' Default is relative scaling")
    parser.add_argument("-n", "--names", dest="names", default=None,
                        help="Enter a comma separated list of names for your bams")
    parser.add_argument("-p", "--plot", dest="plot", default="MULTIPLE",
                        help="Choose either all lines on a single plot or multiple plots. options = 'SINGLE,MULTIPLE,MERGE'")
    parser.add_argument("-t", "--title", dest="title", default='',
                        help="Specify a title for the output plot(s), default will be the coordinate region")

    # DEBUG OPTION TO SAVE TEMP FILES
    parser.add_argument("--scale", dest="scale", default='',
                        help="Enter a comma separated list of scaling factors for your bams. Default is none")
    parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
                        help="If flagged will save temporary files made by bamPlot")
    parser.add_argument("--bed", dest="bed",
                        help="Add a space-delimited list of bed files to plot")
    parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
                        help="If flagged will create a new pdf for each region")

    args = parser.parse_args()

    print(args)

    if args.bam and args.input and args.genome and args.output:

        # Support a legacy mode where a ',' delimited multiple files
        bamFileList = args.bam
        if len(args.bam) == 1:
            bamFileList = args.bam[0].split(',')

        # Make sure these are actually files & readable (!)
        for filename in bamFileList:
            assert(os.access(filename, os.R_OK))

        # bringing in any beds
        if args.bed:
            bedFileList = args.bed
            if type(bedFileList) == str:
                bedFileList = args.bed.split(',')
            print(bedFileList)
            bedCollection = makeBedCollection(bedFileList)
        else:
            bedCollection = utils.LocusCollection([], 50)

        # Load the input for graphing. One of:
        # - A .gff
        # - A .bed
        # - a specific input region (e.g. chr10:.:93150000-93180000)

        valid_sense_options = {'+', '-', '.'}
        if os.access(args.input, os.R_OK):
            if args.input.endswith('.bed'):
                # Uniquely graph every input of this bed
                parsed_input_bed = utils.parseTable(args.input, '\t')
                gffName = os.path.basename(args.input)  # Graph title
                gff = None
                try:
                    if parsed_input_bed[0][5] in valid_sense_options:
                        # This .bed might have a sense parameter
                        gff = [[e[0], '', args.input, e[1], e[2], '', e[5], '', ''] for e in parsed_input_bed]
                except IndexError:
                    pass

                if gff is None:
                    print("Your bed doesn't have a valid sense parameter. Defaulting to both strands, '.'")
                    # We only take chr/start/stop and ignore everything else.
                    gff = [[e[0], '', args.input, e[1], e[2], '', '.', '', ''] for e in parsed_input_bed]
            else:
                # Default to .gff, since that's the original behavior
                gff = utils.parseTable(args.input, '\t')
                gffName = args.input.split('/')[-1].split('.')[0]
        else:
            # means a coordinate line has been given e.g. chr1:+:1-100
            chromLine = args.input.split(':')
            try:
                chrom = chromLine[0]
                sense = chromLine[1]
            except IndexError:
                print('Invalid input line or inaccessible file. Try: chr1:.:1-5000')
                exit()
            assert(sense in valid_sense_options)
            [start, end] = chromLine[2].split('-')
            if chrom[0:3] != 'chr':
                print('ERROR: UNRECOGNIZED GFF OR CHROMOSOME LINE INPUT')
                exit()
            gffLine = [chrom, '', args.input, start, end, '', sense, '', '']
            gffName = "%s_%s_%s_%s" % (chrom, sense, start, end)
            gff = [gffLine]

        # Consider stretching the regions to a fixed minimum size
        if args.stretch_input:
            print('Stretching inputs to a minimum of: %d bp' % (args.stretch_input))
            minLength = args.stretch_input
            stretchGff = []
            for e in gff:
                difference = int(e[4]) - int(e[3])
                if difference < minLength:
                    pad = int((minLength - difference) / 2)
                    stretchGff.append([e[0], e[1], e[2], int(e[3])-pad, int(e[4])+pad, e[5], e[6], e[7], e[8]])
                else:
                    stretchGff.append(e)

            gff = stretchGff

        # Sanity test the gff object
        assert(all([e[6] in valid_sense_options for e in gff]))  # All strands are sane
        #assert(all([int(e[3]) < int(e[4]) for e in gff]))  # All start/stops are ordered

        # bring in the genome
        genome = args.genome.upper()
        if ['HG18', 'HG19', 'HG19_RIBO','HG38','MM9', 'MM10', 'RN4','RN6'].count(genome) == 0:
            print('ERROR: UNSUPPORTED GENOME TYPE %s. USE HG19,HG18, RN4, MM9, or MM10' % (genome))
            parser.print_help()
            exit()

        # bring in the rest of the options

        # output
        rootFolder = args.output
        if rootFolder[-1] != '/':
            rootFolder += '/'
        try:
            os.listdir(rootFolder)
        except OSError:
            print('ERROR: UNABLE TO FIND OUTPUT DIRECTORY %s' % (rootFolder))
            exit()

        # Get analysis title
        if len(args.title) == 0:
            title = gffName
        else:
            title = args.title

        # make a temp folder
        tempFolder = rootFolder + title + '/'
        print("CREATING TEMP FOLDER %s" % (tempFolder))
        pipeline_dfci.formatFolder(tempFolder, create=True)

        # colors
        if args.color:
            colorList = args.color.split(':')
            colorList = [x.split(',') for x in colorList]
            if len(colorList) < len(bamFileList):
                print('WARNING: FEWER COLORS THAN BAMS SPECIFIED. COLORS WILL BE RECYCLED')
                # recycling the color list
                colorList += colorList * (len(bamFileList) / len(colorList))
                colorList = colorList[0:len(bamFileList)]

        else:
            # cycles through the colors of the rainbow
            colorList = tasteTheRainbow(len(bamFileList))

        # sense
        sense = args.sense

        extension = int(args.extension)

        rpm = args.rpm

        scale = args.scale

        yScale = args.yScale.upper()

        # names
        if args.names:
            names = args.names.split(',')

            if len(names) != len(bamFileList):
                print('ERROR: NUMBER OF NAMES AND NUMBER OF BAMS DO NOT CORRESPOND')
                parser.print_help()
                exit()
        else:
            names = [x.split('/')[-1] for x in bamFileList]

        # plot style
        plotStyle = args.plot.upper()
        if ['SINGLE', 'MULTIPLE','MERGE'].count(plotStyle) == 0:
            print('ERROR: PLOT STYLE %s NOT AN OPTION' % (plotStyle))
            parser.print_help()
            exit()

        # now run!
        summaryTableFileName = makeBamPlotTables(gff, genome, bamFileList, colorList, nBins, sense, extension, rpm, tempFolder, names, title, bedCollection,scale)
        print ("%s is the summary table" % (summaryTableFileName))

        #running the R command to plot
        multi = args.multi
        outFile = "%s%s_plots.pdf" % (rootFolder, title)
        rCmd = callRPlot(summaryTableFileName, outFile, yScale, plotStyle,multi)

        # open a bash file
        bashFileName = "%s%s_Rcmd.sh" % (tempFolder, title)
        bashFile = open(bashFileName, 'w')
        bashFile.write('#!/usr/bin/bash\n')
        bashFile.write(rCmd)
        bashFile.close()
        print("Wrote R command to %s" % (bashFileName))
        os.system("bash %s" % (bashFileName))

        # delete temp files
        if not args.save:
            if utils.checkOutput(outFile, 1, 10):
                # This is super dangerous (!). Add some sanity checks.
                assert(" " not in tempFolder)
                assert(tempFolder is not "/")
                removeCommand = "rm -rf %s" % (tempFolder)
                print(removeCommand)
                os.system(removeCommand)
            else:
                print("ERROR: NO OUTPUT FILE %s DETECTED" % (outFile))

    else:
        parser.print_help()
        sys.exit()
def makeGEOTable(dataFile,
                 wiggleFolder,
                 macsFolder,
                 namesList,
                 geoName,
                 outputFolder=''):
    '''
    makes a geo table and a bash script to format everything
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #first make a reverse wce dict

    backgroundDict = {}
    if len(namesList) == 0:
        namesList = dataDict.keys()
    for name in namesList:

        background = dataDict[name]['background']
        backgroundDict[background] = name

    outputFolder = pipeline_dfci.formatFolder(outputFolder, True)
    bashFileName = '%s%s_bash.sh' % (outputFolder, geoName)

    bashFile = open(bashFileName, 'w')

    geoTable = [[
        'SAMPLE_NAME', 'TITLE', 'CELL_TYPE', 'PROCESSED_FILE', 'RAW_FILE',
        'BARCODE'
    ]]

    namesList.sort()

    for name in namesList:

        print(name)

        sampleName = dataDict[name]['uniqueID']
        title = name
        cell_type = name.split('_')[0]
        processed_file = "%s.wig.gz" % (name)
        raw_file = "%s.fastq.gz" % (name)

        fastqFile = dataDict[name]['fastq']
        uniqueID = dataDict[name]['uniqueID']
        try:
            barcode = pipeline_dfci.getTONYInfo(uniqueID, 38)
        except IndexError:
            barcode = ''
        newLine = [
            sampleName, title, cell_type, processed_file, raw_file, barcode
        ]
        print(newLine)
        geoTable.append(newLine)

    utils.unParseTable(geoTable, "%s%s_meta.xls" % (outputFolder, geoName),
                       '\t')

    #now make the folder to hold everything and the relevant bash script
    if len(outputFolder) == 0:
        outputFolder = './%s/' % (geoName)

    else:
        outputFolder = outputFolder + geoName + '/'

    pipeline_dfci.formatFolder(outputFolder, True)

    wiggleFolder = pipeline_dfci.formatFolder(wiggleFolder, False)
    macsFolder = pipeline_dfci.formatFolder(macsFolder, False)

    #now make the bash file
    bashFile.write('#!/usr/bin/bash\n')
    bashFile.write('cd %s\n' % (outputFolder))
    bashFile.write('\n')

    #write the untar command
    for name in namesList:
        fastqFile = dataDict[name]['fastq']
        if len(fastqFile) == 0:
            print "NO FASTQ FILE FOR %s" % (name)
            continue
        tarCmd = 'cp %s %s.fastq.gz\n' % (fastqFile, name)
        bashFile.write(tarCmd)

    bashFile.write('\n\n\n')
    #write the wiggle cp command
    for name in namesList:
        if name.count('WCE') == 1 or name.count('INPUT') == 1:
            refName = backgroundDict[name]
            controlWiggleFile = '%s%s/%s_MACS_wiggle/control/%s_control_afterfiting_all.wig.gz' % (
                macsFolder, refName, refName, refName)
            wigCmd = "cp '%s' %s.wig.gz\n" % (controlWiggleFile, name)
            #wigCmd = "cp '%swceWiggles/%s_control_afterfiting_all.wig.gz' %s.wig.gz\n" % (wiggleFolder,refName,name)
        else:
            wigCmd = "cp '%s%s_treat_afterfiting_all.wig.gz' %s.wig.gz\n" % (
                wiggleFolder, name, name)
        bashFile.write(wigCmd)

    #write the md5sums for the wiggles
    bashFile.write('\n\n\n')
    bashFile.write("echo '' > md5sum.txt\n")
    for name in namesList:
        md5Cmd = 'md5sum %s.wig.gz >> md5sum.txt\n' % (name)
        bashFile.write(md5Cmd)

    #write md5sums for the fastqs
    for name in namesList:
        md5Cmd = 'md5sum %s.fastq.gz >> md5sum.txt\n' % (name)
        bashFile.write(md5Cmd)

    #the big tar command
    tarCmd = '#tar -cvzf %s.tar.gz %s\n' % (geoName, outputFolder)
    bashFile.write(tarCmd)
    bashFile.close()
Beispiel #15
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("--group1", dest="group1",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the first group")
    parser.add_option("--group2", dest="group2",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the second group")
    parser.add_option("--name1", dest="name1",nargs = 1, default=None,
                      help = "Enter a name for the first group of datasets")
    parser.add_option("--name2", dest="name2",nargs = 1, default=None,
                      help = "Enter a name for the second group of datasets")

    #the input options
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of meta rose folders")

    #optional input to supercede the meta rose (this is kinda sad but will fix later)
    #should have had this code run clustering from the get go
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "enter a gff, bed or table of regions to perform dyanmic analysis on")




    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")
    parser.add_option("--use-background", dest="background",action = 'store_true',default=False,
                      help = "If flagged will use background datasets as in data table")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    
    requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2]
    

    try:
        assert(all(requiredArgs))
    except AssertionError:
        parser.print_help()
        sys.exit()

    #now the main run of the function

    #getting the genoe and data file
    genome = string.upper(options.genome)
    dataFile = options.data

    #getting the rose folders
    roseFolderString = options.rose
    [roseFolder1,roseFolder2] = roseFolderString.split(',')
    parentFolder = utils.formatFolder(options.output,True)

    #getting the analysis names
    name1 = options.name1
    name2 = options.name2
    mergeName = "%s_%s_merged" % (name1,name2)

    #getting the datasets names associated with each group
    namesList1 = options.group1.split(',')
    namesList2 = options.group2.split(',')

    #options for background corection
    useBackground = options.background

    #option for median scaling
    medianScale = options.median

    #option for an overriding set of input regions
    if options.input != None:
        #for now only works w/ gffs
        print('Using %s as a set of predifined input regions' % (options.input))
        inputGFF = options.input
    else:
        inputGFF= ''
    

    plotBam = options.plot
    if options.all:
        superOnly = False
    else:
        superOnly = True

    if superOnly and plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
    if superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
    if not superOnly and plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
    if not superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

    #part 1
    print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
    #start with the all enhancer tables from the initial rose calls

    roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
    roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

    roseDict1 = makeRoseDict(roseFolder1)
    roseDict2 = makeRoseDict(roseFolder2)

    #choosing the type of enhancer to analyze
    enhancerCallType = string.lower(options.enhancer_type)
    if superOnly:
        print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
    superFile1 = roseDict1[enhancerCallType]
    superFile2 = roseDict2[enhancerCallType]

    allFile1 = roseDict1['AllEnhancer']
    allFile2 = roseDict2['AllEnhancer']
    
    regionFile1 = roseDict1['RegionMap']
    regionFile2 = roseDict1['RegionMap']

    #this is where we can toggle either using meta rose or clustering
    print('\tMERGING ENHANCERS AND CALLING ROSE')
    if superOnly:
        if len(superFile1) ==0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
            sys.exit()
        if len(superFile2) == 0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
            sys.exit()
        roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)

    else:
        print('doing it right')
        print(allFile1)
        print(allFile2)

        roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)
        print('this is rose output')
        print(roseOutput)
    print('\tMERGING ROSE OUTPUT')

    mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale)
    


    print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

    #part2 is the R script
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
    rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1)
    print(rcmd) 
    os.system(rcmd)

    time.sleep(5)
    callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1)

    #rank the genes


    #part 3
    #rank the delta
    print "PART 3: assinging ranks to differential enhancers"
    print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

    gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
    enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName)
    if utils.checkOutput(enhancerToGeneFile):
        rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName)
        assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
    else:
        print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
        sys.exit()

    #make the rank plot
    print('MAKING RANK PLOTS')
    if utils.checkOutput(rankOutput):
        print('checking for rank output %s' % (rankOutput))
        rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS')
    if utils.checkOutput(normRoseOutput):
        print('checking for %s' % (normRoseOutput))
        rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: REGION PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    #NOW MAP GENES
    print('mapping genes to differential enhancers')
    statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1)



    if utils.checkOutput(statOutput):
        print('checking for gene mapping output %s' % (statOutput))
        print('FINISHED WITH GENE MAPPING')
    else:
        print('GENE MAPPING FAILED')
        sys.exit()

    print('FINISHING OUTPUT')
    
    finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
Beispiel #16
0
def finishRankOutput(dataFile,statOutput,diffOutput,genome,mergeFolder,mergeName,name1,name2,namesList1,namesList2,cutOff=1.0,window = 100000,superOnly=True,plotBam=True):

    '''
    cleans up the rank output table
    makes a gff of all of the gained/lost supers beyond
    a certain cutoff w/ a window
    makes a list of gained genes and lost genes
    makes a bed of gained loss
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    #making sure window and cutoff are int/float
    cutOff = float(cutOff)
    window = int(window)
    genome = string.upper(genome)

    #make the output folder
    outputFolder =pipeline_dfci.formatFolder(mergeFolder+'output/',True)
    
    #bring in the old rank table
    rankEnhancerTable = utils.parseTable(statOutput,'\t')
    
    #make a new formatted table
    header = rankEnhancerTable[0]
    formattedRankTable =[header]

    #the gffs
    gainedGFF = []
    lostGFF = []

    gainedWindowGFF = []
    lostWindowGFF = []

    if superOnly:
        enhancerType = 'SUPERS'
    else:
        enhancerType = 'ENHANCERS'

    #the beds
    if superOnly:
        gainedTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s SEs" description="%s super enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only SEs" description="%s super enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]
    else:
        gainedTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=255,0,0' % (genome,name2,genome,name2,name1)
        gainedBed = [[gainedTrackHeader]]
        conservedTrackHeader = 'track name="%s %s and %s enhancers" description="%s enhancers that are found in both %s vs %s" itemRGB=On color=0,0,0' % (genome,name1,name2,genome,name1,name2)
        conservedBed = [[conservedTrackHeader]]

        lostTrackHeader = 'track name="%s %s only enhancers" description="%s enhancers that are found only in %s vs %s" itemRGB=On color=0,255,0' % (genome,name1,genome,name1,name2)
        lostBed = [[lostTrackHeader]]



    #the genes
    geneTable =[['GENE','ENHANCER_ID','ENHANCER_CHROM','ENHANCER_START','ENHANCER_STOP',header[6],header[7],header[8],'STATUS']]
    headerLength = len(rankEnhancerTable[0])
    for line in rankEnhancerTable[1:]:
        #fix line lengths
        if len(line) != headerLength:
            line += ['']*(headerLength-len(line))

        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankTable.append(line)

        #getting the genes
        geneList = []
        geneList += line[-1].split(',')
        geneList += line[-2].split(',')
        geneList += line[-3].split(',')
        geneList = [x for x in geneList if len(x) >0]
        geneList = utils.uniquify(geneList)
        geneString = string.join(geneList,',')

        bedLine = [line[1],line[2],line[3],line[0],line[-4]]
        
        #for gained
        #this applies both the statistical test chosen (default fdr <= 0.05) and the cutoff
        #the cutoff is hard wired, but we can add an option to change the test
        #stats are done in the R script. FDR norm can kinda suck if no genes are considered diff
        #print(line)
        
        if float(line[-8]) > cutOff and int(line[-4]) == 1:

            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            gainedGFF.append(gffLine)
            gainedWindowGFF.append(gffWindowLine)
            geneStatus = name2
            gainedBed.append(bedLine)
        #for lost
        elif float(line[-8]) < (-1 * cutOff) and int(line[-4]) == 1:
            gffLine = [line[1],line[0],'',line[2],line[3],'','.','',geneString]
            gffWindowLine = [line[1],line[0],'',int(line[2])-window,int(line[3])+window,'','.','',geneString]
            lostGFF.append(gffLine)
            lostWindowGFF.append(gffWindowLine)
            geneStatus = name1
            lostBed.append(bedLine)
        #for conserved
        else:
            geneStatus = 'UNCHANGED'
            conservedBed.append(bedLine)

        #now fill in the gene Table
        for gene in geneList:
            geneTableLine = [gene,line[0],line[1],line[2],line[3],line[6],line[7],line[8],geneStatus]
            geneTable.append(geneTableLine)

    #concat the bed
    fullBed = gainedBed + conservedBed + lostBed
            
    #start writing the output
    #there's the two gffs, the bed,the formatted table, the gene table
    
    
    #formatted table
    formattedFilename = "%s%s_%s_MERGED_%s_RANK_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankTable,formattedFilename,'\t')

    #formatted diff table
    #possible that no genes are differential
    rankEnhancerDiffTable = utils.parseTable(diffOutput,'\t')
    
    
    #make a new formatted table
    header = rankEnhancerDiffTable[0]
    formattedRankDiffTable =[header]

    for line in rankEnhancerDiffTable[1:]:
        #fixing the enhancer ID
        line[0] = line[0].replace('_lociStitched','')
        formattedRankDiffTable.append(line)


    formattedDiffFilename = "%s%s_%s_MERGED_%s_RANK_DIFF_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(formattedRankDiffTable,formattedDiffFilename,'\t')



    #gffs
    gffFolder = pipeline_dfci.formatFolder(outputFolder+'gff/',True)
    gffFilename_gained = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType)
    gffFilenameWindow_gained = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name2),enhancerType,window/1000,window/1000)

    gffFilename_lost = "%s%s_%s_%s_ONLY_%s_-0_+0.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType)
    gffFilenameWindow_lost = "%s%s_%s_%s_ONLY_%s_-%sKB_+%sKB.gff" % (gffFolder,genome,mergeName,string.upper(name1),enhancerType,window/1000,window/1000)

    utils.unParseTable(gainedGFF,gffFilename_gained,'\t')
    utils.unParseTable(gainedWindowGFF,gffFilenameWindow_gained,'\t')
            
    utils.unParseTable(lostGFF,gffFilename_lost,'\t')
    utils.unParseTable(lostWindowGFF,gffFilenameWindow_lost,'\t')
    
    #bed
    bedFilename = "%s%s_%s_MERGED_%s.bed" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(fullBed,bedFilename,'\t')

    #geneTable
    geneFilename = "%s%s_%s_MERGED_%s_GENE_TABLE.txt" % (outputFolder,genome,mergeName,enhancerType)
    utils.unParseTable(geneTable,geneFilename,'\t')

    #finally, move all of the plots to the output folder
    cmd = "cp %s%s_ROSE/*DELTA*.pdf %s%s_%s_MERGED_%s_DELTA.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_GAINED*.pdf %s%s_%s_MERGED_%s_REGION_GAINED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_LOST.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    cmd = "cp %s%s_ROSE/*REGION_LOST*.pdf %s%s_%s_MERGED_%s_REGION_UNCHANGED.pdf" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)


    cmd = "cp %s%s_ROSE/*RANK_PLOT.png %s%s_%s_MERGED_%s_RANK_PLOT.png" % (mergeFolder,namesList1[0],outputFolder,genome,mergeName,enhancerType)
    os.system(cmd)

    #now execute the bamPlot_turbo.py commands
    if plotBam:
        

        bamList1 = [dataDict[name]['bam'] for name in namesList1]
        bamList2 = [dataDict[name]['bam'] for name in namesList2]
        bamList = bamList1 + bamList2
        bamString = string.join(bamList,',')
        
        nameList = [name1]*len(namesList1) + [name2]*len(namesList2)
        nameString = string.join(nameList,',')
        print(namesList1[0])
        print(namesList2[0])

        print(namesList1)
        print(namesList2)
        print(dataDict[namesList1[0]]['color'])
        if dataDict[namesList1[0]]['color'] != dataDict[namesList2[0]]['color']:
            colorList = [dataDict[namesList1[0]]['color']]*len(namesList1) + [dataDict[namesList2[0]]['color']]*len(namesList2)
        else:
            colorList = ['0,0,0']*len(namesList1) + ['100,100,100']*len(namesList2)
        colorString = string.join(colorList,':')

        #change dir

    
        if len(gainedGFF) > 0:
            #gained command
            plotTitle = "%s_ONLY_SE" % (name2)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #gained window command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name2,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_gained,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

        if len(lostGFF) > 0:
            #lost command
            plotTitle = "%s_ONLY_SE" % (name1)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilename_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)

            #lost command
            plotTitle = "%s_ONLY_SE_%sKB_WINDOW" % (name1,window/1000)
            cmd = 'python %sbamPlot_turbo.py -g %s -b %s -i %s -o %s -n %s -c %s -t %s -r -y UNIFORM -p MERGE' % (pipelineDir,genome,bamString,gffFilenameWindow_lost,outputFolder,nameString,colorString,plotTitle)
            os.system(cmd)


    return