Example #1
0
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep on data
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile)
    names_list = dataDict.keys()

    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (mm1s_dataFile, bam_extension))

    #first do individuals
    for plot_group in ['MYC', 'H3K27AC']:
        plotList = [
            name for name in dataDict.keys() if name.count(plot_group) > 0
        ]
        plotName = '%s_MM1S_%s' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(mm1s_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')
Example #2
0
def plot_nb_atac_genes(atac_dataFile, nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep on data
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sNB_ATAC/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(atac_dataFile)
    names_list = dataDict.keys()
    print(names_list)

    #initial check for consistency of read lengths
    # for name in names_list:
    #     bam = utils.Bam(dataDict[name]['bam'])
    #     read_length = bam.getReadLengths()[0]
    #     bam_extension = 200-read_length
    #     print('For dataset %s in %s using an extension of %s' % (name,atac_dataFile,bam_extension))

    print(dataDict[names_list[1]]['bam'])
    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]

    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (atac_dataFile, bam_extension))

    #first do individuals
    for plot_group in ['ATAC']:
        plotList = [
            name for name in dataDict.keys()
            if name.count(plot_group) > 0 and name.count('MM1S') == 0
        ]
        plotName = '%s_NB_%s' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(atac_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')

    #now as metas
    plotList = [
        'BE2C_ATAC_rep1',
        'KELLY_ATAC',
        'NGP_ATAC',
        'SHEP21_ATAC',
    ]
    groupString = 'ATAC,ATAC,ATAC,ATAC'

    plotName = '%s_NB_ATAC_META_RELATIVE' % (plot_prefix)
    pipeline_dfci.callBatchPlot(atac_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=False,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')

    plotName = '%s_NB_ATAC_META_UNIFORM' % (plot_prefix)
    pipeline_dfci.callBatchPlot(atac_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=True,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')
Example #3
0
def plot_be2c_genes(be2c_dataFile, nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep on data
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sBE2C/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(be2c_dataFile)
    names_list = dataDict.keys()
    print(names_list)

    # #initial check for consistency of read lengths
    # for name in names_list:
    #     print(name)
    #     bam = utils.Bam(dataDict[name]['bam'])
    #     read_length = bam.getReadLengths()[0]
    #     bam_extension = 200-read_length
    #     print('For dataset %s in %s using an extension of %s' % (name,be2c_dataFile,bam_extension))

    print(dataDict[names_list[0]]['bam'])
    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (be2c_dataFile, bam_extension))

    #first do individuals except for twist using relative scaling

    plotList = [
        name for name in dataDict.keys()
        if name.count('TWIST') == 0 and name.count('INPUT') == 0
    ]
    plotName = '%s_BE2C_RELATIVE' % (plot_prefix)
    print(plotName)
    pipeline_dfci.callBatchPlot(be2c_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=False,
                                bed=bed_string,
                                plotType='MULTIPLE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString='',
                                rpm=True,
                                rxGenome='')

    plotName = '%s_BE2C_UNIFORM' % (plot_prefix)
    print(plotName)
    pipeline_dfci.callBatchPlot(be2c_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=True,
                                bed=bed_string,
                                plotType='MULTIPLE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString='',
                                rpm=True,
                                rxGenome='')

    #now for twist
    plotList = ['BE2C_TWIST']
    twist_extension = 125
    plotName = '%s_BE2C_TWIST' % (plot_prefix)
    print(plotName)
    pipeline_dfci.callBatchPlot(be2c_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=False,
                                bed=bed_string,
                                plotType='MULTIPLE',
                                extension=twist_extension,
                                multiPage=False,
                                debug=False,
                                nameString='',
                                rpm=True,
                                rxGenome='')
Example #4
0
def plot_shep21_chiprx_genes(shep21_chiprx_dataFile, scale_path,
                             nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep21 chiprx data
    with both spikey normey and without spikey normey
    '''

    #we want a multiplicative scale factor for the data and to not have rpm on

    scale_table = utils.parseTable(scale_path, '\t')
    scale_dict = {}
    for line in scale_table[1:]:
        scale_dict[line[0]] = line[2]

    #first establish the plot folder
    plotFolder_scaled = utils.formatFolder(
        '%sSHEP21_CHIPRX_SCALED/' % (genePlotFolder), True)
    plotFolder_rpm = utils.formatFolder(
        '%sSHEP21_CHIPRX_RPM_NOSCALE/' % (genePlotFolder), True)
    plotFolder_raw = utils.formatFolder(
        '%sSHEP21_CHIPRX_RAW_NOSCALE/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    #for shep21_dataFile
    dataDict = pipeline_dfci.loadDataTable(shep21_chiprx_dataFile)

    names_list = dataDict.keys()
    #initial check for consistency of read lengths
    # for name in names_list:
    #     bam = utils.Bam(dataDict[name]['bam'])
    #     read_length = bam.getReadLengths()[0]
    #     bam_extension = 200-read_length
    #     print('For dataset %s in %s using an extension of %s' % (name,shep21_chiprx_dataFile,bam_extension))

    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (shep21_chiprx_dataFile, bam_extension))

    #for shep21 we want meta of k27ac, pol2, mycn, and twist
    #individual of k27ac, pol2, mycn, and twist

    #first do individuals rpm scaled
    for plot_group in ['MYCN', 'H3K4ME3', 'H3K27AC', 'POL2', 'CTCF']:
        plotList = [
            name for name in dataDict.keys() if name.count(plot_group) > 0
        ]
        scaleList = [
            round(1 / float(scale_dict[name]), 4) for name in plotList
        ]
        scaleList = [str(x) for x in scaleList]
        plot_scale_string = ','.join(scaleList)

        #first raw no scaling
        plotName = '%s_SHEP21_%s_RX_RAW_NOSCALE' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder_raw,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=False,
                                    rxGenome='')

        #first rpm no scaling
        plotName = '%s_SHEP21_%s_RX_RPM_NOSCALE' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder_rpm,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')

        #next w/ scaling
        plotName = '%s_SHEP21_%s_RX_SCALED' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(shep21_chiprx_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder_scaled,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=False,
                                    rxGenome='',
                                    scaleFactorString=plot_scale_string)
    #now as metas

    plotList = [
        'SHEP21_0HR_MYCN_NOSPIKE',
        'SHEP21_2HR_MYCN_NOSPIKE',
        'SHEP21_24HR_MYCN_NOSPIKE',
        'SHEP21_0HR_H3K27AC_NOSPIKE',
        'SHEP21_2HR_H3K27AC_NOSPIKE',
        'SHEP21_24HR_H3K27AC_NOSPIKE',
        'SHEP21_0HR_TWIST',
        'SHEP21_2HR_TWIST',
        'SHEP21_24HR_B_TWIST',
        'SHEP21_0HR_POL2_NOSPIKE_R2',
        'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE',
    ]
    groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2'

    plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix)
    pipeline_dfci.callBatchPlot(shep21_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder_rpm,
                                plotList,
                                uniform=False,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')

    plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix)
    pipeline_dfci.callBatchPlot(shep21_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder_rpm,
                                plotList,
                                uniform=True,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')
Example #5
0
def plot_shep21_genes(nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep21 data
    '''

    #we will have a variety of different plot types
    #all nb_meta baseline
    #chiprx_scaled
    #chiprx w/o scaling
    #just shep21 nospike
    #shep on

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sSHEP21_NOSPIKE/' % (genePlotFolder),
                                    True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    #for shep21_dataFile
    dataDict = pipeline_dfci.loadDataTable(shep21_dataFile)
    names_list = dataDict.keys()
    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (shep21_dataFile, bam_extension))

    #for shep21 we want meta of k27ac, pol2, mycn, and twist
    #individual of k27ac, pol2, mycn, and twist

    #first do individuals
    for plot_group in ['MYCN', 'TWIST', 'H3K27AC', 'POL2']:
        plotList = [
            name for name in dataDict.keys() if name.count(plot_group) > 0
        ]
        plotName = '%s_SHEP21_%s_NOSPIKE' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(shep21_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')

    #now as metas

    plotList = [
        'SHEP21_0HR_MYCN_NOSPIKE',
        'SHEP21_2HR_MYCN_NOSPIKE',
        'SHEP21_24HR_MYCN_NOSPIKE',
        'SHEP21_0HR_H3K27AC_NOSPIKE',
        'SHEP21_2HR_H3K27AC_NOSPIKE',
        'SHEP21_24HR_H3K27AC_NOSPIKE',
        'SHEP21_0HR_TWIST',
        'SHEP21_2HR_TWIST',
        'SHEP21_24HR_B_TWIST',
        'SHEP21_0HR_POL2_NOSPIKE_R2',
        'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE',
    ]
    groupString = 'MYCN,MYCN,MYCN,H3K27AC,H3K27AC,H3K27AC,TWIST,TWIST,TWIST,POL2,POL2,POL2'

    plotName = '%s_SHEP21_NOSPIKE_META_RELATIVE' % (plot_prefix)
    pipeline_dfci.callBatchPlot(shep21_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=False,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')

    plotName = '%s_SHEP21_NOSPIKE_META_UNIFORM' % (plot_prefix)
    pipeline_dfci.callBatchPlot(shep21_dataFile,
                                nb_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=True,
                                bed=bed_string,
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')
Example #6
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(chip_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===========================II. CALLING MACS==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #pipeline_dfci.run_macs(chip_data_file,projectFolder,macsFolder,macsEnrichedFolder,wiggleFolder,useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. MERGING IRF2 REGIONS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #create a set of regions representing the intersect of peaks
    #filter out anything that overlaps a peak in the HA ctl

    def merge_regions():
        '''
        merges ha peaks to identify all overlapping peaks
        filters out anything overlapping the HA controls
        '''
        hk_dox_ha_1 = utils.importBoundRegion(
            '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1')
        hk_dox_ha_2 = utils.importBoundRegion(
            '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2')

        hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci()

        #control datasets
        hk_ctl_ha_1 = utils.importBoundRegion(
            '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1')
        hk_ctl_ha_2 = utils.importBoundRegion(
            '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2')

        hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci()
        hk_ctl_lc = utils.LocusCollection(hk_ctl_loci)

        print(len(hk_dox_loci))
        stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection()
        print(len(stitched_lc))
        filtered_loci = []
        for locus in stitched_lc.getLoci():
            if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len(
                    hk_dox_ha_2.getOverlap(locus)) > 0:
                if len(hk_ctl_lc.getOverlap(locus)) == 0:
                    filtered_loci.append(locus)

        print(len(filtered_loci))
        filtered_lc = utils.LocusCollection(filtered_loci)
        gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (
            gffFolder)
        filtered_gff = utils.locusCollectionToGFF(filtered_lc)
        utils.unParseTable(filtered_gff, gff_path, '\t')

    #merge_regions()

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================IV. IDENTIFY ATAC OVERLAP REGIONS==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # atac_bed_path = '%sHG19_combined_atac_-0_+0.bed' % (bedFolder)# all combined atac regions

    # atac_collection = utils.importBoundRegion(atac_bed_path,'HG19_combined_atac')
    # print(len(atac_collection))

    # #now filter the irf2 gff
    # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder)

    # irf2_collection = utils.gffToLocusCollection(irf2_gff_path)
    # irf2_loci = irf2_collection.getLoci()

    # irf2_atac_loci = [locus for locus in irf2_loci if atac_collection.getOverlap(locus)]
    # print(len(irf2_atac_loci))
    # irf2_atac_collection=utils.LocusCollection(irf2_atac_loci)

    # irf2_atac_gff = utils.locusCollectionToGFF(irf2_atac_collection)
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # utils.unParseTable(irf2_atac_gff,irf2_atac_gff_path,'\t')

    # # overlap with TSS
    # tss_gff_path = '%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)
    # tss_gff = utils.parseTable(tss_gff_path,'\t')
    # tss_collection = utils.gffToLocusCollection(tss_gff)

    # print('tss overlap w/ IRF2  atac peaks')
    # print(len([locus for locus in irf2_atac_loci if tss_collection.getOverlap(locus)]))
    # print(len(irf2_atac_loci))

    # #overlap w/ k27ac
    # k27ac_gff_path = '%sHG19_keratinocyte_combined_all_-0_+0.gff' % (gffFolder)
    # k27ac_gff = utils.parseTable(k27ac_gff_path,'\t')
    # k27ac_collection = utils.gffToLocusCollection(k27ac_gff)

    # print('k27ac overlap w/ IRF2  atac peaks')
    # print(len([locus for locus in irf2_atac_loci if k27ac_collection.getOverlap(locus)]))
    # print(len(irf2_atac_loci))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========================V. CALLING ROSE2 META========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    def wrapRose2Meta(data_file,
                      input_path,
                      parent_folder,
                      active_gene_path='',
                      rank_list=[],
                      control_list=[],
                      analysis_name=''):
        '''
        quick wrapper for Rose2Meta
        '''
        dataDict = pipeline_dfci.loadDataTable(data_file)
        rank_string = ','.join([dataDict[name]['bam'] for name in rank_list])
        control_string = ','.join(
            [dataDict[name]['bam'] for name in control_list])

        output_folder = utils.formatFolder(
            '%s%s' % (parent_folder, analysis_name), True)
        rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % (
            py27_path, pipeline_dir, genome, input_path, rank_string,
            control_string, analysis_name, output_folder, blacklist_path)

        all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder,
                                                             analysis_name)

        if active_gene_path != '':
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path,
                active_gene_path)
        else:
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path)

        rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name)
        rose_bash = open(rose_bash_path, 'w')
        rose_bash.write('#!/usr/bin/python\n\n')
        rose_bash.write('#setting up bamliquidator\n')

        rose_bash.write('\n\n#ROSE2_CMD\n')
        rose_bash.write(rose2_meta_cmd + '\n')
        rose_bash.write(rose2_map_cmd + '\n')

        rose_bash.close()
        print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))

    #use ROSE2 w/ -t 0 and -s 0 to quantify background subtracted AUC at all peaks

    # parent_folder = utils.formatFolder('%smeta_rose/' % (projectFolder),True)

    # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed'
    # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder)
    # #creating bam lists

    # rank_list = ['HK_DOX_HA_1','HK_DOX_HA_2']
    # control_list =['HK_DOX_WCE_1','HK_DOX_WCE_2']

    # #for all IRF2 HA
    # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA'
    # wrapRose2Meta(chip_data_file,irf2_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name)

    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA_ATAC'
    # wrapRose2Meta(chip_data_file,irf2_atac_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#================VI. OVERLAPPING IRF2 W/ MOTIF PREDICTIONS============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #load up peaks
    # #irf2_atac_peaks
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # irf2_atac_gff = utils.parseTable(irf2_atac_gff_path,'\t')
    # irf2_atac_loci = utils.gffToLocusCollection(irf2_atac_gff).getLoci()
    # irf2_atac_collection = utils.LocusCollection(irf2_atac_loci)
    # print(len(irf2_atac_loci))

    # irf2_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES.txt' % (projectFolder)

    # irf2_edge_table = utils.parseTable(irf2_edge_path,'\t')
    # print(len(irf2_edge_table))

    # irf2_confirmed_edges = []
    # irf2_edge_loci = []
    # for line in irf2_edge_table[1:]:
    #     chrom = line[1].split('(')[0]
    #     coords = [int(x) for x in line[1].split(':')[-1].split('-')]
    #     locus = utils.Locus(chrom,coords[0]-00,coords[1]+00,'.',line[0])
    #     if len(irf2_atac_collection.getOverlap(locus)) > 0:
    #         irf2_confirmed_edges.append(line)
    #     irf2_edge_loci.append(locus)
    # print(len(irf2_confirmed_edges))

    # irf2_confirmed_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES_CONFIRMED.txt' % (projectFolder)
    # utils.unParseTable(irf2_confirmed_edges,irf2_confirmed_edge_path,'\t')

    # irf2_edge_collection = utils.LocusCollection(irf2_edge_loci)
    # print(len(irf2_edge_collection))

    # overlap_count = 0
    # for locus in irf2_atac_loci:
    #     search_locus = utils.makeSearchLocus(locus,0,0)
    #     if len(irf2_edge_collection.getOverlap(search_locus)) >0:
    #         overlap_count+=1
    # print(overlap_count)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================VII. RUNNING ENHANCER PROMOTER ON IRF2==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    def wrap_enhancer_promoter(dataFile,
                               input_path,
                               activity_path,
                               analysis_name,
                               names_list=[],
                               useBackground=True):
        '''
        runs enhancer promoter on everybody with the conserved regions and union of active genes
        '''

        #hard coded paths
        tads_path = '%shESC_domains_hg19.bed' % (bedFolder)

        #setting the output folder
        ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder),
                                       True)

        dataDict = pipeline_dfci.loadDataTable(dataFile)
        if len(names_list) == 0:
            names_list = [name for name in dataDict.keys()]
            names_list.sort()

        bams_list = [dataDict[name]['bam'] for name in names_list]
        bams_string = ' '.join(bams_list)

        background_names = [
            dataDict[name]['background'] for name in names_list
        ]
        background_list = [
            dataDict[background_name]['bam']
            for background_name in background_names
        ]
        background_string = ' '.join(background_list)

        ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name)
        ep_bash = open(ep_bash_path, 'w')

        ep_bash.write('#!/usr/bin/bash\n\n\n')

        ep_bash.write('#enhancer promoter analysis for %s\n\n' %
                      (analysis_name))

        if useBackground:
            python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, background_string, genome.upper(),
                input_path, ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        else:
            python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, genome.upper(), input_path,
                ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        ep_bash.close()

        return (ep_bash_path)

    # # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed'
    # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder)
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA_ATAC'
    # bam_list = ['HK_DOX_HA_1','HK_DOX_HA_2']
    # wrap_enhancer_promoter(chip_data_file,irf2_atac_gff_path,active_gene_path,analysis_name,bam_list,useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==============VIII. FORMATTING THE HORRIFYING EXPRESSION TABLE========'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # exp_path = '%sirf2_kd_rna_seq/single_counts_filtered_counts.txt' % (projectFolder)
    # sample_key_path = '%sirf2_kd_rna_seq/sample_key.txt' % (projectFolder)

    # sample_table = utils.parseTable(sample_key_path,'\t')
    # sample_list = [line[0] for line in sample_table[1:]]
    # print(sample_list)
    # exp_table = utils.parseTable(exp_path,'\t')

    # #for each gene make a dictionary
    # exp_dict = {}

    # #first fill out the dictionary by gene name
    # for line in exp_table[1:]:
    #     gene_name = line[3].replace('"','')
    #     exp_dict[gene_name] = {}

    # print(len(exp_dict.keys()))

    # for line in exp_table[1:]:
    #     gene_name = line[3].replace('"','')
    #     sample_ID = line[4].replace('"','')
    #     counts = line[2]
    #     exp_dict[gene_name][sample_ID] = counts

    # #make the formatted expression table
    # header = ['GENE_NAME'] + sample_list
    # exp_table_formatted = [header]
    # gene_list = exp_dict.keys()
    # gene_list.sort()
    # for gene in gene_list:
    #     exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list]
    #     exp_table_formatted.append(exp_line)

    # exp_table_formatted_path = '%sirf2_kd_rna_seq/irf2_expression_formatted.txt' % (projectFolder)
    # utils.unParseTable(exp_table_formatted,exp_table_formatted_path,'\t')

    # #with the exp dict we can make a nicer version of the gene table
    # gene_table_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE.txt' % (projectFolder)
    # gene_table_formatted_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE_FORMATTED.txt' % (projectFolder)

    # gene_table = utils.parseTable(gene_table_path,'\t')
    # gene_table_formatted = [gene_table[0] + ['IRF2_TOTAL_SIGNAL'] + header+ ['OLD_IRF2_KD_MEAN','OLD_CTL_MEAN','OLD_IRF2_VS_CTL','YOUNG_IRF2_KD_MEAN','YOUNG_CTL_MEAN','YOUNG_IRF2_VS_CTL']]
    # for line in gene_table[1:]:
    #     if float(line[1]) == 0.0 and float(line[2]) == 0.0:
    #         continue
    #     if exp_dict.has_key(line[0]) == False:
    #         continue
    #     gene = line[0]
    #     #where conditions are met
    #     old_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_OLD_1', 'IRF2_KD_OLD_2', 'IRF2_KD_OLD_3']])
    #     old_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_OLD_1', 'CT_CRISPR_OLD_2', 'CT_CRISPR_OLD_3']])
    #     old_fold = numpy.log2(old_kd_mean/old_ctl_mean)

    #     young_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_YOUNG_1', 'IRF2_KD_YOUNG_2', 'IRF2_KD_YOUNG_3']])
    #     young_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_YOUNG_1', 'CT_CRISPR_YOUNG_2', 'CT_CRISPR_YOUNG_3']])
    #     young_fold = numpy.log2(young_kd_mean/young_ctl_mean)

    #     exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] + [round(x,4) for x in [old_kd_mean,old_ctl_mean,old_fold,young_kd_mean,young_ctl_mean,young_fold]]
    #     gene_table_formatted.append(line+[sum([float(x) for x in line[1:3]])] + exp_line)

    # utils.unParseTable(gene_table_formatted,gene_table_formatted_path,'\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================IX. ANNOTATING IRF2 KD CLUSTERGRAM==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #this little bit of python code is on the dropbox... need to move over

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================X. PLOTTING FIGURE REGIONS ===================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    figure_gff_path = '%sHG19_KERATINOCYTE_FIGURE_2_GENES.gff' % (gffFolder)
    plotName = 'IRF2_FIGURE_2_GENES'
    outputFolder = utils.formatFolder('%sgene_plot/IRF2/' % (projectFolder),
                                      True)
    pipeline_dfci.callBatchPlot(chip_data_file,
                                figure_gff_path,
                                plotName,
                                outputFolder,
                                namesList=['HK_DOX_HA_1', 'HK_DOX_HA_2'],
                                uniform=True,
                                bed='',
                                plotType='MULTIPLE',
                                extension=200,
                                multiPage=False,
                                debug=False,
                                nameString='',
                                rpm=True,
                                rxGenome='',
                                scaleFactorString='')
Example #7
0
def plot_mouse_genes(mouse_dataFile, mouse_figure_gff_path):
    '''
    plots all varieties and iterations of tracks @ lifted over mouse regions
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sTHMYCN/' % (genePlotFolder), True)
    plot_prefix = 'MM9_NB_FIGURE_GENES_LIFTOVER'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)
    names_list = dataDict.keys()
    #initial check for consistency of read lengths
    # for name in names_list:
    #     bam = utils.Bam(dataDict[name]['bam'])
    #     read_length = bam.getReadLengths()[0]
    #     bam_extension = 200-read_length
    #     print('For dataset %s in %s using an extension of %s' % (name,mouse_dataFile,bam_extension))
    # sys.exit()

    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (mouse_dataFile, bam_extension))

    #first do individuals
    for plot_group in ['_MYCN', 'H3K27AC']:
        plotList = [
            name for name in dataDict.keys()
            if name.upper().count(plot_group) > 0
        ]
        print(plotList)
        if plot_group == '_MYCN':
            plotName = '%s_THMYCN%s' % (plot_prefix, plot_group)
        else:
            plotName = '%s_THMYCN_%s' % (plot_prefix, plot_group)

        print(plotName)
        pipeline_dfci.callBatchPlot(mouse_dataFile,
                                    mouse_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed='',
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')

    #now as metas
    #we only have 3 good k27ac and 3 good mycn datasets
    plotList = [
        'CG_H3K27Ac',
        'SCG_H3K27Ac',
        'THMYCN1_H3K27Ac',
        'THMYCN_139423_H3K27Ac',
        'THMYCN_139076_H3K27Ac',
        'THMYCN2_MYCN',
        'THMYCN_139076_MYCN',
        'THMYCN_139423_MYCN',
    ]
    groupString = 'CG_,SCG,H3K27AC,H3K27AC,H3K27AC,MYCN,MYCN,MYCN'

    plotName = '%s_THMYCN_META_RELATIVE' % (plot_prefix)
    pipeline_dfci.callBatchPlot(mouse_dataFile,
                                mouse_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=False,
                                bed='',
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')

    plotName = '%s_THMYCN_META_UNIFORM' % (plot_prefix)
    pipeline_dfci.callBatchPlot(mouse_dataFile,
                                mouse_figure_gff_path,
                                plotName,
                                plotFolder,
                                plotList,
                                uniform=True,
                                bed='',
                                plotType='MERGE',
                                extension=bam_extension,
                                multiPage=False,
                                debug=False,
                                nameString=groupString,
                                rpm=True,
                                rxGenome='')