Ejemplo n.º 1
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================I. LOADING DATA ANNOTATION TABLES==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)

    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. CALLING ROSE2==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    macsEnrichedFolder = '%smacsEnriched/' % (
        projectFolder)  #folder with macs peak output beds
    parentFolder = utils.formatFolder(
        '%srose/' % (projectFolder),
        True)  # create a folder to store ROSE2 output
    namesList = ['MM1S_H3K27AC', 'MM1S_MED1'
                 ]  # calling ROSE2 on H3K27AC and MED1 defined enhancers
    bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder)
    mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (
        projectFolder)
    pipeline_dfci.callRose2(chip_data_file,
                            macsEnrichedFolder,
                            parentFolder,
                            namesList,
                            extraMap=[],
                            inputFile='',
                            tss=2500,
                            stitch=12500,
                            bashFileName=bash_file,
                            mask=mask_file,
                            useBackground=True)
Ejemplo n.º 2
0
def main():


    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#======================I, LOADING DATA ANNOTATION======================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)
Ejemplo n.º 3
0
def main():


    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#======================I. LOADING DATA ANNOTATION======================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(data_file)
Ejemplo n.º 4
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    pipeline_dfci.summary(shep21_rna_dataFile)
    pipeline_dfci.summary(be2c_rna_drug_dataFile)
    pipeline_dfci.summary(be2c_rna_twist_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================II, RUNNING LINE PLOT SCRIPTS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #make the folder to store output figures

    utils.formatFolder('%sfigures/6_rna_line_plots/' % (projectFolder), True)
    #we have 3 RNA-Seq datasets

    #first is shep21 at the mycn conserved regions w/ the replicate dropped
    #and at shep21 defined regions
    #wrap_shep21()
    wrap_be2c_jq1()
Ejemplo n.º 5
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    pipeline_dfci.summary(shep21_rna_dataFile)
    pipeline_dfci.summary(be2c_rna_drug_dataFile)
    pipeline_dfci.summary(be2c_rna_twist_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================II. PROCESSING RNA_SEQ BAMS======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #shep21_bash_path = process_shep_rna(shep21_rna_dataFile,gtfFile)

    #shep21_drop_rep_bash_path = process_shep_rna_drop_rep(shep21_rna_dataFile,gtfFile)

    be2c_drug_bash_path = process_be2c_drug_rna(be2c_rna_drug_dataFile,
                                                gtfFile)
Ejemplo n.º 6
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. CALLING MACS============================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #running peak finding using macs 1.4.2 on all chip datasets
    #this usually takes ~2-3 hours on a reasonably fast machine
    #a 3 hour time out on this entire operation is set
    #if peak calling takes longer than 3 hours, simply run the script again after completion
    #run_macs(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================II. DEFINING ACTIVE GENES IN MOUSE==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #here we will identify active promoters in various contexts as those with
    #an H3K27AC peak in the +/- 1kb tss region
    #UCSC refseq annotations are used for all genes

    #make_active_gene_lists(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================III. CALLING ROSE TO MAP ENHANCERS=================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for SCG_H3K27AC
    # analysisName = 'SCG_H3K27AC'
    # namesList = ['SCG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for CG_H3K27AC
    # analysisName = 'CG_H3K27AC'
    # namesList = ['CG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for GANGLIA_H3K27AC
    # analysisName = 'GANGLIA_H3K27AC'
    # namesList = ['CG_H3K27Ac','SCG_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    # #for THMYCN
    # analysisName = 'THMYCN_H3K27AC'
    # namesList = ['THMYCN_139076_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN1_H3K27Ac']
    # bashFileName,region_map_path,namesList=define_enhancer_landscape(mouse_dataFile,analysisName,namesList)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================IV. LIFTING OVER NB CONSERVED REGIONS================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #liftover a pair of gffs
    # #first convert to bed
    # nb_promoter_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (hg19_projectFolder)
    # nb_enhancer_gff_path = '%sgff/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (hg19_projectFolder)

    # nb_promoter_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (hg19_projectFolder)
    # nb_enhancer_bed_path ='%sbeds/HG19_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (hg19_projectFolder)

    # nb_promoter_gff = utils.parseTable(nb_promoter_gff_path,'\t')
    # nb_enhancer_gff = utils.parseTable(nb_enhancer_gff_path,'\t')

    # utils.gffToBed(nb_promoter_gff,nb_promoter_bed_path)
    # utils.gffToBed(nb_enhancer_gff,nb_enhancer_bed_path)

    # print('converted NB conserved gffs to beds at %s and %s' % (nb_promoter_bed_path,nb_enhancer_bed_path))

    # #note, now you have to liftover manually to create beds
    # mm9_promoter_bed_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.bed' % (bedFolder)
    # mm9_enhancer_bed_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.bed' % (bedFolder)

    # mm9_promoter_gff_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder)
    # mm9_enhancer_gff_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder)

    # utils.bedToGFF(mm9_promoter_bed_path,mm9_promoter_gff_path)
    # utils.bedToGFF(mm9_enhancer_bed_path,mm9_enhancer_gff_path)

    # print('writing mm9 nb mycn sites to %s and %s' % (mm9_promoter_gff_path,mm9_enhancer_gff_path))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================V. MAPPING ENRICHED TO GFFS====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # setName = 'THMYCN'
    # gffList = [mm9_promoter_gff_path,mm9_enhancer_gff_path]
    # cellTypeList = ['THMYCN1','THMYCN2','THMYCN','CG','SCG']
    # mapList = ['CG_H3K27Ac',
    #             'SCG_H3K27Ac',
    #             'THMYCN1_H3K27Ac',
    #             'THMYCN_139423_H3K27Ac',
    #             'THMYCN_139076_H3K27Ac',
    #             ]

    # #pipeline_dfci.mapEnrichedToGFF(mouse_dataFile,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=mapList,useBackground=True)

    # #summarize info for venn diagrams for each

    # promoter_mapped_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder)
    # promoter_venn_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_VENN.txt' % (tableFolder)
    # summarizeVenn(promoter_mapped_path,group_list = ['CG','THMYCN'],output=promoter_venn_path)

    # enhancer_mapped_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_THMYCN.txt' % (mappedEnrichedFolder)
    # enhancer_venn_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_VENN.txt' % (tableFolder)
    # summarizeVenn(enhancer_mapped_path,group_list = ['CG','THMYCN'],output=enhancer_venn_path)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VI. MAKING MYCN REGIONS GFF======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    dataDict = pipeline_dfci.loadDataTable(mouse_dataFile)
    names_list = [
        'THMYCN2_MYCN',
        'THMYCN_139076_MYCN',
        'THMYCN_139423_MYCN',
    ]

    mycn_loci = []
    for name in names_list:
        mycn_collection = utils.importBoundRegion(
            '%s%s' % (macsEnrichedFolder, dataDict[name]['enrichedMacs']),
            name)
        mycn_loci += mycn_collection.getLoci()

    mycn_collection = utils.LocusCollection(mycn_loci, 50)
    mycn_collection.stitchCollection()
    mycn_gff = utils.locusCollectionToGFF(mycn_collection)
    mycn_gff_path = '%sMM9_THMYCN_MYCN_-0_+0.gff' % (gffFolder)
    utils.unParseTable(mycn_gff, mycn_gff_path, '\t')

    #make collections
    promoter_collection = utils.gffToLocusCollection(
        '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000.gff' % (gffFolder))
    enhancer_collection = utils.gffToLocusCollection(
        '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000.gff' % (gffFolder))
    #make the overlap table
    overlap_table = [['PROMOTER', 'ENHANCER', 'NONE']]
    promoter_count = 0
    enhancer_count = 0
    none_count = 0
    for line in mycn_gff:
        locus = utils.Locus(line[0],
                            int(line[3]) - 10000,
                            int(line[4]) + 10000, '.')
        if enhancer_collection.getOverlap(locus, 'both'):
            enhancer_count += 1
            continue

        if promoter_collection.getOverlap(locus, 'both'):
            promoter_count += 1
        else:
            none_count += 1

    overlap_table.append([promoter_count, enhancer_count, none_count])
    overlap_table_path = '%sMM9_THMYCN_OVERLAP.txt' % (tableFolder)
    utils.unParseTable(overlap_table, overlap_table_path, '\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VI. MAPPING GFFS FOR HEATMAP====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #map_for_heatmap(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VII. AVERAGING MAPPED SIGNAL====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # set_list = ['GANGLIA_H3K27AC','THMYCN_H3K27AC','THMYCN_MYCN']
    # set_names = [
    #     ['CG_H3K27Ac','SCG_H3K27Ac'],
    #     ['THMYCN1_H3K27Ac','THMYCN_139423_H3K27Ac','THMYCN_139076_H3K27Ac'],
    #     ['THMYCN2_MYCN','THMYCN_139076_MYCN','THMYCN_139423_MYCN']
    # ]
    # for i in range(len(set_list)):
    #     setName = set_list[i]
    #     names_list =set_names[i]
    #     print(setName)
    #     print(names_list)
    #     #for promoters
    #     mapped_list = ['%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list]
    #     output_path = '%sMM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000/MM9_NB_MYCN_CONSERVED_PROMOTER_-5000_+5000_%s.gff' % (mappedFolder,setName)
    #     print(output_path)
    #     averagingMappedSignal(mapped_list,output_path,setName)

    #     #for enhancers
    #     mapped_list = ['%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,name) for name in names_list]
    #     output_path = '%sMM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000/MM9_NB_MYCN_CONSERVED_ENHANCER_-5000_+5000_%s.gff' % (mappedFolder,setName)
    #     print(output_path)
    #     averagingMappedSignal(mapped_list,output_path,setName)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VIII. MAKING HEATMAPS/METAS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
Ejemplo n.º 7
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(mouse_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#============II. MAKING A BED OUT OF HG19 FIGURE REGIONS==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    hg19_gff_path = '%sgff/HG19_NB_FIGURE_GENES.gff' % (hg19_projectFolder)

    hg19_gff = utils.parseTable(hg19_gff_path, '\t')
    print(hg19_gff)

    hg19_bed = utils.gffToBed(hg19_gff)
    print(hg19_bed)
    hg19_bed_path = '%sbeds/HG19_NB_FIGURE_GENES.bed' % (hg19_projectFolder)
    utils.unParseTable(hg19_bed, hg19_bed_path, '\t')
    #need to manually lift this over to mm9
    #https://genome.ucsc.edu/cgi-bin/hgLiftOver

    mm9_bed_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.bed' % (bedFolder)
    mm9_gff_path = '%sMM9_NB_FIGURE_GENES_LIFTOVER.gff' % (gffFolder)
    mm9_gff = utils.bedToGFF(mm9_bed_path)

    #now add some additional manual regions

    added_gff_regions = [
        [
            'chr12', 'TWIST1_ENHANCER', 'TWIST1_ENHANCER', 34639818, 34656263,
            '', '-', '', 'TWIST1_ENHANCER'
        ],
        [
            'chr11', 'NPM1_PROMOTER_2', 'NPM1_PROMOTER_2', 33049820, 33065883,
            '', '+', '', 'NPM1_PROMOTER_2'
        ],
        [
            'chr6', 'GATA2_ENHANCER', 'GATA2_ENHANCER', 88135802, 88159867, '',
            '+', '', 'GATA2_ENHANCER'
        ],
        [
            'chr7', 'PHOX2A', 'PHOX2A', 108964211, 108974610, '', '+', '',
            'PHOX2A'
        ],
        [
            'chr15',
            'LET7B',
            'LET7B',
            85497440,
            85538754,
            '',
            '+',
            '',
            'LET7B',
        ],
        [
            'chr10', 'LIN28B', 'LIN28B', 45161233, 45217227, '', '-', '',
            'LIN28B'
        ],
    ]

    mm9_gff_full = mm9_gff + added_gff_regions

    utils.unParseTable(mm9_gff_full, mm9_gff_path, '\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. PLOTTING DATA IN MOUSE===================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #plot mouse regions
    plot_mouse_genes(mouse_dataFile, mm9_gff_path)
Ejemplo n.º 8
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(chip_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. RUNNING MACS============================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    data_dict = pipeline_dfci.loadDataTable(chip_data_file)

    #chip_list= [name for name in data_dict.keys() if name.upper().count('WCE') == 0]
    #print(chip_list)

    k27ac_list = [
        name for name in data_dict.keys()
        if name.count('27ac') == 1 and name.upper().count('WCE') == 0
    ]

    pipeline_dfci.run_macs(chip_data_file, projectFolder, macsFolder,
                           macsEnrichedFolder, wiggleFolder, True, k27ac_list)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================III. CALL ROSE INDIVIDUALLY========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
    analysis_name = 'HUMAN_LIVER_H3K27AC'
    parentFolder = utils.formatFolder('%s%s' % (roseFolder, analysis_name),
                                      True)

    data_dict = pipeline_dfci.loadDataTable(chip_data_file)

    k27ac_list = [
        name for name in data_dict.keys()
        if name.count('27ac') == 1 and name.upper().count('WCE') == 0
    ]

    #pipeline_dfci.callRose2(chip_data_file,macsEnrichedFolder,parentFolder,namesList=k27ac_list,extraMap = [],inputFile='',tss=2500,stitch='',bashFileName ='',mask=maskFile,useBackground=True,py27_path =py27_path)

    #run rose2 wrapper for both
    enhancer_bashFileName, enhancer_region_map_path, names_list = define_enhancer_landscape(
        projectFolder, pipeline_dir, chip_data_file, analysis_name, k27ac_list)
    print(enhancer_bashFileName, enhancer_region_map_path, names_list)

    #runs only if no output detected
    if not utils.checkOutput(enhancer_region_map_path, 0, 0):
        print(enhancer_bashFileName)
        os.system('bash %s' % (enhancer_bashFileName))
Ejemplo n.º 9
0
#namesList = dataDict.keys()

#print(namesList)

#==========================================================================
#=======================LOADING DATA ANNOTATION============================
#==========================================================================

##THIS SECTION LOADS A DATA TABLE.  MUST BE UNCOMMENTED FOR REST OF CODE TO WORK

#LOADING THE DATA TABLE
dataDict = pipeline_dfci.loadDataTable(dataFile)
print(dataDict.keys())

pipeline_dfci.summary(dataFile)

#==========================================================================
#==========================CALLING BOWTIE==================================
#==========================================================================

##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER

#namesList = []  <- fill this in if you want to only map a subset of the data. otherwise leave blank

##SET LAUNCH TO False to debug
#pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True)

#==========================================================================
#=============================CALL MACS====================================
#==========================================================================
Ejemplo n.º 10
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================I. LOADING DATA ANNOTATION TABLES==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)

    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================III. CALLING CRC3==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #running circuitry on the consensus system
    #creates a sbatch bash script
    crc_folder = '%scrc/' % (projectFolder)

    analysis_name = 'MM1S'
    enhancer_path = '%srose/MM1S_H3K27AC_ROSE/MM1S_H3K27AC_peaks_SuperEnhancers_ENHANCER_TO_GENE.txt' % (
        projectFolder)
    subpeak_path = '%smacsEnriched/MM1S_ATAC.bt2.srt.rmdup.macs14_peaks.bed' % (
        projectFolder)
    activity_path = '%stables/MM1S_EXPRESSION_ACTIVITY.txt' % (projectFolder)
    config_path = '%scrc_config.txt' % (whereAmI)
    #extra args

    args = '--config %s' % (config_path)

    print('ESTABLISHING INPUT FILES')
    for file_path in [enhancer_path, activity_path, subpeak_path, config_path]:
        if utils.checkOutput(file_path, 0.1, 0.1):
            print('FOUND %s' % (file_path))
        else:
            print('UNABLE TO FIND %s' % (file_path))
            sys.exit()

    pipeline_dfci.call_crc(analysis_name,
                           enhancer_path,
                           subpeak_path,
                           activity_path,
                           genome,
                           crc_folder,
                           args,
                           py27_path='')
def main():


    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#==================I. LOADING DATA ANNOTATION TABLES===================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)


    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print('#======================================================================')
    print('#=====================II. CONFIGURING GENOME BUILD=====================')
    print('#======================================================================')
    print('\n\n')

    
    genome_directory = '%sgenomes/Homo_sapiens/UCSC/hg19/Sequence/Chromosomes/' % (projectFolder)
    mask_file =  '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (projectFolder)
    
    config_table = [['BUILD:FIELD:PATH'],
                    ['%s:%s:%s' % (genome,'genome_directory',genome_directory)],
                    ['%s:%s:%s' % (genome,'mask_file',mask_file)],
                    ]
    config_path = '%scrc_config.txt' %(whereAmI)
    
    utils.unParseTable(config_table,config_path,'\t')
    print('writing genome configuration to %s' % (config_path))


    print('\n\n')
    print('#======================================================================')
    print('#==================III. DETECTING DEPENDENCIES=========================')
    print('#======================================================================')
    print('\n\n')

    from distutils.spawn import find_executable

    # Try to find bamliquidator, bamliquidator_batch.py, and fimo 
    bamliquidatorString = find_executable('bamliquidator')
    if bamliquidatorString is None:
        raise ValueError('bamliquidator not found in path')
    else:
        print('found bamliquidator')

    bamliquidatorBatchString = find_executable('bamliquidator_batch.py')
    if bamliquidatorString is None:
        raise ValueError('bamliquidator_batch.py not found in path')
    else:
        print('found bamliquidator_batch.py')

    bamliquidatorBatchString = find_executable('fimo')
    if bamliquidatorString is None:
        raise ValueError('fimo not found in path')
    else:
        print('found fimo')
def main():

    print('main analysis for NIBR keratinocyte project atac-seq analysis')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================I. CHECKING DATA TABLES========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    pipeline_dfci.summary(atac_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=========================II. MAPPING FASTQS==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for atac need no unaligned and no discordant

    # #atac_params =  '--end-to-end --sensitive --no-unal --no-discordant --mm --met-stderr --time'
    # #pipeline_dfci.makeBowtieBashJobsSlurm(atac_dataFile,namesList = [],launch=True,overwrite=False,pCount=16,paramString=atac_params)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================III. RUNNING RIESLING==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #riesling is an ATAC-seq pipeline jointly developed by our lab and the Gordon lab at WUSTL

    # #it sanitizes the bams removing duplicate and mitochondrial reads

    # riesling_dir = utils.formatFolder('%sriesling/' % (projectFolder),True)
    # input_dir = utils.formatFolder('/storage/cylin/grail/bam/hg19/NIBR_YvsO/ATACseq_YvsO/')
    # output_dir = utils.formatFolder('/storage/cylin/grail/bam/hg19/NIBR_YvsO/ATACseq_YvsO/riesling/',True)

    # analysis_name = 'NIBR_ATAC_NEW'

    # riesling_bash_path = '%s%s_riesling.sh' % (riesling_dir,analysis_name)

    # riesling_bash = open(riesling_bash_path,'w')

    # riesling_bash.write('#!/usr/bin/bash\n\n')

    # #now write the sbatch headers
    # riesling_bash.write('#SBATCH -n 32\n#SBATCH --mem=512000\n')
    # riesling_bash.write('#SBATCH -o %s%s_reisling_slurm_%%j.out\n' % (riesling_dir,analysis_name))
    # riesling_bash.write('#SBATCH -e %s%s_reisling_slurm_%%j.err\n' % (riesling_dir,analysis_name))
    # riesling_bash.write('pwd; hostname; date\n\n')

    # riesling_bash.write('cd /storage/cylin/bin/riesling-pipeline/\n')
    # riesling_bash.write('%s 2-sanitize-bam.py -i %s -o %s -g %s -v\n' % (py27_path,input_dir, output_dir,genome))
    # riesling_bash.close()

    # print('writing riesling bam commands to %s' % (riesling_bash_path))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================IV. FORMATTING RIESLING BAMS====================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # atac_table = utils.parseTable(atac_dataFile,'\t')
    # #now fix the path to the right bam
    # for i in range(1,len(atac_table)):
    #     atac_table[i][0] = atac_table[i][0] + 'riesling/'

    # atac_dataFile_riesling = atac_dataFile.replace('.txt','_riesling.txt')
    # new_table = utils.unParseTable(atac_table,atac_dataFile_riesling,'\t')

    # #now we need to index all of the bams
    # dataDict = pipeline_dfci.loadDataTable(atac_dataFile_riesling)

    # names_list = dataDict.keys()
    # bam_directory = dataDict[names_list[0]]['folder']
    # bam_file_list = ['%s%s' % (bam_directory, x) for x in os.listdir(bam_directory) if x.split('.')[-1] == 'bam']
    # print(bam_file_list)
    # for bam_path in bam_file_list:
    #     index_cmd = 'samtools index %s' % (bam_path)
    #     print(index_cmd)
    #     os.system(index_cmd)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========================V. RUNNING PEAK CALLING======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    atac_dataFile_riesling = atac_dataFile.replace('.txt', '_riesling.txt')
    #run_macs(atac_dataFile_riesling,False)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========================V. RUNNING CLUSTERING ========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    dataDict = pipeline_dfci.loadDataTable(atac_dataFile_riesling)
    atac_list = dataDict.keys()
    atac_list.sort()
    print(atac_list)

    analysis_name = 'keratinocyte_atac'
    cluster_folder = utils.formatFolder('%sclustering' % (projectFolder), True)
    cluster_rose_folder = utils.formatFolder(
        '%sclustering_rose' % (projectFolder), True)
    output_folder = utils.formatFolder(
        '%s%s_clustering' % (cluster_folder, analysis_name), True)
    names_string = ','.join(atac_list)
    cluster_bash_path = '%s%s_clustering.sh' % (cluster_folder, analysis_name)
    cluster_bash = open(cluster_bash_path, 'w')
    cluster_bash.write('#!/usr/bin/bash\n\n\n')
    cluster_bash.write('#SBATCH --mem=64000\n\n\n')

    cluster_cmd = '%s %sclusterEnhancer.py -d %s -i %s -r %s -o %s -e super -t 0 -n %s --mask %s' % (
        py27_path, pipeline_dir, atac_dataFile_riesling, names_string,
        cluster_rose_folder, output_folder, analysis_name, maskFile)
    cluster_bash.write(cluster_cmd + '\n\n')
    cluster_bash.close()
Ejemplo n.º 13
0
def main():


    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#======================I, LOADING DATA ANNOTATION======================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(shep_on_dataFile)


    print('\n\n')
    print('#======================================================================')
    print('#=========================II. MAP ENHANCERS============================')
    print('#======================================================================')
    print('\n\n')

    # #for enhancers
    # enhancer_bashFileName,enhancer_region_map_path,namesList = define_enhancer_landscape(projectFolder,pipeline_dir,shep_on_dataFile)
    # print(enhancer_bashFileName)
    # #runs only if no output detected
    # if not utils.checkOutput(enhancer_region_map_path,0,0):
    #     print(enhancer_bashFileName)
    #     os.system('bash %s' % (enhancer_bashFileName))


    # #in individual systems
    # bash_path = map_shep_enhancers(shep_on_dataFile)
    # #os.system('bash %s' % (bash_path))


    print('\n\n')
    print('#======================================================================')
    print('#=======================III. MAP MYC LANDSCAPE=========================')
    print('#======================================================================')
    print('\n\n')

    # #for mycn
    # myc_bashFileName,myc_region_map_path,namesList = define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile)

    # if not utils.checkOutput(myc_region_map_path,0,0):
    #     print(myc_bashFileName)
    #     os.system('bash %s' % (myc_bashFileName))



    print('\n\n')
    print('#======================================================================')
    print('#===================IV. MAKING +/- 5KB MYCN GFFs=======================')
    print('#======================================================================')
    print('\n\n')

    #make_shep_on_mycn_landscape(shep_on_dataFile)    



    print('\n\n')
    print('#======================================================================')
    print('#================V. MAPPING MYCN GFFs FOR METAS AND HEATMAP============')
    print('#======================================================================')
    print('\n\n')
    

    #mapping at shep on defined regions and same regions from 3_shep21_chiprx_heatmap
    #those regions are defined in shep21 data
    #map_shep_for_heatmap(shep_on_dataFile)

    print('\n\n')
    print('#======================================================================')
    print('#==================VI. MAPPING MYCN GFFs FOR BOX PLOT==================')
    print('#======================================================================')
    print('\n\n')

    #mapping @ a 1 bin scale for the shep21 conserved mycn regions

    gffList = [
               '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-1kb_+1kb.gff' % (gffFolder),
               '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder),
               '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder),
               '%sHG19_SHEP_MYCN_CONSERVED_-1kb_+1kb.gff' % (gffFolder),
               '%sHG19_SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder),
               '%sHG19_SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder),
        ]

    #map_regions(shep_on_dataFile,gffList,names_list=[])


    print('\n\n')
    print('#======================================================================')
    print('#=====================VII. MAKING BOX PLOTS============================')
    print('#======================================================================')
    print('\n\n')


    set_name = 'SHEP_MYCN'
    gff_name = 'SHEP_MYCN_CONSERVED_ENHANCER_-1kb_+1kb'
    names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list)

    set_name = 'SHEP_H3K27AC'
    names_list = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC']
    makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list)



    set_name = 'SHEP_MYCN'
    gff_name = 'SHEP_MYCN_CONSERVED_PROMOTER_-1kb_+1kb'
    names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list)

    set_name = 'SHEP_H3K27AC'
    names_list = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC']
    makeBoxPlot(shep_on_dataFile,set_name,gff_name,names_list)



    print('\n\n')
    print('#======================================================================')
    print('#===============VII. MAKING HEATMAPS AND METAS ========================')
    print('#======================================================================')
    print('\n\n')
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(atac_data_file)

    #assumes macs has already been run and formatted
    #    run_macs(chip_data_file)

    #    sys.exit()

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================MAKING GEO TABLES==============================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    geoName = 'rasmc_atac'
    outputFolder = '/storage/cylin/grail/projects/rasmc_all/rasmc_geo/%s_geo/' % (
        geoName)
    namesList = []

    #    makeGEOTable(atac_data_file,wiggleFolder,macsFolder,namesList,geoName,outputFolder)

    #==========================================================================
    #====================MAP BAMS BATCH========================================
    #==========================================================================
    print('Mapping chiprx bams to peaks')
    dataFile = atac_data_file
    dataDict = pipeline_dfci.loadDataTable(atac_data_file)
    names = dataDict.keys()
    #    for name in names:
    #        if len(dataDict[name]['enrichedMacs'])>4:
    #            peak_name=dataDict[name]['enrichedMacs']
    #            peak_path='%s%s' % (macsEnrichedFolder,peak_name)
    #            gff_path='%s%s.gff' % (gffFolder,peak_name.split('.bed')[0])
    #            utils.bedToGFF(peak_path,output=gff_path)
    #            gffList=[gff_path]
    #            namesL=[name]
    #            pipeline_dfci.mapBamsBatch(dataFile, gffList,mappedFolder,overWrite=False,namesList=namesL,extension=0,rpm=False)

    namesL = names
    tss_gff_path = '%sRN6_TSS_ALL_-300_+300.gff' % (gffFolder)
    gffList = [tss_gff_path]
    pipeline_dfci.mapBamsBatch(dataFile,
                               gffList,
                               mappedFolder,
                               overWrite=False,
                               namesList=namesL,
                               extension=0,
                               rpm=True)
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(chip_data_file)

    #use macs1.4 to make wiggles
    macs14Folder = utils.formatFolder('%smacs14/' % (projectFolder), True)
    macs14EnrichedFolder = utils.formatFolder(
        '%smacs14Enriched/' % (projectFolder), True)
    pipeline_dfci.run_macs(chip_data_file,
                           projectFolder,
                           macs14Folder,
                           macs14EnrichedFolder,
                           wiggleFolder,
                           useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================II. DEFINING ENHANCERS========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #running ROSE2_meta on the chordoma k27ac

    chip_data_dict = pipeline_dfci.loadDataTable(chip_data_file)
    # for name in chip_data_dict.keys():
    #     print(name)
    #     print(chip_data_dict[name]['enrichedMacs'])

    # #run rose2 wrapper for both
    # bashFileName,region_map_path,names_list = define_enhancer_landscape(projectFolder,pipeline_dir,chip_data_file,analysis_name = 'CH22_H3K27AC')
    # print(bashFileName,region_map_path,names_list)

    # #runs only if no output detected
    # if not utils.checkOutput(enhancer_region_map_path,0,0):
    #     print(enhancer_bashFileName)
    #     os.system('bash %s' % (enhancer_bashFileName))

    # #=========
    # #=========
    # #=========

    # #sanity check debug

    # #run rose2 meta for one dataset as a test w/ stitch at 500 just for a control
    # bashFileName,region_map_path,names_list = define_enhancer_landscape(projectFolder,pipeline_dir,chip_data_file,analysis_name = 'CH22_H3K27AC_1_TEST',names_list = ['CH22_H3K27AC_1'],stitch = '500')
    # print(bashFileName,region_map_path,names_list)

    # #running regular rose2
    # rose2_parent_folder = utils.formatFolder('%srose2' % (projectFolder),True)
    # rose2_bash = pipeline_dfci.callRose2(chip_data_file,macsEnrichedFolder,rose2_parent_folder,namesList=['CH22_H3K27AC_1'],extraMap = [],inputFile='',tss=2500,stitch='500',bashFileName ='',mask=maskFile,useBackground=True,py27_path =py27_path)

    # print(rose2_bash)

    # #ok, ROSE2 META and ROSE2 still produce same result when run on single dataset
    # #whew

    # #=========
    # #=========
    # #=========

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. DEFINE T LANDSCAPE========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #since this is an HA chip we need to remove HA background

    # #get the pos regions (T)
    # data_dict = pipeline_dfci.loadDataTable(chip_data_file)
    # t_list = ['%s%s' % (macsEnrichedFolder,data_dict[name]['enrichedMacs']) for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0]
    # print(t_list)

    # #get the negative_control list from IRF2 project
    # ha_ctl_list = ['%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder),'%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder)]

    # t_bed_path_intersect = '%sCH22_T_INTERSECT.bed' % (bedFolder)
    # #merge_regions(pos_list = t_list,neg_list = ha_ctl_list,analysis_name = 'CH22_T_INTERSECT',output_path=t_bed_path_intersect,merge_type = 'INTERSECT')

    # t_bed_path_union = '%sCH22_T_UNION.bed' % (bedFolder)
    # #merge_regions(pos_list = t_list,neg_list = ha_ctl_list,analysis_name = 'CH22_T_UNION',output_path=t_bed_path_union,merge_type = 'UNION')

    # #for k27ac

    # h3k27ac_list = ['%s%s' % (macsEnrichedFolder,data_dict[name]['enrichedMacs']) for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0]
    # h3k27ac_bed_path_union = '%sCH22_H3K27AC_UNION.bed' % (bedFolder)
    # merge_regions(pos_list = h3k27ac_list,neg_list = [],analysis_name = 'CH22_H3K27AC_UNION',output_path=h3k27ac_bed_path_union,merge_type = 'UNION')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=========================IV. T MOTIF FINDING=========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #use the T union and then calculate signal

    data_dict = pipeline_dfci.loadDataTable(chip_data_file)
    map_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1]
    print(map_list)
    gffList = ['%sCH22_T_UNION.bed' % (bedFolder)]
    #signal_table_list = pipeline_dfci.map_regions(chip_data_file,gffList,mappedFolder,signalFolder,names_list=map_list,medianNorm=False,output='',extendReadsTo=200)

    #column order =  CH22_dTag_T_MUT_HA      CH22_dTag_T_MUT_WCE     CH22_dTag_T_WT_HA       CH22_dTag_T_WT_WCE

    signal_table_path = '%sCH22_T_UNION_CH22_CHIP_TABLE_SIGNAL.txt' % (
        signalFolder)

    top = 1000
    fasta_path = make_T_top_regions(signal_table_path, top)

    #now run meme
    analysis_name = 'HG19_CH22_T_UNION_TOP'
    meme_bash_path = wrap_meme(analysis_name)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================IV. MAKE HEATMAPS OF T LANDSCAPE==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #use the union for T
    t_union_path = '%sbeds/CH22_T_UNION.bed' % (projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================IV. DEFINE ACTIVE GENES========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #make the relevant gffs
    # pipeline_dfci.makeGeneGFFs(annotFile,gffFolder,species='HG19')

    # # #Making a list of all active genes
    # tss_gff = utils.parseTable('%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder),'\t')
    # start_dict = utils.makeStartDict(annotFile)

    # all_gene_table = []
    # ticker = 1
    # for line in tss_gff:
    #     new_line = [ticker,line[1],start_dict[line[1]]['name']]
    #     all_gene_table.append(new_line)
    #     ticker+=1

    # utils.unParseTable(all_gene_table,'%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder),'\t')
    # sys.exit()

    # setName = 'CH22_H3K27AC'
    # cellTypeList = ['CH22']
    # map_list = ['CH22_H3K27AC_1','CH22_H3K27AC_2']
    # gffList = ['%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)]
    # pipeline_dfci.mapEnrichedToGFF(chip_data_file,setName,gffList,cellTypeList,macsEnrichedFolder,mappedEnrichedFolder,macs=True,namesList=map_list,useBackground=True)

    # setList = [['CH22_H3K27AC_1'],['CH22_H3K27AC_2']] #bound by either
    # output = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder)
    # mappedEnrichedFile = '%sHG19_TSS_ALL_-1000_+1000/HG19_TSS_ALL_-1000_+1000_CH22_H3K27AC.txt' % (mappedEnrichedFolder)
    # pipeline_dfci.makeGFFListFile(mappedEnrichedFile,setList,output,annotFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================V. RUNNING ENHANCER PROMOTER======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # data_dict = pipeline_dfci.loadDataTable(chip_data_file)

    # #need to run enhancer promoter code on both k27ac and T

    # #for T at active genes
    # activity_path = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder)
    # input_path = '%sCH22_T_UNION.bed' % (bedFolder)
    # analysis_name = 'CH22_T_UNION'
    # t_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0]

    # #wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = t_list,useBackground=True)

    # #for T at all genes
    # activity_path = '%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder)
    # input_path = '%sCH22_T_UNION.bed' % (bedFolder)
    # analysis_name = 'CH22_T_UNION_ALL_GENES'
    # t_list = [name for name in data_dict.keys() if name.count('dTag_T') == 1 and name.count('WCE') == 0]

    # #wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = t_list,useBackground=True)

    # #for H3K27AC at active genes
    # # activity_path = '%sHG19_CHORDOMA_CH22_H3K27AC_ACTIVE.txt' % (geneListFolder)

    # # input_path = '%sCH22_H3K27AC_UNION.bed' % (bedFolder)
    # # analysis_name = 'CH22_H3K27C_UNION'
    # # h3k27ac_list = [name for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0]

    # # wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = h3k27ac_list,useBackground=True)

    # #for H3K27AC at all genes
    # activity_path = '%sHG19_UCSC_REFSEQ_ALL.txt' % (geneListFolder)

    # input_path = '%sCH22_H3K27AC_UNION.bed' % (bedFolder)
    # analysis_name = 'CH22_H3K27C_UNION_ALL_GENES'
    # h3k27ac_list = [name for name in data_dict.keys() if name.count('H3K27AC') == 1 and name.count('WCE') == 0]

    # wrap_enhancer_promoter(chip_data_file,input_path,activity_path,analysis_name,names_list = h3k27ac_list,useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================VI. LINKING CHROMATIN TO EXPRESSION=================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #a gene counts if it is expressed above cut in at least one sample
    #may need to collapse NM IDs per genes

    #first check that expression table doesn't have duplicates

    def merge_rna(exp_path, exp_cutoff=1, output_path=''):
        '''
        just a wrapper for combining expression data w/ gene level h3k27ac and T data
        '''

        exp_table = utils.parseTable(exp_path, '\t')

        exp_dict = defaultdict(list)
        for line in exp_table[1:]:
            #here's where we can filter for an expression cutoff
            exp_line = [float(x) for x in line[1:]]
            if max(exp_line) > exp_cutoff:
                exp_dict[line[0]] = exp_line

        #now figure out genes w/ T binding
        t_gene_path = '%senhancerPromoter/CH22_T_UNION_ALL_GENES/CH22_T_UNION_ALL_GENES_GENE_TABLE.txt' % (
            projectFolder)
        t_table = utils.parseTable(t_gene_path, '\t')

        t_dict = defaultdict(list)

        for line in t_table[1:]:
            t_dict[line[0]] = [float(x) for x in line[1:]]

        #now figure out genes w/ H3K27AC binding
        h3k27ac_gene_path = '%senhancerPromoter/CH22_H3K27C_UNION_ALL_GENES/CH22_H3K27C_UNION_ALL_GENES_GENE_TABLE.txt' % (
            projectFolder)
        h3k27ac_table = utils.parseTable(h3k27ac_gene_path, '\t')

        h3k27ac_dict = defaultdict(list)

        for line in h3k27ac_table[1:]:
            h3k27ac_dict[line[0]] = [float(x) for x in line[1:]]

        #now set up the output
        gene_table = []
        gene_table_header = [
            'GENE', 'T_PROMOTER', 'T_DISTAL', 'H3K27AC_PROMOTER',
            'H3K27AC_DISTAL'
        ] + exp_table[0]
        gene_table.append(gene_table_header)

        #anchor analysis on genes w/ detectable expr
        exp_gene_list = exp_dict.keys()
        exp_gene_list.sort()
        for gene in exp_gene_list:

            if gene in t_dict:
                t_line = t_dict[gene]
            else:
                t_line = [0.0, 0.0]

            if gene in h3k27ac_dict:
                h3k27ac_line = h3k27ac_dict[gene]
            else:
                h3k27ac_line = [0.0, 0.0]

            new_line = [gene] + t_line + h3k27ac_line + exp_dict[gene]

            gene_table.append(new_line)

        utils.unParseTable(gene_table, output_path, '\t')

    #for norm data
    rna_project_folder = '/storage/cylin/grail/projects/chordoma_ch22_rna/'

    #this table is just in alphabetical order
    exp_path = '%s190612_rna_seq/cuffnorm_output/cuffnorm_all_fpkm_exprs_norm.txt' % (
        rna_project_folder)

    exp_cutoff = 1
    output_path = '%stables/HG19_CHORDOMA_CH22_GENE_TABLE_NORM.txt' % (
        projectFolder)
    #merge_rna(exp_path,exp_cutoff,output_path)

    #for raw data
    exp_path = '%s190612_rna_seq/cuffnorm_output/cuffnorm_all_fpkm_exprs_raw.txt' % (
        rna_project_folder)
    output_path = '%stables/HG19_CHORDOMA_CH22_GENE_TABLE_RAW.txt' % (
        projectFolder)
Ejemplo n.º 16
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(nb_all_chip_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================II. ENHANCER PROMOTER FOR ALL NB==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    # activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    # analysis_name = 'NB_MYCN_CONSERVED'
    # nb_enhancer_promoter_bash = wrap_enhancer_promoter(nb_all_chip_dataFile,input_path,activity_path,analysis_name)
    # os.system('bash %s' % (nb_enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===============III. ENHANCER PROMOTER IN SHEP21 SYSTEM================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for SHEP21 nospike
    # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for SHEP21 chiprx
    # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for SHEP21 nospike
    # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False)
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for SHEP21 chiprx
    # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False)
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for nb conserved regions
    # #for SHEP21 nospike
    # mycn_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # for mycn_name in mycn_list:

    #     input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    #     activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    #     analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for SHEP21 chiprx
    # mycn_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # for mycn_name in mycn_list:

    #     input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    #     activity_path = '%sHG19_NB_H3K27AC_ACTIVE_UNION.txt' % (geneListFolder)
    #     analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep21_chiprx_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===============IV. ENHANCER PROMOTER IN SHEP ON SYSTEM================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for SHEP21 on
    # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    #for SHEP21 on no background
    # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    # for mycn_name in mycn_list:
    #     input_path = '%sSHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name],useBackground=False)
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    # #for SHEP21 on @ NB conserved regions
    # mycn_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    # for mycn_name in mycn_list:
    #     input_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    #     activity_path = '%sHG19_SHEP21_0HR_H3K27AC_NOSPIKE_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'NB_MYCN_CONSERVED_%s' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(shep_on_dataFile,input_path,activity_path,analysis_name,names_list = [mycn_name])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#================V. ENHANCER PROMOTER IN INDIVIDUAL NB================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for BE2C, KELLY, NGP
    # mycn_list = ['BE2C','KELLY','NGP']
    # for mycn_name in mycn_list:
    #     input_path = '%s%s_MYCN_peaks.bed' % (macsEnrichedFolder,mycn_name)
    #     activity_path = '%sHG19_%s_H3K27AC_ACTIVE.txt' % (geneListFolder,mycn_name)
    #     analysis_name = '%s_MYCN' % (mycn_name)
    #     nb_enhancer_promoter_bash = wrap_enhancer_promoter(nb_all_chip_dataFile,input_path,activity_path,analysis_name,names_list = ['%s_MYCN' % (mycn_name)])
    #     os.system('bash %s' % (nb_enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#============VI. ENHANCER PROMOTER ANALYSIS IN OTHER CANCERS==========='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #for p493-6, mm1s, h2171, h128, and u87

    # #for p493-6
    # myc_list = ['P493-6_T0_MYC','P493-6_T1_MYC','P493-6_T24_MYC']
    # for myc_name in myc_list:
    #     input_path = '%sP493-6_T24_MYC_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_P493-6_T24_H3K27AC_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'P493-6_T24_MYC_REGIONS_%s' % (myc_name)
    #     enhancer_promoter_bash = wrap_enhancer_promoter(p4936_young_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name])
    #     os.system('bash %s' % (enhancer_promoter_bash))

    # #for sclc
    # myc_list = ['H128_MYC','H2171_MYC']
    # for myc_name in myc_list:
    #     input_path = '%sH2171_MYC_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_H2171_H3K27AC_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'H2171_MYC_REGIONS_%s' % (myc_name)
    #     enhancer_promoter_bash = wrap_enhancer_promoter(sclc_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name])
    #     os.system('bash %s' % (enhancer_promoter_bash))

    # #for MM
    # myc_list = ['MM1S_MYC_DMSO']
    # for myc_name in myc_list:
    #     input_path = '%sMM1S_MYC_DMSO_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_MM1S_H3K27AC_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'MM1S_MYC_REGIONS_%s' % (myc_name)
    #     enhancer_promoter_bash = wrap_enhancer_promoter(mm1s_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name])
    #     os.system('bash %s' % (enhancer_promoter_bash))

    # #for u87
    # myc_list = ['U87_MYC']
    # for myc_name in myc_list:
    #     input_path = '%sU87_MYC_peaks.bed' % (macsEnrichedFolder)
    #     activity_path = '%sHG19_U87_H3K27AC_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = 'U87_MYC_REGIONS_%s' % (myc_name)
    #     enhancer_promoter_bash = wrap_enhancer_promoter(u87_dataFile,input_path,activity_path,analysis_name,names_list = [myc_name])
    #     os.system('bash %s' % (enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========VII. ENHANCER PROMOTER ANALYSIS FOR OTHER MARKS IN BE2C========'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #names_list = ['BE2C_BRD4','BE2C_H3K27AC','BE2C_TWIST','BE2C_RNA_POL2']
    # names_list = ['BE2C_H3K27AC']
    # names_list = ['BE2C_BRD4','BE2C_TWIST','BE2C_RNA_POL2','BE2C_H3K27ME3','BE2C_H3K4ME3']

    # for name in names_list:
    #     input_path = '%s%s_peaks.bed' % (macsEnrichedFolder,name)
    #     activity_path = '%sHG19_BE2C_H3K27AC_ACTIVE.txt' % (geneListFolder)
    #     analysis_name = '%s_REGIONS' % (name)
    #     enhancer_promoter_bash = wrap_enhancer_promoter(be2c_dataFile,input_path,activity_path,analysis_name,names_list = [name])
    #     os.system('bash %s' % (enhancer_promoter_bash))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================VIII. MAKING GENE TABLE W/ LENGTH================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #for nb conserved
    gene_table_path = '%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_GENE_TABLE.txt' % (
        projectFolder)
    peak_table_path = '%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_PEAK_TABLE.txt' % (
        projectFolder)
    gene_path = addLengths(gene_table_path, peak_table_path)

    #for shep21
    gene_table_path = '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_GENE_TABLE.txt' % (
        projectFolder)
    peak_table_path = '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_PEAK_TABLE.txt' % (
        projectFolder)

    gene_path = addLengths(gene_table_path, peak_table_path)
Ejemplo n.º 17
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(shep21_chiprx_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========II. DEFINING ACTIVE GENES AND ENHANCERS IN SHEP21============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #make_shep21_active()

    #bash_path = map_nb_enhancers(nb_all_chip_dataFile)
    #os.system('bash %s' % (bash_path))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================III. MAKING +/- 5KB MYCN GFFs======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
    #make_shep21_mycn_landscape(nb_all_chip_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===============IV. MAPPING MYCN GFFs FOR METAS AND HEATMAP============'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #with and without spike in
    #map_shep21_for_heatmap(shep21_chiprx_dataFile,shep21_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================V. MAPPING MYCN GFFs FOR BOX PLOT==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #mapping @ a 1 bin scale for the shep21 conserved mycn regions

    # gffList = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-0_+0.gff' % (gffFolder),
    #           '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_-1kb_+1kb.gff' % (gffFolder),
    #            '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb.gff' % (gffFolder),
    #            '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb.gff' % (gffFolder),
    # ]

    # gffList = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0.gff' % (gffFolder),
    #           '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0.gff' % (gffFolder),
    #     ]

    # map_regions(shep21_chiprx_dataFile,gffList,names_list=[])
    # map_regions(shep21_dataFile,gffList,names_list=[])

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===============VI. MAKING HEATMAPS AND METAS ========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #set the output folder
    utils.formatFolder('%sfigures/5_chiprx_heatmaps/' % projectFolder, True)

    # #==========================================
    # #for shep21 mycn chiprx
    # plot_name = 'SHEP21_MYCN_RX'
    # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'red'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #for shep21 mycn regular chip
    # plot_name = 'SHEP21_MYCN_NOSPIKE'
    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'red'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #==========================================
    # #for shep21 h3k27ac chiprx
    # plot_name = 'SHEP21_H3K27AC_RX'
    # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'blue'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #for shep21 mycn regular chip
    # plot_name = 'SHEP21_H3K27AC_NOSPIKE'
    # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'blue'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #==========================================
    # #for shep21 CTCF chiprx
    # plot_name = 'SHEP21_CTCF_RX'
    # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'black'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #==========================================
    # #for shep21 RNA Pol II chiprx
    # plot_name = 'SHEP21_POL2_RX'
    # names_list = ['SHEP21_0HR_POL2_RX','SHEP21_2HR_POL2_RX','SHEP21_24HR_POL2_RX']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'black'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #==========================================
    # #for shep21 RNA Pol II NOSPIKE
    # plot_name = 'SHEP21_POL2_NOSPIKE'
    # names_list = ['SHEP21_0HR_POL2_NOSPIKE','SHEP21_2HR_POL2_NOSPIKE','SHEP21_24HR_POL2_NOSPIKE']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'black'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    # #==========================================
    # #for shep21 H3K4ME3 chiprx
    # plot_name = 'SHEP21_H3K4ME3_RX'
    # names_list = ['SHEP21_0HR_H3K4ME3_RX','SHEP21_2HR_H3K4ME3_RX','SHEP21_24HR_H3K4ME3_RX']
    # gff_list = ['%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-5kb_+5kb.gff' % (gffFolder),
    #             '%sHG19_SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-5kb_+5kb.gff' % (gffFolder),
    #             ]
    # plot_color = 'green'
    # makeHeatmap(names_list,gff_list,plot_name,plot_color)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================VII. MAKING BOXPLOTS ============================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
    utils.formatFolder('%sfigures/4_chiprx_plots/' % projectFolder, True)

    # #=============================================================================
    # #for nb mycn chiprx
    # set_name = 'MYCN_CHIPRX'
    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    # names_list = ['SHEP21_0HR_MYCN_RX','SHEP21_2HR_MYCN_RX','SHEP21_24HR_MYCN_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # #=============================================================================
    # #for nb mycn chip no spike
    # set_name = 'MYCN_NOSPIKE'
    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # #=============================================================================
    # #for nb h3k27ac chiprx
    # set_name = 'H3K27AC_CHIPRX'
    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    # names_list = ['SHEP21_0HR_H3K27AC_RX','SHEP21_2HR_H3K27AC_RX','SHEP21_24HR_H3K27AC_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # #=============================================================================
    # #for nb H3K27ac chip no spike
    # set_name = 'H3K27AC_NOSPIKE'
    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    # names_list = ['SHEP21_0HR_H3K27AC_NOSPIKE','SHEP21_2HR_H3K27AC_NOSPIKE','SHEP21_24HR_H3K27AC_NOSPIKE']
    # makeBoxPlot(shep21_dataFile,set_name,gff_name,names_list)

    # #=============================================================================
    # #for nb ctcf chiprx
    # set_name = 'CTCF_CHIPRX'
    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    # gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    # names_list = ['SHEP21_0HR_CTCF_RX','SHEP21_2HR_CTCF_RX','SHEP21_24HR_CTCF_RX']
    # makeBoxPlot(shep21_chiprx_dataFile,set_name,gff_name,names_list)

    #=============================================================================
    #for nb RNA Pol II chiprx
    set_name = 'POL2_RX'
    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    names_list = [
        'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX'
    ]
    makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    names_list = [
        'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX'
    ]
    makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    names_list = [
        'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX'
    ]
    makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    names_list = [
        'SHEP21_0HR_POL2_RX', 'SHEP21_2HR_POL2_RX', 'SHEP21_24HR_POL2_RX'
    ]
    makeBoxPlot(shep21_chiprx_dataFile, set_name, gff_name, names_list)

    #=============================================================================
    #for nb RNA Pol II chiprx
    set_name = 'POL2_NOSPIKE'
    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-1kb_+1kb'
    names_list = [
        'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE'
    ]
    makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_ENHANCER_-0_+0'
    names_list = [
        'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE'
    ]
    makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-1kb_+1kb'
    names_list = [
        'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE'
    ]
    makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list)

    gff_name = 'SHEP21_0HR_MYCN_NOSPIKE_CONSERVED_PROMOTER_-0_+0'
    names_list = [
        'SHEP21_0HR_POL2_NOSPIKE', 'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE'
    ]
    makeBoxPlot(shep21_dataFile, set_name, gff_name, names_list)
Ejemplo n.º 18
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. CHECKING CHIP-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #edit all of the data files to absolute path the
    for dataFile in chip_data_list:

        pipeline_dfci.summary(dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================II. CHECKING RNA-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the
    for dataFile in rna_data_list:

        pipeline_dfci.summary(dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================III. CHECKING ATAC-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(atac_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================IV. CHECKING CHIPRX DATA========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    pipeline_dfci.summary(shep21_chiprx_dataFile)
Ejemplo n.º 19
0
#namesList = dataDict.keys()

#print(namesList)

#==========================================================================
#=======================LOADING DATA ANNOTATION============================
#==========================================================================

##THIS SECTION LOADS A DATA TABLE.  MUST BE UNCOMMENTED FOR REST OF CODE TO WORK


#LOADING THE DATA TABLE
dataDict = pipeline_dfci.loadDataTable(dataFile)
print(dataDict.keys())

pipeline_dfci.summary(dataFile)

#==========================================================================
#==========================CALLING BOWTIE==================================
#==========================================================================

##THIS SECTION CALLS BOWTIE ON RAW READ FILES TO GENERATE SORTED AND INDEXED BAMS IN THE BAM FOLDER


#namesList = []  <- fill this in if you want to only map a subset of the data. otherwise leave blank

##SET LAUNCH TO False to debug
#pipeline_dfci.makeBowtieBashJobs(dataFile,namesList,launch=True)

#==========================================================================
#=============================CALL MACS====================================
Ejemplo n.º 20
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #these are the datasets we will use
    pipeline_dfci.summary(shep_on_dataFile)
    pipeline_dfci.summary(shep21_dataFile)
    pipeline_dfci.summary(shep21_chiprx_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=========================II. MAKE BOXPLOTS============================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #here we will wrap boxplots for each set of analysis

    region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE'  #this is used to find the peak tables
    set_name = 'SHEP_MYCN'  # this is the defacto title for the datasets
    scale_table_path = ''
    wrapInvasionBox(shep_on_dataFile,
                    region_prefix,
                    set_name,
                    names_list=[],
                    top=5000,
                    scale_path=scale_table_path)

    region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE'  #this is used to find the peak tables
    set_name = 'SHEP_MYCN_NOSPIKE'  # this is the defacto title for the datasets
    scale_table_path = ''
    myc_list = [
        'SHEP21_0HR_MYCN_NOSPIKE', 'SHEP21_2HR_MYCN_NOSPIKE',
        'SHEP21_24HR_MYCN_NOSPIKE'
    ]
    wrapInvasionBox(shep21_dataFile,
                    region_prefix,
                    set_name,
                    names_list=myc_list,
                    top=5000,
                    scale_path=scale_table_path)

    region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE'  #this is used to find the peak tables
    set_name = 'SHEP_MYCN_RX_NO_SCALE'  # this is the defacto title for the datasets
    scale_table_path = ''
    myc_list = [
        'SHEP21_0HR_MYCN_RX', 'SHEP21_2HR_MYCN_RX', 'SHEP21_24HR_MYCN_RX'
    ]
    wrapInvasionBox(shep21_chiprx_dataFile,
                    region_prefix,
                    set_name,
                    names_list=myc_list,
                    top=5000,
                    scale_path=scale_table_path)

    region_prefix = 'SHEP21_0HR_MYCN_NOSPIKE_REGIONS_NO_WCE'  #this is used to find the peak tables
    set_name = 'SHEP_MYCN_RX'  # this is the defacto title for the datasets
    scale_table_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder)
    myc_list = [
        'SHEP21_0HR_MYCN_RX', 'SHEP21_2HR_MYCN_RX', 'SHEP21_24HR_MYCN_RX'
    ]
    wrapInvasionBox(shep21_chiprx_dataFile,
                    region_prefix,
                    set_name,
                    names_list=myc_list,
                    top=5000,
                    scale_path=scale_table_path)
Ejemplo n.º 21
0
def main():


    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#======================I, LOADING DATA ANNOTATION======================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(nb_all_chip_dataFile)



    print('\n\n')
    print('#======================================================================')
    print('#========================II. MAKING NES TABLE==========================')
    print('#======================================================================')
    print('\n\n')

    # #at a given fdr cutoff, grab the NES pathways  
    
    nes_folder = utils.formatFolder('%snes_tables/' % (projectFolder),True)

    # #for top 5k regions
    # nes_path_list = ['%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/H2171_MYC_REGIONS_H2171_MYC/H2171_MYC_REGIONS_H2171_MYC_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/MM1S_MYC_REGIONS_MM1S_MYC_DMSO/MM1S_MYC_REGIONS_MM1S_MYC_DMSO_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/U87_MYC_REGIONS_U87_MYC/U87_MYC_REGIONS_U87_MYC_top_5000_nes.txt' % (projectFolder),
    #                  ]

    # names_list = ['NB_MYCN_CONSERVED','H2171','MM1S','P493-6_T24','U87']
    # output_path = '%sMYC_HIGH_NES.txt' % (nes_folder)
    # makeNESTable(nes_path_list,names_list,output_path)
                     
    
    # #for shep21 nospike shutdown system
    # nes_path_list = ['%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_0HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_2HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_2HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_24HR_MYCN_NOSPIKE/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP21_24HR_MYCN_NOSPIKE_top_5000_nes.txt' % (projectFolder),
    #                  ]

    # names_list = ['SHEP21_0HR_MYCN_NOSPIKE','SHEP21_2HR_MYCN_NOSPIKE','SHEP21_24HR_MYCN_NOSPIKE']
    # output_path = '%sSHEP21_MYCN_NOSPIKE_NES.txt' % (nes_folder)
    # makeNESTable(nes_path_list,names_list,output_path)


    # #for shep on induction system
    # nes_path_list = ['%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_0HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_0HR_MYCN_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_2HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_2HR_MYCN_top_5000_nes.txt' % (projectFolder),
    #                  '%senhancerPromoter/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_6HR_MYCN/SHEP21_0HR_MYCN_NOSPIKE_REGIONS_SHEP_6HR_MYCN_top_5000_nes.txt' % (projectFolder),
    #                  ]

    # names_list = ['SHEP_0HR_MYCN','SHEP_2HR_MYCN','SHEP_6HR_MYCN']
    # output_path = '%sSHEP_ON_NES.txt' % (nes_folder)
    # makeNESTable(nes_path_list,names_list,output_path)


    #for BE2C comparisons
    nes_path_list = [
        '%senhancerPromoter/BE2C_MYCN/BE2C_MYCN_top_5000_nes.txt' % (projectFolder), 
        '%senhancerPromoter/BE2C_H3K27AC_REGIONS/BE2C_H3K27AC_REGIONS_top_5000_nes.txt' % (projectFolder),
                     ]




    names_list = ['BE2C_RNA_POL2','BE2C_MYCN','BE2C_H3K27AC','BE2C_BRD4','BE2C_TWIST']
    names_list = ['BE2C_MYCN','BE2C_H3K27AC']
    output_path = '%sBE2C_NES.txt' % (nes_folder)
    makeNESTable(nes_path_list,names_list,output_path)




    print('\n\n')
    print('#======================================================================')
    print('#========================III. CALLING HEATMAP==========================')
    print('#======================================================================')
    print('\n\n')

    # #for high myc
    # nes_path = '%sMYC_HIGH_NES.txt' % (nes_folder)
    # wrapHeatmap(nes_path,0.01,2)



    # #for shep21 nospike
    # nes_path = '%sSHEP21_MYCN_NOSPIKE_NES.txt' % (nes_folder)
    # wrapHeatmap(nes_path,0.1,2)
    
    # #for shep on
    # nes_path = '%sSHEP_ON_NES.txt' % (nes_folder)
    # wrapHeatmap(nes_path,0.1,2)


    #for be2c
    nes_path = '%sBE2C_NES.txt' % (nes_folder)
    wrapHeatmap(nes_path,0.1,1.5)


    print('\n\n')
    print('#======================================================================')
    print('#=====================IV. MAKING TSS DISTAL GFFS=======================')
    print('#======================================================================')
    print('\n\n')


    # #we want the peak list to cover NB_MYCN_CONSERVED, P4936, MM1S, H2171,U87

    # #for top 5k regions
    # peak_path_list = ['%senhancerPromoter/NB_MYCN_CONSERVED/NB_MYCN_CONSERVED_PEAK_TABLE.txt' % (projectFolder),
    #                  '%senhancerPromoter/H2171_MYC_REGIONS_H2171_MYC/H2171_MYC_REGIONS_H2171_MYC_PEAK_TABLE.txt' % (projectFolder),
    #                  '%senhancerPromoter/MM1S_MYC_REGIONS_MM1S_MYC_DMSO/MM1S_MYC_REGIONS_MM1S_MYC_DMSO_PEAK_TABLE.txt' % (projectFolder),
    #                  '%senhancerPromoter/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC/P493-6_T24_MYC_REGIONS_P493-6_T24_MYC_PEAK_TABLE.txt' % (projectFolder),
    #                  '%senhancerPromoter/U87_MYC_REGIONS_U87_MYC/U87_MYC_REGIONS_U87_MYC_PEAK_TABLE.txt' % (projectFolder),
    #                  ]


    # tss_gff_path,distal_gff_path = makePeakGFFs(peak_path_list)


    print('\n\n')
    print('#======================================================================')
    print('#=====================V. MAPPING MYC TO REGIONS========================')
    print('#======================================================================')
    print('\n\n')
Ejemplo n.º 22
0
def main():

    print('rna analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(rna_data_file)

    print('\n\n')
    print(
        '#=========================================================================='
    )
    print(
        '#=======================II, ALIGNING WITH HISAT2==========================='
    )
    print(
        '#=========================================================================='
    )
    print('\n\n')

    #pipeline_dfci.mapHisat(dataFile,namesList=[],useSRA=False,pCount=16,Launch=True)

    print('\n\n')
    print(
        '#=========================================================================='
    )
    print(
        '#=======================III, RUNNING RNA-SEQ ANALYSIS======================'
    )
    print(
        '#=========================================================================='
    )
    print('\n\n')

    #analysisName = 'rasmc_rna'

    #gtfFile = '/storage/cylin/grail/genomes/ERCC_Technical_Data/rn6_ercc.gtf'
    #cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder),True)
    #bashFileName = '%s%s_cufflinks.sh' % (cufflinksFolder,analysisName)

    #groupList = [['RASMC_RNA_0H_A','RASMC_RNA_0H_B'],['RASMC_RNA_PDGF_2H_B','RASMC_RNA_PDGF_2H_C','RASMC_RNA_PDGF_2H_D'],['RASMC_RNA_PDGF_JQ1_2H_E','RASMC_RNA_PDGF_JQ1_2H_G','RASMC_RNA_PDGF_JQ1_2H_H'],['RASMC_RNA_PDGF_24H_A','RASMC_RNA_PDGF_24H_B','RASMC_RNA_PDGF_24H_D'],['RASMC_RNA_PDGF_JQ1_24H_E','RASMC_RNA_PDGF_JQ1_24H_F','RASMC_RNA_PDGF_JQ1_24H_H']]

    #print(groupList)
    #pipeline_dfci.makeCuffTableSlurm(rna_data_file,analysisName,gtfFile,cufflinksFolder,groupList,bashFileName)

    # #flag useERCC to true

    print('\n\n')
    print(
        '#=========================================================================='
    )
    print(
        '#=======================IV, MAKE GEO TABLE================================='
    )
    print(
        '#=========================================================================='
    )
    print('\n\n')

    namesList = []
    geoName = 'rasmc_rna'

    outputFolder = '/storage/cylin/grail/projects/rasmc_all/rasmc_geo/%s_geo/' % (
        geoName)
    makeGEORNATable(rna_data_file, namesList, geoName, outputFolder)
Ejemplo n.º 23
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    pipeline_dfci.summary(nb_all_chip_dataFile)
    for dataFile in chip_data_list:
        pipeline_dfci.summary(dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========================II. MAKING FIGURE GFF========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    nb_figure_gff_path = make_nb_gff()

    #make the associated beds for plottings

    nb_mycn_conserved_gff = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    canon_path, non_path = makeEboxBeds(nb_mycn_conserved_gff, name='')

    bed_string = ','.join([canon_path, non_path])
    print(bed_string)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================III. CALLING PLOTTING FUNCTIONS=================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #for the shep21 no spike system
    plot_shep21_genes(nb_figure_gff_path, bed_string)

    #for the shep21 chiprx system
    scale_path = '%sHG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (tableFolder)
    plot_shep21_chiprx_genes(shep21_chiprx_dataFile, scale_path,
                             nb_figure_gff_path, bed_string)

    #for the shep on system
    plot_shep_on_genes(shep_on_dataFile, nb_figure_gff_path, bed_string)

    #for the pan NB metas
    plot_nb_all_genes(nb_all_chip_dataFile, nb_figure_gff_path, bed_string)

    #for be2c only
    plot_be2c_genes(be2c_dataFile, nb_figure_gff_path, bed_string)

    #for atac
    pipeline_dfci.summary(atac_dataFile)
    plot_nb_atac_genes(atac_dataFile, nb_figure_gff_path, bed_string)

    #for p493-6
    pipeline_dfci.summary(p4936_young_dataFile)
    plot_p4936_genes(p4936_young_dataFile, nb_figure_gff_path, bed_string)

    #for mm1s
    pipeline_dfci.summary(mm1s_dataFile)
    plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string)
Ejemplo n.º 24
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #these are the datasets we will use
    pipeline_dfci.summary(shep21_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#================II. RUNNING DIFFERENTIAL ROSE ANALYSIS================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #use the dynamic rose tools to first map twist1 binding sites
    #and then quantify

    name1 = 'SHEP21_0HR_TWIST'
    name2 = 'SHEP21_24HR_B_TWIST'
    analysis_name = 'SHEP21_TWIST1'
    rank_gff_path = wrapDRose(shep21_dataFile, name1, name2, analysis_name)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================III. MAPPING MYCN DATA TO RANK GFF==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #for shep21 nospike
    gffList = [rank_gff_path]
    dataDict = pipeline_dfci.loadDataTable(shep21_dataFile)
    names_list = [
        name for name in dataDict.keys()
        if name.count('MYCN') == 1 or name.count('INPUT') == 1
        or name.count('TWIST') == 1 and name.count('rep2') == 0
    ]
    print(names_list)
    #map_regions(shep21_dataFile,gffList,names_list)

    gffList = ['%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder)]
    #map_regions(shep21_dataFile,gffList,names_list)

    #make a gff of twist and mycn sites at 0hr
    twist_collection = utils.importBoundRegion(
        '%smacsEnriched/SHEP21_0HR_TWIST_peaks.bed' % (projectFolder),
        'SHEP21_0HR_TWIST')

    mycn_collection = utils.importBoundRegion(
        '%smacsEnriched/SHEP21_0HR_MYCN_NOSPIKE_peaks.bed' % (projectFolder),
        'SHEP21_0HR_MYCN_NOSPIKE')

    all_loci = twist_collection.getLoci() + mycn_collection.getLoci()
    all_collection = utils.LocusCollection(all_loci, 50)
    stitched_collection = all_collection.stitchCollection()

    stitched_loci = stitched_collection.getLoci()

    overlap_loci = []
    for locus in stitched_loci:
        if len(twist_collection.getOverlap(locus, 'both')) > 0 and len(
                mycn_collection.getOverlap(locus, 'both')) > 0:
            overlap_loci.append(locus)

    overlap_collection = utils.LocusCollection(overlap_loci, 50)
    overlap_gff = utils.locusCollectionToGFF(overlap_collection)
    overlap_gff_path = '%sHG19_SHEP21_0HR_TWIST_MYCN_INTERSECTION_-0_+0.gff' % (
        gffFolder)
    utils.unParseTable(overlap_gff, overlap_gff_path, '\t')

    gffList = [overlap_gff_path]
    map_regions(shep21_dataFile, gffList, names_list)
Ejemplo n.º 25
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. CHECKING CHIP-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(chip_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================II. CHECKING RNA-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(rna_dataFile)

    #if no processed expression present, runs cuffquant/cuffnorm/RNA-seq pipeline
    cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder), True)
    analysis_name = 'NIBR_YvsO'
    #groupList = [['Y_BC10_Y1','Y_BC11_Y2','Y_BC16_Y3'],['O_BC18_O1','O_BC25_O2','O_BC27_O3']]
    #bashFileName = '%s%s_rna_cufflinks.sh' % (cufflinksFolder,analysis_name)
    #pipeline_dfci.makeCuffTable(rna_dataFile,analysis_name,gtfFile,cufflinksFolder,groupList,bashFileName)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================III. CHECKING ATAC-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(atac_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================IV. CHECKING IRF2 CHIPMENTATION==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    pipeline_dfci.summary(irf2_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================V. SUMMARIZING ALL DATA==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    output = '%stables/HG19_HPEK_SEQ_TABLE.txt' % (projectFolder)
    make_summary_table(data_file_list, output)
Ejemplo n.º 26
0
def main():


    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print('#======================================================================')
    print('#======================I, LOADING DATA ANNOTATION======================')
    print('#======================================================================')
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #edit all of the data files to absolute path the
    for dataFile in chip_data_list:

        pipeline_dfci.summary(dataFile)


    print('\n\n')
    print('#======================================================================')
    print('#==========================II. CALLING MACS============================')
    print('#======================================================================')
    print('\n\n')

    #running peak finding using macs 1.4.2 on all chip datasets
    #this usually takes ~2-3 hours on a reasonably fast machine
    #a 3 hour time out on this entire operation is set
    #if peak calling takes longer than 3 hours, simply run the script again after completion


    # for dataFile in chip_data_list:

    #     run_macs(dataFile)



    print('\n\n')
    print('#======================================================================')
    print('#===================III. DEFINING ACTIVE GENES IN NB===================')
    print('#======================================================================')
    print('\n\n')

    
    # #here we will identify active promoters in various contexts as those with 
    # #an H3K27AC peak in the +/- 1kb tss region
    # #UCSC refseq annotations are used for all genes
    # #make_nb_active_gene_lists(nb_all_chip_dataFile)
    
    # make_active_gene_lists(mm1s_dataFile,p4936_young_dataFile,sclc_dataFile,shep_on_dataFile,u87_dataFile)

    print('\n\n')
    print('#======================================================================')
    print('#===============IV. DEFINING NB MYCN AND H3K27AC LANDSCAPE=============')
    print('#======================================================================')
    print('\n\n')

    # #for enhancers
    # enhancer_bashFileName,enhancer_region_map_path,namesList = define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile)

    # #runs only if no output detected
    # if not utils.checkOutput(enhancer_region_map_path,0,0):
    #     print(enhancer_bashFileName)
    #     os.system('bash %s' % (enhancer_bashFileName))


    # #for mycn
    # mycn_bashFileName,mycn_region_map_path,namesList = define_mycn_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile)

    # if not utils.checkOutput(mycn_region_map_path,0,0):
    #     print(mycn_bashFileName)
    #     os.system('bash %s' % (mycn_bashFileName))
    
    # #now we need to call the R script that creates the rank plots
    # if utils.checkOutput(mycn_region_map_path,1,30): #set a wait time for 30 minutes
    #     print('Found NB_MYCN meta_rose landscape and running rank plot R code')

    #     conserved_rank_path = '%smeta_rose/NB_MYCN/NB_MYCN_0KB_STITCHED_ENHANCER_REGION_RANK_CONSERVED.txt' % (projectFolder)
    #     if utils.checkOutput(conserved_rank_path,0,0):
    #         print('Identified NB rank conserved regions: %s' % (conserved_rank_path))
    #     else:
    #         print('Defining NB rank conserved regions')
    #         name_string = ','.join(namesList) #provides the dataset names used
    #         rank_script_path = '%sr_scripts/1_nb_mycn_rank.R' % (projectFolder)
    #         r_cmd = 'Rscript %s %s %s %s' % (rank_script_path,mycn_region_map_path,name_string,projectFolder)
    #         print(r_cmd)
    #         os.system(r_cmd)


    print('\n\n')
    print('#======================================================================')
    print('#==========V. MAPPING MYCN AND H3K27AC TO MYCN REGIONS=================')
    print('#======================================================================')
    print('\n\n')

    # #here we will first make a gff of conserved NB MYCN regions
    # #and then map MYCN and H3K27ac signal 

    # print('Making a gff and bed of conserved NB MYCN regions:')

    # mycn_gff_path,mycn_flank_gff_path = make_mycn_regions(conserved_rank_path) 
    
    # print('Mapping MYCN and H3K27AC signal')
    # gffList = [mycn_gff_path,mycn_flank_gff_path]
    #gffList = ['%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder),'%sHG19_NB_MYCN_CONSERVED_-500_+500.gff' % (gffFolder)]
    #pipeline_dfci.map_regions(nb_all_chip_dataFile,gffList,mappedFolder,signalFolder)

    print('\n\n')
    print('#======================================================================')
    print('#==================VI. CREATING NB MYCN STATS TABLE====================')
    print('#======================================================================')
    print('\n\n')
    

    # mycn_table_path = '%stables/HG19_NB_MYCN_CONSERVED_STATS_TABLE.txt' % (projectFolder)
    # if utils.checkOutput(mycn_table_path,0,0):
    #     print('Identified MYCN table %s' % (mycn_table_path))
    # else:
    #     print('Making MYCN stats table')        
    #     mycn_table_path = make_mycn_stats_table(nb_all_chip_dataFile,mycn_table_path)

    mycn_table_path = '%stables/HG19_NB_MYCN_CONSERVED_STATS_TABLE.txt' % (projectFolder)
    #mycn_table_path = make_mycn_stats_table(nb_all_chip_dataFile,mycn_table_path)
    print('\n\n')
    print('#======================================================================')
    print('#=================VII. MAKING VECTOR COMPARISON PLOTS==================')
    print('#======================================================================')
    print('\n\n')

    compare_script_path = '%sr_scripts/2_nb_mycn_vector_plots.R' % (projectFolder)
    r_cmd = 'Rscript %s %s %s' % (compare_script_path,mycn_table_path,projectFolder)
    print(r_cmd)
    os.system(r_cmd)


    print('\n\n')
    print('#======================================================================')
    print('#==================VIII. RANKING EBOXES IN MYCN PEAKS==================')
    print('#======================================================================')
    print('\n\n')

    # mycn_gff_path = '%sHG19_NB_MYCN_CONSERVED_-0_+0.gff' % (gffFolder)
    # ebox_rank_path = rank_eboxes(nb_all_chip_dataFile,mycn_gff_path,macsFolder,genomeDirectory,window = 100)
    
    # print(ebox_rank_path)

    # #now make the heatmap
    # ebox_heatmap_script_path = '%sr_scripts/3_nb_ebox_heatmap.R' % (projectFolder)
    # r_cmd = 'Rscript %s %s %s' % (ebox_heatmap_script_path,ebox_rank_path,projectFolder)
    # print(r_cmd)
    # os.system(r_cmd)

    print('\n\n')
    print('#======================================================================')
    print('#====================IX. MAPPING BE2C DATASETS TO TSS==================')
    print('#======================================================================')
    print('\n\n')
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. CHECKING CHIP-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(chip_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================II. CHECKING RNA-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(rna_dataFile)

    #if no processed expression present, runs cuffquant/cuffnorm/RNA-seq pipeline
    cufflinksFolder = utils.formatFolder('%scufflinks' % (projectFolder), True)
    analysis_name = 'NIBR_YvsO'
    groupList = [['Y_BC10_Y1', 'Y_BC11_Y2', 'Y_BC16_Y3'],
                 ['O_BC18_O1', 'O_BC25_O2', 'O_BC27_O3']]
    bashFileName = '%s%s_rna_cufflinks.sh' % (cufflinksFolder, analysis_name)
    pipeline_dfci.makeCuffTable(rna_dataFile, analysis_name, gtfFile,
                                cufflinksFolder, groupList, bashFileName)

    call_bashFileName = 'bash %s' % bashFileName
    proc = subprocess.Popen(call_bashFileName, shell=True)

    # wait for finishing cufflinks
    proc.wait()

    # if call_bashFileName returns 1 (fail), then exit with status 1
    if proc.returncode:
        print 'running %s failed' (call_bashFileName)
        sys.exit(1)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================III. CHECKING ATAC-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    #edit all of the data files to absolute path the

    pipeline_dfci.summary(atac_dataFile)
Ejemplo n.º 28
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for data file
    pipeline_dfci.summary(chip_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===========================II. CALLING MACS==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #pipeline_dfci.run_macs(chip_data_file,projectFolder,macsFolder,macsEnrichedFolder,wiggleFolder,useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================III. MERGING IRF2 REGIONS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #create a set of regions representing the intersect of peaks
    #filter out anything that overlaps a peak in the HA ctl

    def merge_regions():
        '''
        merges ha peaks to identify all overlapping peaks
        filters out anything overlapping the HA controls
        '''
        hk_dox_ha_1 = utils.importBoundRegion(
            '%sHK_DOX_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_1')
        hk_dox_ha_2 = utils.importBoundRegion(
            '%sHK_DOX_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_DOX_HA_2')

        hk_dox_loci = hk_dox_ha_1.getLoci() + hk_dox_ha_2.getLoci()

        #control datasets
        hk_ctl_ha_1 = utils.importBoundRegion(
            '%sHK_CTL_HA_1_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_1')
        hk_ctl_ha_2 = utils.importBoundRegion(
            '%sHK_CTL_HA_2_peaks.bed' % (macsEnrichedFolder), 'HK_CTL_HA_2')

        hk_ctl_loci = hk_ctl_ha_1.getLoci() + hk_ctl_ha_2.getLoci()
        hk_ctl_lc = utils.LocusCollection(hk_ctl_loci)

        print(len(hk_dox_loci))
        stitched_lc = utils.LocusCollection(hk_dox_loci).stitchCollection()
        print(len(stitched_lc))
        filtered_loci = []
        for locus in stitched_lc.getLoci():
            if len(hk_dox_ha_1.getOverlap(locus)) > 0 and len(
                    hk_dox_ha_2.getOverlap(locus)) > 0:
                if len(hk_ctl_lc.getOverlap(locus)) == 0:
                    filtered_loci.append(locus)

        print(len(filtered_loci))
        filtered_lc = utils.LocusCollection(filtered_loci)
        gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (
            gffFolder)
        filtered_gff = utils.locusCollectionToGFF(filtered_lc)
        utils.unParseTable(filtered_gff, gff_path, '\t')

    #merge_regions()

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================IV. IDENTIFY ATAC OVERLAP REGIONS==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # atac_bed_path = '%sHG19_combined_atac_-0_+0.bed' % (bedFolder)# all combined atac regions

    # atac_collection = utils.importBoundRegion(atac_bed_path,'HG19_combined_atac')
    # print(len(atac_collection))

    # #now filter the irf2 gff
    # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder)

    # irf2_collection = utils.gffToLocusCollection(irf2_gff_path)
    # irf2_loci = irf2_collection.getLoci()

    # irf2_atac_loci = [locus for locus in irf2_loci if atac_collection.getOverlap(locus)]
    # print(len(irf2_atac_loci))
    # irf2_atac_collection=utils.LocusCollection(irf2_atac_loci)

    # irf2_atac_gff = utils.locusCollectionToGFF(irf2_atac_collection)
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # utils.unParseTable(irf2_atac_gff,irf2_atac_gff_path,'\t')

    # # overlap with TSS
    # tss_gff_path = '%sHG19_TSS_ALL_-1000_+1000.gff' % (gffFolder)
    # tss_gff = utils.parseTable(tss_gff_path,'\t')
    # tss_collection = utils.gffToLocusCollection(tss_gff)

    # print('tss overlap w/ IRF2  atac peaks')
    # print(len([locus for locus in irf2_atac_loci if tss_collection.getOverlap(locus)]))
    # print(len(irf2_atac_loci))

    # #overlap w/ k27ac
    # k27ac_gff_path = '%sHG19_keratinocyte_combined_all_-0_+0.gff' % (gffFolder)
    # k27ac_gff = utils.parseTable(k27ac_gff_path,'\t')
    # k27ac_collection = utils.gffToLocusCollection(k27ac_gff)

    # print('k27ac overlap w/ IRF2  atac peaks')
    # print(len([locus for locus in irf2_atac_loci if k27ac_collection.getOverlap(locus)]))
    # print(len(irf2_atac_loci))

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#========================V. CALLING ROSE2 META========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    def wrapRose2Meta(data_file,
                      input_path,
                      parent_folder,
                      active_gene_path='',
                      rank_list=[],
                      control_list=[],
                      analysis_name=''):
        '''
        quick wrapper for Rose2Meta
        '''
        dataDict = pipeline_dfci.loadDataTable(data_file)
        rank_string = ','.join([dataDict[name]['bam'] for name in rank_list])
        control_string = ','.join(
            [dataDict[name]['bam'] for name in control_list])

        output_folder = utils.formatFolder(
            '%s%s' % (parent_folder, analysis_name), True)
        rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % (
            py27_path, pipeline_dir, genome, input_path, rank_string,
            control_string, analysis_name, output_folder, blacklist_path)

        all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder,
                                                             analysis_name)

        if active_gene_path != '':
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path,
                active_gene_path)
        else:
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path)

        rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name)
        rose_bash = open(rose_bash_path, 'w')
        rose_bash.write('#!/usr/bin/python\n\n')
        rose_bash.write('#setting up bamliquidator\n')

        rose_bash.write('\n\n#ROSE2_CMD\n')
        rose_bash.write(rose2_meta_cmd + '\n')
        rose_bash.write(rose2_map_cmd + '\n')

        rose_bash.close()
        print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))

    #use ROSE2 w/ -t 0 and -s 0 to quantify background subtracted AUC at all peaks

    # parent_folder = utils.formatFolder('%smeta_rose/' % (projectFolder),True)

    # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed'
    # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder)
    # #creating bam lists

    # rank_list = ['HK_DOX_HA_1','HK_DOX_HA_2']
    # control_list =['HK_DOX_WCE_1','HK_DOX_WCE_2']

    # #for all IRF2 HA
    # irf2_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA'
    # wrapRose2Meta(chip_data_file,irf2_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name)

    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA_ATAC'
    # wrapRose2Meta(chip_data_file,irf2_atac_gff_path,parent_folder,active_gene_path,rank_list,control_list,analysis_name)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#================VI. OVERLAPPING IRF2 W/ MOTIF PREDICTIONS============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # #load up peaks
    # #irf2_atac_peaks
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # irf2_atac_gff = utils.parseTable(irf2_atac_gff_path,'\t')
    # irf2_atac_loci = utils.gffToLocusCollection(irf2_atac_gff).getLoci()
    # irf2_atac_collection = utils.LocusCollection(irf2_atac_loci)
    # print(len(irf2_atac_loci))

    # irf2_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES.txt' % (projectFolder)

    # irf2_edge_table = utils.parseTable(irf2_edge_path,'\t')
    # print(len(irf2_edge_table))

    # irf2_confirmed_edges = []
    # irf2_edge_loci = []
    # for line in irf2_edge_table[1:]:
    #     chrom = line[1].split('(')[0]
    #     coords = [int(x) for x in line[1].split(':')[-1].split('-')]
    #     locus = utils.Locus(chrom,coords[0]-00,coords[1]+00,'.',line[0])
    #     if len(irf2_atac_collection.getOverlap(locus)) > 0:
    #         irf2_confirmed_edges.append(line)
    #     irf2_edge_loci.append(locus)
    # print(len(irf2_confirmed_edges))

    # irf2_confirmed_edge_path = '%scrc_atac/keratinocyte_combined_all/keratinocyte_combined_all_EDGE_TABLE_signal_filtered_IRF2_EDGES_CONFIRMED.txt' % (projectFolder)
    # utils.unParseTable(irf2_confirmed_edges,irf2_confirmed_edge_path,'\t')

    # irf2_edge_collection = utils.LocusCollection(irf2_edge_loci)
    # print(len(irf2_edge_collection))

    # overlap_count = 0
    # for locus in irf2_atac_loci:
    #     search_locus = utils.makeSearchLocus(locus,0,0)
    #     if len(irf2_edge_collection.getOverlap(search_locus)) >0:
    #         overlap_count+=1
    # print(overlap_count)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================VII. RUNNING ENHANCER PROMOTER ON IRF2==============='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    def wrap_enhancer_promoter(dataFile,
                               input_path,
                               activity_path,
                               analysis_name,
                               names_list=[],
                               useBackground=True):
        '''
        runs enhancer promoter on everybody with the conserved regions and union of active genes
        '''

        #hard coded paths
        tads_path = '%shESC_domains_hg19.bed' % (bedFolder)

        #setting the output folder
        ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder),
                                       True)

        dataDict = pipeline_dfci.loadDataTable(dataFile)
        if len(names_list) == 0:
            names_list = [name for name in dataDict.keys()]
            names_list.sort()

        bams_list = [dataDict[name]['bam'] for name in names_list]
        bams_string = ' '.join(bams_list)

        background_names = [
            dataDict[name]['background'] for name in names_list
        ]
        background_list = [
            dataDict[background_name]['bam']
            for background_name in background_names
        ]
        background_string = ' '.join(background_list)

        ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name)
        ep_bash = open(ep_bash_path, 'w')

        ep_bash.write('#!/usr/bin/bash\n\n\n')

        ep_bash.write('#enhancer promoter analysis for %s\n\n' %
                      (analysis_name))

        if useBackground:
            python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, background_string, genome.upper(),
                input_path, ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        else:
            python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, genome.upper(), input_path,
                ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        ep_bash.close()

        return (ep_bash_path)

    # # blacklist_path = '/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed'
    # active_gene_path = '%sgeneListFolder/HG19_KERATINOCYTE_ACTIVE.txt' % (projectFolder)
    # irf2_atac_gff_path = '%sHG19_IRF2_HA_MERGED_FILTERED_CONSERVED_ATAC_0_+0.gff' % (gffFolder)
    # analysis_name = 'IRF2_HA_ATAC'
    # bam_list = ['HK_DOX_HA_1','HK_DOX_HA_2']
    # wrap_enhancer_promoter(chip_data_file,irf2_atac_gff_path,active_gene_path,analysis_name,bam_list,useBackground=True)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==============VIII. FORMATTING THE HORRIFYING EXPRESSION TABLE========'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # exp_path = '%sirf2_kd_rna_seq/single_counts_filtered_counts.txt' % (projectFolder)
    # sample_key_path = '%sirf2_kd_rna_seq/sample_key.txt' % (projectFolder)

    # sample_table = utils.parseTable(sample_key_path,'\t')
    # sample_list = [line[0] for line in sample_table[1:]]
    # print(sample_list)
    # exp_table = utils.parseTable(exp_path,'\t')

    # #for each gene make a dictionary
    # exp_dict = {}

    # #first fill out the dictionary by gene name
    # for line in exp_table[1:]:
    #     gene_name = line[3].replace('"','')
    #     exp_dict[gene_name] = {}

    # print(len(exp_dict.keys()))

    # for line in exp_table[1:]:
    #     gene_name = line[3].replace('"','')
    #     sample_ID = line[4].replace('"','')
    #     counts = line[2]
    #     exp_dict[gene_name][sample_ID] = counts

    # #make the formatted expression table
    # header = ['GENE_NAME'] + sample_list
    # exp_table_formatted = [header]
    # gene_list = exp_dict.keys()
    # gene_list.sort()
    # for gene in gene_list:
    #     exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list]
    #     exp_table_formatted.append(exp_line)

    # exp_table_formatted_path = '%sirf2_kd_rna_seq/irf2_expression_formatted.txt' % (projectFolder)
    # utils.unParseTable(exp_table_formatted,exp_table_formatted_path,'\t')

    # #with the exp dict we can make a nicer version of the gene table
    # gene_table_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE.txt' % (projectFolder)
    # gene_table_formatted_path = '%senhancerPromoter/IRF2_HA_ATAC/IRF2_HA_ATAC_GENE_TABLE_FORMATTED.txt' % (projectFolder)

    # gene_table = utils.parseTable(gene_table_path,'\t')
    # gene_table_formatted = [gene_table[0] + ['IRF2_TOTAL_SIGNAL'] + header+ ['OLD_IRF2_KD_MEAN','OLD_CTL_MEAN','OLD_IRF2_VS_CTL','YOUNG_IRF2_KD_MEAN','YOUNG_CTL_MEAN','YOUNG_IRF2_VS_CTL']]
    # for line in gene_table[1:]:
    #     if float(line[1]) == 0.0 and float(line[2]) == 0.0:
    #         continue
    #     if exp_dict.has_key(line[0]) == False:
    #         continue
    #     gene = line[0]
    #     #where conditions are met
    #     old_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_OLD_1', 'IRF2_KD_OLD_2', 'IRF2_KD_OLD_3']])
    #     old_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_OLD_1', 'CT_CRISPR_OLD_2', 'CT_CRISPR_OLD_3']])
    #     old_fold = numpy.log2(old_kd_mean/old_ctl_mean)

    #     young_kd_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['IRF2_KD_YOUNG_1', 'IRF2_KD_YOUNG_2', 'IRF2_KD_YOUNG_3']])
    #     young_ctl_mean = numpy.mean([float(exp_dict[gene][x]) for x in ['CT_CRISPR_YOUNG_1', 'CT_CRISPR_YOUNG_2', 'CT_CRISPR_YOUNG_3']])
    #     young_fold = numpy.log2(young_kd_mean/young_ctl_mean)

    #     exp_line = [gene] + [exp_dict[gene][sample_ID] for sample_ID in sample_list] + [round(x,4) for x in [old_kd_mean,old_ctl_mean,old_fold,young_kd_mean,young_ctl_mean,young_fold]]
    #     gene_table_formatted.append(line+[sum([float(x) for x in line[1:3]])] + exp_line)

    # utils.unParseTable(gene_table_formatted,gene_table_formatted_path,'\t')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=================IX. ANNOTATING IRF2 KD CLUSTERGRAM==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #this little bit of python code is on the dropbox... need to move over

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================X. PLOTTING FIGURE REGIONS ===================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    figure_gff_path = '%sHG19_KERATINOCYTE_FIGURE_2_GENES.gff' % (gffFolder)
    plotName = 'IRF2_FIGURE_2_GENES'
    outputFolder = utils.formatFolder('%sgene_plot/IRF2/' % (projectFolder),
                                      True)
    pipeline_dfci.callBatchPlot(chip_data_file,
                                figure_gff_path,
                                plotName,
                                outputFolder,
                                namesList=['HK_DOX_HA_1', 'HK_DOX_HA_2'],
                                uniform=True,
                                bed='',
                                plotType='MULTIPLE',
                                extension=200,
                                multiPage=False,
                                debug=False,
                                nameString='',
                                rpm=True,
                                rxGenome='',
                                scaleFactorString='')
Ejemplo n.º 29
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I. CHECKING CHIP-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ChIP-Seq
    #edit all of the data files to absolute path the
    for dataFile in chip_data_list:

        pipeline_dfci.summary(dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================II. MAKING POL2 SIGNAL TABLES======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    gffList = [
        '%sHG19_TSS_ALL_-300_+300.gff' % (gffFolder),
        '%sHG19_BODY_ALL_+300_+3000.gff' % (gffFolder),
    ]

    names_list = [
        'SHEP21_0HR_POL2_NOSPIKE_R2',
        'SHEP21_2HR_POL2_NOSPIKE',
        'SHEP21_24HR_POL2_NOSPIKE',
        'SHEP21_0HR_INPUT_NOSPIKE',
        'SHEP21_2HR_INPUT_NOSPIKE_rep2',
        'SHEP21_24HR_INPUT_NOSPIKE_rep2',
    ]

    #shep21_nospike_pol2_signal_path = pipeline_dfci.map_regions(shep21_dataFile,gffList,mappedFolder,signalFolder,names_list,medianNorm=False,output='')

    #now for shep21 chiprx
    names_list = [
        'SHEP21_0HR_POL2_RX',
        'SHEP21_2HR_POL2_RX',
        'SHEP21_24HR_POL2_RX',
        'SHEP21_0HR_INPUT_RX_1',
        'SHEP21_2HR_INPUT_RX_1',
        'SHEP21_24HR_INPUT_RX_1',
    ]

    shep21_nospike_pol2_signal_path = pipeline_dfci.map_regions(
        shep21_chiprx_dataFile,
        gffList,
        mappedFolder,
        signalFolder,
        names_list,
        medianNorm=False,
        output='')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#====================III. CHECKING ATAC-SEQ DATA======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================IV. CHECKING CHIPRX DATA========================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=======================I. FIXING LINKS FOR BAMS======================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    # bam_folder = '/storage/cylin/grail/projects/chordoma_rna/190612_rna_seq/bams/'

    # def symlink_bai(bam_folder):

    #     '''
    #     resolves the symlinks of the bams to also symlink the bais
    #     '''
    #     bam_file_list = ['%s%s' % (bam_folder,fh) for fh in os.listdir(bam_folder) if fh.count('bam') > 0]
    #     print(bam_file_list)
    #     for bam_path in bam_file_list:

    #         #print(bam_path)
    #         #print(os.path.realpath(bam_path))
    #         bam_origin = os.path.realpath(bam_path)

    #         sym_origin = bam_origin.replace('.bam','.bam.bai')
    #         sym_dest = bam_path.replace('.bam','.bam.bai')
    #         #print(sym_origin)
    #         #print(sym_dest)

    #         sym_cmd ='ln -s %s %s' % (sym_origin,sym_dest)
    #         os.system(sym_cmd)

    # symlink_bai(bam_folder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#=====================II. LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for ch22 data file
    pipeline_dfci.summary(ch22_rna_data_file)

    #for umchor1
    pipeline_dfci.summary(umchor1_rna_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================III. RUNNING CUFFNORM==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')