コード例 #1
0
def wrap_meme(analysis_name):
    '''
    wrapper to run meme-chip w/ a pwm
    '''
    meme_folder = utils.formatFolder('%smeme/' % (projectFolder), True)

    output_folder = utils.formatFolder('%s%s' % (meme_folder, analysis_name),
                                       True)

    meme_bash_path = '%s%s_%s_meme.sh' % (meme_folder, analysis_name, top)
    meme_path = '/storage/cylin/bin/meme/bin/meme-chip'
    pwm_path = '/storage/cylin/bin/pipeline/crc/annotation/VertebratePWMs.txt'

    meme_bash = open(meme_bash_path, 'w')
    meme_bash.write('#!/usr/bin/bash\n')
    meme_bash.write('#SBATCH -n 32\n')
    meme_bash.write('#SBATCH -p short\n')

    meme_cmd = '%s -meme-nmotifs 5 -spamo-skip -oc %s -db %s %s' % (
        meme_path, output_folder, pwm_path, fasta_path)

    meme_bash.write(meme_cmd)

    meme_bash.close()

    return meme_bash_path
コード例 #2
0
ファイル: dynamicEnhancer.py プロジェクト: lg72cu/pipeline
def makeRoseDict(roseFolder):

    """
    analyzes a rose folder to try to find all of the various necessary files
    creates a dictionary with their full paths
    """
    if not utils.formatFolder(roseFolder, False):

        print "Folder %s does not exist" % (roseFolder)
        sys.exit()

    roseFolder = utils.formatFolder(roseFolder, False)
    roseFileList = [x for x in os.listdir(roseFolder) if x[0] != "."]  # no hidden files
    if len(roseFileList) == 0:
        print "No files found in %s" % (roseFolder)
        sys.exit()

    # create a dictionary to store stuff
    roseDict = {}
    # there are 5 files that we're interested in
    # REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed

    # sequentially find each one and add the full path to the roseDict
    roseDict["AllEnhancer"] = getFile("AllEnhancers.table.txt", roseFileList, roseFolder)
    roseDict["super"] = getFile("SuperEnhancers.table.txt", roseFileList, roseFolder)
    roseDict["stretch"] = getFile("_StretchEnhancers.table.txt", roseFileList, roseFolder)
    roseDict["superstretch"] = getFile("SuperStretchEnhancers.table.txt", roseFileList, roseFolder)

    roseDict["EnhancerToGene"] = getFile("_SuperEnhancers_ENHANCER_TO_GENE", roseFileList, roseFolder)
    roseDict["RegionMap"] = getFile("REGION_MAP", roseFileList, roseFolder)
    roseDict["bed"] = getFile("Enhancers_withSuper.bed", roseFileList, roseFolder)

    return roseDict
コード例 #3
0
def makeRoseDict(roseFolder):

    '''
    analyzes a rose folder to try to find all of the various necessary files
    creates a dictionary with their full paths
    '''
    if not utils.formatFolder(roseFolder,False):
        
        print "Folder %s does not exist" % (roseFolder)
        sys.exit()

    roseFolder = utils.formatFolder(roseFolder,False)
    roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
    if len(roseFileList) == 0:
        print "No files found in %s" % (roseFolder)
        sys.exit()

    #create a dictionary to store stuff
    roseDict = {}
    #there are 5 files that we're interested in
    #REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed

    #sequentially find each one and add the full path to the roseDict
    roseDict['AllEnhancer'] = getFile('AllEnhancers.table.txt',roseFileList,roseFolder)
    roseDict['super'] = getFile('SuperEnhancers.table.txt',roseFileList,roseFolder)
    roseDict['stretch'] = getFile('_StretchEnhancers.table.txt',roseFileList,roseFolder)
    roseDict['superstretch'] = getFile('SuperStretchEnhancers.table.txt',roseFileList,roseFolder)

    roseDict['EnhancerToGene'] = getFile('_SuperEnhancers_ENHANCER_TO_GENE',roseFileList,roseFolder)
    roseDict['RegionMap'] = getFile('REGION_MAP',roseFileList,roseFolder)
    roseDict['bed'] = getFile('Enhancers_withSuper.bed',roseFileList,roseFolder)

    return roseDict
コード例 #4
0
def makeRoseDict(roseFolder):

    '''
    analyzes a rose folder to try to find all of the various necessary files
    creates a dictionary with their full paths
    '''
    if not utils.formatFolder(roseFolder,False):
        
        print "Folder %s does not exist" % (roseFolder)
        sys.exit()

    roseFolder = utils.formatFolder(roseFolder,False)
    roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
    if len(roseFileList) == 0:
        print "No files found in %s" % (roseFolder)
        sys.exit()

    #create a dictionary to store stuff
    roseDict = {}
    #there are 5 files that we're interested in
    #REGION_MAP, AllEnhancers.table.txt, SuperEnhancers.table.txt, ENHANCER_TO_GENE, Enhancers_withSuper.bed

    #sequentially find each one and add the full path to the roseDict
    roseDict['AllEnhancer'] = getFile('AllEnhancers.table.txt',roseFileList,roseFolder)
    roseDict['SuperEnhancer'] = getFile('SuperEnhancers.table.txt',roseFileList,roseFolder)
    roseDict['EnhancerToGene'] = getFile('ENHANCER_TO_GENE',roseFileList,roseFolder)
    roseDict['RegionMap'] = getFile('REGION_MAP',roseFileList,roseFolder)
    roseDict['bed'] = getFile('Enhancers_withSuper.bed',roseFileList,roseFolder)

    return roseDict
コード例 #5
0
def makeHeatmap(names_list, gff_list, plot_name, plot_color):
    '''
    wrapper for the heatmap and meta R script
    '''
    meta_heat_script = '%sr_scripts/5_chiprx_heatmaps.R' % (hg19_projectFolder)
    scale_table_path = '%stables/HG19_SHEP21_CHIPRX_SCALE_FACTORS.txt' % (
        hg19_projectFolder)
    figures_path = utils.formatFolder('%sfigures/' % (projectFolder), True)
    figures_path = utils.formatFolder(
        '%sfigures/5_chiprx_heatmaps/' % (projectFolder), True)

    names_string = ','.join(names_list)

    for gff in gff_list:
        gffName = gff.split('/')[-1].split('.')[0]
        mapped_list = [
            '%s%s/%s_%s.gff' % (mappedFolder, gffName, gffName, name)
            for name in names_list
        ]
        mapped_string = ','.join(mapped_list)

        r_cmd = 'Rscript %s %s %s %s %s %s %s %s %s' % (
            meta_heat_script, mapped_string, scale_table_path, names_string,
            plot_color, gffName, plot_name, 'TRUE', projectFolder)
        print(r_cmd)
        os.system(r_cmd)
コード例 #6
0
def wrapGeneMapper(data_file,names_list=[],launch=True):

    '''
    runs ROSE2 GENE MAPPER on the AllEnhancers table
    '''

    data_dict = pipeline_dfci.loadDataTable(data_file)
    parent_rose_folder = utils.formatFolder('%srose_final' % (projectFolder),False)

    if len(names_list) ==0:
        names_list=[name for name in data_dict.keys() if name.upper().count('H3K27AC') ==1]

    #find each individual all enhancer table and then call the mapper via an .sh script
    for name in names_list:
        print(name)
        dataset_rose_folder = utils.formatFolder('%s%s_ROSE' %(parent_rose_folder,name),False)
        all_enhancer_path = '%s%s_peaks_AllEnhancers.table.txt' % (dataset_rose_folder,name)

        #print(all_enhancer_path)
        mapper_bash_path = '%s%s_geneMapper.sh' % (dataset_rose_folder,name)
        mapper_bash_file = open(mapper_bash_path,'w')
        mapper_bash_file.write('#!/usr/bin/bash\n\n\n\n')
        mapper_bash_file.write('#Running ROSE2 GENE MAPPER ON %s ALL ENHANCERS OUTPUT\n\n' % (name))
        mapper_cmd = 'python %sROSE2_geneMapper.py -g %s -i %s -f -w 100000' % (pipeline_dir,genome,all_enhancer_path)
        mapper_bash_file.write(mapper_cmd+'\n')
        mapper_bash_file.close()
        print('wrote gene mapper command to %s' % (mapper_bash_path))
        if launch:
            os.system('bash %s' % mapper_bash_path) 
コード例 #7
0
def makeNameDict(dataFile,roseFolder,namesList=[]):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:
        namesList = [name for name in dataDict.keys() if dataDict[name]['background'] != 'NONE']
        #this filters out control WCE datatsets

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}
        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        print enrichedFile
        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:

            roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
            allEnhancerFileList = [x for x in roseOutputFiles if x.count("AllEnhancers.table.txt") == 1 and x[0] != '.' ] #no weird hidden or temp files
            if len(allEnhancerFileList) > 0:
                nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
            else:
                nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            sys.exit()
    return nameDict
コード例 #8
0
def mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile):

    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict= pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder,True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder,analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds




    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:
        
        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap+=[name,backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (backgroundName,name)
                sys.exit()
        else:
            extraMap+=[name]

    print extraMap
    
    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (outputFolder,namesList[0],gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap,1,1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap


    
    bashFileName = pipeline_dfci.callRose2(dataFile,'',roseParentFolder,[namesList[0]],extraMap,mergedGFFFile,0,0,bashFileName,mask=maskFile) 
    
    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)


    if utils.checkOutput(mergedRegionMap,5,60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (mergedGFFFile)
        sys.exit()
コード例 #9
0
ファイル: CRC2.py プロジェクト: melnuesch/CLL_TFnetworks_2018
def findMotifs(canidateGenes, projectFolder, projectName, motifConvertFile,
               motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''

    # Create a dictionary to call motif names keyed on gene names

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]
    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])
    print 'GENERATING TF NETWORK'

    # select the TF candidates that have motifs
    canidateMotifs = []
    for gene in canidateGenes:
        if gene in motifNames:
            canidateMotifs.append(gene)

    print 'Number of annotated canidate TFs that have motifs: ' + str(
        len(canidateMotifs))
    canidateMotifs = sorted(canidateMotifs)

    #canidateMotifs = ['NANOG', 'POU5F1', 'SOX2']

    bgCmd = 'fasta-get-markov -m 1 < ' + projectFolder + projectName + '_SUBPEAKS.fa > ' + projectFolder + projectName + '_bg.meme'
    subprocess.call(bgCmd, shell=True)

    utils.formatFolder(projectFolder + 'FIMO/', True)

    fimoCmd = 'fimo'
    for TF in canidateMotifs:
        print TF
        for x in motifDatabaseDict[TF]:
            fimoCmd += ' --motif ' + "'%s'" % (str(x))

    #fimoCmd += ' --thresh 1e-5'
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile ' + projectFolder + projectName + '_bg.meme'
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += projectFolder + projectName + '_SUBPEAKS.fa'
    fimoCmd += ' > ' + projectFolder + 'FIMO/fimo.txt'  ##
    print fimoCmd

    fimoOutput = subprocess.call(
        fimoCmd, shell=True)  #will wait that fimo is done to go on

    return fimoCmd
コード例 #10
0
def process_shep_rna_drop_rep(shep21_rna_dataFile, gtfFile):
    '''
    quantifies gene expression to the hg19 ucsc refseq genes_ercc.gtf
    which has the spike included
    '''

    analysisName = 'SHEP21'

    cufflinksFolder = utils.formatFolder(
        '%sshep21_cufflinks_no_rep2/' % (rnaFolder), True)

    groupList = [
        ['SHEP21_0HR_rep1', 'SHEP21_0HR_rep3'],
        ['SHEP21_2HR_rep1', 'SHEP21_2HR_rep2', 'SHEP21_2HR_rep3'],
        ['SHEP21_4HR_rep1', 'SHEP21_4HR_rep2', 'SHEP21_4HR_rep3'],
        ['SHEP21_6HR_rep1', 'SHEP21_6HR_rep2', 'SHEP21_6HR_rep3'],
        ['SHEP21_8HR_rep1', 'SHEP21_8HR_rep2', 'SHEP21_8HR_rep3'],
        ['SHEP21_16HR_rep1', 'SHEP21_16HR_rep2', 'SHEP21_16HR_rep3'],
        ['SHEP21_24HR_rep1', 'SHEP21_24HR_rep2', 'SHEP21_24HR_rep3'],
    ]

    bashFileName = '%sshep21_rna_seq_cuff_no_rep2.sh' % (cufflinksFolder)
    makeCuffTable(shep21_rna_dataFile, analysisName, gtfFile, cufflinksFolder,
                  groupList, bashFileName)

    return bashFileName
コード例 #11
0
def main():

    print('main analysis for project %s' % (projectName))

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==================I. LOADING DATA ANNOTATION TABLES==================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for chip data file
    pipeline_dfci.summary(chip_data_file)

    #for chip data file
    pipeline_dfci.summary(atac_data_file)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#==========================II. CALLING ROSE2==========================='
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    macsEnrichedFolder = '%smacsEnriched/' % (
        projectFolder)  #folder with macs peak output beds
    parentFolder = utils.formatFolder(
        '%srose/' % (projectFolder),
        True)  # create a folder to store ROSE2 output
    namesList = ['MM1S_H3K27AC', 'MM1S_MED1'
                 ]  # calling ROSE2 on H3K27AC and MED1 defined enhancers
    bash_file = '%sMM1S_ROSE_CALLS.sh' % (parentFolder)
    mask_file = '%sgenomes/Homo_sapiens/UCSC/hg19/Annotation/Masks/hg19_encode_blacklist.bed' % (
        projectFolder)
    pipeline_dfci.callRose2(chip_data_file,
                            macsEnrichedFolder,
                            parentFolder,
                            namesList,
                            extraMap=[],
                            inputFile='',
                            tss=2500,
                            stitch=12500,
                            bashFileName=bash_file,
                            mask=mask_file,
                            useBackground=True)
コード例 #12
0
def main():

    print('main analysis for MYCN project')

    print('changing directory to project folder')
    os.chdir(projectFolder)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#======================I, LOADING DATA ANNOTATION======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #This section sanity checks each data table and makes sure both bam and .bai files are accessible

    #for RNA-Seq
    pipeline_dfci.summary(shep21_rna_dataFile)
    pipeline_dfci.summary(be2c_rna_drug_dataFile)
    pipeline_dfci.summary(be2c_rna_twist_dataFile)

    print('\n\n')
    print(
        '#======================================================================'
    )
    print(
        '#===================II, RUNNING LINE PLOT SCRIPTS======================'
    )
    print(
        '#======================================================================'
    )
    print('\n\n')

    #make the folder to store output figures

    utils.formatFolder('%sfigures/6_rna_line_plots/' % (projectFolder), True)
    #we have 3 RNA-Seq datasets

    #first is shep21 at the mycn conserved regions w/ the replicate dropped
    #and at shep21 defined regions
    #wrap_shep21()
    wrap_be2c_jq1()
コード例 #13
0
    def wrap_enhancer_promoter(dataFile,
                               input_path,
                               activity_path,
                               analysis_name,
                               names_list=[],
                               useBackground=True):
        '''
        runs enhancer promoter on everybody with the conserved regions and union of active genes
        '''

        #hard coded paths
        tads_path = '%shESC_domains_hg19.bed' % (bedFolder)

        #setting the output folder
        ep_folder = utils.formatFolder('%senhancerPromoter/' % (projectFolder),
                                       True)

        dataDict = pipeline_dfci.loadDataTable(dataFile)
        if len(names_list) == 0:
            names_list = [name for name in dataDict.keys()]
            names_list.sort()

        bams_list = [dataDict[name]['bam'] for name in names_list]
        bams_string = ' '.join(bams_list)

        background_names = [
            dataDict[name]['background'] for name in names_list
        ]
        background_list = [
            dataDict[background_name]['bam']
            for background_name in background_names
        ]
        background_string = ' '.join(background_list)

        ep_bash_path = '%s%s_enhancer_promoter.sh' % (ep_folder, analysis_name)
        ep_bash = open(ep_bash_path, 'w')

        ep_bash.write('#!/usr/bin/bash\n\n\n')

        ep_bash.write('#enhancer promoter analysis for %s\n\n' %
                      (analysis_name))

        if useBackground:
            python_cmd = 'python %senhancerPromoter.py -b %s -c %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, background_string, genome.upper(),
                input_path, ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        else:
            python_cmd = 'python %senhancerPromoter.py -b %s -g %s -i %s -o %s -a %s --name %s --tads %s --top 2000\n\n' % (
                pipeline_dir, bams_string, genome.upper(), input_path,
                ep_folder, activity_path, analysis_name, tads_path)

            ep_bash.write(python_cmd)

        ep_bash.close()

        return (ep_bash_path)
コード例 #14
0
ファイル: CRC2.py プロジェクト: melnuesch/CLL_TFnetworks_2018
def buildGraph(projectFolder, projectName, motifConvertFile, refseqToNameDict,
               canidateGenes):
    '''
    import the FIMO output once it's finished
    build the networkX directed graph
    '''

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}
    motifNames = [line[1] for line in motifDatabase]

    # The reverse of the other dict, from motif name to gene name
    for line in motifDatabase:
        motifDatabaseDict[line[0]] = line[1]

    fimoFile = projectFolder + 'FIMO/fimo.txt'
    fimoTable = utils.parseTable(fimoFile, '\t')

    graph = nx.DiGraph(name=projectName)
    graph.add_nodes_from(canidateGenes)

    motifDict = defaultdict(list)
    for line in fimoTable[1:]:

        source = motifDatabaseDict[line[0]]  #motifId
        # region = line[1].split('|')
        region = line[2].split('|')
        target = refseqToNameDict[
            region[0]]  #gene name corresponding to the NMid
        graph.add_edge(source, target)
        # motifDict[source].append((region[1], int(region[2]) + int(line[2]), int(region[2]) + int(line[3])))
        motifDict[source].append((region[1], int(region[2]) + int(line[3]),
                                  int(region[2]) + int(line[4])))

    utils.formatFolder(projectFolder + 'motifBED/', True)
    for gene in motifDict.keys():
        if motifDict[gene]:
            bed = []
            for loc in motifDict[gene]:
                bed.append([loc[0], loc[1], loc[2]])

            filename = projectFolder + 'motifBED/' + gene + '_' + projectName + '_motifs.bed'
            utils.unParseTable(bed, filename, '\t')

    return graph
コード例 #15
0
def getFile(fileString,fileList,parentFolder):
    '''
    returns full path of file from fileList containing the fileString
    returns an error if multiple files match
    '''
    if not utils.formatFolder(parentFolder,False):
        print "ERROR: Folder %s does not exist" % (parentFolder)
        sys.exit()
    parentFolder = utils.formatFolder(parentFolder,False)
    matchFiles = [fileName for fileName in fileList if fileName.count(fileString) == 1]
    if len(matchFiles) == 0:
        print "ERROR: No files found in %s with %s in title" % (parentFolder,fileString)
        sys.exit()
    if len(matchFiles) > 1:
        print "ERROR: Multiple files found in %s with %s in title" % (parentFolder,fileString)
        sys.exit()
    matchFilePath  = "%s%s" % (parentFolder,matchFiles[0])
    return matchFilePath
コード例 #16
0
def getFile(fileString,fileList,parentFolder):
    '''
    returns full path of file from fileList containing the fileString
    returns an error if multiple files match
    '''
    if not utils.formatFolder(parentFolder,False):
        print "ERROR: Folder %s does not exist" % (parentFolder)
        sys.exit()
    parentFolder = utils.formatFolder(parentFolder,False)
    matchFiles = [fileName for fileName in fileList if fileName.count(fileString) == 1]
    if len(matchFiles) == 0:
        print "WARNING: No files found in %s with %s in title" % (parentFolder,fileString)
        return ''
    if len(matchFiles) > 1:
        print "ERROR: Multiple files found in %s with %s in title" % (parentFolder,fileString)
        sys.exit()
    matchFilePath  = "%s%s" % (parentFolder,matchFiles[0])
    return matchFilePath
コード例 #17
0
ファイル: enhancerPromoter.py プロジェクト: shengqh/pipeline
def callGSEA(gseaPath, gmxPath, outputFolder,analysisName,top):

    '''
    runs C2 GSEA
    '''
    #gseaPath = '/usr/local/bin/gsea/gsea2-2.2.2.jar'
    #gmxPath = '/grail/annotations/gsea/c2.all.v5.1.symbols.gmt' #C2 set


    gseaBashFilePath = '%s%s_GSEA_cmd.sh' % (outputFolder,analysisName)
    gseaBashFile = open(gseaBashFilePath,'w')

    gseaBashFile.write('#!/usr/bin/bash\n\n')

    gseaBashFile.write('#COMMAND LINE GSEA CALLS FOR %s\n\n' % (analysisName))
    
    
    #for all
    gctPath = '%s%s_top_all.gct' % (outputFolder,analysisName)
    clsPath = '%s%s_top_all.cls' % (outputFolder,analysisName)
    gseaOutputFolder = utils.formatFolder('%sgsea_top_all_c2' % (outputFolder),True)
    rptLabel = '%s_top_all' % (analysisName)

    gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
    gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder)

    gseaBashFile.write(gseaCmd_all)
    gseaBashFile.write('\n')
    
    if top != 'all':
      #for top N
      gctPath = '%s%s_top_%s.gct' % (outputFolder,analysisName,top)
      clsPath = '%s%s_top_%s.cls' % (outputFolder,analysisName,top)
      gseaOutputFolder = utils.formatFolder('%sgsea_top_%s_c2' % (outputFolder,top),True)
      rptLabel = '%s_top_%s' % (analysisName,top)

      gseaBashFile.write('rm -rf %s/%s.Gsea* \n' % (gseaOutputFolder, rptLabel))
      gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s#PROMOTER_versus_DISTAL -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (gseaPath,gctPath,clsPath,gmxPath,rptLabel,gseaOutputFolder)

      gseaBashFile.write(gseaCmd_top)
      gseaBashFile.write('\n')

    gseaBashFile.close()
    os.system('bash %s' % (gseaBashFilePath))
コード例 #18
0
ファイル: CRC_wrapper.py プロジェクト: sridhar0605/pipeline
def launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)

    dataDict = pipeline_dfci.loadDataTable(data_file)

    genome_build = genome.name()
    
    meta_rose_output_1 = utils.formatFolder(meta_rose_folder + group1_name,True)
    meta_rose_output_2 = utils.formatFolder(meta_rose_folder + group2_name,True)

    meta_rose_string = ','.join([meta_rose_output_1,meta_rose_output_2])

    #setting the output
    dynamic_rose_folder = utils.formatFolder(dynamic_rose_folder,True)

    group1_string = ','.join(group1_list)
    group2_string = ','.join(group2_list)
    
    dynamic_cmd = 'python %sdynamicEnhancer_meta.py -g %s -d %s -r %s -o %s --group1 %s --group2 %s --name1 %s --name2 %s -p -m' % (pipeline_dir,genome_build,data_file,meta_rose_string,dynamic_rose_folder,group1_string,group2_string,group1_name,group2_name)
    
    if len(inputGFF) > 0:
        dynamic_cmd += ' --input %s' % (inputGFF)

    bash_path ='%s%s_dynamic_meta.sh' % (dynamic_rose_folder,analysis_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(dynamic_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote DYNAMIC_META command for %s to %s' % (analysis_name,bash_path))
    print('Launching DYNAMIC_META_ROSE')
    os.system('bash %s' % (bash_path))
コード例 #19
0
ファイル: CRC3.py プロジェクト: mufrdrk/pipeline
def findMotifs(subpeakFasta, bg_path, candidate_tf_list, projectFolder,
               analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt' % (fimoFolder, subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {}  #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()

    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder, analysis_name)
    fimo_bash = open(fimo_bash_path, 'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))

    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > ' + output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(
        fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
コード例 #20
0
ファイル: CRC3.py プロジェクト: linlabcode/pipeline
def findMotifs(subpeakFasta,bg_path,candidate_tf_list, projectFolder, analysis_name, motifConvertFile, motifDatabaseFile):
    '''
    takes the refseq to subpeak seq dict
    returns the networkx object with all connections
    '''
    fimoFolder = utils.formatFolder(projectFolder + 'FIMO/', True)
    subpeak_name = subpeakFasta.split('/')[-1].split('.')[0]
    output = '%s%s_fimo.txt'  % (fimoFolder,subpeak_name)
    # Create a dictionary to call motif names keyed on gene names
    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = {} #create a dict keyed by TF with multiple motifs

    for line in motifDatabase:
        motifDatabaseDict[line[1]] = []
    for line in motifDatabase:
        motifDatabaseDict[line[1]].append(line[0])

    candidate_tf_list.sort()
    
    print(candidate_tf_list)

    #now make a list of all motifs
    motif_list = []
    for tf in candidate_tf_list:
        motif_list += motifDatabaseDict[tf]

    motif_list = utils.uniquify(motif_list)

    fimo_bash_path = '%s%s_fimo.sh' % (fimoFolder,analysis_name)
    fimo_bash = open(fimo_bash_path,'w')
    fimo_bash.write('#!/usr/bin/bash\n\n')

    fimoCmd = 'fimo'
    for motif in motif_list:
        fimoCmd += ' --motif ' + "'%s'" % (str(motif))


    #fimoCmd += ' --thresh 1e-5' #if you want to increase stringency
    fimoCmd += ' -verbosity 1'  # thanks for that ;)!
    fimoCmd += ' -text'
    fimoCmd += ' -oc ' + projectFolder + 'FIMO'
    fimoCmd += ' --bgfile %s' % (bg_path)
    fimoCmd += ' ' + motifDatabaseFile + ' '
    fimoCmd += subpeakFasta
    fimoCmd += ' > '+ output
    print fimoCmd
    fimo_bash.write(fimoCmd)
    fimo_bash.close()

    fimoOutput = subprocess.call(fimoCmd, shell=True)  #will wait that fimo is done to go on

    return output
コード例 #21
0
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,maskFile=''):

    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose
    
    roseOutputFolder = utils.formatFolder(roseFolder,True)
    
    queueList =[]
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':
     
            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose(dataFile,'',roseOutputFolder,[name],[],enrichedFile,mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)

    #now check for completion of datasets
            
    for name in queueList:

        #check for the AllEnhancers table
        enhancerFile = "%s%s_ROSE/%s_peaks_AllEnhancers.table.txt" % (roseOutputFolder,name,name)
        

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile)
        if utils.checkOutput(enhancerFile,5,60):
            
            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:
            print "UNABLE TO FIND ENHANCER OUTPUT FOR %s. QUITTING NOW" % (name)
            sys.exit()

    return nameDict
コード例 #22
0
ファイル: CRC_wrapper.py プロジェクト: sridhar0605/pipeline
def launchMetaRose(group_name,group_list,meta_rose_folder,genome,data_file,stitch,tss):

    '''
    launches meta rose
    '''

    project_folder = utils.formatFolder(os.path.abspath(utils.getParentFolder(data_file)),False)
    macs_folder = '%smacsEnriched/' % (project_folder) #quick hack to make sure input beds are in the right place
    dataDict = pipeline_dfci.loadDataTable(data_file)

    meta_rose_output = utils.formatFolder(meta_rose_folder + group_name,True)

    genome_build = genome.name()
    
    input_string = ','.join(['%s%s' % (macs_folder,dataDict[name]['enrichedMacs']) for name in group_list])
    bam_string = ','.join([dataDict[name]['bam'] for name in group_list])

    meta_cmd = 'python %sROSE2_META.py -g %s -i %s -r %s -o %s -n %s -t %s' % (pipeline_dir,genome_build,input_string,bam_string,meta_rose_output,group_name,tss)
    if stitch != None:
        meta_cmd += ' -s %s' % (stitch)

    #adding a mask if necessary
    if genome.hasFeature('mask'):
        meta_cmd += ' --mask %s' % (genome.returnFeature('mask'))

    bash_path ='%s%s_meta_rose.sh' % (meta_rose_output,group_name)
    bash_file = open(bash_path,'w')
    bash_file.write('#!/usr/bin/bash\n\n')

    bash_file.write('cd %s\n\n' % (pipeline_dir))
    bash_file.write(meta_cmd)
    bash_file.write('\n\n')

    bash_file.close()
    
    print('Wrote META_ROSE command for %s to %s' % (group_name,bash_path))
    print('Launching META_ROSE')
    os.system('bash %s' % (bash_path))
コード例 #23
0
def define_enhancer_landscape(projectFolder,pipeline_dir,nb_all_chip_dataFile):

    '''
    defines the NB enhancer baseline using H3K27ac chips from NGP, KELLY, BE2C, and SHEP21
    enhancers defined using auto optimized stitching of nearby regions
    w/ a 2.5kb tss exclusion
    uses the meta rose code and writes out a .sh file for reproducibility
    '''

    #For H3K27AC
    #with TSS exclusion and auto stitching

    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    analysisName = 'NB_H3K27AC'
    namesList = [name for name in dataDict.keys() if name.count('H3K27AC') == 1]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList,',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList,',')

    bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList]
    bedString = string.join(bedFileList,',')

    roseFolder = '%smeta_rose/' % (projectFolder)
    roseFolder = utils.formatFolder(roseFolder,True)

    outputFolder = '%s%s/' % (roseFolder,analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName)

    bashFile = open(bashFileName,'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 2500 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()


    #the 4KB parameter is 
    region_map_path = '%s%s/%s_AllEnhancers.table.txt' % (roseFolder,analysisName,analysisName)
    return bashFileName,region_map_path,namesList
コード例 #24
0
def map_shep_enhancers(shep_on_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder,True)

    bashFileName = '%senhancer_rose/shep_on_enhancer_rose.sh' %(projectFolder)

    namesList = ['SHEP_0HR_H3K27AC','SHEP_2HR_H3K27AC','SHEP_6HR_H3K27AC']

    pipeline_dfci.callRose2(shep_on_dataFile,macsEnrichedFolder,parentFolder,namesList,[],'',2500,'',bashFileName,maskFile)

    return bashFileName
コード例 #25
0
    def wrapRose2Meta(data_file,
                      input_path,
                      parent_folder,
                      active_gene_path='',
                      rank_list=[],
                      control_list=[],
                      analysis_name=''):
        '''
        quick wrapper for Rose2Meta
        '''
        dataDict = pipeline_dfci.loadDataTable(data_file)
        rank_string = ','.join([dataDict[name]['bam'] for name in rank_list])
        control_string = ','.join(
            [dataDict[name]['bam'] for name in control_list])

        output_folder = utils.formatFolder(
            '%s%s' % (parent_folder, analysis_name), True)
        rose2_meta_cmd = '%s %sROSE2_META.py -g %s -i %s -r %s -c %s -n %s -o %s -s 0 -t 0 --mask %s' % (
            py27_path, pipeline_dir, genome, input_path, rank_string,
            control_string, analysis_name, output_folder, blacklist_path)

        all_enhancer_path = '%s%s_AllEnhancers.table.txt' % (output_folder,
                                                             analysis_name)

        if active_gene_path != '':
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s -l %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path,
                active_gene_path)
        else:
            rose2_map_cmd = '%s %sROSE2_geneMapper.py -g %s -i %s' % (
                py27_path, pipeline_dir, genome, all_enhancer_path)

        rose_bash_path = '%s%s_rose2_meta.sh' % (parent_folder, analysis_name)
        rose_bash = open(rose_bash_path, 'w')
        rose_bash.write('#!/usr/bin/python\n\n')
        rose_bash.write('#setting up bamliquidator\n')

        rose_bash.write('\n\n#ROSE2_CMD\n')
        rose_bash.write(rose2_meta_cmd + '\n')
        rose_bash.write(rose2_map_cmd + '\n')

        rose_bash.close()
        print('Wrote ROSE2 META CMD to %s' % (rose_bash_path))
コード例 #26
0
def process_be2c_drug_rna(be2c_rna_drug_dataFile, gtfFile):

    analysisName = 'BE2C_DRUG'

    cufflinksFolder = utils.formatFolder(
        '%sbe2c_drug_cufflinks/' % (rnaFolder), True)

    groupList = [
        ['BE2C_DMSO_B1', 'BE2C_DMSO_B2', 'BE2C_DMSO_B3'],
        ['BE2C_JQ1_4HR_2', 'BE2C_JQ1_4HR_3'],
        ['BE2C_JQ1_8HR_1', 'BE2C_JQ1_8HR_2', 'BE2C_JQ1_8HR_3'],
        ['BE2C_JQ1_24HR_1', 'BE2C_JQ1_24HR_2', 'BE2C_JQ1_24HR_3'],
    ]

    bashFileName = '%sbe2c_drug_rna_seq_cuff.sh' % (cufflinksFolder)
    makeCuffTable(be2c_rna_drug_dataFile, analysisName, gtfFile,
                  cufflinksFolder, groupList, bashFileName)

    return bashFileName
コード例 #27
0
def define_myc_landscape(projectFolder,pipeline_dir,shep_on_dataFile):

    '''
    defines the myc baseline in shep on system across the union of all time points
    uses the meta rose code and writes out a .sh file for reproducibility
    '''

    #For MYC baseline
    #no TSS exclusion and no stitching

    dataDict = pipeline_dfci.loadDataTable(shep_on_dataFile)
    analysisName = 'SHEP_ON_MYC'
    namesList = [name for name in dataDict.keys() if name.count('MYC') == 1]

    bamFileList = [dataDict[name]['bam'] for name in namesList]
    bamString = string.join(bamFileList,',')

    controlBams = [dataDict[name]['background'] for name in namesList]
    controlFileList = [dataDict[name]['bam'] for name in controlBams]
    controlBamString = string.join(controlFileList,',')

    bedFileList = [macsEnrichedFolder + dataDict[name]['enrichedMacs'] for name in namesList]
    bedString = string.join(bedFileList,',')

    roseFolder = '%smeta_rose/' % (projectFolder)
    roseFolder = utils.formatFolder(roseFolder,True)

    outputFolder = '%s%s/' % (roseFolder,analysisName)
    bashFileName = '%s%s_meta_rose.sh' % (roseFolder,analysisName)

    bashFile = open(bashFileName,'w')
    bashFile.write('#!/usr/bin/bash\n\n')
    bashFile.write('cd %s\n' % (pipeline_dir))

    metaRoseCmd = 'python %sROSE2_META.py -g hg19 -i %s -r %s -c %s -o %s -n %s -t 0 -s 0 --mask %s' % (pipeline_dir,bedString,bamString,controlBamString,outputFolder,analysisName,maskFile)

    bashFile.write(metaRoseCmd + '\n')
    bashFile.close()

    #this is the expeceted region map output
    region_map_path = '%s%s/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt' % (roseFolder,analysisName,analysisName)
    return bashFileName,region_map_path,namesList
コード例 #28
0
def plot_mm_genes(mm1s_dataFile, nb_figure_gff_path, bed_string):
    '''
    plots all varieties and iterations of tracks for shep on data
    '''

    #first establish the plot folder
    plotFolder = utils.formatFolder('%sMM1S/' % (genePlotFolder), True)
    plot_prefix = 'HG19_NB_FIGURE_GENES'

    #we also have to set the extension properly between datasets

    #go by data file
    dataDict = pipeline_dfci.loadDataTable(mm1s_dataFile)
    names_list = dataDict.keys()

    bam = utils.Bam(dataDict[names_list[0]]['bam'])
    read_length = bam.getReadLengths()[0]
    bam_extension = 200 - read_length
    print('For datasets in %s using an extension of %s' %
          (mm1s_dataFile, bam_extension))

    #first do individuals
    for plot_group in ['MYC', 'H3K27AC']:
        plotList = [
            name for name in dataDict.keys() if name.count(plot_group) > 0
        ]
        plotName = '%s_MM1S_%s' % (plot_prefix, plot_group)
        print(plotName)
        pipeline_dfci.callBatchPlot(mm1s_dataFile,
                                    nb_figure_gff_path,
                                    plotName,
                                    plotFolder,
                                    plotList,
                                    uniform=True,
                                    bed=bed_string,
                                    plotType='MULTIPLE',
                                    extension=bam_extension,
                                    multiPage=False,
                                    debug=False,
                                    nameString='',
                                    rpm=True,
                                    rxGenome='')
コード例 #29
0
def process_be2c_twist_rna(be2c_rna_twist_dataFile, gtfFile):

    analysisName = 'BE2C_TWIST'

    cufflinksFolder = utils.formatFolder(
        '%sbe2c_twist_cufflinks/' % (rnaFolder), True)

    groupList = [
        ['BE2C_shT_nodox_rep1', 'BE2C_shT_nodox_rep2', 'BE2C_shT_nodox_rep3'],
        ['BE2C_shT_3HR_rep1', 'BE2C_shT_3HR_rep2', 'BE2C_shT_3HR_rep3'],
        ['BE2C_shT_6HR_rep1', 'BE2C_shT_6HR_rep2', 'BE2C_shT_6HR_rep3'],
        ['BE2C_shT_12HR_rep1', 'BE2C_shT_12HR_rep2', 'BE2C_shT_12HR_rep3'],
        ['BE2C_shT_24HR_rep1', 'BE2C_shT_24HR_rep2', 'BE2C_shT_24HR_rep3'],
        ['BE2C_shT_48HR_rep1', 'BE2C_shT_48HR_rep2', 'BE2C_shT_48HR_rep3'],
    ]

    bashFileName = '%sbe2c_twist_rna_seq_cuff.sh' % (cufflinksFolder)
    makeCuffTable(be2c_rna_twist_dataFile, analysisName, gtfFile,
                  cufflinksFolder, groupList, bashFileName)

    return bashFileName
コード例 #30
0
def map_nb_enhancers(nb_all_chip_dataFile):
    '''
    for enhancers in individual systems defined by k27ac
    '''
    dataDict = pipeline_dfci.loadDataTable(nb_all_chip_dataFile)
    namesList = dataDict.keys()

    print(namesList)

    parentFolder = '%senhancer_rose' % (projectFolder)
    parentFolder = utils.formatFolder(parentFolder, True)

    bashFileName = '%senhancer_rose/nb_enhancer_rose.sh' % (projectFolder)

    namesList = [
        'SHEP21_0HR_H3K27AC_NOSPIKE', 'BE2C_H3K27AC', 'KELLY_H3K27AC',
        'NGP_H3K27AC'
    ]

    pipeline_dfci.callRose2(nb_all_chip_dataFile, macsEnrichedFolder,
                            parentFolder, namesList, [], '', 2500, '',
                            bashFileName, maskFile)

    return bashFileName
コード例 #31
0
def wrap_dynamic_meta(mouse_dataFile, meta_rose_1, meta_rose_2, output_folder,
                      group1_names, group2_names, name_1, name_2):
    '''
    wraps the dynamic meta enhancer analysis
    '''

    output_folder = utils.formatFolder(output_folder, True)
    group1_string = ','.join(group1_names)
    group2_string = ','.join(group2_names)

    bash_path = '%s%s_%s_dynamic.sh' % (output_folder, name_1, name_2)
    bash_file = open(bash_path, 'w')
    bash_file.write('#!/usr/bin/bash\n\n\n')

    bash_file.write('cd %s\n\n' % (projectFolder))
    cmd = 'srun --mem 16000 python %sdynamicEnhancer_meta.py -g MM9 -d %s -r %s,%s -o %s --group1 %s --group2 %s --name1 %s --name2 %s' % (
        pipeline_dir, mouse_dataFile, meta_rose_1, meta_rose_2, output_folder,
        group1_string, group2_string, name_1, name_2)

    print(cmd)
    bash_file.write(cmd)

    bash_file.close()
    print(bash_path)
コード例 #32
0
def main():
    '''
    main run call
    '''

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a ROSE ranked enhancer or super-enhancer file")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter the bam used to rank enhancers")
    parser.add_option("-c", "--control", dest="control", nargs=1, default='',
                      help="Enter a background bam for background correction")

    parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None,
                      help="Enter a gene list to filter through")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder. Default will be same folder as input file")
    parser.add_option(
        "-w", "--window", dest="window", nargs=1, default=50000,
        help="Enter a search distance for genes. Default is 50,000bp")
    parser.add_option(
        "-f", "--format", dest="formatTable", action="store_true", default=False,
        help="If flagged, maintains original formatting of input table")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.genome or not options.rankby:

        parser.print_help()
        exit()

    print(options)

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # GETTING THE INPUT
    enhancerFile = options.input
    window = int(options.window)

    # making the out folder if it doesn't exist
    if options.out:
        outFolder = utils.formatFolder(options.out, True)
    else:
        outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/'

    # GETTING BAM INFO
    rankByBamFile = options.rankby
    controlBamFile = options.control

    # CHECK FORMATTING FLAG
    if options.formatTable:
        noFormatTable = True
    else:
        noFormatTable = False

    # GETTING THE TRANSCRIBED LIST
    if options.geneList:

        transcribedFile = options.geneList
    else:
        transcribedFile = ''

    if options.rankby:
        enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable = mapEnhancerToGeneTop(
            rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile, True, window, noFormatTable)

        # Writing enhancer output
        enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

        if window != 50000:
            # writing the enhancer table

            out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            print("writing output to %s" % (out1))
            utils.unParseTable(enhancerToGeneTable, out1, '\t')

            # writing enhancer top gene table
            out2 = '%s%s_ENHANCER_TO_TOP_GENE_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            utils.unParseTable(enhancerToTopGeneTable, out2, '\t')

            # writing the gene table
            out3 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (
                outFolder, enhancerFileName, window / 1000)
            utils.unParseTable(geneToEnhancerTable, out3, '\t')
        else:
            # writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(enhancerToGeneTable, out1, '\t')

            # writing the enhancer table
            out2 = '%s%s_ENHANCER_TO_TOP_GENE.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(enhancerToTopGeneTable, out2, '\t')

            # writing the gene table
            out3 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName)
            utils.unParseTable(geneToEnhancerTable, out3, '\t')
    else:
        #do traditional mapping
        enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable)

        #Writing enhancer output
        enhancerFileName = enhancerFile.split('/')[-1].split('.')[0]

        if window != 50000:
            #writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
            utils.unParseTable(enhancerToGeneTable,out1,'\t')

            #writing the gene table
            out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000)
            utils.unParseTable(geneToEnhancerTable,out2,'\t')
        else:
            #writing the enhancer table
            out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName)
            utils.unParseTable(enhancerToGeneTable,out1,'\t')

            #writing the gene table
            out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName)
            utils.unParseTable(geneToEnhancerTable,out2,'\t')
コード例 #33
0
ファイル: CRC3.py プロジェクト: linlabcode/pipeline
def collapseFimo(fimo_output,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile):

    '''
    collapses motifs from fimo
    for each source node (TF) and each target node (gene enhancer regions), collapse motif instances
    then spit out a ginormous set of beds and a single crazy collapsed bed
    '''
    
    #first build up the motif name conversion database

    motifDatabase = utils.parseTable(motifConvertFile, '\t')
    motifDatabaseDict = defaultdict(list)
    # The reverse of the other dict, from motif name to gene name
    # a motif can go to multiple genes
    for line in motifDatabase:
        motifDatabaseDict[line[0]].append(line[1])



    #make the folder to store motif beds
    utils.formatFolder('%smotif_beds/' % (output_folder),True)

    edgeDict = {}
    #first layer are source nodes
    for tf in candidate_tf_list:
        edgeDict[tf] = defaultdict(list) #next layer are target nodes which are derived from the fimo output
        

    fimoTable = utils.parseTable(fimo_output,'\t')
    print(fimo_output)

    #fimo sometimes puts the region in either the first or second column
    fimo_line = fimoTable[1]
    if fimo_line[1].count('|') >0:
        region_index = 1
    else:
        region_index = 2
    print('USING COLUMN %s OF FIMO OUTPUT FOR REGION' % (region_index))

    for line in fimoTable[1:]:
        source_tfs = motifDatabaseDict[line[0]]   #motifId
        for source in source_tfs:
            if candidate_tf_list.count(source) == 0:
                continue
            region = line[region_index].split('|')

            target = region[0]
            if region_index == 2:
                target_locus = utils.Locus(region[1],int(region[2]) + int(line[3]), int(region[2]) + int(line[4]),'.')
            else:
                target_locus = utils.Locus(region[1],int(region[2]) + int(line[2]), int(region[2]) + int(line[3]),'.')
            #what's missing here is the enhancer id of the target locus
            try:
                edgeDict[source][target].append(target_locus)
            except KeyError:
                print('this motif is not in the network')
                print(line)
                sys.exit()


    #now we actually want to collapse this down in a meaningful way
    #overlapping motifs count as a single binding site. This way a TF with tons of motifs
    #that finds the same site over and over again doesn't get over counted
    all_bed = []
    all_bed_path = '%s%s_all_motifs.bed' % (output_folder,analysis_name)
    for tf in candidate_tf_list:
        print(tf)
        target_nodes = edgeDict[tf].keys()
        bed_header = ['track name = "%s" description="%s motifs in %s"' % (tf,tf,analysis_name)]
        all_bed.append(bed_header)
        target_bed = [bed_header]
        target_bed_path = '%smotif_beds/%s_motifs.bed' % (output_folder,tf)
        for target in target_nodes:
            edgeCollection = utils.LocusCollection(edgeDict[tf][target],50)
            edgeCollection = edgeCollection.stitchCollection()
            edgeLoci = edgeCollection.getLoci()
            edgeDict[tf][target] = edgeLoci
            for locus in edgeLoci:
                bed_line = [locus.chr(),locus.start(),locus.end(),target,'','+']
                target_bed.append(bed_line)
                all_bed.append(bed_line)

        utils.unParseTable(target_bed,target_bed_path,'\t')

    #now the loci are all stitched up 
    utils.unParseTable(all_bed,all_bed_path,'\t')
    return edgeDict
コード例 #34
0
ファイル: ROSE2_META.py プロジェクト: BoulderLabs/pipeline
import subprocess
import string

from collections import defaultdict


#==================================================================
#=========================GLOBAL===================================
#==================================================================


# Get the script's full local path
whereAmI = os.path.dirname(os.path.realpath(__file__))
print(whereAmI)
# Get the script folder
codeFolder = utils.formatFolder(whereAmI,False)

print('RUNNING ROSE2_META.py FROM %s' % (whereAmI))

#samtools must be installed
samtoolsPath = 'samtools'

#bamliquidator must be installed
bamliquidator_path = 'bamliquidator_batch'

#==================================================================
#=====================HELPER FUNCTIONS=============================
#==================================================================

def getBamChromList(bamFileList):
コード例 #35
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(
        usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME')

    # required flags
    parser.add_argument("-i",
                        "--input",
                        dest="inputSam",
                        type=str,
                        help="Enter a sam file",
                        required=False)
    parser.add_argument("-g",
                        "--genome",
                        dest="genome",
                        type=str,
                        help="specify the main reference genome",
                        required=False)
    parser.add_argument("-s",
                        "--spike",
                        dest="spike",
                        type=str,
                        help="specify the spike in  genome",
                        required=False)

    parser.add_argument(
        "-d",
        "--dest",
        dest="dest",
        type=str,
        help="specify an optional destination for the final bams to move to",
        required=False)

    args = parser.parse_args()

    print(args)

    if args.inputSam and args.genome and args.spike:

        print(
            'FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s'
            % (args.inputSam, args.genome, args.spike))
        samPath = args.inputSam

        if string.upper(samPath).count('.SAM') == 0:
            print('ERROR, file must end in .sam or .SAM')
            sys.exit()

        #get the headers
        genome_string = string.upper('%s_%s' % (args.genome, args.spike))

        genomeDict = {
            'RN6_DM6': [
                '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM',
                '/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'
            ],
        }

        if genomeDict.has_key(genome_string) == False:
            print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string))
            sys.exit()
        else:
            print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %
                  (genome_string))

        header1 = genomeDict[genome_string][0]
        header2 = genomeDict[genome_string][1]
        suffix = string.lower('_%s' % (args.spike))

        outfile1 = string.replace(samPath, samPath[-4:],
                                  '.%s%s' % (args.genome, samPath[-4:]))
        outfile2 = string.replace(samPath, samPath[-4:],
                                  '.%s%s' % (args.spike, samPath[-4:]))
        split_sam(suffix, samPath, header1, header2, outfile1, outfile2)

        #move stuff to destination folder
        if args.dest:
            bamFolder = utils.formatFolder(args.dest, False)

            samFolder = utils.getParentFolder(samPath)

            mv_cmd = 'mv %s*bam* %s' % (samFolder, bamFolder)
            print('MOVING BAMS FROM %s TO %s' % (samFolder, bamFolder))
            os.system(mv_cmd)

    else:
        parser.print_help()
        sys.exit()
コード例 #36
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -n [NAMES_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter a data file for datasets to be processed")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "specify an output folder to write results to")

    #additional options
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to analyze. Default will be all datasets")

    parser.add_option("-n","--name", dest="name",nargs=1,default=None,
                      help = "Enter a name for the analysis")

    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a folder to detect or write rose output")

    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "flag to run analysis on ALL enhancers (this is much slower)")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments
        
        #pull in the datafile and create a datadict
        dataFile = options.data




        #now the output folder
        outputFolder = utils.formatFolder(options.output,True) #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================
        
        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)
        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print namesList

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile,roseFolder,namesList)

            
        print nameDict
        
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================
        
        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder)
        print nameDict


        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================
        
        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================
        
        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"
        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName)
        mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly)


        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder)
        
        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(mergedRegionMap,medianDict,analysisName,genome,outputFolder)
        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples

        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/rose/')
        cmd = 'python /ark/home/cl512/rose/ROSE_geneMapper.py -g %s -i %s' % (genome,clusterTableFile)
        os.system(cmd)

        print "FINISHED"


    else:
        parser.print_help()
        sys.exit()
コード例 #37
0
ファイル: callBowtie2.py プロジェクト: BoulderLabs/pipeline
def main():

    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -f [FASTQFILE] -g [GENOME] -u [UNIQUEID] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-f","--fastq", dest="fastq",nargs = 1, default=None,
                      help = "Enter the full path of a fastq file to be mapped")
    parser.add_option("-g","--genome",dest="genome",nargs =1, default = None,
                      help = "specify a genome, options are hg19,hg18, mm9 or geckov2 right now")
    parser.add_option("-u","--unique",dest="unique",nargs =1, default = None,
                      help = "specify a uniqueID")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "Specify an output folder")


    #optional arguments
    parser.add_option("--param",dest="paramString",nargs =1, default = '',
                      help = "A string of bowtie parameters")
    parser.add_option("--link-folder",dest="linkFolder",nargs =1, default = None,
                      help = "Specify a folder to symlink the bam")
    parser.add_option("-p","--paired",dest="paired",action='store_true',default = False,
                      help = "Flag for paired end data")
    parser.add_option("-S","--sam",dest="sam",action='store_true',default = False,
                      help = "Flag to save sam")
    parser.add_option("-q","--qc",dest="qc",action='store_true',default = False,
                      help = "Flag to run fastqc")



    (options,args) = parser.parse_args()

    if not options.fastq or not options.genome or not options.unique or not options.output:
        parser.print_help()
        exit()


    #retrive the arguments
    fastqFile = options.fastq
    genome = string.lower(options.genome)
    uniqueID = options.unique
    outputFolder = options.output
    
    #make the output folder
    outputFolder = utils.formatFolder(outputFolder,True)

    #retrieve optional arguments
    paramString = options.paramString
    if options.linkFolder:

        linkFolder = options.linkFolder
    else:
        linkFolder =''
    pairedEnd = options.paired

    #get the bowtie index
    bowtieDict = {
        'mm9':'/raider/index/mm9/Bowtie2Index/genome',
        'hg19':'/raider/index/hg19/Bowtie2Index/genome',
        'hg18':'/grail/genomes/Homo_sapiens/human_gp_mar_06_no_random/bowtie/hg18',
        'geckov2':'/grail/genomes/gecko/GeCKOv2/Sequence/Bowtie2Index/gecko',
        'ribo':'/raider/temp/rDNA/hg19_45S_index/genome',
        'hg19_ribo':'/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_ribo/genome',
        }

    bowtieIndex = bowtieDict[string.lower(genome)]

    #get the temp string
    tempString = '_%s' % str(random.randint(1,10000))
    
    fileNameDict = makeFileNameDict(fastqFile,genome,tempString,tempParentFolder,outputFolder,linkFolder,uniqueID,pairedEnd)

    #open the bashfile to write to
    bashFileName = "%s%s_bwt2.sh" % (outputFolder,uniqueID)
    bashFile = open(bashFileName,'w')

    #shebang
    bashFile.write('#!/usr/bin/bash\n')

    #make temp directory
    cmd = 'mkdir %s' % (fileNameDict['tempFolder'])
    bashFile.write(cmd+'\n')

    #extract fastq
    cmd = extractFastqCmd(fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #call fastqc
    if options.qc:
        cmd =runFastQC(fastqcString,fileNameDict,pairedEnd)
        bashFile.write(cmd+'\n')

    #call bowtie
    cmd = bowtieCmd(bowtieString,paramString,bowtieIndex,fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #remove temp fastq
    cmd = removeTempFastqCmd(fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #generate a bam
    cmd = generateTempBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #change into the temp directory
    cmd = changeTempDir(fileNameDict)
    bashFile.write(cmd+'\n')

    #sort the bam
    cmd = sortBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #index
    cmd = indexBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #remove sam
    if not options.sam:
        cmd = rmSamCmd(fileNameDict)
        bashFile.write(cmd+'\n')
    
    #or move the sam
    if options.sam:
        cmd = mvSamCmd(fileNameDict)
        bashFile.write(cmd+'\n')
    #mv bams
    cmd = mvBamCmd(fileNameDict)
    bashFile.write(cmd+'\n')

    #link bams
    if options.linkFolder:
        cmd = linkBamCmd(fileNameDict)
        bashFile.write(cmd+'\n')

    #cleanup
    cmd = rmTempFiles(fileNameDict)
    bashFile.write(cmd+'\n')


    bashFile.close()

    print "Wrote mapping command to %s" % (bashFileName)
コード例 #38
0
ファイル: ROSE2_META.py プロジェクト: linlabcode/pipeline
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a comma separated list of .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="Enter a comma separated list of bams to rank by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-n", "--name", dest="name", nargs=1, default=None,
                      help="Provide a name for the analysis otherwise ROSE will guess")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="Enter a comma separated list of control bams. Can either provide a single control bam for all rankby bams, or provide a control bam for each individual bam")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE(s)

    inputList = [inputFile for inputFile in  options.input.split(',') if len(inputFile) > 1]

    #converting all input files into GFFs and moving into the GFF folder
    inputGFFList = []
    for inputFile in inputList:
        if inputFile.split('.')[-1] == 'bed':
            # CONVERTING A BED TO GFF
            inputGFFName = inputFile.split('/')[-1][0:-4] #strips the last 4 characters i.e. '.bed'
            inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
            utils.bedToGFF(inputFile, inputGFFFile)
        elif options.input.split('.')[-1] == 'gff':
            # COPY THE INPUT GFF TO THE GFF FOLDER

            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])

        else:
            print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
            # COPY THE INPUT GFF TO THE GFF FOLDER
            os.system('cp %s %s' % (inputFile, gffFolder))
            inputGFFFile = '%s%s' % (gffFolder,inputFile.split('/')[-1])
        inputGFFList.append(inputGFFFile)
                                    

    # GETTING THE LIST OF BAMFILES TO PROCESS
    #either same number of bams for rankby and control 
    #or only 1 control #or none!
    #bamlist should be all rankby bams followed by control bams

    
    bamFileList = []
    if options.control:
        controlBamList = [bam for bam in options.control.split(',') if len(bam) >0]
        rankbyBamList = [bam for bam in options.rankby.split(',') if len(bam) >0]

        if len(controlBamList) == len(rankbyBamList):
            #case where an equal number of backgrounds are given
            bamFileList = rankbyBamList + controlBamList
        elif len(controlBamList) == 1:
            #case where a universal background is applied
            bamFileList = rankbyBamList + controlBamList*len(rankbyBamList)
        else:
            print('ERROR: EITHER PROVIDE A SINGLE CONTROL BAM FOR ALL SAMPLES, OR ONE CONTROL BAM FOR EACH SAMPLE')
            sys.exit()
    else:
        bamFileList = [bam for bam in options.rankby.split(',') if len(bam) > 0]




    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False


    # GETTING THE GENOME
    genome = string.upper(options.genome)
    print('USING %s AS THE GENOME' % (genome))

    # GETTING THE CORRECT ANNOT FILE

    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (pipeline_dir),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (pipeline_dir),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (pipeline_dir),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (pipeline_dir),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (pipeline_dir),
        'RN4': '%s/annotation/rn4_refseq.ucsc' % (pipeline_dir),
    }

    try:
        annotFile = genomeDict[genome.upper()]
    except KeyError:
        print('ERROR: UNSUPPORTED GENOMES TYPE %s' % (genome))
        sys.exit()


    #FINDING THE ANALYSIS NAME
    if options.name:
        inputName = options.name
    else:
        inputName = inputGFFList[0].split('/')[-1].split('.')[0]
    print('USING %s AS THE ANALYSIS NAME' % (inputName))


    print('FORMATTING INPUT REGIONS')
    # MAKING THE RAW INPUT FILE FROM THE INPUT GFFs
    #use a simpler unique region naming system 
    if len(inputGFFList) == 1:
        inputGFF = utils.parseTable(inputGFFList[0],'\t')
    else:
        inputLoci = []
        for gffFile in inputGFFList:
            print('\tprocessing %s' % (gffFile))
            gff = utils.parseTable(gffFile,'\t')
            gffCollection = utils.gffToLocusCollection(gff,50)
            inputLoci += gffCollection.getLoci()


        inputCollection = utils.LocusCollection(inputLoci,50)
        inputCollection = inputCollection.stitchCollection() # stitches to produce unique regions

        inputGFF = utils.locusCollectionToGFF(inputCollection)

    formattedGFF = []
    #now number things appropriately
    for i,line in enumerate(inputGFF):
        
        #use the coordinates to make a new id inputname_chr_sense_start_stop
        chrom = line[0]
        coords = [int(line[3]) ,int(line[4])]
        sense = line[6]

        lineID = '%s_%s' % (inputName,str(i+1)) #1 indexing
        
        newLine = [chrom,lineID,lineID,min(coords),max(coords),'',sense,'',lineID]
        formattedGFF.append(newLine)
        
    #name of the master input gff file
    masterGFFFile = '%s%s_%s_ALL_-0_+0.gff' % (gffFolder,string.upper(genome),inputName)
    utils.unParseTable(formattedGFF,masterGFFFile,'\t')

    print('USING %s AS THE INPUT GFF' % (masterGFFFile))


    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    #GET CHROMS FOUND IN THE BAMS
    print('GETTING CHROMS IN BAMFILES')
    bamChromList = getBamChromList(bamFileList)
    print("USING THE FOLLOWING CHROMS")
    print(bamChromList)

    #LOADING IN THE GFF AND FILTERING BY CHROM
    print('LOADING AND FILTERING THE GFF')
    inputGFF = filterGFF(masterGFFFile,bamChromList)
    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFF)

    print('CHECKING REFERENCE COLLECTION:')
    checkRefCollection(referenceCollection)
        

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))



    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.



    bamFileListUnique = list(bamFileList)
    bamFileListUnique = utils.uniquify(bamFileListUnique)
    #prevent redundant mapping
    print("MAPPING TO THE FOLLOWING BAMS:")
    print(bamFileListUnique)
    for bamFile in bamFileListUnique:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            os.system(cmd1)
            if utils.checkOutput(mappedOut1File,0.2,5):
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)


    print('FINDING AVERAGE SIGNAL AMONGST BAMS')
    metaOutputFile = collapseRegionMap(outputFile1,inputName + '_MERGED_SIGNAL',controlBams=options.control)

    #now try the merging

    print('CALLING AND PLOTTING SUPER-ENHANCERS')



    rankbyName = inputName + '_MERGED_SIGNAL'
    controlName = 'NONE'
    cmd = 'Rscript %sROSE2_callSuper.R %s %s %s %s' % (pipeline_dir,outFolder, metaOutputFile, inputName, controlName)
    print(cmd)

    os.system(cmd)
    

    # calling the gene mapper
    print('CALLING GENE MAPPING')

    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)

    #for now don't use ranking bam to call top genes
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superTableFile)
    print(cmd)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
 
    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, stretchTableFile)
    print(cmd)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)

    cmd = "python %sROSE2_geneMapper.py -g %s -i %s%s -f" % (pipeline_dir,genome, outFolder, superStretchTableFile)
    os.system(cmd)
コード例 #39
0
ファイル: enhancerPromoter.py プロジェクト: mufrdrk/pipeline
def main():
    '''
    main run method for enhancer promoter contribution tool
    '''

    parser = argparse.ArgumentParser(usage='%(prog)s [options]')

    # required flags
    parser.add_argument(
        "-b",
        "--bam",
        dest="bam",
        nargs='*',
        help="Enter a space separated list of .bam files for the main factor",
        required=True)
    parser.add_argument("-i",
                        "--input",
                        dest="input",
                        type=str,
                        help="Enter .gff or .bed file of regions to analyze",
                        required=True)
    parser.add_argument(
        "-g",
        "--genome",
        dest="genome",
        type=str,
        help=
        "specify a genome, HG18,HG19,HG38,MM8,MM9,MM10,RN6 are currently supported",
        required=True)

    # output flag
    parser.add_argument("-o",
                        "--output",
                        dest="output",
                        type=str,
                        help="Enter the output folder.",
                        required=True)

    # additional options flags and optional arguments
    parser.add_argument(
        "-a",
        "--activity",
        dest="activity",
        type=str,
        help=
        "specify a table where first column represents a list of active refseq genes",
        required=False)

    parser.add_argument(
        "-c",
        "--control",
        dest="control",
        nargs='*',
        help=
        "Enter a space separated list of .bam files for background. If flagged, will perform background subtraction",
        required=False)
    parser.add_argument(
        "-w",
        "--window",
        dest="window",
        type=int,
        help=
        "Enter a window to define the TSS area +/- the TSS. Default is 1kb",
        required=False,
        default=1000)
    parser.add_argument(
        "--other-bams",
        dest="other",
        nargs='*',
        help="enter a space separated list of other bams to map to",
        required=False)

    parser.add_argument(
        "--name",
        dest="name",
        type=str,
        help=
        "enter a root name for the analysis, otherwise will try to find the name from the input file",
        required=False)

    parser.add_argument(
        "--top",
        dest="top",
        type=int,
        help=
        "Run the analysis on the top N genes by total signal. Default is 5000",
        required=False,
        default=5000)
    parser.add_argument(
        "--tads",
        dest="tads",
        type=str,
        help=
        "Include a .bed of tad regions to restrict enhancer/gene association",
        required=False,
        default=None)

    args = parser.parse_args()

    print(args)

    #minimum arguments needed to proceed
    if args.bam and args.input and args.genome and args.output:

        #=====================================================================================
        #===============================I. PARSING ARGUMENTS==================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n'
        )

        #top analysis subset
        top = args.top

        #input genome
        genome = args.genome.upper()
        print('PERFORMING ANALYSIS ON %s GENOME BUILD' % (genome))

        #set of bams
        bamFileList = args.bam

        #bring in the input path
        inputPath = args.input

        #try to get the input name or use the name argument
        if args.name:
            analysisName = args.name
        else:
            analysisName = inputPath.split('/')[-1].split('.')[0]

        print('USING %s AS ANALYSIS NAME' % (analysisName))
        #setting up the output folder
        parentFolder = utils.formatFolder(args.output, True)
        outputFolder = utils.formatFolder(
            '%s%s' % (parentFolder, analysisName), True)

        print('WRITING OUTPUT TO %s' % (outputFolder))

        if inputPath.split('.')[-1] == 'bed':
            #type is bed
            print('input in bed format, converting to gff')
            inputGFF = utils.bedToGFF(inputPath)
        else:
            inputGFF = utils.parseTable(inputPath, '\t')

        #the tss window
        window = int(args.window)

        #activity path
        if args.activity:
            activityPath = args.activity
            activityTable = utils.parseTable(activityPath, '\t')

            #try to find the column for refseq id
            for i in range(len(activityTable[0])):
                if str(activityTable[0][i]).count('NM_') > 0 or str(
                        activityTable[0][i]).count('NR_') > 0:
                    ref_col = i

            geneList = [line[ref_col] for line in activityTable
                        ]  # this needs to be REFSEQ NM ID
            print('IDENTIFIED %s ACTIVE GENES' % (len(geneList)))

        else:
            geneList = []

        #check if tads are being invoked
        if args.tads:
            print('LOADING TAD LOCATIONS FROM %s' % (args.tads))
            use_tads = True
            tads_path = args.tads
        else:
            use_tads = False
            tads_path = ''

        print('LOADING ANNOTATION DATA FOR GENOME %s' % (genome))

        #important here to define the window
        startDict, tssCollection, genomeDirectory, chrom_list, mouse_convert_dict = loadAnnotFile(
            genome, window, geneList, True)
        #print(tssCollection.getOverlap(utils.Locus('chr5',171387630,171388066,'.')))
        #sys.exit()

        print('FILTERING THE INPUT GFF FOR GOOD CHROMOSOMES')

        print(chrom_list)
        filtered_gff = [
            line for line in inputGFF if chrom_list.count(line[0]) > 0
        ]

        print('%s of INITIAL %s REGIONS ARE IN GOOD CHROMOSOMES' %
              (len(filtered_gff), len(inputGFF)))

        #=====================================================================================
        #================II. IDENTIFYING TSS PROXIMAL AND DISTAL ELEMENTS=====================
        #=====================================================================================

        print(
            '\n\n#======================================\n#==II. MAPPING TO TSS/DISTAL REGIONS===\n#======================================\n'
        )

        #now we need to split the input region
        print('SPLITTING THE INPUT GFF USING A WINDOW OF %s' % (window))
        splitGFF = splitRegions(filtered_gff, tssCollection)
        print(len(filtered_gff))
        print(len(splitGFF))

        splitGFFPath = '%s%s_SPLIT.gff' % (outputFolder, analysisName)
        utils.unParseTable(splitGFF, splitGFFPath, '\t')
        print('WRITING TSS SPLIT GFF OUT TO %s' % (splitGFFPath))

        #now you have to map the bams to the gff
        print('MAPPING TO THE SPLIT GFF')
        mappedFolder = utils.formatFolder('%sbam_mapping' % (outputFolder),
                                          True)

        signalTable = mapBams(bamFileList, splitGFFPath, analysisName,
                              mappedFolder)
        signalTablePath = '%s%s_signal_table.txt' % (outputFolder,
                                                     analysisName)
        utils.unParseTable(signalTable, signalTablePath, '\t')

        if args.control:
            controlBamFileList = args.control
            controlSignalTable = mapBams(controlBamFileList, splitGFFPath,
                                         analysisName, mappedFolder)
            controlSignalTablePath = '%s%s_control_signal_table.txt' % (
                outputFolder, analysisName)
            utils.unParseTable(controlSignalTable, controlSignalTablePath,
                               '\t')

        #now create the background subtracted summarized average table

        print('CREATING AN AVERAGE SIGNAL TABLE')
        averageTable = makeAverageTable(outputFolder,
                                        analysisName,
                                        useBackground=args.control)
        averageTablePath = '%s%s_average_table.txt' % (outputFolder,
                                                       analysisName)
        utils.unParseTable(averageTable, averageTablePath, '\t')

        #now load up all of the cpg and other parameters to make the actual peak table

        #first check if this has already been done
        peakTablePath = '%s%s_PEAK_TABLE.txt' % (outputFolder, analysisName)
        if utils.checkOutput(peakTablePath, 0.1, 0.1):
            print('PEAK TABLE OUTPUT ALREADY EXISTS')
            peakTable = utils.parseTable(peakTablePath, '\t')
        else:
            peakTable = makePeakTable(paramDict, splitGFFPath,
                                      averageTablePath, startDict, geneList,
                                      genomeDirectory, tads_path)
            utils.unParseTable(peakTable, peakTablePath, '\t')

        geneTable = makeGeneTable(peakTable, analysisName)

        geneTablePath = '%s%s_GENE_TABLE.txt' % (outputFolder, analysisName)
        utils.unParseTable(geneTable, geneTablePath, '\t')

        #if mouse, need to convert genes over
        if genome.count('MM') == 1:
            print('CONVERTING MOUSE NAMES TO HUMAN HOMOLOGS FOR GSEA')
            converted_geneTablePath = '%s%s_GENE_TABLE_CONVERTED.txt' % (
                outputFolder, analysisName)

            converted_geneTable = [geneTable[0]]
            for line in geneTable[1:]:
                converted_name = mouse_convert_dict[line[0]]
                if len(converted_name) > 0:
                    converted_geneTable.append([converted_name] + line[1:])

                    utils.unParseTable(converted_geneTable,
                                       converted_geneTablePath, '\t')

            geneTablePath = converted_geneTablePath
            geneTable = converted_geneTable

        #=====================================================================================
        #===================================III. PLOTTING ====================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#===III. PLOTTING ENHANCER/PROMOTER===\n#======================================\n'
        )

        #if there are fewer genes in the gene table than the top genes, only run on all
        if len(geneTable) < int(top):
            print(
                'WARNING: ONLY %s GENES WITH SIGNAL AT EITHER PROMOTERS OR ENHANCERS. NOT ENOUGH TO RUN ANALYSIS ON TOP %s'
                % (len(geneTable) - 1, top))
            top = 0
            use_top = False
        else:
            use_top = True

        #now call the R code
        print('CALLING R PLOTTING SCRIPTS')
        callRWaterfall(geneTablePath, outputFolder, analysisName, top)

        #=====================================================================================
        #==================================IV. RUNNING GSEA===================================
        #=====================================================================================

        print(
            '\n\n#======================================\n#============IV. RUNNING GSEA=========\n#======================================\n'
        )

        #now let's call gsea
        print('RUNNING GSEA ON C2')
        callGSEA(outputFolder, analysisName, top, 'enhancer_vs_promoter',
                 use_top)
        callGSEA(outputFolder, analysisName, top, 'total_contribution',
                 use_top)

        if use_top:
            print('DETECTING GSEA OUTPUT FOR TOP %s GENES' % (top))
            #for top by enhancer v promoter metric
            top_promoterTablePath, top_distalTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'enhancer_vs_promoter')
            top_signalTablePath, top_backgroundTablePath = detectGSEAOutput(
                analysisName, outputFolder, top, 'total_contribution')

            print('MAKING NES PLOTS FOR TOP %s GENES' % (top))
            callR_GSEA(top_promoterTablePath, top_distalTablePath,
                       outputFolder, analysisName + '_enhancer_vs_promoter',
                       top)
            callR_GSEA(top_signalTablePath, top_backgroundTablePath,
                       outputFolder, analysisName + '_total_contribution', top)

        print('DETECTING GSEA OUTPUT FOR ALL GENES')
        #for top
        all_promoterTablePath, all_distalTablePath = detectGSEAOutput(
            analysisName, outputFolder, 'all')

        print('MAKING NES PLOTS FOR ALL GENES')
        callR_GSEA(all_promoterTablePath, all_distalTablePath, outputFolder,
                   analysisName, 'all')

        #these files can be parsed to make the NES plot

        #[x for x in fileList if x.count('report_for') == 1and x.count('xls') ==1]
        print('ALL DONE WITH ANALYSIS FOR %s' % (analysisName))
コード例 #40
0
def makeNameDict(dataFile, roseFolder, namesList=[], enhancerType='super'):
    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)

    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder, False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder, False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder, True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:

        namesList = [
            name for name in dataDict.keys()
            if string.upper(name).count('WCE') == 0
            and string.upper(name).count('INPUT') == 0
        ]
        #if no namesList is given, this filters out WCE

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:

        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,
                                              dataDict[name]['enrichedMacs'])

        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile, 'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder, name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [
                    x for x in roseOutputFiles
                    if x.count(enhancerString) == 1 and x[0] != '.'
                ]  #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (
                        roseFolder, name, allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile'] = ''
        else:
            nameDict[name]['enhancerFile'] = ''

        if nameDict[name]['enhancerFile'] == '' and nameDict[name][
                'enrichedFile'] == '':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (
                name)
            print nameDict[name]
            sys.exit()
    return nameDict
コード例 #41
0
def mapMergedGFF(dataFile, nameDict, mergedGFFFile, analysisName, outputFolder,
                 maskFile):
    '''
    calls rose on the mergedGFFFile for all datasets
    '''
    dataDict = pipeline_dfci.loadDataTable(dataFile)
    roseParentFolder = "%srose/" % (outputFolder)
    utils.formatFolder(roseParentFolder, True)
    gffName = mergedGFFFile.split('/')[-1].split('.')[0]
    bashFileName = "%srose/%s_roseCall.sh" % (outputFolder, analysisName)
    #namesList is just the first dataset
    #extrmap will have to have all other datasets + their backgrounds

    namesList = nameDict.keys()
    namesList.sort()
    extraMap = []
    for name in namesList[1:]:

        if nameDict[name]['background']:
            backgroundName = dataDict[name]['background']
            if dataDict.has_key(backgroundName):
                extraMap += [name, backgroundName]
            else:
                print "ERROR: UNABLE TO FIND LISTED BACKGROUND DATASET %s FOR %s" % (
                    backgroundName, name)
                sys.exit()
        else:
            extraMap += [name]

    print extraMap

    #first check to see if this has already been done
    mergedRegionMap = "%srose/%s_ROSE/%s_0KB_STITCHED_ENHANCER_REGION_MAP.txt" % (
        outputFolder, namesList[0], gffName)
    print("LOOKING FOR REGION MAP AT %s" % (mergedRegionMap))

    if utils.checkOutput(mergedRegionMap, 1, 1):
        print("FOUND PREVIOUS REGION MAP")

        return mergedRegionMap

    bashFileName = pipeline_dfci.callRose2(dataFile,
                                           '',
                                           roseParentFolder, [namesList[0]],
                                           extraMap,
                                           mergedGFFFile,
                                           0,
                                           0,
                                           bashFileName,
                                           mask=maskFile)

    bashCommand = "bash %s" % (bashFileName)
    os.system(bashCommand)
    print "Running enhancer mapping command:\n%s" % (bashCommand)

    if utils.checkOutput(mergedRegionMap, 5, 60):
        return mergedRegionMap
    else:
        print "UNABLE TO CALL ROSE ENHANCER MAPPING ON CONSENSUS ENHANCER FILE %s.\nEXITING NOW" % (
            mergedGFFFile)
        sys.exit()
コード例 #42
0
def launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile=''):

    '''
    launches enhancer mapping if needed from enriched region files
    '''

    namesList = nameDict.keys()

    #check to see if everything is good, if so return True and call it a day
    if len([x for x in namesList if len(nameDict[x]['enhancerFile']) > 0]) == len(namesList):
        print "ENHANCER FILE OUTPUT FOUND FOR ALL DATASETS"
        return nameDict

    #if not, have to call rose
    
    roseOutputFolder = utils.formatFolder(roseFolder,True)
    
    queueList =[]
    for name in namesList:

        #check to see if we need to call rose
        if nameDict[name]['enhancerFile'] == '':
     
            #get the enriched file
            enrichedFile = nameDict[name]['enrichedFile']
            #call rose
            print "CALLING ROSE FOR %s" % (name)
            bashFileName = pipeline_dfci.callRose2(dataFile,'',roseOutputFolder,[name],[],enrichedFile,tssDistance,stitch,mask=maskFile)
            print bashFileName
            os.system('bash %s &' % (bashFileName))
            #add name to queue list
            queueList.append(name)



    #define the enhancer type
    if enhancerType == 'super':
        enhancerString = 'AllEnhancers.table.txt'
    if enhancerType == 'stretch':
        enhancerString = 'AllEnhancers_Length.table.txt'
    if enhancerType == 'superstretch':
        enhancerString = 'AllEnhancers_SuperStretch.table.txt'



    #now check for completion of datasets
    for name in queueList:

        #check for the AllEnhancers table        
        enhancerFile = "%s%s_ROSE/%s_peaks_%s" % (roseOutputFolder,name,name,enhancerString)
        

        print "CHECKING FOR %s ROSE OUTPUT IN %s" % (name,enhancerFile)
        if utils.checkOutput(enhancerFile,1,10):
            
            print "FOUND ENHANCER OUTPUT FOR %s" % (name)
            nameDict[name]['enhancerFile'] = enhancerFile
        else:

            #try finding it w/ a different name
            #this will bug out if nothing is there
            roseFolder = "%s%s_ROSE/" % (roseOutputFolder,name)
            roseFileList = [x for x in os.listdir(roseFolder) if x[0] != '.'] #no hidden files
            if len(roseFileList) == 0:
                print "No files found in %s" % (roseFolder)
                sys.exit()
            enhancerFile = getFile(enhancerString,roseFileList,roseFolder)
            nameDict[name]['enhancerFile'] = enhancerFile

    return nameDict
コード例 #43
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] {-r [ROSE_FOLDERS] | -i [INPUT_GFF]} -o [OUTPUT_FOLDER] --group1 [GROUP1_NAMES] --group2 [GROUP2_NAMES] --name1 [GROUP1_NAME] --name2 [GROUP2_NAME]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("--group1", dest="group1",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the first group")
    parser.add_option("--group2", dest="group2",nargs = 1, default=None,
                      help = "Enter a comma separated list of dataset names associated with the second group")
    parser.add_option("--name1", dest="name1",nargs = 1, default=None,
                      help = "Enter a name for the first group of datasets")
    parser.add_option("--name2", dest="name2",nargs = 1, default=None,
                      help = "Enter a name for the second group of datasets")

    #the input options
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of meta rose folders")

    #optional input to supercede the meta rose (this is kinda sad but will fix later)
    #should have had this code run clustering from the get go
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "enter a gff, bed or table of regions to perform dyanmic analysis on")




    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")
    parser.add_option("--use-background", dest="background",action = 'store_true',default=False,
                      help = "If flagged will use background datasets as in data table")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    
    requiredArgs = [options.genome,options.data,options.rose,options.output,options.group1,options.group2,options.name1,options.name2]
    

    try:
        assert(all(requiredArgs))
    except AssertionError:
        parser.print_help()
        sys.exit()

    #now the main run of the function

    #getting the genoe and data file
    genome = string.upper(options.genome)
    dataFile = options.data

    #getting the rose folders
    roseFolderString = options.rose
    [roseFolder1,roseFolder2] = roseFolderString.split(',')
    parentFolder = utils.formatFolder(options.output,True)

    #getting the analysis names
    name1 = options.name1
    name2 = options.name2
    mergeName = "%s_%s_merged" % (name1,name2)

    #getting the datasets names associated with each group
    namesList1 = options.group1.split(',')
    namesList2 = options.group2.split(',')

    #options for background corection
    useBackground = options.background

    #option for median scaling
    medianScale = options.median

    #option for an overriding set of input regions
    if options.input != None:
        #for now only works w/ gffs
        print('Using %s as a set of predifined input regions' % (options.input))
        inputGFF = options.input
    else:
        inputGFF= ''
    

    plotBam = options.plot
    if options.all:
        superOnly = False
    else:
        superOnly = True

    if superOnly and plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
    if superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
    if not superOnly and plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
    if not superOnly and not plotBam:
        print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

    #part 1
    print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
    #start with the all enhancer tables from the initial rose calls

    roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
    roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

    roseDict1 = makeRoseDict(roseFolder1)
    roseDict2 = makeRoseDict(roseFolder2)

    #choosing the type of enhancer to analyze
    enhancerCallType = string.lower(options.enhancer_type)
    if superOnly:
        print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
    superFile1 = roseDict1[enhancerCallType]
    superFile2 = roseDict2[enhancerCallType]

    allFile1 = roseDict1['AllEnhancer']
    allFile2 = roseDict2['AllEnhancer']
    
    regionFile1 = roseDict1['RegionMap']
    regionFile2 = roseDict1['RegionMap']

    #this is where we can toggle either using meta rose or clustering
    print('\tMERGING ENHANCERS AND CALLING ROSE')
    if superOnly:
        if len(superFile1) ==0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
            sys.exit()
        if len(superFile2) == 0:
            print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
            sys.exit()
        roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)

    else:
        print('doing it right')
        print(allFile1)
        print(allFile2)

        roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder,namesList1,namesList2,useBackground,inputGFF)
        print('this is rose output')
        print(roseOutput)
    print('\tMERGING ROSE OUTPUT')

    mergedRoseOutput,normRoseOutput = mergeRoseSignal(dataFile,roseOutput,roseDict1,roseDict2,name1,name2,namesList1,namesList2,useBackground,medianScale)
    


    print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

    #part2 is the R script
    mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
    rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale,namesList1)
    print(rcmd) 
    os.system(rcmd)

    time.sleep(5)
    callRoseGeneMapper(mergedGFFFile,genome,parentFolder,namesList1)

    #rank the genes


    #part 3
    #rank the delta
    print "PART 3: assinging ranks to differential enhancers"
    print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

    gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
    enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,namesList1[0],gffName)
    if utils.checkOutput(enhancerToGeneFile):
        rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_MERGED_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,namesList1[0],gffName)
        assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
    else:
        print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
        sys.exit()

    #make the rank plot
    print('MAKING RANK PLOTS')
    if utils.checkOutput(rankOutput):
        print('checking for rank output %s' % (rankOutput))
        rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    print('MAKING REGION SIGNAL PLOTS AND FINDING DIFFERENTIAL REGIONS')
    if utils.checkOutput(normRoseOutput):
        print('checking for %s' % (normRoseOutput))
        rcmd = callRegionPlotRScript(normRoseOutput,name1,name2,namesList1,namesList2)
        print(rcmd)
        os.system(rcmd)
    else:
        print('ERROR: REGION PLOT SCRIPT FAILED TO RUN')
        sys.exit()

    #NOW MAP GENES
    print('mapping genes to differential enhancers')
    statOutput,diffOutput = callRoseGeneMapper_stats(mergedGFFFile,genome,parentFolder,namesList1)



    if utils.checkOutput(statOutput):
        print('checking for gene mapping output %s' % (statOutput))
        print('FINISHED WITH GENE MAPPING')
    else:
        print('GENE MAPPING FAILED')
        sys.exit()

    print('FINISHING OUTPUT')
    
    finishRankOutput(dataFile,statOutput,diffOutput,genome,parentFolder,mergeName,name1,name2,namesList1,namesList2,1.0,100000,superOnly,plotBam)
コード例 #44
0
ファイル: CRC_wrapper.py プロジェクト: linlabcode/pipeline
def main():

    '''
    using argparse

    '''

    parser = argparse.ArgumentParser(usage='%(prog)s -i DATAFILE -1 GROUP1_NAMES -2 GROUP2_NAMES')

    # required flags
    parser.add_argument("-d", "--data_table", dest="data_table", type=str,
                      help="input a data table with all datasets to be analyzed", required=True)
    parser.add_argument("-1", "--group1", dest="group1", type=str,
                      help="input a comma separated list of all datasets in group1", required=True)
    parser.add_argument("-2", "--group2", dest="group2", type=str,
                      help="input a comma separated list of all datasets in group2", required=True)


    #optional input override
    parser.add_argument("-i", "--input", dest="input", type=str,
                        help="input a gff of regions to analyze", required=False)


    #optional arguments
    parser.add_argument("-n", "--name", dest="name", type=str,
                      help="specify a name for the analysis. Default is drawn from the data table name", required=False)

    parser.add_argument("--group1-name", dest="group1_name", default='GROUP1',type=str,
                      help="Enter a name for group1.  Default is 'GROUP1'", required=False)
    parser.add_argument("--group2-name", dest="group2_name", default='GROUP2',type=str,
                      help="Enter a name for group2.  Default is 'GROUP2'", required=False)

    parser.add_argument("-a", "--activity", dest="activity", type=str,default='',
                      help="a table with active gene names in the first column", required=False)
    parser.add_argument("-t", "--tss", dest="tss", type=int,default=2500,
                      help="Specify a TSS exclusion distance. Default is 2500", required=False)
    parser.add_argument("-s", "--stitch", dest="stitch", type=int,default=None,
                      help="Specify a stitching distance. Default is auto stitching", required=False)



    parser.add_argument("-o", "--output", dest="output", default='./',type=str,
                      help="Enter the output folder. Default is the current working directory", required=False)

    parser.add_argument("--log", dest="log", default='',type=str,
                      help="Enter a path to log output", required=False)



#     # DEBUG OPTION TO SAVE TEMP FILES
#     parser.add_argument("--scale", dest="scale", default='',
#                       help="Enter a comma separated list of scaling factors for your bams. Default is none")
#     parser.add_argument("--save-temp", dest="save", action='store_true', default=False,
#                       help="If flagged will save temporary files made by bamPlot")
#     parser.add_argument("--bed", dest="bed",
#                       help="Add a space-delimited list of bed files to plot")
#     parser.add_argument("--multi-page", dest="multi", action='store_true', default=False,
#                       help="If flagged will create a new pdf for each region")

    args = parser.parse_args()



    #now we can begin to parse the arguments
    
    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================
    #pulling in the data table
    data_file = os.path.abspath(args.data_table)
    dataDict = pipeline_dfci.loadDataTable(data_file)

    #setting naming conventions
    if not args.name:
        analysis_name = data_file.split('/')[-1].split('.')[0]
    else:
        analysis_name = args.name

    #getting the optional input gff
    if args.input:
        inputGFF = args.input
    else:
        inputGFF = ''

    #getting group names
    group1_name = args.group1_name
    group2_name = args.group2_name

    #getting group1 
    group1_string = args.group1
    group1_list = [name for name in string.split(group1_string,',') if len(name) > 0]

    #getting group2
    group2_string = args.group2
    group2_list = [name for name in string.split(group2_string,',') if len(name) > 0]

    #checking that all datasets are in the data table
    for name in group1_list + group2_list:
        if name not in dataDict:
            print('ERROR: DATASET %s NOT FOUND IN DATA TABLE %s. EXITING NOW' % (name,data_file))
            sys.exit()

    #loading in the genome object from the data table
    genome_list = utils.uniquify([dataDict[name]['genome'] for name in group1_list + group2_list])
    if len(genome_list) > 1:
        print('ERROR: ATTEMPTING TO ANALYZE DATASETS FROM MULTIPLE GENOME BUILDS. EXITING NOW.')
        sys.exit()

    
    #the load genome function has an assertion test to make sure the genome is supported
    genome = loadGenome(genome_list[0])

    
    parent_folder = utils.formatFolder(args.output,True)
    output_folder = utils.formatFolder(parent_folder + analysis_name,True)


    #these are the user defined optional arguments
    tss = int(args.tss)

    stitch = args.stitch
    print('stitch')
    print(stitch)

    
    #list of active genes to constrain analysis 
    if len(args.activity) == 0:
        #assumes all genes are active unless told otherwise
        #activity_path,activity_table = getActivity() # fix this function
        print('using all active genes')
    else:
        activity_path = args.activity
        activity_table = utils.parseTable(activity_path,'\t')




    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing datasets described in %s\n' % (data_file))

    print('Name for the analysis: %s\n' % (analysis_name))
    print('Using genome: %s\n' % (genome.name()))


    
    print('%s datasets: %s\n' % (group1_name,group1_string))
    print('%s datasets: %s\n' % (group2_name,group2_string))

    if len(activity_path) > 0:
        print('Identified %s active genes in the analysis using %s as a list of active genes' % (len(activity_table),activity_path))
    else:
        print('Identified %s active genes in the analysis using aggregate data from %s and %s' % (len(activity_table),group1_name,group2_name))
    print('Writing output to: %s\n' % (output_folder))


    #=====================================================================================
    #======================II. DEFINING CIS-REGULATORY ELEMENTS===========================
    #=====================================================================================


    print('\n\n#======================================\n#=II. MAPPING CIS-REGULATORY ELEMENTS==\n#======================================\n')



    #crc_wrapper will act at the group level and not consider individual datasets
    #since a data table is used as the input, the code will rely heavily on pipeline_dfci
    #embedded tools

    #1. first we need to run meta rose using default parameters and check the output
    #exists for each group

    meta_rose_folder = utils.formatFolder(output_folder + 'meta_rose/',True)

    group1_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group1_name,group1_name)

    group2_output = '%s%s/%s_AllEnhancers.table.txt' % (meta_rose_folder,group2_name,group2_name)
    #print(group1_output)
    #print(group2_output)

    #for each output check to see if they exist
    #if not launch

    try:
        foo = open(group1_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group1_name))
        launchMetaRose(group1_name,group1_list,meta_rose_folder,genome,data_file,stitch,tss)
        
    try:
        foo = open(group2_output,'r')
    except IOError:
        print('No META_ROSE output found for %s. Running META_ROSE now' % (group2_name))
        launchMetaRose(group2_name,group2_list,meta_rose_folder,genome,data_file,stitch,tss)



    #now check for completion
    if utils.checkOutput(group1_output,1,10):
        print('META_ROSE finished for %s' % (group1_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group1_name))
        sys.exit()

    if utils.checkOutput(group2_output,1,10):
        print('META_ROSE finished for %s' % (group2_name))
    else:
        print('META_ROSE timed out for %s. EXITING NOW.' % (group2_name))
        sys.exit()


    #Meta rose does not give all regions that are SE in at least one sample
    #and can be blown out by amplicons etc...
    #sooo we need to run clustering to generate a good input gff
    #ideally we just rewrite dynamic meta to run off of clustering output
    #until we do that let's just overwrite w/ an input gff
    

    print('Comparing cis-regulatory landscapes of %s and %s' % (group1_name,group2_name))
    dynamic_rose_folder = utils.formatFolder(output_folder + 'dynamic_meta_rose/',True)

    #here we will use the rank table as the primary output
    dynamic_rose_output = '%soutput/%s_%s_%s_merged_MERGED_SUPERS_RANK_TABLE.txt' % (dynamic_rose_folder,genome.name(),group1_name,group2_name)
    
    try:
        foo = open(dynamic_rose_output,'r')
    except IOError:
        print('No DYNAMIC_ROSE output found for %s. Running DYNAMIC_ROSE now' % (analysis_name))
        launchDynamicRose(analysis_name,group1_name,group2_name,group1_list,group2_list,meta_rose_folder,dynamic_rose_folder,genome,data_file,activity_path,inputGFF)

    if utils.checkOutput(dynamic_rose_output,1,10):
        print('DYNAMIC_ROSE finsihed for %s' % (analysis_name))
    else:
        print('DYNAMIC_ROSE analysis timed out for %s. EXITING NOW.' % (analysis_name))
        sys.exit()




    #=====================================================================================
    #======================III. IDENTIFYING TF NODES IN NETWORK===========================
    #=====================================================================================


    print('\n\n#======================================\n#===III. RUNNING CIRCUITRY ANALYSIS====\n#======================================\n')




    #now we want to call circuitry on each group... ok to have different subpeaks and motif calls
    #if as a first approximation we weight by the overall enhancer




    crc_folder = utils.formatFolder('%scrc/' % (output_folder),True)



    #for all
    all_crc_folder = utils.formatFolder('%s%s' % (crc_folder,analysis_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,analysis_name,group1_list+group2_list,all_crc_folder,activity_path)



    #for group1
    group1_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group1_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group1_name,group1_list,group1_crc_folder,activity_path)

    #for group2
    group2_crc_folder = utils.formatFolder('%s%s' % (crc_folder,group2_name),True)
    launchCRC(data_file,genome,dynamic_rose_output,group2_name,group2_list,group2_crc_folder,activity_path)
コード例 #45
0
ファイル: extractGuides.py プロジェクト: BoulderLabs/pipeline
def extractGuideFastq(fastqFile,outputFolder='',gzip=False):

    '''
    takes a fastq and extracts candidate guide RNAs
    '''
    
    #get the full absolute path for the fastq File
    fastqFile = os.path.abspath(fastqFile)
    fastq = utils.open(fastqFile,'r')
    
    #get the fastq name and root
    if len(outputFolder) == 0:
        outputFolder = utils.getParentFolder(fastqFile)

    #makes sure the output folder exists
    utils.formatFolder(outputFolder,True)

    #grab the name info from the fastq
    fastqName = fastqFile.split('/')[-1]
    fastqRoot = string.replace(fastqName,'.fastq','')
    fastqRoot = string.replace(fastqRoot,'.gz','')
    
    #guideFastqFile output
    guideFastqFile = '%s%s.gecko.fastq' % (outputFolder,fastqRoot)
    guideFastq = utils.open(guideFastqFile,'w')

    print('processing %s' % (fastqName))
    print('million reads processed:')
    ticker = 0
    found = 0
    while True:
        
        if ticker%1000000 == 0:
            print(ticker/1000000)

        fastqLines = []

        #now load the fastq lines
        try:
            for i in range(4):
                fastqLines.append(fastq.next())
        except StopIteration:
            break

    
        #see if you can find a cut site
        seq = fastqLines[1].rstrip()
        try:
            cutPosition = seq.index(cutSeq)
            found+=1
        except ValueError:
            ticker+=1
            continue

        guideStart = cutPosition + cutOffset
        guideStop = guideStart + guideLength
        
        #pulling out the guide seq in the fastqLines
        fastqLines[1] = fastqLines[1][guideStart:guideStop] + '\n'
        fastqLines[3] = fastqLines[3][guideStart:guideStop] + '\n'

        for line in fastqLines:
            guideFastq.write(line)

        ticker+=1
        # if ticker == 100000:
        #     print(ticker)
        #     print(found)
        #     print(float(found)/float(ticker))
        #     break

    print('SUMMARY STATISTICS')
    print(ticker)
    print(found)
    print(float(found)/float(ticker))

    #close the fastq
    guideFastq.close()

    #gzip the fastq
    if gzip:
        os.system('gzip %s &' % (guideFastqFile))
        guideFastqFile += '.gz'

    return guideFastqFile
コード例 #46
0
def main():
    """
    main run function
    """

    #usage = "usage: %prog [options] -g [GENOME] -b [SORTED BAMFILE(S)] -i [INPUTFILE] -o [OUTPUTFOLDER]"
    parser = argparse.ArgumentParser(usage='%(prog)s -i SAMFILE -g REF_GENOME -s SPIKE_GENOME')

    # required flags
    parser.add_argument("-i", "--input", dest="inputSam", type=str,
                        help="Enter a sam file", required=False)
    parser.add_argument("-g", "--genome", dest="genome", type=str,
                        help="specify the main reference genome", required=False)
    parser.add_argument("-s", "--spike", dest="spike", type=str,
                        help="specify the spike in  genome", required=False)


    parser.add_argument("-d", "--dest", dest="dest", type=str,
                        help="specify an optional destination for the final bams to move to", required=False)

    args = parser.parse_args()

    print(args)
    
    if args.inputSam and args.genome and args.spike:

        print('FORMATTING %s FOR CHIP_RX USING REFERENCE GENOME %s and SPIKE_IN GENOME %s' % (args.inputSam,args.genome,args.spike))
        samPath = args.inputSam

        if string.upper(samPath).count('.SAM') == 0:
            print('ERROR, file must end in .sam or .SAM')
            sys.exit()

        
        #get the headers
        genome_string = string.upper('%s_%s' % (args.genome,args.spike))

        
        genomeDict = {'RN6_DM6':['/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/rn6_header.SAM','/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index_dm6/dm6_header.SAM'],
                      }

        if genomeDict.has_key(genome_string) == False:
            print('ERROR: UNSUPPORTED GENOME COMBINATION %s' % (genome_string))
            sys.exit()
        else:
            print('IDENTIFIED HEADERS FOR GENOME COMBINATION %s' %(genome_string))


        header1 = genomeDict[genome_string][0]
        header2 = genomeDict[genome_string][1]
        suffix = string.lower('_%s' % (args.spike))

        outfile1 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.genome,samPath[-4:]))
        outfile2 = string.replace(samPath,samPath[-4:],'.%s%s' % (args.spike,samPath[-4:]))
        split_sam(suffix,samPath,header1,header2,outfile1,outfile2)

        #move stuff to destination folder
        if args.dest:
            bamFolder = utils.formatFolder(args.dest,False)

            samFolder = utils.getParentFolder(samPath)

            mv_cmd = 'mv %s*bam* %s' % (samFolder,bamFolder)
            print('MOVING BAMS FROM %s TO %s' % (samFolder,bamFolder))
            os.system(mv_cmd)


    else:
        parser.print_help()
        sys.exit()
コード例 #47
0
ファイル: ROSE2_META.py プロジェクト: linlabcode/pipeline
import subprocess
import string

from collections import defaultdict


#==================================================================
#=========================GLOBAL===================================
#==================================================================


# Get the script's full local path
whereAmI = os.path.dirname(os.path.realpath(__file__))
print(whereAmI)
# Get the script folder
pipeline_dir = utils.formatFolder(whereAmI,False)

print('RUNNING ROSE2_META.py FROM %s' % (whereAmI))

#samtools must be installed
samtoolsPath = 'samtools'

#bamliquidator must be installed
bamliquidator_path = 'bamliquidator_batch.py'

#==================================================================
#=====================HELPER FUNCTIONS=============================
#==================================================================

def getBamChromList(bamFileList):
コード例 #48
0
ファイル: ROSE2_main.py プロジェクト: afederation/pipeline
def main():
    '''
    main run call
    '''
    debug = False

    from optparse import OptionParser
    usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]"
    parser = OptionParser(usage=usage)
    # required flags
    parser.add_option("-i", "--i", dest="input", nargs=1, default=None,
                      help="Enter a .gff or .bed file of binding sites used to make enhancers")
    parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-o", "--out", dest="out", nargs=1, default=None,
                      help="Enter an output folder")
    parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None,
                      help="Enter the genome build (MM9,MM8,HG18,HG19)")

    # optional flags
    parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None,
                      help="Enter a comma separated list of additional bam files to map to")
    parser.add_option("-c", "--control", dest="control", nargs=1, default=None,
                      help="bamfile to rank enhancer by")
    parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='',
                      help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter")
    parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0,
                      help="Enter a distance from TSS to exclude. 0 = no TSS exclusion")

    parser.add_option("--mask", dest="mask", nargs=1, default=None,
                      help="Mask a set of regions from analysis.  Provide a .bed or .gff of masking regions")

    # RETRIEVING FLAGS
    (options, args) = parser.parse_args()

    if not options.input or not options.rankby or not options.out or not options.genome:
        print('hi there')
        parser.print_help()
        exit()

    # making the out folder if it doesn't exist
    outFolder = utils.formatFolder(options.out, True)

    # figuring out folder schema
    gffFolder = utils.formatFolder(outFolder + 'gff/', True)
    mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True)

    # GETTING INPUT FILE
    if options.input.split('.')[-1] == 'bed':
        # CONVERTING A BED TO GFF
        inputGFFName = options.input.split('/')[-1][0:-4]
        inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName)
        utils.bedToGFF(options.input, inputGFFFile)
    elif options.input.split('.')[-1] == 'gff':
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    else:
        print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT')
        # COPY THE INPUT GFF TO THE GFF FOLDER
        inputGFFFile = options.input
        os.system('cp %s %s' % (inputGFFFile, gffFolder))

    # GETTING THE LIST OF BAMFILES TO PROCESS
    if options.control:
        bamFileList = [options.rankby, options.control]

    else:
        bamFileList = [options.rankby]

    if options.bams:
        bamFileList += options.bams.split(',')
        bamFileList = utils.uniquify(bamFileList)
    # optional args

    # Stitch parameter
    if options.stitch == '':
        stitchWindow = ''
    else:
        stitchWindow = int(options.stitch)

    # tss options
    tssWindow = int(options.tss)
    if tssWindow != 0:
        removeTSS = True
    else:
        removeTSS = False

    # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS
    print('USING %s AS THE INPUT GFF' % (inputGFFFile))
    inputName = inputGFFFile.split('/')[-1].split('.')[0]

    # GETTING THE GENOME
    genome = options.genome
    print('USING %s AS THE GENOME' % genome)

    # GETTING THE CORRECT ANNOT FILE
    cwd = os.getcwd()
    genomeDict = {
        'HG18': '%s/annotation/hg18_refseq.ucsc' % (cwd),
        'MM9': '%s/annotation/mm9_refseq.ucsc' % (cwd),
        'HG19': '%s/annotation/hg19_refseq.ucsc' % (cwd),
        'MM8': '%s/annotation/mm8_refseq.ucsc' % (cwd),
        'MM10': '%s/annotation/mm10_refseq.ucsc' % (cwd),
    }

    annotFile = genomeDict[genome.upper()]

    # MAKING THE START DICT
    print('MAKING START DICT')
    startDict = utils.makeStartDict(annotFile)

    # LOADING IN THE BOUND REGION REFERENCE COLLECTION
    print('LOADING IN GFF REGIONS')
    referenceCollection = utils.gffToLocusCollection(inputGFFFile)

    # MASKING REFERENCE COLLECTION
    # see if there's a mask
    if options.mask:
        maskFile = options.mask
        # if it's a bed file
        if maskFile.split('.')[-1].upper() == 'BED':
            maskGFF = utils.bedToGFF(maskFile)
        elif maskFile.split('.')[-1].upper() == 'GFF':
            maskGFF = utils.parseTable(maskFile, '\t')
        else:
            print("MASK MUST BE A .gff or .bed FILE")
            sys.exit()
        maskCollection = utils.gffToLocusCollection(maskGFF)

        # now mask the reference loci
        referenceLoci = referenceCollection.getLoci()
        filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0]
        print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile))
        referenceCollection = utils.LocusCollection(filteredLoci, 50)

    # NOW STITCH REGIONS
    print('STITCHING REGIONS TOGETHER')
    stitchedCollection, debugOutput, stitchWindow = regionStitching(inputGFFFile, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS)

    # NOW MAKE A STITCHED COLLECTION GFF
    print('MAKING GFF FROM STITCHED COLLECTION')
    stitchedGFF = utils.locusCollectionToGFF(stitchedCollection)
    # making sure start/stop ordering are correct
    for i in range(len(stitchedGFF)):

        line = stitchedGFF[i]
        start = int(line[3])
        stop = int(line[4])
        if start > stop:
            line[3] = stop
            line[4] = start

    print(stitchWindow)
    print(type(stitchWindow))
    if not removeTSS:
        stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000))
    else:
        stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000))
        stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000))
        debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000))

    # WRITING DEBUG OUTPUT TO DISK

    if debug:
        print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile))
        utils.unParseTable(debugOutput, debugOutFile, '\t')

    # WRITE THE GFF TO DISK
    print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile))
    utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t')

    # SETTING UP THE OVERALL OUTPUT FILE
    outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt'
    print('OUTPUT WILL BE WRITTEN TO  %s' % (outputFile1))

    # MAPPING TO THE NON STITCHED (ORIGINAL GFF)
    # MAPPING TO THE STITCHED GFF

    # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail.
    bamliquidator_path = '/ark/home/jdm/pipeline/bamliquidator_batch.py'
    if not os.path.isfile(bamliquidator_path):
        bamliquidator_path = 'bamliquidator_batch.py'
        if not os.path.isfile(bamliquidator_path):
            raise ValueError('bamliquidator_batch.py not found in path')

    for bamFile in bamFileList:

        bamFileName = bamFile.split('/')[-1]

        # MAPPING TO THE STITCHED GFF
        mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName)
        mappedOut1File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, stitchedGFFName, bamFileName)
        if utils.checkOutput(mappedOut1File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File))
        else:
            cmd1 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile)
            print(cmd1)

            output1 = subprocess.Popen(cmd1, stdout=subprocess.PIPE, shell=True)
            output1 = output1.communicate()
            if len(output1[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName))
                sys.exit()

        # MAPPING TO THE ORIGINAL GFF
        mappedOut2Folder = '%s%s_%s_MAPPED' % (mappedFolder, inputName, bamFileName)
        mappedOut2File = '%s%s_%s_MAPPED/matrix.gff' % (mappedFolder, inputName, bamFileName)
        if utils.checkOutput(mappedOut2File, 0.2, 0.2):
            print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut2File))
        else:
            cmd2 = "python " + bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (inputGFFFile, mappedOut2Folder, bamFile)
            print(cmd2)

            output2 = subprocess.Popen(cmd2, stdout=subprocess.PIPE, shell=True)
            output2 = output2.communicate()
            if len(output2[0]) > 0:  # test if mapping worked correctly
                print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (inputGFFFile, bamFileName))
            else:
                print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (inputGFFFile, bamFileName))
                sys.exit()

    print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS')
    # CALCULATE DENSITY BY REGION
    # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR
    mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName)

    print('CALLING AND PLOTTING SUPER-ENHANCERS')

    if options.control:

        rankbyName = options.rankby.split('/')[-1]
        controlName = options.control.split('/')[-1]
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)

    else:
        rankbyName = options.rankby.split('/')[-1]
        controlName = 'NONE'
        cmd = 'R --no-save %s %s %s %s < ROSE2_callSuper.R' % (outFolder, outputFile1, inputName, controlName)
    print(cmd)

    os.system(cmd)

    # calling the gene mapper
    time.sleep(20)
    superTableFile = "%s_SuperEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superTableFile)
    os.system(cmd)


    stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, stretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, stretchTableFile)
    os.system(cmd)


    superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName)
    if options.control:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s" % (genome, options.rankby, options.control, outFolder, superStretchTableFile)
    else:
        cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s" % (genome, options.rankby, outFolder, superStretchTableFile)
    os.system(cmd)
コード例 #49
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter a data file for datasets to be processed")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "specify an output folder to write results to")

    #additional options
    parser.add_option("-i","--input", dest="input",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to analyze. Default will be all datasets")

    parser.add_option("-n","--name", dest="name",nargs=1,default=None,
                      help = "Enter a name for the analysis")

    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a folder to detect or write rose output")

    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "flag to run analysis on ALL enhancers (this is much slower)")
    parser.add_option("-s","--stitch", dest="stitch",nargs = 1, default='',
                      help = "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")

    parser.add_option("-t","--tss", dest="tss",nargs = 1, default=2500,
                      help = "specify a tss exclusion window. default is 2500bp")

    parser.add_option("--mask",dest="mask",nargs=1,default=None,
                      help = 'Create a mask set of regions to filter out of analysis. must be .bed or .gff format')


    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments
        
        #pull in the datafile and create a datadict
        dataFile = options.data

        #now the output folder
        outputFolder = utils.formatFolder(options.output,True) #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"
        
        #check for a stitching parameter
        if len(str(options.stitch)) > 0:
            stitch = str(options.stitch)
        else:
            stitch = ''

        #check for the tss parameter
        tssDistance = int(options.tss)

        #check enhancer type
        enhancerType = string.lower(options.enhancer_type)
        if ['super','superstretch','stretch'].count(enhancerType) == 0:
            print("ERROR: unsupported enhancer type %s" % (enhancerType))
            sys.exit()


        #see if there's a mask
        if options.mask:
            maskFile = options.mask
        else:
            maskFile = ''

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================
        
        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile,roseFolder,namesList,enhancerType)

            
        print nameDict

        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print nameDict.keys()

        for name in nameDict.keys():
            if len(nameDict[name]['enhancerFile']) == 0:
                print("NO ROSE OUTPUT FOR %s" % (name))
        
        #sys.exit()
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================
        
        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile,nameDict,outputFolder,roseFolder,stitch,tssDistance,enhancerType,maskFile)
        print nameDict

        #sys.exit()

        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================
        
        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        #sys.exit()
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================
        
        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"

        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder,genome,analysisName)
        mergedGFFFile = mergeCollections(nameDict,analysisName,mergedGFFFile,superOnly)

        #sys.exit()

        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile,nameDict,mergedGFFFile,analysisName,outputFolder,maskFile)

        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(nameDict,mergedRegionMap,medianDict,analysisName,genome,outputFolder)
      
        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome,outputFolder,analysisName,signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples
        #sys.exit()
        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/pipeline/')
        cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (genome,clusterTableFile)
        os.system(cmd)

        print "FINISHED"


    else:
        parser.print_help()
        sys.exit()
コード例 #50
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -n [DATA_NAMES] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9,RN4,RN6) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")


    #additional options
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")
    parser.add_option("-m","--median", dest="median",action = 'store_true', default=False,
                      help = "If flagged, will use median enhancer scaling")
    parser.add_option("-e","--enhancer-type", dest="enhancer_type",nargs = 1,default='super',
                      help = "specify type of enhancer to analyze: super, stretch, superStretch")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output and options.names:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        

        nameString = options.names
        [name1,name2] =nameString.split(',')

        mergeName = "%s_%s_merged" % (name1,name2)

        #option for median scaling
        medianScale = options.median

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls

        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)

        roseDict1 = makeRoseDict(roseFolder1)
        roseDict2 = makeRoseDict(roseFolder2)


        #choosing the type of enhancer to analyze
        enhancerCallType = string.lower(options.enhancer_type)
        if superOnly:
            print("ANALYZING ENHANCER TYPE: %s" % (string.upper(enhancerCallType)))
        superFile1 = roseDict1[enhancerCallType]
        superFile2 = roseDict2[enhancerCallType]

        allFile1 = roseDict1['AllEnhancer']
        allFile2 = roseDict2['AllEnhancer']

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            if len(superFile1) ==0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder1)
                sys.exit()
            if len(superFile2) == 0:
                print "ERROR: UNABLE TO FIND %s FILES IN %s" % (enhancerCallType,roseFolder2)
                sys.exit()
            roseOutput = callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergeName,genome,parentFolder)

        else:

            roseOutput = callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergeName,genome,parentFolder)



        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')

        #part2 is the R script
        mergedGFFFile = '%s%s_%s_MERGED_REGIONS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)    
        rcmd = callDeltaRScript(mergedGFFFile,parentFolder,dataFile,name1,name2,allFile1,allFile2,medianScale)
        print(rcmd) 
        os.system(rcmd)

        time.sleep(30)
        callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')

        gffName = '%s_%s_MERGED_REGIONS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        sys.exit()
コード例 #51
0
def main():

    from optparse import OptionParser

    usage = "usage: %prog [options] -d [DATA_FILE] -i [INPUT_LIST] -r [ROSE_FOLDER] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage=usage)
    #required flags
    parser.add_option("-d",
                      "--data",
                      dest="data",
                      nargs=1,
                      default=None,
                      help="Enter a data file for datasets to be processed")
    parser.add_option("-o",
                      "--output",
                      dest="output",
                      nargs=1,
                      default=None,
                      help="specify an output folder to write results to")

    #additional options
    parser.add_option(
        "-i",
        "--input",
        dest="input",
        nargs=1,
        default=None,
        help=
        "Enter a comma separated list of names to analyze. Default will be all datasets"
    )

    parser.add_option("-n",
                      "--name",
                      dest="name",
                      nargs=1,
                      default=None,
                      help="Enter a name for the analysis")

    parser.add_option("-r",
                      "--rose",
                      dest="rose",
                      nargs=1,
                      default=None,
                      help="Enter a folder to detect or write rose output")

    parser.add_option(
        "-a",
        "--all",
        dest="all",
        action='store_true',
        default=False,
        help="flag to run analysis on ALL enhancers (this is much slower)")
    parser.add_option(
        "-s",
        "--stitch",
        dest="stitch",
        nargs=1,
        default='',
        help=
        "specify a fixed stitch distance for all datasets, otherwise will compute stitching automatically on each dataset"
    )
    parser.add_option(
        "-e",
        "--enhancer-type",
        dest="enhancer_type",
        nargs=1,
        default='super',
        help="specify type of enhancer to analyze: super, stretch, superStretch"
    )

    parser.add_option("-t",
                      "--tss",
                      dest="tss",
                      nargs=1,
                      default=2500,
                      help="specify a tss exclusion window. default is 2500bp")

    parser.add_option(
        "--mask",
        dest="mask",
        nargs=1,
        default=None,
        help=
        'Create a mask set of regions to filter out of analysis. must be .bed or .gff format'
    )

    (options, args) = parser.parse_args()

    print(options)
    print(args)

    if options.data and options.output:

        #check to see if minimum arguments are met

        #pull in arguments

        #pull in the datafile and create a datadict
        dataFile = options.data

        #now the output folder
        outputFolder = utils.formatFolder(
            options.output, True)  #check and create the output folder
        #now the rose folder
        if options.rose:
            roseFolder = options.rose
        else:
            roseFolder = "%srose/" % (outputFolder)

        if options.input:
            namesList = options.input.split(',')
        else:
            namesList = []

        #get the genome
        dataDict = pipeline_dfci.loadDataTable(dataFile)
        genome = dataDict[dataDict.keys()[0]]['genome']

        #check if using only supers
        if options.all:
            superOnly = False
        else:
            superOnly = True

        #get the anlysis name
        if options.name:
            analysisName = options.name
        else:
            analysisName = "enhancers"

        #check for a stitching parameter
        if len(str(options.stitch)) > 0:
            stitch = str(options.stitch)
        else:
            stitch = ''

        #check for the tss parameter
        tssDistance = int(options.tss)

        #check enhancer type
        enhancerType = string.lower(options.enhancer_type)
        if ['super', 'superstretch', 'stretch'].count(enhancerType) == 0:
            print("ERROR: unsupported enhancer type %s" % (enhancerType))
            sys.exit()

        #see if there's a mask
        if options.mask:
            maskFile = options.mask
        else:
            maskFile = ''

        #=====================================================
        #=================SUMMARIZE INPUTS====================
        #=====================================================

        print "WORKING IN GENOME %s" % (genome)
        print "DRAWING DATA FROM %s AND ROSE FOLDER %s" % (dataFile,
                                                           roseFolder)
        print "USING %s AS THE OUTPUT FOLDER" % (outputFolder)

        #=====================================================
        #==============ESTABLISH ALL WORKING FILES============
        #=====================================================

        print "\n\n\nESTABLISHING WORKING FILES"
        nameDict = makeNameDict(dataFile, roseFolder, namesList, enhancerType)

        print nameDict

        print "STARTING ANALYSIS ON THE FOLLOWING DATASETS:"
        print nameDict.keys()

        for name in nameDict.keys():
            if len(nameDict[name]['enhancerFile']) == 0:
                print("NO ROSE OUTPUT FOR %s" % (name))

        #sys.exit()
        #=====================================================
        #==============LAUNCH ENHANCER MAPPING================
        #=====================================================

        print "\n\n\nLAUNCHING ENHANCER MAPPING (IF NECESSARY)"
        nameDict = launchEnhancerMapping(dataFile, nameDict, outputFolder,
                                         roseFolder, stitch, tssDistance,
                                         enhancerType, maskFile)
        print nameDict

        #sys.exit()

        #=====================================================
        #====================GET MEDIAN SIGNAL================
        #=====================================================

        print "\n\n\nGETTING MEDIAN ENHANCER SIGNAL FROM EACH SAMPLE"
        medianDict = makeMedianDict(nameDict)

        print medianDict
        #sys.exit()
        #=====================================================
        #====================MERGING ENHANCERS================
        #=====================================================

        print "\n\n\nIDENTIFYING CONSENSUS ENHANCER REGIONS"

        mergedGFFFile = "%s%s_%s_-0_+0.gff" % (outputFolder, genome,
                                               analysisName)
        mergedGFFFile = mergeCollections(nameDict, analysisName, mergedGFFFile,
                                         superOnly)

        #sys.exit()

        #=====================================================
        #===============MAP TO MERGED REGIONS=================
        #=====================================================

        print "\n\n\nMAPPING DATA TO CONSENSUS ENHANCER REGIONS"
        mergedRegionMap = mapMergedGFF(dataFile, nameDict, mergedGFFFile,
                                       analysisName, outputFolder, maskFile)

        #=====================================================
        #==============CORRECT FOR MEDIAN SIGNAL==============
        #=====================================================

        print "\n\n\nCREATING ENHANCER SIGNAL TABLE"
        signalTableFile = makeEnhancerSignalTable(nameDict, mergedRegionMap,
                                                  medianDict, analysisName,
                                                  genome, outputFolder)

        #=====================================================
        #===============CALL CLUSTERING R SCRIPT==============
        #=====================================================

        print "\n\n\nGENERATING CLUSTERING OUTPUT"
        clusterTableFile = callRScript(genome, outputFolder, analysisName,
                                       signalTableFile)
        #output should be
        #png of cluster gram with rows as genes
        #png of cluster gram of samples w/ tree
        #ordered table w/ cluster assignment
        #similarity matrix for samples
        #sys.exit()
        #=====================================================
        #=============GENE MAPPING BY CLUSTER=================
        #=====================================================

        os.chdir('/ark/home/cl512/pipeline/')
        cmd = 'python /ark/home/cl512/pipeline/ROSE2_geneMapper.py -g %s -i %s' % (
            genome, clusterTableFile)
        os.system(cmd)

        print "FINISHED"

    else:
        parser.print_help()
        sys.exit()
コード例 #52
0
def main():

    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -f [FASTQFILE] -g [GENOME] -u [UNIQUEID] -o [OUTPUTFOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-f","--fastq", dest="fastq",nargs = 1, default=None,
                      help = "Enter the full path of a fastq file to be mapped")
    parser.add_option("-g","--genome",dest="genome",nargs =1, default = None,
                      help = "specify a genome, options are hg19,hg18, mm9 or geckov2 right now")
    parser.add_option("-u","--unique",dest="unique",nargs =1, default = None,
                      help = "specify a uniqueID")
    parser.add_option("-o","--output",dest="output",nargs =1, default = None,
                      help = "Specify an output folder")


    #optional arguments
    parser.add_option("--param",dest="paramString",nargs =1, default = 1,
                      help = "A string of bowtie parameters")
    parser.add_option("--link-folder",dest="linkFolder",nargs =1, default = None,
                      help = "Specify a folder to symlink the bam")
    parser.add_option("-p","--paired",dest="paired",action='store_true',default = False,
                      help = "Flag for paired end data")
    parser.add_option("-S","--sam",dest="sam",action='store_true',default = False,
                      help = "Flag to save sam")
    parser.add_option("-q","--qc",dest="qc",action='store_true',default = False,
                      help = "Flag to run fastqc")



    (options,args) = parser.parse_args()

    if not options.fastq or not options.genome or not options.unique or not options.output:
        parser.print_help()
        exit()

    #retrive the arguments
    fastqFile = options.fastq
    genome = string.lower(options.genome)
    uniqueID = options.unique
    outputFolder = options.output
    
    #make the output folder
    outputFolder = utils.formatFolder(outputFolder,True)

    #retrieve optional arguments
    paramString = options.paramString
    if options.linkFolder:

        linkFolder = options.linkFolder
    else:
        linkFolder =''
    pairedEnd = options.paired

    #get the bowtie index
    bowtieDict = {

        'hg19':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/genome',
        'hg38':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/genome',
        'mm10':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/genome',
        'mm9':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome',
        'rn6':'/storage/cylin/grail/genomes/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/genome',
        'mm10_dm6':'/storage/cylin/grail/genomes/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index_dm6/genome.mm10.dm6',
        'hg19_dm6':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_dm6/genome.hg19.dm6',
        'hg19_ercc':'/storage/cylin/grail/genomes/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index_ercc/human.hg19_ercc'
        }

    bowtieIndex = bowtieDict[string.lower(genome)]

    #get the temp string
    tempString = '_%s' % str(random.randint(1,10000))
    
    fileNameDict = makeFileNameDict(fastqFile,genome,tempString,tempParentFolder,outputFolder,linkFolder,uniqueID,pairedEnd)

    #open the bashfile to write to
    bashFileName = "%s%s_bwt2.sh" % (outputFolder,uniqueID)
    bashFile = open(bashFileName,'w')

    #shebang
    bashFile.write('#!/usr/bin/bash\n')

    #sbatch funky junk
    #cmd = '#SBATCH --output=/storage/cylin/grail/slurm_out/serial_test_%j.out # Standard output and error log'
    #bashFile.write(cmd+'\n')
    ts = time.time()
    timestamp = datetime.datetime.fromtimestamp(ts).strftime('%Y%m%d_%Hh%Mm%Ss')
    cmd = '#SBATCH --output=/storage/cylin/grail/slurm_out/bwt2_%s_%s' % (uniqueID,timestamp) + '_%j.out # Standard output and error log'
    bashFile.write(cmd+'\n')
    cmd = '#SBATCH -e /storage/cylin/grail/slurm_out/bwt2_%s_%s' % (uniqueID,timestamp) + '_%j.err # Standard output and error log'
    bashFile.write(cmd+'\n')

    cmd = 'pwd; hostname; date'
    bashFile.write(cmd+'\n')
    bashFile.write('\n\n\n')

    

    #make temp directory
    cmd = 'mkdir %s' % (fileNameDict['tempFolder'])
    bashFile.write(cmd+'\n')

    #extract fastq
    cmd = extractFastqCmd(fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #call fastqc
    if options.qc:
        cmd =runFastQC(fastqcString,fileNameDict,pairedEnd)
        bashFile.write(cmd+'\n')

    #call bowtie
    print('THIS IS THE PARAM STRING')
    print(paramString)
    cmd = bowtieCmd(bowtieString,paramString,bowtieIndex,fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #remove temp fastq
    cmd = removeTempFastqCmd(fileNameDict,pairedEnd)
    bashFile.write(cmd+'\n')

    #generate a bam
    cmd = generateTempBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #change into the temp directory
    cmd = changeTempDir(fileNameDict)
    bashFile.write(cmd+'\n')

    #sort the bam
    cmd = sortBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #index
    cmd = indexBamCmd(samtoolsString,fileNameDict)
    bashFile.write(cmd+'\n')

    #remove sam
    if not options.sam:
        cmd = rmSamCmd(fileNameDict)
        bashFile.write(cmd+'\n')
    
    #or move the sam
    if options.sam:
        cmd = mvSamCmd(fileNameDict)
        bashFile.write(cmd+'\n')
    #mv bams
    cmd = mvBamCmd(fileNameDict)
    bashFile.write(cmd+'\n')

    #link bams
    if options.linkFolder:
        cmd = linkBamCmd(fileNameDict)
        bashFile.write(cmd+'\n')

    #cleanup
    #cmd = rmTempFiles(fileNameDict)
    #bashFile.write(cmd+'\n')


    bashFile.close()


    print "Wrote mapping command to %s" % (bashFileName)
コード例 #53
0
ファイル: enhancerPromoter.py プロジェクト: mufrdrk/pipeline
def callGSEA(outputFolder,
             analysisName,
             top,
             analysis_type='enhancer_vs_promoter',
             use_top=True):
    '''
    runs C2 GSEA
    '''

    #figure out the suffix for gct and cls files
    analysis_dict = {
        'enhancer_vs_promoter': ['', '#PROMOTER_versus_DISTAL'],
        'total_contribution': ['_total_contrib', '#SIGNAL_versus_BACKGROUND'],
    }

    if analysis_dict.has_key(analysis_type) == False:
        print(
            'Error: please use one of the following supported analysis types')
        print(analysis_dict.keys())
        sys.exit()

    suffix = analysis_dict[analysis_type][0]

    gseaPath = '/storage/cylin/home/cl6/gsea2-3.0_beta_2.jar'
    gmxPath = '/storage/cylin/grail/annotations/gsea/c2.all.v5.1.symbols.gmt'  #C2 set

    gseaBashFilePath = '%s%s_GSEA%s_cmd.sh' % (outputFolder, analysisName,
                                               suffix)
    gseaBashFile = open(gseaBashFilePath, 'w')

    gseaBashFile.write('#!/usr/bin/bash\n\n')

    gseaBashFile.write(
        '#COMMAND LINE GSEA CALLS FOR %s USING %s COMPARISON\n\n' %
        (analysisName, string.upper(analysis_type)))

    #for all
    gctPath = '%s%s_top_all%s.gct' % (outputFolder, analysisName, suffix)
    clsPath = '%s%s_top_all%s.cls' % (outputFolder, analysisName, suffix)
    gseaOutputFolder = utils.formatFolder(
        '%sgsea_top_all_c2%s' % (outputFolder, suffix), True)
    rptLabel = '%s_top_all%s' % (analysisName, suffix)

    gseaCmd_all = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (
        gseaPath, gctPath, clsPath, analysis_dict[analysis_type][1], gmxPath,
        rptLabel, gseaOutputFolder)

    gseaBashFile.write(gseaCmd_all)
    gseaBashFile.write('\n')

    if use_top:
        #for top N
        gctPath = '%s%s_top_%s%s.gct' % (outputFolder, analysisName, top,
                                         suffix)
        clsPath = '%s%s_top_%s%s.cls' % (outputFolder, analysisName, top,
                                         suffix)
        gseaOutputFolder = utils.formatFolder(
            '%sgsea_top_%s_c2%s' % (outputFolder, top, suffix), True)
        rptLabel = '%s_top_%s%s' % (analysisName, top, suffix)

        gseaCmd_top = 'java -Xmx4000m -cp %s xtools.gsea.Gsea -res %s -cls %s%s -gmx %s -collapse false -mode Max_probe -norm meandiv -nperm 1000 -permute gene_set -rnd_type no_balance -scoring_scheme weighted -rpt_label %s -metric Diff_of_Classes -sort real -order descending -include_only_symbols true -make_sets true -median false -num 100 -plot_top_x 20 -rnd_seed timestamp -save_rnd_lists false -set_max 500 -set_min 15 -zip_report false -out %s -gui false' % (
            gseaPath, gctPath, clsPath, analysis_dict[analysis_type][1],
            gmxPath, rptLabel, gseaOutputFolder)

        gseaBashFile.write(gseaCmd_top)
        gseaBashFile.write('\n')

    gseaBashFile.close()
    os.system('bash %s' % (gseaBashFilePath))
コード例 #54
0
def makeNameDict(dataFile,roseFolder,namesList=[],enhancerType='super'):

    '''
    for each name, check for the presence of an enriched file or  allEnhancer table
    these are the files required for enhancer clustering
    '''

    dataDict = pipeline_dfci.loadDataTable(dataFile)
    
    #draw the parent folder from the dataFile
    parentFolder = utils.getParentFolder(dataFile)
    print "Using %s as the parent folder" % (parentFolder)

    #check to see if a rose folder exists already
    if utils.formatFolder(roseFolder,False):
        roseExists = True
        roseFolder = utils.formatFolder(roseFolder,False)
    else:
        roseExists = False
        roseFolder = utils.formatFolder(roseFolder,True)

    #check namesList to see if datasets exist
    if len(namesList) == 0:


        namesList = [name for name in dataDict.keys() if string.upper(name).count('WCE') ==0 and string.upper(name).count('INPUT') == 0 ]
        #if no namesList is given, this filters out WCE 

    #now check that all of the datasets at a minimum have a rose output OR enriched region file

    nameDict = {}
    for name in namesList:
        
        nameDict[name] = {}

        #check if each dataset has a background

        backgroundName = dataDict[name]['background']
        if dataDict.has_key(backgroundName):
            nameDict[name]['background'] = True
        else:
            nameDict[name]['background'] = False

        #assumes standard folder structure for enriched file
        enrichedFile = "%smacsEnriched/%s" % (parentFolder,dataDict[name]['enrichedMacs'])
        
        print "Looking for macs output at %s" % (enrichedFile)

        try:
            foo = open(enrichedFile,'r')
            foo.close()
            nameDict[name]['enrichedFile'] = enrichedFile
        except IOError:
            nameDict[name]['enrichedFile'] = ''

        #roseOutput looks for standard format rose output
        #need an allEnhancers table and a region table to proceed
        #if the rose folder doesn't exist, don't bother
        if roseExists:
            try:
                roseOutputFiles = os.listdir("%s%s_ROSE" % (roseFolder,name))
                if enhancerType == 'super':
                    enhancerString = 'AllEnhancers.table.txt'
                if enhancerType == 'stretch':
                    enhancerString = 'AllEnhancers_Length.table.txt'
                if enhancerType == 'superstretch':
                    enhancerString = 'AllEnhancers_SuperStretch.table.txt'

                allEnhancerFileList = [x for x in roseOutputFiles if x.count(enhancerString) == 1 and x[0] != '.' ] #no weird hidden or temp files
                if len(allEnhancerFileList) > 0:
                    nameDict[name]['enhancerFile'] = "%s%s_ROSE/%s" % (roseFolder,name,allEnhancerFileList[0])
                else:
                    nameDict[name]['enhancerFile'] = ''
            except OSError:
                nameDict[name]['enhancerFile']=''
        else:
            nameDict[name]['enhancerFile'] = ''
        
        if nameDict[name]['enhancerFile'] == '' and nameDict[name]['enrichedFile'] =='':
            print "INSUFFICIENT DATA TO RUN ENAHNCER ANALYSIS ON %s. PLEASE MAKE SURE ROSE OUTPUT OR MACS ENRICHED REGION PEAKS FILE EXISTS" % (name)
            print nameDict[name]
            sys.exit()
    return nameDict
コード例 #55
0
def main():



    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -g [GENOME] -d [DATAFILE] -r [ROSE_FOLDERS] -o [OUTPUT_FOLDER]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the genome build (HG18,HG19,MM9) for the project")
    parser.add_option("-d","--data", dest="data",nargs = 1, default=None,
                      help = "Enter the data file for the project")
    parser.add_option("-r","--rose", dest="rose",nargs = 1, default=None,
                      help = "Enter a comma separated list of rose folder")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the output folder for the project")

    #additional options
    parser.add_option("-n","--names", dest="names",nargs = 1, default=None,
                      help = "Enter a comma separated list of names to go with the datasets")
    parser.add_option("-p","--plot", dest="plot",action = 'store_true', default=False,
                      help = "If flagged, will plot differential regions")
    parser.add_option("-a","--all", dest="all",action = 'store_true', default=False,
                      help = "If flagged, will run analysis for all enhancers and not just supers.")

    (options,args) = parser.parse_args()

    print(options)
    print(args)
    
    if options.genome and options.data and options.rose and options.output:
        genome = string.upper(options.genome)
        dataFile = options.data

        roseFolderString = options.rose
        [roseFolder1,roseFolder2] = roseFolderString.split(',')
        parentFolder = utils.formatFolder(options.output,True)
        
        if options.names:
            nameString = options.names
            [name1,name2] =nameString.split(',')
        else:
            name1 = roseFolder1.split('/')[-1]
            name1 = string.replace(name1,'_ROSE','')

            name2 = roseFolder2.split('/')[-1]
            name2 = string.replace(name2,'_ROSE','')

        mergeName = "%s_%s_merged" % (name1,name2)

        plotBam = options.plot
        if options.all:
            superOnly = False
        else:
            superOnly = True

        if superOnly and plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and plotting output to %s" % (name1,name2,parentFolder)
        if superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all super enhancers in %s and %s and writing output to %s" % (name1,name2,parentFolder)
        if not superOnly and plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and plotting output to %s. WARNING: Plotting all differential enhancers could take a while" % (name1,name2,parentFolder)
        if not superOnly and not plotBam:
            print "Running dynamic enhancer analysis on all enhancers in %s and %s and writing output to %s." % (name1,name2,parentFolder)

        #part 1
        print "PART1: analyzing ROSE output from %s and %s" % (name1,name2)
        #start with the all enhancer tables from the initial rose calls
        roseFolder1 = pipeline_dfci.formatFolder(roseFolder1,False)
        roseFolder2 = pipeline_dfci.formatFolder(roseFolder2,False)
        superFile1 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder1,name1)
        superFile2 = '%s%s_peaks_SuperEnhancers.table.txt' % (roseFolder2,name2)

        allFile1 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder1,name1)
        allFile2 = '%s/%s_peaks_AllEnhancers.table.txt' % (roseFolder2,name2)

        print('\tMERGING ENHANCERS AND CALLING ROSE')
        if superOnly:
            mergedGFFFile = '%s%s_%s_MERGED_SUPERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,superFile1,superFile2,name1,name2,mergedGFFFile,parentFolder)

        else:
            mergedGFFFile = '%s%s_%s_MERGED_ENHANCERS_-0_+0.gff' % (parentFolder,string.upper(genome),mergeName)
            #callMergeSupers(dataFile,allFile1,allFile2,name1,name2,mergedGFFFile,parentFolder)


        if superOnly:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_SUPERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)
        else:
            superOutput = "%s%s_ROSE/%s_%s_MERGED_ENHANCERS_-0_+0_SuperEnhancers_ENHANCER_TO_GENE.txt" % (parentFolder,name1,string.upper(genome),mergeName)

        print('\tCALCULATING ENHANCER DELTA AND MAKING PLOTS')
        if utils.checkOutput(superOutput):
            #part2 is the R script
            rcmd = callDeltaRScript(mergedGFFFile,parentFolder,name1,name2)
            print(rcmd) 
            os.system(rcmd)
            time.sleep(30)
            callRoseGeneMapper(mergedGFFFile,genome,parentFolder,name1)
        else:
            print('ERROR: ROSE CALL FAILED')
            sys.exit()

        #rank the genes


        #part 3
        #rank the delta
        print "PART 3: assinging ranks to differential enhancers"
        print('\tASSIGNING SUPER RANK TO MERGED ENHANCERS')
        if superOnly:
            gffName = '%s_%s_MERGED_SUPERS_-0_+0' % (string.upper(genome),mergeName)
        else:
            gffName = '%s_%s_MERGED_ENHANCERS_-0_+0' % (string.upper(genome),mergeName)
        enhancerToGeneFile = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB.txt" % (parentFolder,name1,gffName)
        if utils.checkOutput(enhancerToGeneFile):
            rankOutput = "%s%s_ROSE/%s_0KB_STITCHED_ENHANCER_DELTA_ENHANCER_TO_GENE_100KB_RANK.txt" % (parentFolder,name1,gffName)
            assignEnhancerRank(enhancerToGeneFile,allFile1,allFile2,name1,name2,rankOutput)
        else:
            print('ERROR: DELTA SCRIPT OR ROSE GENE MAPPER FAILED TO RUN')
            sys.exit()

        #make the rank plot
        print('MAKING RANK PLOTS')
        if utils.checkOutput(rankOutput):
            rcmd = callRankRScript(rankOutput,name1,name2,superFile1,superFile2)
            print(rcmd)
            os.system(rcmd)
        else:
            print('ERROR: RANK PLOT SCRIPT FAILED TO RUN')
            sys.exit()

        time.sleep(30)

        print('FINISHING OUTPUT')
        finishRankOutput(dataFile,rankOutput,genome,parentFolder,mergeName,name1,name2,1,100000,superOnly,plotBam)
    else:
        parser.print_help()
        exit()
コード例 #56
0
ファイル: CRC3.py プロジェクト: linlabcode/pipeline
def main():

    import argparse
    parser = argparse.ArgumentParser(usage="usage: prog [options] -e [ENHANCER_FILE] -b [BAM_FILE] -g [GENOME] -o [OUTPUTFOLDER] -n [NAME]" )



    #required flags                                                                                                                   
    parser.add_argument("-e","--enhancer_file", dest="enhancers", default=None,type=str,
                        help = "Provide a ROSE generated enhancer table (_AllEnhancers.table.txt)",required=True)

    parser.add_argument("-g","--genome",dest="genome", default = None,type=str,
                        help = "Provide the build of the genome to be used for the analysis. Currently supports HG19, HG18 and MM9",required=True)
    parser.add_argument("-o","--output",dest="output", default = None,type=str,
                        help = "Enter an output folder",required=True)
    parser.add_argument("-n","--name",dest="name", default = None,type=str,
                        help = "Provide a name for the job",required=True)


    #you either need bams for valleys or subpeaks
    parser.add_argument("-b","--bam",dest="bam", default = None,type=str,
                        help = "Enter a comma separated list of bams of valley finding",required=False)
    parser.add_argument("-s","--subpeaks", dest="subpeaks",default=None,type=str,
                        help = "Enter a BED file of regions to search for motifs",required=False)



    #additional options                                                                                  
    parser.add_argument("-a","--activity",dest="activity", default = None,type=str,
                        help = "A table with active gene names in the first column",required=False)
    parser.add_argument("-l","--extension-length", dest="extension", default=100,type=int,
                        help = "Enter the length to extend subpeak regions for motif finding. default is 100",required=False)
    parser.add_argument("-B","--background", dest="background", default=None,type=str,
                        help = "Provide a background BAM file",required=False)
    parser.add_argument("-N", "--number", dest="number", default=1,type=int,
                        help = "Enter the number of non overlapping motifs in a region required to assign a binding event. Default=1",required=False)     #I have modified the destination of -N option so that it is different from the destination of -E option
    parser.add_argument("--motifs", dest="motifs", default=False,type=str,
                        help = "Enter additional PWM file for the analysis",required=False)
    parser.add_argument("-t","--tfs", dest="tfs",default=None,type=str,
                        help = "Enter additional TFs (comma separated) to be used in the bindinf analysis",required=False)
    parser.add_argument("--config", dest="config",default='',type=str,
                        help = "Enter genome configuration file to overwrite default paths",required=False)


    args = parser.parse_args()





    #=====================================================================================
    #===============================I. PARSING ARGUMENTS==================================
    #=====================================================================================


    ###
    # Define all global file names
    ###
    print(args)
    genome = loadGenome(args.genome,args.config)

    motifDatabaseFile = genome.returnFeature('motif_database')
    motifConvertFile = genome.returnFeature('motif_convert')

    # User input files
    enhancer_file = args.enhancers

    if args.bam == None and args.subpeaks == None:
        print('ERROR: Must provide either bams for valley finding or subpeaks as a .bed')
        sys.exit()

    #set the subpeak file
    if args.subpeaks:
        subpeakFile = args.subpeaks
    else: subpeakFile = None


    #will need to fix bams down the line to take in multiple bams
    if args.bam:
        bamFileList = [bam_path for bam_path in args.bam.split(',') if len(bam_path) >0]
        print(bamFileList)
    else:
        bamFileList = []

    if args.background:
        background = args.background

    else: 
        background = None


    #output folder and analysis name
    print(args.output)
    output_folder = utils.formatFolder(args.output,True)
    analysis_name = args.name

    #optional arguments
    #activity path
    activity_path = args.activity

    #motif extension
    constExtension = args.extension

    print('\n\n#======================================\n#===========I. DATA SUMMARY============\n#======================================\n')

    print('Analyzing TF connectivity for %s' % (analysis_name))
    print('Writing output to %s' % (output_folder))
    if subpeakFile:
        print('Using %s to define subpeaks for motif finding' % (subpeakFile))
    else:
        print('Identifying valleys from .bam files')
    print('Using %s to define active genes' % (activity_path))


    #=====================================================================================
    #=======================II. IDENTIFYING CANDIDATE TFS AND NODES=======================
    #=====================================================================================

    print('\n\n#======================================\n#===II. MAPPING GENES AND ENHANCERS====\n#======================================\n')
    
    geneTable,geneTFTable,enhancerTable,enhancerTFTable,geneSummaryTable,candidate_tf_list,gene_to_enhancer_dict= geneToEnhancerDict(genome, enhancer_file, activity_path)
    #write these guys to disk

    gene_out = '%s%s_GENE_TABLE.txt' % (output_folder,analysis_name)
    gene_tf_out = '%s%s_GENE_TF_TABLE.txt' % (output_folder,analysis_name)

    enhancer_out = '%s%s_ENHANCER_TABLE.txt' % (output_folder,analysis_name)
    enhancer_tf_out = '%s%s_ENHANCER_TF_TABLE.txt' % (output_folder,analysis_name)

    summary_out= '%s%s_GENE_SUMMARY.txt' % (output_folder,analysis_name)
    
    utils.unParseTable(enhancerTable,enhancer_out,'\t')    
    utils.unParseTable(enhancerTFTable,enhancer_tf_out,'\t')

    utils.unParseTable(geneTable,gene_out,'\t')
    utils.unParseTable(geneTFTable,gene_tf_out,'\t')

    utils.unParseTable(geneSummaryTable,summary_out,'\t')
    

    print('Identified %s genes w/ proximal cis-regulatory elements' % (len(gene_to_enhancer_dict)))
            
    print('Identified %s candidate TFs' % (len(candidate_tf_list)))
    print(candidate_tf_list)


    #=====================================================================================
    #==========================III. FINDING VALLEYS/SUBPEAKS==============================
    #=====================================================================================

    print('\n\n#======================================\n#=====III. FINDING VALLEYS/SUBPEAKS====\n#======================================\n')


    #so here we would need to find valleys everywhere
    if subpeakFile == None:
        print('finding valleys')
        #note: the tf_bed_path is for networks, all is for out degree finding
        all_bed_path = findValleys(gene_to_enhancer_dict, bamFileList, analysis_name, output_folder, cutoff = 0.2)
    else:
        print('Using subpeaks from %s' % (subpeakFile))
        all_bed_path = filterSubpeaks(subpeakFile,gene_to_enhancer_dict,analysis_name,output_folder)


    #first make the subpeak bed and subpeak fasta for the tfs

    all_sub_bed,all_fasta = generateSubpeakFASTA(gene_to_enhancer_dict, all_bed_path, genome, analysis_name,output_folder, constExtension)
    if subpeakFile == None:
        #this is the case where we did valleys #only reason you would need to output the sub bed
        all_sub_out = '%s%s_all_subpeak.bed' % (output_folder,analysis_name)
        utils.unParseTable(all_sub_bed,all_sub_out,'\t')


    #writing the all subpeak fasta out to disk
    all_fasta_out = '%s%s_all_subpeak.fasta' % (output_folder,analysis_name)
    utils.unParseTable(all_fasta,all_fasta_out,'')
        

    #=====================================================================================
    #=================================IV. FINDING MOTIFS==================================
    #=====================================================================================

    print('\n\n#======================================\n#======IV. RUNNING MOTIF FINDING=======\n#======================================\n')


    #first make background
    bg_path = makeMotifBackground(all_fasta_out,output_folder,analysis_name)

    #find motifs for all regions
    fimo_out = findMotifs(all_fasta_out,bg_path,candidate_tf_list, output_folder, analysis_name, motifConvertFile, motifDatabaseFile)

    edgeDict = collapseFimo(fimo_out,gene_to_enhancer_dict,candidate_tf_list,output_folder,analysis_name,motifConvertFile)

    #=====================================================================================
    #============================V. RUNNING NETWORK ANALYSIS==============================
    #=====================================================================================

    print('\n\n#======================================\n#========V. BUILDING NETWORK===========\n#======================================\n')


    print('building graph and edge table')
    graph = buildGraph(edgeDict,gene_to_enhancer_dict,output_folder, analysis_name,cutoff=1)

    formatNetworkOutput(graph, output_folder, analysis_name, candidate_tf_list)

        
    print('FINISHED RUNNING CRC FOR %s' % (analysis_name))

    sys.exit()
コード例 #57
0
def main():


    '''
    main run function
    '''

    from optparse import OptionParser

    usage = "usage: %prog [options] -t [TEST_BAM] -c [CONTROL_BAM] -g [GENOME]"
    parser = OptionParser(usage = usage)
    #required flags
    parser.add_option("-t","--test", dest="test",nargs = 1, default=None,
                      help = "Enter the full path of the test bam")
    parser.add_option("-c","--control", dest="control",nargs = 1, default=None,
                      help = "Enter the full path of the control bam")
    parser.add_option("-g","--genome", dest="genome",nargs = 1, default=None,
                      help = "Enter the build for the GeCKO library (currently only supports geckov2)")


    #optional arguments
    parser.add_option("-n","--name",dest="name",nargs =1, default = 0,
                      help = "Comma separated test,control name")
    parser.add_option("-s","--scoring",dest="scoring",nargs =1, default = 'WtSum',
                      help = "Scoring method (KSbyScore,WtSum,SecondBestRank) defulat: WtSum")
    parser.add_option("-o","--output", dest="output",nargs = 1, default=None,
                      help = "Enter the full path of the output folder. Default is the current working directory")


    (options,args) = parser.parse_args()

    #three required parameters to get started
    if options.test and options.control and options.genome:

        #get the names of the datasets
        if options.name:
            if len(options.name.split(',')) == 2:
                [testName,controlName] = options.name.split(',')
            else:
                print("ERROR: Must provide a comma separated test,control name if using -n flag")
                parser.print_help()
                sys.exit()
        else:
            #try to extract names from file
            #strip extension from filename
            testName = options.test.split('/')[-1].split('.')[0]
            controlName = options.control.split('/')[-1].split('.')[0]

        #names
        print("using %s as name for test dataset" % (testName))
        print("using %s as name for control dataset" % (controlName))

        #get the analysis name
        analysisName = '%s_%s' % (testName,controlName)
        print("using %s as analysis name" % (analysisName))
        
        #get the scoring method
        scoringMethod = options.scoring
        if ['KSbyScore','WtSum','SecondBestRank'].count(scoringMethod)==0:
            print("ERROR: please specify one of the following scoring methods:('KSbyScore','WtSum','SecondBestRank') or leave blank (default WtSum)")
            parser.print_help()
            sys.exit()
                  
        
        #set up output folder
        if options.output:
            outputFolder = utils.formatFolder(options.output,True)
        else:
            outputFolder = utils.formatFolder('./%s/' % (analysisName),True)

        print("using %s as an output folder" % (outputFolder))

        #get the right annotation
        genomeDict = {'geckov2':'/grail/genomes/gecko/GeCKOv2/Annotation/Human_GeCKOv2_Library.txt',
                      }

        #load the annotation dictionary
        annotFile = genomeDict[string.lower(options.genome)]
        print("using %s as the annotation file" % (annotFile))
        
        #guideDict,geneDict = makeAnnotDict(annotFile)
        
        #now set up each bam
        testBam = utils.Bam(options.test)
        controlBam = utils.Bam(options.control)

        #get the MMR for each
        testMMR = round(float(testBam.getTotalReads())/1000000,4)
        controlMMR = round(float(controlBam.getTotalReads())/1000000,4)

        print("Test dataset: %s has an MMR of %s" % (testName,testMMR))
        print("Control dataset: %s has an MMR of %s" % (controlName,controlMMR))

        #now get the idxstats output
        testIdxFile = '%s%s_idxstats.txt' % (outputFolder,testName)
        testIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.test,testIdxFile)
        print("Test idxstats command:")
        print(testIdxCmd)
        os.system(testIdxCmd)

        controlIdxFile = '%s%s_idxstats.txt' % (outputFolder,controlName)
        controlIdxCmd = '%s idxstats %s > %s' % (samtoolsString,options.control,controlIdxFile)
        print("Control idxstats command:")
        print(controlIdxCmd)
        os.system(controlIdxCmd)

        print("Checking for output")
        if not utils.checkOutput(testIdxFile,0.1,5):
            print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.test))
        print("Found test IdxStats file")
        if not utils.checkOutput(controlIdxFile,0.1,5):
            print("ERROR: UNABLE TO GENERATE IDX OUTPUT FOR %s" % (options.control))
        print("Found control IdxStats file")

        #now make the fold table

        foldTableFile =makeFoldTable(annotFile,analysisName,testName,controlName,testMMR,controlMMR,testIdxFile,controlIdxFile,outputFolder,epsilon = 1)
        
        print('writing output to %s' % (foldTableFile))
        
        print("MAING FRIGER TABLE")
        rigerTableFile = makeRigerTable(foldTableFile,output='')
        print('writing FRIGER table to %s' % (rigerTableFile))

        rigerBashFileName = callRiger(rigerTableFile,scoring=scoringMethod,output='',callRiger=True)

        

    else:
        parser.print_help()
        sys.exit()
コード例 #58
0
import subprocess
#==========================================================================
#============================PARAMETERS====================================
#==========================================================================



projectName = 'slam_seq'
genome ='hg38'
annotFile = '%s/annotation/%s_refseq.ucsc' % (pipeline_dir,genome)

#project folders
projectFolder = '/storage/cylin/grail/projects/%s' % (projectName) #PATH TO YOUR PROJECT FOLDER


projectFolder = utils.formatFolder(projectFolder,True)
#standard folder names
gffFolder ='%sgff/' % (projectFolder)
macsFolder = '%smacsFolder/' % (projectFolder)
macsEnrichedFolder = '%smacsEnriched/' % (projectFolder)
mappedEnrichedFolder = '%smappedEnriched/' % (projectFolder)
mappedFolder = '%smappedFolder/' % (projectFolder)
wiggleFolder = '%swiggles/' % (projectFolder)
metaFolder = '%smeta/' % (projectFolder)
metaRoseFolder = '%smeta_rose/' % (projectFolder)
roseFolder = '%srose/' % (projectFolder)
fastaFolder = '%sfasta/' % (projectFolder)
bedFolder = '%sbed/' % (projectFolder)
figuresFolder = '%sfigures/' % (projectFolder)
geneListFolder = '%sgeneListFolder/' % (projectFolder)
bedFolder = '%sbeds/' % (projectFolder)