Ejemplo n.º 1
0
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp,
                     forceBroadClusters, turn):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """

    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber * 0.25))

        print 'Number varying samples to identify:', gsp.SamplesDiffering()

        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        ### Parameters are fixed as they are distinct
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting Rank=0'
        #print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        ### ADJUST THE RANKS - MUST UPDATE!!!!
        if turn == 1:
            if force_broad_round1:
                #Rank=2
                Rank = Rank
            else:
                if Rank > 2:
                    Rank = 30
        else:
            if Rank > 2:
                Rank = 30
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False

        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot
Ejemplo n.º 2
0
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound):
    """ This function is used perform a single-iteration of the OncoSplice workflow (called from main),
    including the unsupervised splicing analysis (splice-ICGS) and signature depletion """
    
    ### Filter the EventAnnotation PSI file with non-depleted events from the prior round
    filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound)
    
    try:
        print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound)

        ### Reset the below variables which can be altered in prior rounds
        gsp.setGeneSelection('')
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect([])
        species = gsp.Species()
        if forceBroadClusters == True:
            ### Find Broad clusters with at least 25% of all samples
            originalSamplesDiffering = gsp.SamplesDiffering()
            gsp.setSamplesDiffering(int(SampleNumber*0.25))
            
        print 'Number varying samples to identify:',gsp.SamplesDiffering()
        
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp)
        if forceBroadClusters == True:
            gsp.setSamplesDiffering(originalSamplesDiffering)

        dPSI_results_fn=graphic_links3[-1][-1]
        dPSI_results_fn=dPSI_results_fn[:-4]+'.txt'
       
        print "Running block identification for k analyses - Round"+str(AnalysisRound)
        ### Parameters are fixed as they are distinct 
        RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) 
        dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt'
        NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound)

    except Exception:
        print 'UNKNOWN ERROR!!!!! Setting k=0' 
        print traceback.format_exc()
        k=0
    
    print "Round =", AnalysisRound,'and k =', k
    if AnalysisRound == 1:
        if force_broad_round1:
            k = 2
        else:
            NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
    if k < 2:
        NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone
        #k = 2
        
    print "Round =", AnalysisRound,'and k =', k
    if k>1:
        ### ADJUST THE k - MUST UPDATE!!!!
        if AnalysisRound == 1:
            if k < 2:
                k = 30
        else:
            if k > 2:
                k = 30
        print "Round =", AnalysisRound,'and k =', k
        
        try:
            flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
        except:
            print traceback.format_exc()
            k+=1
            print 'Adjusted k =',k
            try:
                flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy)
                print traceback.format_exc()
            except:
                k = 30
                print 'Adjusted k = 30'
                try:
                    flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy)
                    print traceback.format_exc()
                except:
                    flag = True
                    pass ### will force k-means below
    
    if k<2:
        if k==1:
            try:
                print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound)
                header=[]
                header=Kmeans.header_file(dPSI_results_fn_block)
                Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound)
                if AnalysisRound == 1:
                    flag=True
                else:
                    flag=False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                AnalysisRound = True
        else:
            flag=False
     
    return flag,full_PSI_InputFile,filtered_EventAnnot_dir
Ejemplo n.º 3
0
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq):

    species = "Hs"
    row_method = 'hopach'
    column_method = 'hopach'
    row_metric = 'correlation'
    column_metric = 'euclidean'
    color_gradient = 'yellow_black_blue'
    contrast = 3
    vendor = "RNASeq"
    GeneSelection = ''
    PathwaySelection = ''
    GeneSetSelection = 'None Selected'
    excludeCellCycle = False
    #rho_cutoff = 0.4
    restrictBy = 'protein_coding'
    featurestoEvaluate = 'Genes'
    ExpressionCutoff = 0
    CountsCutoff = 0
    FoldDiff = 1.2
    SamplesDiffering = 4
    JustShowTheseIDs = ''
    removeOutliers = False
    PathwaySelection = []
    array_type = "RNASeq"
    #rho_cutoff=0.4
    gsp = UI.GeneSelectionParameters(species, array_type, vendor)
    gsp.setGeneSet(GeneSetSelection)
    gsp.setPathwaySelect(PathwaySelection)
    gsp.setGeneSelection(GeneSelection)
    gsp.setJustShowTheseIDs(JustShowTheseIDs)
    gsp.setNormalize('median')
    gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff,
                                     SamplesDiffering, removeOutliers,
                                     featurestoEvaluate, restrictBy,
                                     excludeCellCycle, column_metric,
                                     column_method, rho_cutoff)
    #Run splice ICGS
    """import UI
        species='Mm'; platform = "3'array"; vendor = 'Ensembl'
        gsp = UI.GeneSelectionParameters(species,platform,vendor)
        gsp.setGeneSet('None Selected')
        gsp.setPathwaySelect('')
        gsp.setGeneSelection('')
        gsp.setJustShowTheseIDs('')
        gsp.setNormalize('median')
        gsp.setSampleDiscoveryParameters(0,0,1.5,3,
        False,'PSI','protein_coding',False,'cosine','hopach',0.35)"""

    FilteredEventAnnot = filterEventAnnotation.FilterFile(
        InputFile, EventAnnot, turn)

    try:
        print "Running splice-ICGS for feature selection - Round" + str(turn)
        #except Exception:Rank=0
        graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species,
                                                         'exons',
                                                         InputFile,
                                                         mlp,
                                                         exp_threshold=0,
                                                         rpkm_threshold=0,
                                                         parameters=gsp)

        Guidefile = graphic_links3[-1][-1]
        Guidefile = Guidefile[:-4] + '.txt'

        print "Running block identification for rank analyses - Round" + str(
            turn)
        RNASeq_blockIdentification.correlateClusteredGenesParameters(
            Guidefile,
            rho_cutoff=0.4,
            hits_cutoff=4,
            hits_to_report=50,
            ReDefinedClusterBlocks=True,
            filter=True)
        Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt'
        NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block,
                                                 InputFile, turn)
    except Exception:
        print 'UNKNOWN ERROR!!!!!'
        print traceback.format_exc()
        Rank = 0

    if Rank > 1:
        print 'Current turn:', turn, 'k =',
        if turn == 1:
            Rank = 2
        elif Rank > 2:
            Rank = 30
        else:
            Rank = 2
        if seq == "bulk":
            use_adjusted_p = True
        else:
            use_adjusted_p = False
        print Rank
        print "Running NMF analyses for dimension reduction using " + str(
            Rank) + " ranks - Round" + str(turn)
        NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis(
            NMFinput, Rank, turn, strategy)
        print "Running Metadata Analyses for finding differential splicing events"
        rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis(
            'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p,
            0.05, Annotation)
        counter = 1
        Guidedir = rootdir + CovariateQuery
        PSIdir = rootdir + 'ExpressionProfiles'
        global upd_guides
        upd_guides = []
        name = []
        group = []
        grplst = []
        for filename in os.listdir(Guidedir):
            if filename.startswith("PSI."):
                Guidefile = os.path.join(Guidedir, filename)
                psi = string.replace(filename, "PSI.", "")
                PSIfile = os.path.join(PSIdir, psi)
                omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir)

                if omitcluster == 0:
                    group.append(counter)
                    name.append(psi)
                    counter += 1
        if counter > 2:
            dire = export.findParentDir(InputFile)
            output_dir = dire + 'OncoInputs'
            if os.path.exists(output_dir) == False:
                export.createExportFolder(output_dir)

            output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt'
            ExpandSampleClusters.filterRows(InputFile,
                                            output_file,
                                            filterDB=upd_guides,
                                            logData=False)
            header = ExpandSampleClusters.header_file(output_file)
            print "Running SVM prediction for improved subtypes - Round" + str(
                turn)
            train = ExpandSampleClusters.TrainDataGeneration(
                output_file, BinarizedOutput, name)
            grplst.append(group)
            ExpandSampleClusters.Classify(header, train, output_file, grplst,
                                          name, turn)
            header = Correlationdepletion.header_file(NMFResult)

            output_file = output_dir + '/DepletionInput-Round' + str(
                turn) + ".txt"
            sampleIndexSelection.filterFile(InputFile, output_file, header)
            print "Running Correlation Depletion - Round" + str(turn)
            commonkeys, count = Correlationdepletion.FindCorrelations(
                NMFResult, output_file, name)
            Depleted = Correlationdepletion.DepleteSplicingevents(
                commonkeys, output_file, count, InputFile)
            InputFile = Depleted

            flag = True
        else:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)
                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False

    else:
        if Rank == 1:
            try:
                print "Running K-means analyses instead of NMF - Round" + str(
                    turn)
                header = []
                header = Kmeans.header_file(Guidefile_block)
                Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn)

                flag = False
            except Exception:
                print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!'
                print traceback.format_exc()
                flag = False
        else:
            flag = False

    return flag, InputFile, FilteredEventAnnot