def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp, forceBroadClusters, turn): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber * 0.25)) print 'Number varying samples to identify:', gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!! Setting Rank=0' #print traceback.format_exc() Rank = 0 if Rank > 1: ### ADJUST THE RANKS - MUST UPDATE!!!! if turn == 1: if force_broad_round1: #Rank=2 Rank = Rank else: if Rank > 2: Rank = 30 else: if Rank > 2: Rank = 30 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound) try: print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) species = gsp.Species() if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber*0.25)) print 'Number varying samples to identify:',gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) dPSI_results_fn=graphic_links3[-1][-1] dPSI_results_fn=dPSI_results_fn[:-4]+'.txt' print "Running block identification for k analyses - Round"+str(AnalysisRound) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt' NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound) except Exception: print 'UNKNOWN ERROR!!!!! Setting k=0' print traceback.format_exc() k=0 print "Round =", AnalysisRound,'and k =', k if AnalysisRound == 1: if force_broad_round1: k = 2 else: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone if k < 2: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone #k = 2 print "Round =", AnalysisRound,'and k =', k if k>1: ### ADJUST THE k - MUST UPDATE!!!! if AnalysisRound == 1: if k < 2: k = 30 else: if k > 2: k = 30 print "Round =", AnalysisRound,'and k =', k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) except: print traceback.format_exc() k+=1 print 'Adjusted k =',k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy) print traceback.format_exc() except: k = 30 print 'Adjusted k = 30' try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) print traceback.format_exc() except: flag = True pass ### will force k-means below if k<2: if k==1: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) if AnalysisRound == 1: flag=True else: flag=False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True else: flag=False return flag,full_PSI_InputFile,filtered_EventAnnot_dir
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq): species = "Hs" row_method = 'hopach' column_method = 'hopach' row_metric = 'correlation' column_metric = 'euclidean' color_gradient = 'yellow_black_blue' contrast = 3 vendor = "RNASeq" GeneSelection = '' PathwaySelection = '' GeneSetSelection = 'None Selected' excludeCellCycle = False #rho_cutoff = 0.4 restrictBy = 'protein_coding' featurestoEvaluate = 'Genes' ExpressionCutoff = 0 CountsCutoff = 0 FoldDiff = 1.2 SamplesDiffering = 4 JustShowTheseIDs = '' removeOutliers = False PathwaySelection = [] array_type = "RNASeq" #rho_cutoff=0.4 gsp = UI.GeneSelectionParameters(species, array_type, vendor) gsp.setGeneSet(GeneSetSelection) gsp.setPathwaySelect(PathwaySelection) gsp.setGeneSelection(GeneSelection) gsp.setJustShowTheseIDs(JustShowTheseIDs) gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff, SamplesDiffering, removeOutliers, featurestoEvaluate, restrictBy, excludeCellCycle, column_metric, column_method, rho_cutoff) #Run splice ICGS """import UI species='Mm'; platform = "3'array"; vendor = 'Ensembl' gsp = UI.GeneSelectionParameters(species,platform,vendor) gsp.setGeneSet('None Selected') gsp.setPathwaySelect('') gsp.setGeneSelection('') gsp.setJustShowTheseIDs('') gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(0,0,1.5,3, False,'PSI','protein_coding',False,'cosine','hopach',0.35)""" FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) #except Exception:Rank=0 graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!!' print traceback.format_exc() Rank = 0 if Rank > 1: print 'Current turn:', turn, 'k =', if turn == 1: Rank = 2 elif Rank > 2: Rank = 30 else: Rank = 2 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print Rank print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot