def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy): """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """ use_adjusted_p=True print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound) NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version print "Running metaData Analyses for finding differential splicing events" rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation) counter=1 dPSI_results_dir=rootdir+CovariateQuery global upd_guides upd_guides=[] name=[] group=[] grplst=[] for filename in os.listdir(dPSI_results_dir): if filename.startswith("PSI."): dPSI_results_fn=os.path.join(dPSI_results_dir, filename) dPSI_comparison_alt_name=string.replace(filename,"PSI.","") omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir) if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison group.append(counter) name.append(string.replace(filename,"PSI.","")) counter+=1 print counter, 'robust splicing subtypes identified in round',AnalysisRound if counter>0: #counter>2 --- changed to 0 to force NMF dire = export.findParentDir(full_PSI_InputFile) output_dir = dire+'OncoInputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt' ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False) header=ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound) #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name) grplst.append(group) ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version header=Correlationdepletion.header_file(NMFResult) output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt" sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header) print "Running Correlation Depletion - Round"+str(AnalysisRound) commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name) Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile) full_PSI_InputFile=Depleted flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed """" else: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) flag=True except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True """ return flag,full_PSI_InputFile
if EnrichmentOnly==False: print 'PSI input files:',EventAnnot print 'Using a rho-cutoff of:',rho_cutoff if filters==True: ### Filter based on a default percentage of samples with detected PSI values EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True) else: SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False) output_dir = dire+'ExpressionInput' export.createExportFolder(output_dir) full_PSI_InputFile=output_dir+"/exp.input.txt" header=header_list(EventAnnot) sampleIndexSelection.filterFile(EventAnnot,full_PSI_InputFile,header,FirstCol=False) ### Set Splice-ICGS defaults gsp = UI.GeneSelectionParameters(species,platform,platform) gsp.setNormalize('median') gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) gsp.setJustShowTheseIDs('') gsp.setSampleDiscoveryParameters(ExpressionCutoff,CountsCutoff,FoldDiff,SamplesDiffering,removeOutliers, featurestoEvaluate,restrictBy,excludeCellCycle,column_metric,column_method,rho_cutoff) AnalysisRound=1 if mode == "single": """ Perform a single round of Splice-ICGS (RNASeq.py module) """ flag,full_PSI_InputFile,EventAnnot=CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound)
if __name__ == '__main__': import getopt ################ Comand-line arguments ################ if len( sys.argv[1:] ) <= 1: ### Indicates that there are insufficient number of command-line arguments print "Warning! Insufficient command line flags supplied." sys.exit() else: analysisType = [] options, remainder = getopt.getopt(sys.argv[1:], '', ['Guidefile=', 'PSIfile=']) for opt, arg in options: if opt == '--Guidefile': Guidefile = arg elif opt == '--PSIfile': PSIfile = arg else: print "Warning! Command-line argument: %s not recognized. Exiting..." % opt sys.exit() #filename="/Users/meenakshi/Documents/leucegene/ICGS/Clustering-exp.Hs_RNASeq_top_alt_junctions367-Leucegene-75p_no149-Guide1 TRAK1&ENSG00000182606&I1.1_42075542-E2.1__E-hierarchical_cosine_correlation.txt" #PSIfile="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.Hs_RNASeq_top_alt_junctions-PSI_EventAnnotation-367-Leucegene-75p-unique-filtered-filtered.txt" #keylabel="/Users/meenakshi/Documents/leucegene/ExpressionInput/exp.round2_glmfilteredKmeans_label.txt" header = header_file(Guidefile) output_file = PSIfile[:-4] + "-filtered.txt" sampleIndexSelection.filterFile(PSIfile, output_file, header) commonkeys, count = FindCorrelations(Guidefile, output_file) DepleteSplicingevents(commonkeys, output_file, count)
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp, forceBroadClusters, turn): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber * 0.25)) print 'Number varying samples to identify:', gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!! Setting Rank=0' #print traceback.format_exc() Rank = 0 if Rank > 1: ### ADJUST THE RANKS - MUST UPDATE!!!! if turn == 1: if force_broad_round1: #Rank=2 Rank = Rank else: if Rank > 2: Rank = 30 else: if Rank > 2: Rank = 30 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq): species = "Hs" row_method = 'hopach' column_method = 'hopach' row_metric = 'correlation' column_metric = 'euclidean' color_gradient = 'yellow_black_blue' contrast = 3 vendor = "RNASeq" GeneSelection = '' PathwaySelection = '' GeneSetSelection = 'None Selected' excludeCellCycle = False #rho_cutoff = 0.4 restrictBy = 'protein_coding' featurestoEvaluate = 'Genes' ExpressionCutoff = 0 CountsCutoff = 0 FoldDiff = 1.2 SamplesDiffering = 4 JustShowTheseIDs = '' removeOutliers = False PathwaySelection = [] array_type = "RNASeq" #rho_cutoff=0.4 gsp = UI.GeneSelectionParameters(species, array_type, vendor) gsp.setGeneSet(GeneSetSelection) gsp.setPathwaySelect(PathwaySelection) gsp.setGeneSelection(GeneSelection) gsp.setJustShowTheseIDs(JustShowTheseIDs) gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff, SamplesDiffering, removeOutliers, featurestoEvaluate, restrictBy, excludeCellCycle, column_metric, column_method, rho_cutoff) #Run splice ICGS """import UI species='Mm'; platform = "3'array"; vendor = 'Ensembl' gsp = UI.GeneSelectionParameters(species,platform,vendor) gsp.setGeneSet('None Selected') gsp.setPathwaySelect('') gsp.setGeneSelection('') gsp.setJustShowTheseIDs('') gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(0,0,1.5,3, False,'PSI','protein_coding',False,'cosine','hopach',0.35)""" FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) #except Exception:Rank=0 graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!!' print traceback.format_exc() Rank = 0 if Rank > 1: print 'Current turn:', turn, 'k =', if turn == 1: Rank = 2 elif Rank > 2: Rank = 30 else: Rank = 2 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print Rank print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot