def FilterFile(Guidefile, PSI, turn=0): if 'Clustering' in Guidefile: count = 1 else: count = 0 val = [] head = 0 for line in open(Guidefile, 'rU').xreadlines(): if head > count: line = line.rstrip('\r\n') q = string.split(line, '\t') val.append(q[0]) else: head += 1 continue dire = export.findParentDir(export.findParentDir(Guidefile)[:-1]) output_dir = dire + 'SubtypeAnalyses-Results' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt' output_file = output_dir + '/round' + str( turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt' try: os.mkdir(output_dir + '/round' + str(turn)) except: pass ### already exists if turn == 1: ### No need to filter this file shutil.copyfile(PSI, output_file) else: filterRows(PSI, output_file, filterDB=val) return output_file
def FilterFile(Guidefile,Guidefile_block,PSI,turn): if 'Clustering' in Guidefile: count=1 flag=True rank_Count=0 prev=0 else: count=0 val=[] head=0 print Guidefile_block for line in open(Guidefile_block,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') if flag: if int(q[1])==prev: continue else: rank_Count+=1 prev=int(q[1]) else: head+=1 continue head=0 print Guidefile for line in open(Guidefile,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') val.append(q[0]) else: head+=1 continue dire = export.findParentDir(PSI) output_dir = dire+'OncoInputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/NMFInput-Round'+str(turn)+'.txt' filterRows(PSI,output_file,filterDB=val) return output_file,rank_Count
def performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy): """ Run NMF and determine the number of valid clusters based on the magnitude of detected differential splicing """ use_adjusted_p=True print "Running NMF analyses for dimension reduction using "+str(k)+" k - Round"+str(AnalysisRound) NMFResult,BinarizedOutput,metaData,Annotation=NMF_Analysis.NMFAnalysis(NMFinput,k,AnalysisRound,strategy) ### This is where we get the correct version print "Running metaData Analyses for finding differential splicing events" rootdir,CovariateQuery=metaDataAnalysis.remoteAnalysis(species,filtered_EventAnnot_dir,metaData,'PSI',0.1,use_adjusted_p,0.05,Annotation) counter=1 dPSI_results_dir=rootdir+CovariateQuery global upd_guides upd_guides=[] name=[] group=[] grplst=[] for filename in os.listdir(dPSI_results_dir): if filename.startswith("PSI."): dPSI_results_fn=os.path.join(dPSI_results_dir, filename) dPSI_comparison_alt_name=string.replace(filename,"PSI.","") omitcluster=FindTopUniqueEvents(dPSI_results_fn,dPSI_comparison_alt_name,dPSI_results_dir) if omitcluster==0: ### Hence, clustering succeeded and did not fail in this dPSI comparison group.append(counter) name.append(string.replace(filename,"PSI.","")) counter+=1 print counter, 'robust splicing subtypes identified in round',AnalysisRound if counter>0: #counter>2 --- changed to 0 to force NMF dire = export.findParentDir(full_PSI_InputFile) output_dir = dire+'OncoInputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/SVMInput-Round'+str(AnalysisRound)+'.txt' ExpandSampleClusters.filterRows(full_PSI_InputFile,output_file,filterDB=upd_guides,logData=False) header=ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round"+str(AnalysisRound) #print 'AAAAAAAAAAAAAAAAAAAAAAAA',output_file #print 'BBBBBBBBBBBBBBBBBBBBBBBB',BinarizedOutput train=ExpandSampleClusters.TrainDataGeneration(output_file,BinarizedOutput,name) grplst.append(group) ExpandSampleClusters.Classify(header,train,output_file,grplst,name,AnalysisRound) ### This is where we write the worng version header=Correlationdepletion.header_file(NMFResult) output_file=output_dir+'/DepletionInput-Round'+str(AnalysisRound)+".txt" sampleIndexSelection.filterFile(full_PSI_InputFile,output_file,header) print "Running Correlation Depletion - Round"+str(AnalysisRound) commonkeys,count=Correlationdepletion.FindCorrelations(NMFResult,output_file,name) Depleted=Correlationdepletion.DepleteSplicingevents(commonkeys,output_file,count,full_PSI_InputFile) full_PSI_InputFile=Depleted flag=True ### Indicates that K-means was not run - hence, another round of splice-ICGS should be performed """" else: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) flag=True except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True """ return flag,full_PSI_InputFile
print "Subtype discovery stringency:",strategy dire = export.findParentDir(EventAnnot) if EnrichmentOnly==False: print 'PSI input files:',EventAnnot print 'Using a rho-cutoff of:',rho_cutoff if filters==True: ### Filter based on a default percentage of samples with detected PSI values EventAnnot,SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=True) else: SampleNumber=filterPSIValues(EventAnnot,percentCutoff=percentCutoff,filterStatus=False) output_dir = dire+'ExpressionInput' export.createExportFolder(output_dir) full_PSI_InputFile=output_dir+"/exp.input.txt" header=header_list(EventAnnot) sampleIndexSelection.filterFile(EventAnnot,full_PSI_InputFile,header,FirstCol=False) ### Set Splice-ICGS defaults gsp = UI.GeneSelectionParameters(species,platform,platform) gsp.setNormalize('median') gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) gsp.setJustShowTheseIDs('') gsp.setSampleDiscoveryParameters(ExpressionCutoff,CountsCutoff,FoldDiff,SamplesDiffering,removeOutliers, featurestoEvaluate,restrictBy,excludeCellCycle,column_metric,column_method,rho_cutoff) AnalysisRound=1
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp, forceBroadClusters, turn): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber * 0.25)) print 'Number varying samples to identify:', gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!! Setting Rank=0' #print traceback.format_exc() Rank = 0 if Rank > 1: ### ADJUST THE RANKS - MUST UPDATE!!!! if turn == 1: if force_broad_round1: #Rank=2 Rank = Rank else: if Rank > 2: Rank = 30 else: if Rank > 2: Rank = 30 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def Classify(filename, Mutlabels={}, dire="", flag=True): count = 0 start = 1 orderdict = OrderedDict() countdict = OrderedDict() countlst = [] Y = [] head = 0 rownames = [] colnames = [] q = [] Z = [] if dire != "": output_dir = dire + 'Results' export.createExportFolder(output_dir) if flag: output_file = output_dir + "/Consolidated-Increasing" + ".txt" else: output_file = output_dir + "/Consolidated-Decreasing" + ".txt" else: output_file = filename[:-4] + "-ordered.txt" export_object = open(output_file, 'w') for line in open(filename, 'rU').xreadlines(): if head > 0: val = [] counter2 = 0 val2 = [] me = 0.0 line = line.rstrip('\r\n') q = string.split(line, '\t') # rownames.append(q[0]) if q[0] == "": continue orderdict[q[0]] = [ q[0], ] for i in range(start, len(q)): try: val2.append(float(q[i])) try: orderdict[q[0]].append(float(q[i])) except Exception: orderdict[q[0]] = [ float(q[i]), ] try: countdict[i].append(float(q[i])) except Exception: countdict[i] = [ float(q[i]), ] except Exception: continue count += 1 else: #export_object.write(line) head = 1 line = line.rstrip('\r\n') q = string.split(line, '\t') header = q continue for i in countdict: countlst.append(sum(countdict[i])) #print countlst B = sorted(range(len(countlst)), key=lambda x: countlst[x], reverse=flag) C = sorted(range(len(countlst)), key=lambda x: B[x]) qu = 0 for i in orderdict.keys(): Y.append(orderdict[i]) qu += 1 #print Y for i in range(0, len(C)): jk = C.index(i) + 1 #print jk #print Y[jk] Y = sorted(Y, key=itemgetter(jk)) #orderdict=OrderedDict(sorted(orderdict,key=itemgetter(jk))) #colnames.append(header[C.index(i)+1]) Y = np.array(Y) Y = zip(*Y) Y = np.array(Y) Z.append(Y[0, :]) for i in range(0, len(C)): jk = C.index(i) + 1 Z.append(Y[jk, :]) Z = np.array(Z) q = Z.shape export_object.write("uid") for i in range(q[1]): export_object.write("\t" + Z[0][i]) export_object.write("\n") for ij in range(1, q[0]): jk = C.index(ij - 1) + 1 if header[jk] in Mutlabels: export_object.write(Mutlabels[header[jk]]) else: export_object.write(header[jk]) for jq in range(0, q[1]): export_object.write("\t" + str(Z[ij][jq])) export_object.write("\n") export_object.close() graphic_links = [] row_method = None column_method = None column_metric = 'cosine' row_metric = 'cosine' color_gradient = 'yellow_black_blue' transpose = False graphic_links = clustering.runHCexplicit(output_file, graphic_links, row_method, row_metric, column_method, column_metric, color_gradient, transpose, display=False, Normalize=False)
def parseResultfolders(motifdir, GEdir, SFlist): sfs = [] for lin in open(SFlist, 'rU').xreadlines(): s = lin.rstrip('\r\n') s1 = string.split(s, '\t') sfs.append(s1[0]) mappingdict = defaultdict(list) allden = [] for filename in os.listdir(motifdir): name = filename mapping = [] dellst = [] if "._" not in filename and "Events" not in filename: fol = os.path.join(motifdir, filename) if os.path.isdir(fol): #for filename2 in os.listdir(fol): #filnam2=os.path.join(fol,filename2) #if "._" not in filnam2: # if os.path.isdir(filnam2): # #print filnam2 # flag=0 # if "._" not in filename2: # name=filename+":"+filename2 # flag=1 # # if flag==1: for filename3 in os.listdir(fol): if filename3 == "finalResults.tab": clipres = os.path.join(fol, filename3) for lin in open(clipres, 'rU').xreadlines(): q = lin.rstrip('\r\n') q1 = string.split(q, '\t') clipnam = q1[0] + ":" + q1[1] + ":" + q1[2] mappingdict[name, clipnam, "Clipseq"] = q1[11] if filename3 == "output_TF_strand": knownrbp = os.path.join(fol, filename3) for filename4 in os.listdir(knownrbp): if filename4 == "knownResults.txt": filenam4 = os.path.join(knownrbp, filename4) try: head = 0 for line in open(filenam4, 'rU').xreadlines(): q = line.rstrip('\r\n') q1 = string.split(q, '\t') if head == 0: motif = q1.index('Motif Name') pval = q1.index('P-value') head = 1 continue else: mappingdict[ name, q1[motif], "Cisbp_Actual"] = q1[pval] except Exception: continue if filename3 == "output1": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output2": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output3": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output4": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #mappingdict[name,motname,"Cisbp_denovo"]=col[2] if filename3 == "output5": denovorbp = os.path.join(fol, filename3) for filename4 in os.listdir(denovorbp): if filename4 == "homerResults.html": denolink = "file://" + str( os.path.join(denovorbp, filename4)) #print denolink html = urllib2.urlopen(denolink).read() soup = BeautifulSoup(html) for table in soup.find_all('table'): for row in table.find_all('tr'): col = map( cell_text, row.find_all(re.compile('t[dh]'))) if col[2] == "P-value": continue else: motname = string.split( col[7], "(")[0] mapping.append([ name + ";" + motname, float(col[2]) ]) #print name,motname,col[2] #sys.exit() #mappingdict[name,motname,"Cisbp_denovo"]=col[2] mapping.sort(key=lambda x: x[0]) mapping.sort(key=lambda x: x[1]) #prev="" #output=os.path.join(motifdir,"test.txt") #output_w=open(output,"a") for i in range(len(mapping)): if mapping[i][0] not in dellst: mot = string.split(mapping[i][0], ";")[1] genes = [] genes = string.split(mot, ":")[1:] allden.append([filename, mot, genes, mapping[i][1]]) #output_w.write(mapping[i][0]+"\t"+str(mapping[i][1])) # output_w.write("\n") dellst.append(mapping[i][0]) final = {} for i in range(len(allden)): de = [] de = allden[i] for q in de[2]: if q in final: if de[3] < final[q][1]: final[q] = [de[0], de[3], de[1]] else: final[q] = [de[0], de[3], de[1]] for genes in final: de = [] de = final[genes] mappingdict[de[0], de[2], "Cisbp_denovo"] = str(de[1]) for filename in os.listdir(GEdir): if "GE" in filename and "._GE" not in filename: InputFile = os.path.join(GEdir, filename) name = string.replace(filename, "GE.", "") name = string.replace(name, "_vs_Others.txt", "") head = 0 for line in open(InputFile, 'rU').xreadlines(): q = line.rstrip('\r\n') q1 = string.split(q, '\t') if head == 0: symbol = q1.index('Symbol') adjp = q1.index('adjp') head = 1 continue else: if q1[symbol] in sfs: mappingdict[name, q1[symbol], "GE"] = q1[adjp] dire = export.findParentDir(motifdir) output_dir = dire + 'MotifResults' export.createExportFolder(output_dir) output = output_dir + "/Motifresults.txt" #output=os.path.join(motifdir,"merged_output_allpvalues_nofold.txt") output1 = open(output, "w") #output1.write("signature"+"\t"+"gene"+"\t"+"tool"+"\t"+"p-value"+"\n") for name, gene, key in mappingdict: output1.write(name + "\t" + gene + "\t" + key + "\t" + mappingdict[name, gene, key] + "\n") output1.close() return output
def Classify(header,Xobs,output_file,grplst,name,turn): count=0 start=1 Y=[] head=0 for line in open(output_file,'rU').xreadlines(): if head >count: val=[] counter2=0 val2=[] me=0.0 line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) Y.append(val) else: head+=1 continue Xobs=zip(*Xobs) Xobs=np.array(Xobs) Xobs=zip(*Xobs) Xobs=np.array(Xobs) X=grplst X=zip(*X) X=np.array(X) Y=zip(*Y) Y=np.array(Y) dire = export.findParentDir(export.findParentDir(export.findParentDir(output_file)[:-1])[:-1]) output_dir = dire+'SVMOutputs' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt' export_class1=open(exportnam1,"w") exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt' export_class2=open(exportnam2,"w") regr = LinearSVC() regr.fit(Xobs,X[:,0]) q=regr.predict(Y) count=1 if len(X[:,0])>2: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) export_class1.write("uid") export_class2.write("uid") for ni in name: sub=string.split(ni,"_")[0] export_class1.write("\t"+"R"+str(turn)+"-"+sub) export_class2.write("\t"+"R"+str(turn)+"-"+sub) export_class1.write("\n") export_class2.write("\n") for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) export_class2.write(header[iq+1]) for jq in range(0,len(X[:,0])): export_class1.write("\t"+str(prob_[iq][jq])) if prob_[iq][jq]>0: export_class2.write("\t"+str(1)) else: export_class2.write("\t"+str(0)) export_class1.write("\n") export_class2.write("\n") else: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) export_class1.write("uid"+"\t") export_class2.write("uid"+"\t") export_class1.write("group") export_class2.write("R"+str(turn)+"-V1"+"\t"+"R"+str(turn)+"-V2") export_class1.write("\n") export_class2.write("\n") for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) export_class2.write(header[iq+1]) export_class1.write("\t"+str(prob_[iq])) if prob_[iq]>0.5: export_class2.write("\t"+str(1)+"\t"+str(0)) else: if prob_[iq]<-0.5: export_class2.write("\t"+str(0)+"\t"+str(1)) else: export_class2.write("\t"+str(0)+"\t"+str(0)) export_class1.write("\n") export_class2.write("\n") export_class2.close() Orderedheatmap.Classify(exportnam2)
def FilterGuideGeneFile(Guidefile,Guidefile_block,expressionInputFile,iteration,platform,uniqueIDs,symbolIDs): """ Filters the original input expression file for Guide3 genes/events. Needed Since NMF only can deal with positive values [Guide3 has negative values]""" root_dir = export.findParentDir(expressionInputFile)[:-1] if 'ExpressionInput' in root_dir: root_dir = export.findParentDir(root_dir) if 'Clustering' in Guidefile: count=1 flag=True rank_Count=0 prev=0 else: count=0 val=[] head=0 for line in open(Guidefile_block,'rU').xreadlines(): if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') #val.append(q[0]) if flag: if int(q[1])==prev: continue else: rank_Count+=1 prev=int(q[1]) else: head+=1 continue head=0 for line in open(Guidefile,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') n=len(q) if head >count: line=line.rstrip('\r\n') q= string.split(line,'\t') uid = q[0] if uid not in uniqueIDs: if uid in symbolIDs: uid = symbolIDs[uid] val.append(uid) else: continue val.append(uid) if platform != "PSI" and head==2: rank_Count=rank_Count+int(q[1]) print rank_Count head=head+1 else: head+=1 if platform != "PSI" and q[0]=="column_clusters-flat": rank_Count=int(q[n-1]) continue output_dir = root_dir+'/NMF-SVM' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) output_file = output_dir+'/NMFInput-Round'+str(iteration)+'.txt' filterRows(expressionInputFile,output_file,filterDB=val) return output_file,rank_Count
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq): species = "Hs" row_method = 'hopach' column_method = 'hopach' row_metric = 'correlation' column_metric = 'euclidean' color_gradient = 'yellow_black_blue' contrast = 3 vendor = "RNASeq" GeneSelection = '' PathwaySelection = '' GeneSetSelection = 'None Selected' excludeCellCycle = False #rho_cutoff = 0.4 restrictBy = 'protein_coding' featurestoEvaluate = 'Genes' ExpressionCutoff = 0 CountsCutoff = 0 FoldDiff = 1.2 SamplesDiffering = 4 JustShowTheseIDs = '' removeOutliers = False PathwaySelection = [] array_type = "RNASeq" #rho_cutoff=0.4 gsp = UI.GeneSelectionParameters(species, array_type, vendor) gsp.setGeneSet(GeneSetSelection) gsp.setPathwaySelect(PathwaySelection) gsp.setGeneSelection(GeneSelection) gsp.setJustShowTheseIDs(JustShowTheseIDs) gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff, SamplesDiffering, removeOutliers, featurestoEvaluate, restrictBy, excludeCellCycle, column_metric, column_method, rho_cutoff) #Run splice ICGS """import UI species='Mm'; platform = "3'array"; vendor = 'Ensembl' gsp = UI.GeneSelectionParameters(species,platform,vendor) gsp.setGeneSet('None Selected') gsp.setPathwaySelect('') gsp.setGeneSelection('') gsp.setJustShowTheseIDs('') gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(0,0,1.5,3, False,'PSI','protein_coding',False,'cosine','hopach',0.35)""" FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) #except Exception:Rank=0 graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!!' print traceback.format_exc() Rank = 0 if Rank > 1: print 'Current turn:', turn, 'k =', if turn == 1: Rank = 2 elif Rank > 2: Rank = 30 else: Rank = 2 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print Rank print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def Enrichment(Inputfile,mutdict,mutfile,Expand,header): import collections import mappfinder X=defaultdict(list) prev="" head=0 group=defaultdict(list) enrichdict=defaultdict(float) mut=export.findFilename(mutfile) dire=export.findParentDir(Inputfile) output_dir = dire+'MutationEnrichment' export.createExportFolder(output_dir) exportnam=output_dir+'/Enrichment_Results.txt' export_enrich=open(exportnam,"w") exportnam=output_dir+'/Enrichment_tophits.txt' export_hit=open(exportnam,"w") export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n") if Expand=="yes": header2=header_file(Inputfile,Expand="yes") for line in open(Inputfile,'rU').xreadlines(): if head >0: line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(1,len(q)): if q[i]==str(1): #group[q[0]].append(header2[i-1]) group[header2[i-1]].append(q[0]) else: head+=1 continue else: for line in open(Inputfile,'rU').xreadlines(): line=line.rstrip('\r\n') line=string.split(line,'\t') #for i in range(1,len(line)): group[line[2]].append(line[0]) total_Scores={} for kiy in mutdict: if kiy =="MDP": print mutdict[kiy] groupdict={} remaining=[] remaining=list(set(header) - set(mutdict[kiy])) groupdict[1]=mutdict[kiy] groupdict[2]=remaining # export_enrich1.write(kiy) for key2 in group: r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy])))) n=float(len(group[key2])) R=float(len(set(mutdict[kiy]))) N=float(len(header)) if r==0 or R==1.0: print kiy,key2,r,n,R,N pval=float(1) z=float(0) null_z = 0.000 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) else: try: z = Zscore(r,n,N,R) except : z = 0.0000 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0,n,N,R) except Exception: null_z = 0.000 try: pval = mappfinder.FishersExactTest(r,n,R,N) zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) except Exception: pval=1.0 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) #pass if kiy in total_Scores: signature_db = total_Scores[kiy] signature_db[key2]=zsd ### Necessary format for the permutation function else: signature_db={key2:zsd} total_Scores[kiy] = signature_db sorted_results=[] mutlabels={} for kiy in total_Scores: signature_db = total_Scores[kiy] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([signature,float(zsd.PermuteP()),results]) sorted_results.sort() ### Sort by p-value prev="" for (sig,p,values) in sorted_results: if sig!=prev: flag=True export_hit.write(string.join(values,'\t')+'\n') if flag: if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 : mutlabels[values[1]]=values[0] flag=False export_hit.write(string.join(values,'\t')+'\n') export_enrich.write(string.join(values,'\t')+'\n') prev=sig if len(sorted_results)==0: export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n') export_enrich.close() #print mutlabels return mutlabels
def KmeansAnalysis(filename, header, InputFile, turn): X = defaultdict(list) prev = "" head = 0 for line in open(filename, 'rU').xreadlines(): if head > 1: val = [] line = line.rstrip('\r\n') q = string.split(line, '\t') for i in range(2, len(q)): val.append(float(q[i])) if q[1] == prev: X[prev].append(val) else: prev = q[1] X[prev].append(val) else: head += 1 continue for key in X: print key X[key] = np.array(X[key]) print X[key].shape mat = [] dire = export.findParentDir(export.findParentDir(InputFile)[:-1]) output_dir = dire + 'SVMOutputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) exportname = output_dir + '/R' + str(turn) + 'Kmeans_result.txt' #exportname=filename[:-4]+key+'.txt' export_results = open(exportname, "w") mat = zip(*X[key]) mat = np.array(mat) print mat.shape kmeans = KMeans(n_clusters=2, random_state=0).fit(mat) y = kmeans.labels_ #cent=kmeans.cluster_centers_ y = y.tolist() total = len(y) cent_1 = y.count(0) cent_2 = y.count(1) print cent_1, cent_2 group = 'R' + str(turn) + '_Kmeans' export_results.write("uid" + "\t" + group + "\n") if cent_1 < cent_2: count = 2 for j in y: if j == 0: export_results.write(header[count] + "\t" + "1" + "\n") else: export_results.write(header[count] + "\t" + "0" + "\n") count += 1 else: count = 2 for j in y: if j == 1: export_results.write(header[count] + "\t" + "1" + "\n") else: export_results.write(header[count] + "\t" + "0" + "\n") count += 1
def Classify(header,Xobs,output_file,grplst,name,turn,platform,output_dir,root_dir): count=0 start=1 Y=[] head=0 for line in open(output_file,'rU').xreadlines(): if head >count: val=[] counter2=0 val2=[] me=0.0 line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) #if q[1]==prev: Y.append(val) else: head+=1 continue Xobs=zip(*Xobs) Xobs=np.array(Xobs) Xobs=zip(*Xobs) Xobs=np.array(Xobs) X=grplst X=zip(*X) X=np.array(X) #print X Y=zip(*Y) Y=np.array(Y) #np.savetxt("/Volumes/MyPassport/Users/saljh8/Desktop/dataAnalysis/SalomonisLab/Leucegene/July-2017/PSI/ExpressionProfiles/DataPlots/complete_KNN.txt",q) #if platform=="PSI": #else: output_dir = output_dir+'/SVMOutputs' output_dir2 = root_dir+'/ICGS-NMF' if os.path.exists(output_dir)==False: export.createExportFolder(output_dir) if os.path.exists(output_dir2)==False: export.createExportFolder(output_dir2) #exportnam=output_dir+'/round'+str(turn)+'SVC_test_50cor.txt' #export_class=open(exportnam,"w") exportnam1=output_dir+'/round'+str(turn)+'SVC_decision_func.txt' export_class1=open(exportnam1,"w") if platform=="PSI": exportnam2=output_dir+'/round'+str(turn)+'SVC_Results.txt' export_class2=open(exportnam2,"w") else: exportnam2=output_dir2+'/FinalGroups.txt' export_class2=open(exportnam2,"w") exportnam3=output_dir+'/round'+str(turn)+'SVC_Results_max.txt' export_class3=open(exportnam3,"w") #export_class2.write("uid"+"\t"+"group"+"\t"+"class"+"\n") regr = LinearSVC() regr.fit(Xobs,X[:,0]) q=regr.predict(Y) #print q count=1 ordersamp={} order=[] for i in q: gr=string.split(name[int(i)-1],"_")[0] gr=gr.replace("V","") #export_class2.write(header[count]+"\t"+str(i)+"\t"+name[int(i)-1]+"\n") # export_class2.write(header[count]+"\t"+str(i)+"\t"+gr+"\n") ordersamp[header[count]]=[name[int(i)-1],str(i)] count+=1 #print len(X[:,0]) if len(X[:,0])>2: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) #k=list(prob_) export_class1.write("uid") #export_class2.write("uid") export_class3.write("uid") for ni in name: export_class1.write("\t"+"R"+str(turn)+"-"+ni) #export_class2.write("\t"+"R"+str(turn)+"-"+ni) export_class3.write("\t"+"R"+str(turn)+"-"+ni) export_class1.write("\n") #export_class2.write("\n") export_class3.write("\n") #print prob_ for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) #export_class2.write(header[iq+1]) export_class3.write(header[iq+1]) for jq in range(0,len(name)): export_class1.write("\t"+str(prob_[iq][jq])) #print prob_[iq][jq],'\t',max(prob_[iq,:]) if prob_[iq][jq]==max(prob_[iq,:]): #print ordersamp[header[iq+1]],name[jq] if ordersamp[header[iq+1]][0]==name[jq]: if max(prob_[iq,:])>0: ### Increase this value to increase SVM alignment specificity class_assignment = 1 order.append([header[iq+1],name[jq],prob_[iq][jq],ordersamp[header[iq+1]][1]]) else: class_assignment = 0 ### The best match is poor, hence, the cell will be excluded from final results!!! export_class3.write("\t"+str(class_assignment)) else: export_class3.write("\t"+str(0)) export_class1.write("\n") #export_class2.write("\n") export_class3.write("\n") export_class1.close() export_class3.close() else: if platform=="PSI": prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) #k=list(prob_) export_class1.write("uid"+"\t") export_class2.write("uid"+"\t") export_class1.write("group") export_class2.write("round"+str(turn)+"-V1"+"\t"+"round"+str(turn)+"-V2"+"\n") #for ni in name: # export_class1.write("\t"+ni) # export_class2.write("\t"+ni) export_class1.write("\n") export_class2.write("\n") #print prob_ #export_class1.write(header[1]) #export_class2.write(header[1]) for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) export_class2.write(header[iq+1]) #for jq in range(0,len(X[:,0])): export_class1.write("\t"+str(prob_[iq])) if prob_[iq]>0.5: export_class2.write("\t"+str(1)+"\t"+str(0)) else: if prob_[iq]<-0.5: export_class2.write("\t"+str(0)+"\t"+str(1)) else: export_class2.write("\t"+str(0)+"\t"+str(0)) export_class1.write("\n") export_class2.write("\n") else: prob_=regr.fit(Xobs,X[:,0]).decision_function(Y) #k=list(prob_) export_class1.write("uid") #export_class2.write("uid") export_class3.write("uid") for ni in name: export_class1.write("\t"+"R"+str(turn)+"-"+ni) #export_class2.write("\t"+"R"+str(turn)+"-"+ni) export_class3.write("\t"+"R"+str(turn)+"-"+ni) export_class1.write("\n") #export_class2.write("\n") export_class3.write("\n") #print prob_ for iq in range(0,len(header)-1): export_class1.write(header[iq+1]) #export_class2.write(header[iq+1]) export_class3.write(header[iq+1]) # for jq in range(0,len(name)): export_class1.write("\t"+str(prob_[iq])) if prob_[iq]>0.0: #print ordersamp[header[iq+1]],name[jq] if ordersamp[header[iq+1]][0]==name[jq]: order.append([header[iq+1],name[jq],prob_[iq],ordersamp[header[iq+1]][1]]) export_class3.write("\t"+str(1)) else: export_class3.write("\t"+str(0)) export_class1.write("\n") #export_class2.write("\n") export_class3.write("\n") export_class1.close() export_class3.close() order = sorted(order, key = operator.itemgetter(2),reverse=True) order = sorted(order, key = operator.itemgetter(1)) for i in range(len(order)): #export_class2.write(order[i][0]+"\t"+order[i][3]+"\t"+order[i][1]+"\n") gr=string.split(order[i][1],"_")[0] gr=gr.replace("V","") #export_class2.write(header[count]+"\t"+str(i)+"\t"+name[int(i)-1]+"\n") export_class2.write(order[i][0]+"\t"+order[i][3]+"\t"+gr+"\n") export_class2.close() if platform=="PSI": Orderedheatmap.Classify(exportnam2) else: Orderedheatmap.Classify(exportnam3)
def Enrichment(Inputfile,mutdict,mutfile,metaDataMatrixFormat,header): import collections import mappfinder X=defaultdict(list) prev="" head=0 group=defaultdict(list) enrichdict=defaultdict(float) mut=export.findFilename(mutfile) dire=export.findParentDir(Inputfile) output_dir = dire+'MutationEnrichment' print output_dir export.createExportFolder(output_dir) number_of_samples = 0 ### All enrichment results exportnam=output_dir+'/Enrichment_Results.txt' export_enrich=open(exportnam,"w") ### Selected Enrichment results based on p-value, sensitivity and specificity for association with cluster names exportnam=output_dir+'/Enrichment_tophits.txt' export_hit=open(exportnam,"w") header = "Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n" export_enrich.write(header) export_hit.write(header) header2=returnSamplesInMetaData(Inputfile,metaDataMatrixFormat=True) print header2 for line in open(Inputfile,'rU').xreadlines(): if head > 0: number_of_samples+=1 line=line.rstrip('\r\n') q = string.split(line,'\t') for i in range(1,len(q)): if q[i]==str(1): #group[q[0]].append(header2[i-1]) group[header2[i-1]].append(q[0]) ### [Cluster] = [full_sample_ID] else: head+=1 continue print 'Number of patient samples in dataset =',number_of_samples total_Scores={} for kiy in mutdict: if kiy =="MDP": print mutdict[kiy] groupdict={} remaining=[] remaining=list(set(header) - set(mutdict[kiy])) groupdict[1]=mutdict[kiy] groupdict[2]=remaining #export_enrich1.write(kiy) for key2 in group: r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy])))) n=float(len(group[key2])) R=float(len(set(mutdict[kiy]))) N=float(number_of_samples) if r==0 or key2=="1" or R==1.0: #print kiy,key2,r,n,R,N pval=float(1) z=float(0) null_z = 0.000 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) else: try: z = Zscore(r,n,N,R) except: z=0 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0,n,N,R) except Exception: null_z = 0.000 try: pval = mappfinder.FishersExactTest(r,n,R,N) zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) except Exception: pval=1.0 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) #pass if kiy in total_Scores: signature_db = total_Scores[kiy] signature_db[key2]=zsd ### Necessary format for the permutation function else: signature_db={key2:zsd} total_Scores[kiy] = signature_db sorted_results=[] mutlabels={} for kiy in total_Scores: signature_db = total_Scores[kiy] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([signature,-1*float(zsd.ZScore()),results]) sorted_results.sort() ### Sort z-score prev="" for (sig,p,values) in sorted_results: if sig!=prev: flag=True export_hit.write(string.join(values,'\t')+'\n') if flag: ### Update the cluster label to include the top enriched term meeting, sensitivity and specificity cutoffs #print values[5],values[6],values[6],values[2]; sys.exit() if (float(values[5])>=0.2 and float(values[6])>=0.2 and float(values[7])>=1.95 and float(values[2])>=2): clusterID = values[1] topEnrichedTerm=values[0] mutlabels[clusterID]=clusterID+' ('+topEnrichedTerm+')' flag=False export_hit.write(string.join(values,'\t')+'\n') export_enrich.write(string.join(values,'\t')+'\n') prev=sig if len(sorted_results)==0: export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n') export_enrich.close() return mutlabels