def probesetSummarize(exp_file_location_db, analyze_metaprobesets, probeset_type, species, root): for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData fl = exp_file_location_db[dataset] apt_dir = fl.APTLocation() array_type = fl.ArrayType() pgf_file = fl.InputCDFFile() clf_file = fl.CLFFile() bgp_file = fl.BGPFile() xhyb_remove = fl.XHybRemoval() cel_dir = fl.CELFileDir() + '/cel_files.txt' expression_file = fl.ExpFile() stats_file = fl.StatsFile() output_dir = fl.OutputDir() + '/APT-output' cache_dir = output_dir + '/apt-probeset-summarize-cache' architecture = fl.Architecture( ) ### May over-ride the real architecture if a failure occurs get_probe_level_results = 'yes' if get_probe_level_results == 'yes': export_features = 'yes' if xhyb_remove == 'yes' and (array_type == 'gene' or array_type == 'junction'): xhyb_remove = 'no' ### This is set when the user mistakenly selects exon array, initially if analyze_metaprobesets == 'yes': export_features = 'true' metaprobeset_file = filepath('AltDatabase/' + species + '/' + array_type + '/' + species + '_' + array_type + '_' + probeset_type + '.mps') count = verifyFileLength(metaprobeset_file) if count < 2: from build_scripts import ExonArray ExonArray.exportMetaProbesets( array_type, species) ### Export metaprobesets for this build import subprocess import platform print 'Processor architecture set =', architecture, platform.machine() if '/bin' in apt_dir: apt_file = apt_dir + '/apt-probeset-summarize' ### if the user selects an APT directory elif os.name == 'nt': if '32bit' in architecture: apt_file = apt_dir + '/PC/32bit/apt-probeset-summarize' plat = 'Windows' elif '64bit' in architecture: apt_file = apt_dir + '/PC/64bit/apt-probeset-summarize' plat = 'Windows' elif 'darwin' in sys.platform: apt_file = apt_dir + '/Mac/apt-probeset-summarize' plat = 'MacOSX' elif 'linux' in sys.platform: if '32bit' in platform.architecture(): apt_file = apt_dir + '/Linux/32bit/apt-probeset-summarize' plat = 'linux32bit' elif '64bit' in platform.architecture(): apt_file = apt_dir + '/Linux/64bit/apt-probeset-summarize' plat = 'linux64bit' apt_file = filepath(apt_file) apt_extract_file = string.replace(apt_file, 'probeset-summarize', 'cel-extract') #print 'AltAnalyze has choosen APT for',plat print "Beginning probeset summarization of input CEL files with Affymetrix Power Tools (APT)..." if 'cdf' in pgf_file or 'CDF' in pgf_file: if xhyb_remove == 'yes' and array_type == 'AltMouse': kill_list_dir = osfilepath('AltDatabase/' + species + '/AltMouse/' + species + '_probes_to_remove.txt') else: kill_list_dir = osfilepath( 'AltDatabase/affymetrix/APT/probes_to_remove.txt') try: ### Below code attempts to calculate probe-level summarys and absent/present p-values ### for 3'arrays (may fail for arrays with missing missmatch probes - AltMouse) cdf_file = pgf_file algorithm = 'rma' retcode = subprocess.call([ apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir, "-a", "pm-mm,mas5-detect.calls=1.pairs=1" ]) try: extract_retcode = subprocess.call( [ apt_extract_file, "-d", cdf_file, "--pm-with-mm-only", "-o", output_dir + '/probe.summary.txt', "--cel-files", cel_dir, "-a" ] ) ### "quant-norm,pm-gcbg", "--report-background" -requires a BGP file except Exception, e: #print traceback.format_exc() retcode = False ### On some system there is a no file found error, even when the analysis completes correctly if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir + '/' + algorithm + '.summary.txt' export.customFileCopy( summary_exp_file, expression_file) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) summary_stats_file = output_dir + '/pm-mm.mas5-detect.summary.txt' try: shutil.copyfile(summary_stats_file, stats_file) except Exception: None ### Occurs if dabg export failed os.remove(summary_stats_file) except Exception: #print traceback.format_exc() try: cdf_file = pgf_file algorithm = 'rma' pval = 'dabg' retcode = subprocess.call([ apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir ]) # "-a", pval, if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir + '/' + algorithm + '.summary.txt' export.customFileCopy( summary_exp_file, expression_file ) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) except NameError: status = 'failed'
"-b", bgp_file, "--kill-list", kill_list_dir, "-m", metaprobeset_file, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir, "--feat-details", export_features ] ) ### Exclude DABG p-value - known issue for Glue junction array else: bad_exit if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir + '/' + algorithm + '.summary.txt' #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, expression_file, metaprobeset_file, species) export.customFileCopy( summary_exp_file, expression_file) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) summary_exp_file = output_dir + '/' + pval + '.summary.txt' #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, stats_file, metaprobeset_file, species) try: shutil.copyfile(summary_exp_file, stats_file) os.remove(summary_exp_file) except Exception: print traceback.format_exc() null = [] ### Occurs if dabg export failed if analyze_metaprobesets == 'yes': residual_destination_file = string.replace(
def probesetSummarize(exp_file_location_db,analyze_metaprobesets,probeset_type,species,root): for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData fl = exp_file_location_db[dataset] apt_dir =fl.APTLocation() array_type=fl.ArrayType() pgf_file=fl.InputCDFFile() clf_file=fl.CLFFile() bgp_file=fl.BGPFile() xhyb_remove = fl.XHybRemoval() cel_dir=fl.CELFileDir() + '/cel_files.txt' expression_file = fl.ExpFile() stats_file = fl.StatsFile() output_dir = fl.OutputDir() + '/APT-output' cache_dir = output_dir + '/apt-probeset-summarize-cache' architecture = fl.Architecture() ### May over-ride the real architecture if a failure occurs get_probe_level_results = 'yes' if get_probe_level_results == 'yes': export_features = 'yes' if xhyb_remove == 'yes' and (array_type == 'gene' or array_type == 'junction'): xhyb_remove = 'no' ### This is set when the user mistakenly selects exon array, initially if analyze_metaprobesets == 'yes': export_features = 'true' metaprobeset_file = filepath('AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'_'+probeset_type+'.mps') count = verifyFileLength(metaprobeset_file) if count<2: import ExonArray ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build import subprocess; import platform print 'Processor architecture set =',architecture,platform.machine() if '/bin' in apt_dir: apt_file = apt_dir +'/apt-probeset-summarize' ### if the user selects an APT directory elif os.name == 'nt': if '32bit' in architecture: apt_file = apt_dir + '/PC/32bit/apt-probeset-summarize'; plat = 'Windows' elif '64bit' in architecture: apt_file = apt_dir + '/PC/64bit/apt-probeset-summarize'; plat = 'Windows' elif 'darwin' in sys.platform: apt_file = apt_dir + '/Mac/apt-probeset-summarize'; plat = 'MacOSX' elif 'linux' in sys.platform: if '32bit' in platform.architecture(): apt_file = apt_dir + '/Linux/32bit/apt-probeset-summarize'; plat = 'linux32bit' elif '64bit' in platform.architecture(): apt_file = apt_dir + '/Linux/64bit/apt-probeset-summarize'; plat = 'linux64bit' apt_file = filepath(apt_file) apt_extract_file = string.replace(apt_file,'probeset-summarize','cel-extract') #print 'AltAnalyze has choosen APT for',plat print "Beginning probeset summarization of input CEL files with Affymetrix Power Tools (APT)..." if 'cdf' in pgf_file or 'CDF' in pgf_file: if xhyb_remove == 'yes' and array_type == 'AltMouse': kill_list_dir = osfilepath('AltDatabase/'+species+'/AltMouse/'+species+'_probes_to_remove.txt') else: kill_list_dir = osfilepath('AltDatabase/affymetrix/APT/probes_to_remove.txt') try: ### Below code attempts to calculate probe-level summarys and absent/present p-values ### for 3'arrays (may fail for arrays with missing missmatch probes - AltMouse) cdf_file = pgf_file; algorithm = 'rma' retcode = subprocess.call([ apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir, "-a", "pm-mm,mas5-detect.calls=1.pairs=1"]) try: extract_retcode = subprocess.call([ apt_extract_file, "-d", cdf_file, "--pm-with-mm-only", "-o", output_dir+'/probe.summary.txt', "--cel-files", cel_dir, "-a"]) ### "quant-norm,pm-gcbg", "--report-background" -requires a BGP file except Exception,e: #print traceback.format_exc() retcode = False ### On some system there is a no file found error, even when the analysis completes correctly if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir+'/'+algorithm+'.summary.txt' export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) summary_stats_file = output_dir+'/pm-mm.mas5-detect.summary.txt' try: shutil.copyfile(summary_stats_file, stats_file) except Exception: None ### Occurs if dabg export failed os.remove(summary_stats_file) except Exception: #print traceback.format_exc() try: cdf_file = pgf_file; algorithm = 'rma'; pval = 'dabg' retcode = subprocess.call([ apt_file, "-d", cdf_file, "--kill-list", kill_list_dir, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir]) # "-a", pval, if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir+'/'+algorithm+'.summary.txt' export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) except NameError: status = 'failed'
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"): root_dir = export.findParentDir(NMFinputDir)[:-1] if 'ExpressionInput' in root_dir: root_dir = export.findParentDir(root_dir) if 'NMF-SVM' in root_dir: root_dir = export.findParentDir(root_dir) export.findFilename(NMFinputDir) X=[] header=[] head=0 exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt' export_res=export.ExportFile(exportnam) exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt' export_res1=export.ExportFile(exportnam_bin) exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt' export_res5=export.ExportFile(exportnam_bint) MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt' export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt') export_res4=open(string.replace(MF_input,'exp.','groups.'),"w") export_res7=open(string.replace(MF_input,'exp.','comps.'),"w") exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt' export_res2=export.ExportFile(exportnam2) exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt' export_res3=export.ExportFile(exportnam3) #if 'Clustering' in NMFinputDir: # count=1 # start=2 #else: count=0 start=1 #print Rank for line in open(NMFinputDir,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') if head >count: val=[] val2=[] me=0.0 for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) #if q[1]==prev: X.append(val) else: export_res1.write(line) export_res.write(line) export_res1.write("\n") #export_res4.write(line) #export_res4.write("\n") export_res.write("\n") header=q head+=1 continue group=defaultdict(list) sh=[] X=np.array(X) #print X.shape mat=[] #mat=X mat=zip(*X) mat=np.array(mat) #print mat.shape #model = NMF(n_components=15, init='random', random_state=0) #W = model.fit_transform(mat) nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95) nmf_fit = nmf() W = nmf_fit.basis() W=np.array(W) #np.savetxt("basismatrix2.txt",W,delimiter="\t") H=nmf_fit.coef() H=np.array(H) # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t") #print W.shape sh=W.shape export_res3.write("uid\tUID\tUID\n") if int(Rank)==2: par=1 else: par=2 #for i in range(sh[1]): # val=W[:,i] # me=np.mean(val) # st=np.std(val) # export_res2.write(header[i+1]) # for j in range(sh[0]): # if float(W[i][j])>=float(me+(par*st)): # # export_res2.write("\t"+str(1)) # else: # export_res2.write("\t"+str(0)) # # export_res2.write("\n") if platform != 'PSI': sh=W.shape Z=[] export_res5.write("uid") export_res2.write("uid") for i in range(sh[1]): export_res5.write("\t"+'V'+str(i)) export_res2.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res5.write("\n") export_res2.write("\n") export_res3.write("\n") for i in range(sh[0]): new_val=[] val=W[i,:] export_res2.write(header[i+1]) export_res5.write(header[i+1]) export_res4.write(header[i+1]) flag=True for j in range(sh[1]): if W[i][j]==max(val) and flag: export_res5.write("\t"+str(1)) export_res2.write("\t"+str(1)) new_val.append(1) export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j)) flag=False else: export_res5.write("\t"+str(0)) export_res2.write("\t"+str(0)) new_val.append(0) Z.append(new_val) export_res5.write("\n") export_res2.write("\n") export_res4.write("\n") W=zip(*W) W=np.array(W) sh=W.shape Z=zip(*Z) Z=np.array(Z) for i in range(sh[0]): export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): export_res.write("\t"+str(W[i][j])) export_res1.write("\t"+str(Z[i][j])) export_res.write("\n") export_res1.write("\n") export_res.close() export_res1.close() export_res2.close() export_res5.close() Orderedheatmap.Classify(exportnam_bint) return exportnam,exportnam_bin,exportnam2,exportnam3 else: W=zip(*W) W=np.array(W) sh=W.shape Z=[] for i in range(sh[0]): new_val=[] val=W[i,:] num=sum(i > 0.10 for i in val) if num >40 or num <3: compstd=True else: compstd=False me=np.mean(val) st=np.std(val) #print 'V'+str(i) export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): if compstd: if float(W[i][j])>=float(me+(par*st)): export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) else: if float(W[i][j])>0.1: export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) export_res.write("\t"+str(W[i][j])) Z.append(new_val) export_res.write("\n") export_res1.write("\n") # Z=zip(*Z) Z=np.array(Z) sh=Z.shape Z_new=[] val1=[] Z1=[] dellst=[] export_res2.write("uid") export_res5.write("uid") for i in range(sh[0]): indices=[] val1=Z[i,:] sum1=sum(val1) flag=False indices=[index for index, value in enumerate(val1) if value == 1] for j in range(sh[0]): val2=[] if i!=j: val2=Z[j,:] sum2=sum([val2[x] for x in indices]) summ2=sum(val2) try: if float(sum2)/float(sum1)>0.5: if summ2>sum1: flag=True #print str(i) except Exception: continue if flag==False: Z1.append(val1) export_res2.write("\t"+'V'+str(i)) export_res5.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res2.write("\n") export_res5.write("\n") Z1=np.array(Z1) Z=Z1 Z=zip(*Z) Z=np.array(Z) sh=Z.shape for i in range(sh[0]): val1=Z[i,:] #print sum(val1) #if sum(val)>2: if sum(val1)>2: val=[0 if x==1 else x for x in val1] else: val=val1 me=np.mean(val) st=np.std(val) export_res2.write(header[i+1]) export_res5.write(header[i+1]) for j in range(sh[1]): if strategy=="conservative": export_res2.write("\t"+str(val1[j])) export_res5.write("\t"+str(val1[j])) else: export_res2.write("\t"+str(val[j])) export_res5.write("\t"+str(val[j])) export_res2.write("\n") export_res5.write("\n") Z_new.append(val) Z_new=zip(*Z_new) Z_new=np.array(Z_new) sh=Z_new.shape export_res5.close() Orderedheatmap.Classify(exportnam_bint) if strategy=="conservative": return exportnam,exportnam_bin,exportnam2,exportnam3 else: return exportnam,exportnam_bin,exportnam2,exportnam3
try: os.remove(summary_exp_file) except Exception: null=[] ### Occurs if dabg export failed fatal_error = APTDebugger(output_dir) if len(fatal_error)>0: print fatal_error print 'Skipping DABG p-value calculation to resolve (Bad library files -> contact Affymetrix support)' retcode = subprocess.call([ apt_file, "-p", pgf_file, "-c", clf_file, "-b", bgp_file, "--kill-list", kill_list_dir, "-m", metaprobeset_file, "-a", algorithm, "-o", output_dir, "--cel-files", cel_dir, "--feat-details", export_features]) ### Exclude DABG p-value - known issue for Glue junction array else: bad_exit if retcode: status = 'failed' else: status = 'run' summary_exp_file = output_dir+'/'+algorithm+'.summary.txt' #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, expression_file, metaprobeset_file, species) export.customFileCopy(summary_exp_file, expression_file) ### Removes the # containing lines #shutil.copyfile(summary_exp_file, expression_file) os.remove(summary_exp_file) summary_exp_file = output_dir+'/'+pval+'.summary.txt' #if analyze_metaprobesets == 'yes': annotateMetaProbesetGenes(summary_exp_file, stats_file, metaprobeset_file, species) try: shutil.copyfile(summary_exp_file, stats_file) os.remove(summary_exp_file) except Exception: print traceback.format_exc() null=[] ### Occurs if dabg export failed if analyze_metaprobesets == 'yes': residual_destination_file = string.replace(expression_file,'exp.','residuals.') residual_exp_file = output_dir+'/'+algorithm+'.residuals.txt'