def agilentSummarize(exp_file_location_db): print 'Agilent array import started' global red_channel_db global green_channel_db red_channel_db={} green_channel_db={} for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData fl = exp_file_location_db[dataset] output_dir = fl.OutputDir() array_dir=fl.CELFileDir() group_dir = fl.GroupsFile() ### provides the list of array_files channel_to_extract = fl.ChannelToExtract() expression_file = fl.ExpFile() array_group_list = UI.importArrayGroupsSimple(group_dir,[])[0] normalization_method = fl.NormMatrix() arrays = map(lambda agd: agd.Array(), array_group_list) ### Pull the array names out of this list of objects dir_list = unique.read_directory(array_dir) count=0 for array in dir_list: if array in arrays: ### Important since other text files may exist in that directory count+=1 filename = array_dir+'/'+array importAgilentExpressionValues(filename,array,channel_to_extract) if count == 50: print '' ### For progress printing count = 0 if len(green_channel_db)>0: filename = output_dir+ '/'+ 'gProcessed/gProcessed-'+dataset+'-raw.txt' exportExpressionData(filename,green_channel_db) if 'quantile' in normalization_method: print '\nPerforming quantile normalization on the green channel...' green_channel_db = RNASeq.quantileNormalizationSimple(green_channel_db) filename = output_dir+ '/'+ 'gProcessed/gProcessed-'+dataset+'-quantile.txt' exportExpressionData(filename,green_channel_db) final_exp_db = green_channel_db if len(red_channel_db)>0: filename = output_dir+ '/'+ 'rProcessed/rProcessed-'+dataset+'-raw.txt' exportExpressionData(filename,red_channel_db) if 'quantile' in normalization_method: print '\nPerforming quantile normalization on the red channel...' red_channel_db = RNASeq.quantileNormalizationSimple(red_channel_db) filename = output_dir+ '/'+ 'rProcessed/rProcessed-'+dataset+'-quantile.txt' exportExpressionData(filename,red_channel_db) final_exp_db = red_channel_db if len(red_channel_db)>0 and len(green_channel_db)>0: if channel_to_extract == 'green/red ratio': final_exp_db = calculateRatios(green_channel_db,red_channel_db) elif channel_to_extract == 'red/green ratio': final_exp_db = calculateRatios(red_channel_db,green_channel_db) exportExpressionData(expression_file,final_exp_db) print 'Exported expression input file to:',expression_file
def summarizeExpressionData(filename, qc_type): start_time = time.time() fn = filepath(filename) matrix = [] row_header = [] import RNASeq platform = RNASeq.checkExpressionFileFormat(fn, "3'array") x = 0 if '/' in filename: dataset_name = string.split(filename, '/')[-1][:-4] else: dataset_name = string.split(filename, '\\')[-1][:-4] for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: group_db, column_header, qc_db = assignGroupColors(t[1:], qc_type) x = 1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data if qc_type == 'distribution': #values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place values = map(lambda x: float(x), t[1:]) i = 0 for r in values: if r != 0: if 'counts' in dataset_name or platform == 'RNASeq': r = round(math.log(r, 2), 1) else: r = round(r, 1) try: qc_db[column_header[i]][ r] += 1 ### count this rounded expression value once for this filename except Exception: qc_db[column_header[i]][r] = 1 i += 1 if qc_type == 'feature' or qc_type == 'totals': if 'counts' in dataset_name: feature_id = string.split(t[0], '=')[0] if '-' in feature_id: feature = 'junction' elif ':I' in feature_id: feature = 'intron' elif ':E' in feature_id: feature = 'exon' values = map(lambda x: float(x), t[1:]) i = 0 for r in values: if r != 0: if qc_type == 'feature': r = round(math.log(r, 2), 1) try: qc_db[column_header[i]][feature].append( r) ### add all expression values except Exception: qc_db[column_header[i]][feature] = [r] i += 1 x += 1 time_diff = str(round(time.time() - start_time, 1)) print 'Dataset import in %s seconds' % time_diff return qc_db, dataset_name
def normalizeDataset(filename, output=None, normalization='quantile', platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Normalized/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) exportExpressionData(output, sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir, filename, platform) print 'Exported expression input file to:', output
def summarizeExpressionData(filename,qc_type): start_time = time.time() fn = filepath(filename) matrix=[] row_header=[] import RNASeq platform = RNASeq.checkExpressionFileFormat(fn,"3'array") x=0 if '/' in filename: dataset_name = string.split(filename,'/')[-1][:-4] else: dataset_name = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if data[0] =='#': x=0 elif x==0: group_db, column_header, qc_db = assignGroupColors(t[1:],qc_type) x=1 else: if ' ' not in t and '' not in t: ### Occurs for rows with missing data if qc_type == 'distribution': #values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place values = map(lambda x: float(x), t[1:]) i=0 for r in values: if r!=0: if 'counts' in dataset_name or platform == 'RNASeq': r = round(math.log(r,2),1) else: r = round(r,1) try: qc_db[column_header[i]][r]+=1 ### count this rounded expression value once for this filename except Exception: qc_db[column_header[i]][r]=1 i+=1 if qc_type == 'feature' or qc_type == 'totals': if 'counts' in dataset_name: feature_id = string.split(t[0],'=')[0] if '-' in feature_id: feature = 'junction' elif ':I' in feature_id: feature = 'intron' elif ':E' in feature_id: feature = 'exon' values = map(lambda x: float(x), t[1:]) i=0 for r in values: if r!=0: if qc_type == 'feature': r = round(math.log(r,2),1) try: qc_db[column_header[i]][feature].append(r) ### add all expression values except Exception: qc_db[column_header[i]][feature] = [r] i+=1 x+=1 time_diff = str(round(time.time()-start_time,1)) print 'Dataset import in %s seconds' % time_diff return qc_db,dataset_name
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print "Exported expression input file to:", output
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output==None: output = filename moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t'+moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) exportExpressionData(output,sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir,filename,platform) print 'Exported expression input file to:',output
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Quantile/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print 'Exported expression input file to:', output
def CompleteWorkflow(full_PSI_InputFile,EventAnnot,rho_cutoff,strategy,seq,gsp,forceBroadClusters,AnalysisRound): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round filtered_EventAnnot_dir=filterEventAnnotation.FilterFile(full_PSI_InputFile,EventAnnot,AnalysisRound) try: print "Running splice-ICGS for feature selection - Round"+str(AnalysisRound) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) species = gsp.Species() if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber*0.25)) print 'Number varying samples to identify:',gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', full_PSI_InputFile,mlp,exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) dPSI_results_fn=graphic_links3[-1][-1] dPSI_results_fn=dPSI_results_fn[:-4]+'.txt' print "Running block identification for k analyses - Round"+str(AnalysisRound) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters(dPSI_results_fn,rho_cutoff=0.4,hits_cutoff=4,hits_to_report=50,ReDefinedClusterBlocks=True,filter=True) dPSI_results_fn_block=dPSI_results_fn[:-4]+'-BlockIDs.txt' NMFinput, k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn_block,full_PSI_InputFile,AnalysisRound) except Exception: print 'UNKNOWN ERROR!!!!! Setting k=0' print traceback.format_exc() k=0 print "Round =", AnalysisRound,'and k =', k if AnalysisRound == 1: if force_broad_round1: k = 2 else: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone if k < 2: NMFinput,k = NMF_Analysis.FilterFile(dPSI_results_fn,dPSI_results_fn,full_PSI_InputFile,AnalysisRound) ### Just use the Guide 3 file alone #k = 2 print "Round =", AnalysisRound,'and k =', k if k>1: ### ADJUST THE k - MUST UPDATE!!!! if AnalysisRound == 1: if k < 2: k = 30 else: if k > 2: k = 30 print "Round =", AnalysisRound,'and k =', k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) except: print traceback.format_exc() k+=1 print 'Adjusted k =',k try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k, AnalysisRound, strategy) print traceback.format_exc() except: k = 30 print 'Adjusted k = 30' try: flag,full_PSI_InputFile = performNMF(species, NMFinput, full_PSI_InputFile, filtered_EventAnnot_dir, k,AnalysisRound, strategy) print traceback.format_exc() except: flag = True pass ### will force k-means below if k<2: if k==1: try: print "Running K-means analyses instead of NMF - Round"+str(AnalysisRound) header=[] header=Kmeans.header_file(dPSI_results_fn_block) Kmeans.KmeansAnalysis(dPSI_results_fn_block,header,full_PSI_InputFile,AnalysisRound) if AnalysisRound == 1: flag=True else: flag=False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() AnalysisRound = True else: flag=False return flag,full_PSI_InputFile,filtered_EventAnnot_dir
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version): if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue else: specific_array_type = array_type if update_all == 'yes': update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes' if update_ensembl == 'yes': from build_scripts import EnsemblSQL; reload(EnsemblSQL) """ Used to grab all essential Ensembl annotations previously obtained via BioMart""" configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = '' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl-to-External gene associations""" configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """ if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': EnsemblSQL.getFullGeneSequences(ensembl_version,species) if update_uniprot == 'yes': ###Might need to delete the existing versions of downloaded databases or force download buildUniProtFunctAnnotations(species,force) if update_probeset_to_ensembl == 'yes': if species == 'Mm' and array_type == 'AltMouse': buildAltMouseExonAnnotations(species,array_type,force,genomic_build) elif array_type == 'junction': buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build) elif array_type == 'RNASeq': import RNASeq; test_status = 'no'; data_type = 'mRNA' RNASeq.getEnsemblAssociations(species,data_type,test_status,force) else: buildExonArrayExonAnnotations(species,array_type,force) if update_domain == 'yes': if array_type == 'RNASeq': only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences else: only_rely_on_coordinate_mapping = False from build_scripts import FeatureAlignment from build_scripts import JunctionArray from build_scripts import mRNASeqAlign from build_scripts import IdentifyAltIsoforms ### Get UCSC associations for all Ensembl linked genes (download databases if necessary) if species == 'Mm' and array_type == 'AltMouse': mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'yes' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) if (species == 'Mm' and array_type == 'AltMouse'): """Imports and re-exports array-Ensembl annotations""" null = JunctionArray.importArrayAnnotations(species,array_type); null={} if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing""" analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force) run_seqcomp = 'no' if only_rely_on_coordinate_mapping == False: IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') if array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon' IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed """ Repeat above with CoordinateBasedMatching = True """ ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs) analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) if array_type == 'RNASeq': JunctionArray.combineExonJunctionAnnotations(species,array_type) if update_miRs == 'yes': if update_miR_seq == 'yes': from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no' MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results) if array_type == 'exon' or array_type == 'gene': from build_scripts import ExonSeqModule stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) stringency = 'lax' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build else: from build_scripts import JunctionSeqModule stringency = 'strict'; mir_source = 'multiple' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) stringency = 'lax' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) if array_type == 'junction': try: from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules JunctionArray.filterForCriticalExons(species,array_type) JunctionArray.overRideExonEntriesWithJunctions(species,array_type) JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build except IOError: print 'No built junction files to analyze';sys.exit() if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'): from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit() try: filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename) er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt') import shutil; shutil.copyfile(ef,er) except Exception: null=[] if array_type != 'RNASeq': ### Get the probeset-probe relationships from online - needed for FIRMA analysis filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' if array_type == 'junction' and 'lue' in specific_array_type: server_folder = 'junction/hGlue' verifyFile(filename,server_folder) ### Will force download if missing verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn': try: ### Available for select exon-arrays and AltMouse probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt' verifyFile(probeset_to_remove_file,array_type) except Exception: null=[]
def CompleteWorkflow(InputFile, EventAnnot, rho_cutoff, strategy, seq, gsp, forceBroadClusters, turn): """ This function is used perform a single-iteration of the OncoSplice workflow (called from main), including the unsupervised splicing analysis (splice-ICGS) and signature depletion """ ### Filter the EventAnnotation PSI file with non-depleted events from the prior round FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) ### Reset the below variables which can be altered in prior rounds gsp.setGeneSelection('') gsp.setGeneSet('None Selected') gsp.setPathwaySelect([]) if forceBroadClusters == True: ### Find Broad clusters with at least 25% of all samples originalSamplesDiffering = gsp.SamplesDiffering() gsp.setSamplesDiffering(int(SampleNumber * 0.25)) print 'Number varying samples to identify:', gsp.SamplesDiffering() graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) if forceBroadClusters == True: gsp.setSamplesDiffering(originalSamplesDiffering) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) ### Parameters are fixed as they are distinct RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!! Setting Rank=0' #print traceback.format_exc() Rank = 0 if Rank > 1: ### ADJUST THE RANKS - MUST UPDATE!!!! if turn == 1: if force_broad_round1: #Rank=2 Rank = Rank else: if Rank > 2: Rank = 30 else: if Rank > 2: Rank = 30 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def agilentSummarize(exp_file_location_db): print 'Agilent array import started' global red_channel_db global green_channel_db red_channel_db = {} green_channel_db = {} for dataset in exp_file_location_db: ### Instance of the Class ExpressionFileLocationData fl = exp_file_location_db[dataset] output_dir = fl.OutputDir() array_dir = fl.CELFileDir() group_dir = fl.GroupsFile() ### provides the list of array_files channel_to_extract = fl.ChannelToExtract() expression_file = fl.ExpFile() array_group_list = UI.importArrayGroupsSimple(group_dir, [])[0] normalization_method = fl.NormMatrix() arrays = map( lambda agd: agd.Array(), array_group_list) ### Pull the array names out of this list of objects dir_list = unique.read_directory(array_dir) count = 0 for array in dir_list: if array in arrays: ### Important since other text files may exist in that directory count += 1 filename = array_dir + '/' + array importAgilentExpressionValues(filename, array, channel_to_extract) if count == 50: print '' ### For progress printing count = 0 if len(green_channel_db) > 0: filename = output_dir + '/' + 'gProcessed/gProcessed-' + dataset + '-raw.txt' exportExpressionData(filename, green_channel_db) if 'quantile' in normalization_method: print '\nPerforming quantile normalization on the green channel...' green_channel_db = RNASeq.quantileNormalizationSimple( green_channel_db) filename = output_dir + '/' + 'gProcessed/gProcessed-' + dataset + '-quantile.txt' exportExpressionData(filename, green_channel_db) final_exp_db = green_channel_db if len(red_channel_db) > 0: filename = output_dir + '/' + 'rProcessed/rProcessed-' + dataset + '-raw.txt' exportExpressionData(filename, red_channel_db) if 'quantile' in normalization_method: print '\nPerforming quantile normalization on the red channel...' red_channel_db = RNASeq.quantileNormalizationSimple(red_channel_db) filename = output_dir + '/' + 'rProcessed/rProcessed-' + dataset + '-quantile.txt' exportExpressionData(filename, red_channel_db) final_exp_db = red_channel_db if len(red_channel_db) > 0 and len(green_channel_db) > 0: if channel_to_extract == 'green/red ratio': final_exp_db = calculateRatios(green_channel_db, red_channel_db) elif channel_to_extract == 'red/green ratio': final_exp_db = calculateRatios(red_channel_db, green_channel_db) exportExpressionData(expression_file, final_exp_db) print 'Exported expression input file to:', expression_file
def summarizeExpressionData(filename, qc_type): start_time = time.time() fn = filepath(filename) matrix = [] row_header = [] import RNASeq platform = RNASeq.checkExpressionFileFormat(fn, "3'array") x = 0 if "/" in filename: dataset_name = string.split(filename, "/")[-1][:-4] else: dataset_name = string.split(filename, "\\")[-1][:-4] for line in open(fn, "rU").xreadlines(): data = cleanUpLine(line) t = string.split(data, "\t") if data[0] == "#": x = 0 elif x == 0: group_db, column_header, qc_db = assignGroupColors(t[1:], qc_type) x = 1 else: if " " not in t and "" not in t: ### Occurs for rows with missing data if qc_type == "distribution": # values = map(lambda x: round(float(x), 1), t[1:]) ### report value to one decimal place values = map(lambda x: float(x), t[1:]) i = 0 for r in values: if r != 0: if "counts" in dataset_name or platform == "RNASeq": r = round(math.log(r, 2), 1) else: r = round(r, 1) try: qc_db[column_header[i]][ r ] += 1 ### count this rounded expression value once for this filename except Exception: qc_db[column_header[i]][r] = 1 i += 1 if qc_type == "feature" or qc_type == "totals": if "counts" in dataset_name: feature_id = string.split(t[0], "=")[0] if "-" in feature_id: feature = "junction" elif ":I" in feature_id: feature = "intron" elif ":E" in feature_id: feature = "exon" values = map(lambda x: float(x), t[1:]) i = 0 for r in values: if r != 0: if qc_type == "feature": r = round(math.log(r, 2), 1) try: qc_db[column_header[i]][feature].append(r) ### add all expression values except Exception: qc_db[column_header[i]][feature] = [r] i += 1 x += 1 time_diff = str(round(time.time() - start_time, 1)) print "Dataset import in %s seconds" % time_diff return qc_db, dataset_name
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version): if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue else: specific_array_type = array_type if update_all == 'yes': update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes' if update_ensembl == 'yes': import EnsemblSQL; reload(EnsemblSQL) """ Used to grab all essential Ensembl annotations previously obtained via BioMart""" configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = '' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl-to-External gene associations""" configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """ if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': EnsemblSQL.getFullGeneSequences(ensembl_version,species) if update_uniprot == 'yes': ###Might need to delete the existing versions of downloaded databases or force download buildUniProtFunctAnnotations(species,force) if update_probeset_to_ensembl == 'yes': if species == 'Mm' and array_type == 'AltMouse': buildAltMouseExonAnnotations(species,array_type,force,genomic_build) elif array_type == 'junction': buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build) elif array_type == 'RNASeq': import RNASeq; test_status = 'no'; data_type = 'mRNA' RNASeq.getEnsemblAssociations(species,data_type,test_status,force) else: buildExonArrayExonAnnotations(species,array_type,force) if update_domain == 'yes': ### Get UCSC associations for all Ensembl linked genes (download databases if necessary) if species == 'Mm' and array_type == 'AltMouse': mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'yes' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) if (species == 'Mm' and array_type == 'AltMouse'): """Imports and re-exports array-Ensembl annotations""" import JunctionArray null = JunctionArray.importArrayAnnotations(species,array_type); null={} if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq': """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing""" import mRNASeqAlign; analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force) import IdentifyAltIsoforms; run_seqcomp = 'no' IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) import FeatureAlignment; import JunctionArray FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') if array_type == 'junction' or array_type == 'RNASeq': ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon' IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed """ Repeat above with CoordinateBasedMatching = True """ ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs) analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) if array_type == 'RNASeq': JunctionArray.combineExonJunctionAnnotations(species,array_type) if update_miRs == 'yes': if update_miR_seq == 'yes': import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no' MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results) if array_type == 'exon' or array_type == 'gene': import ExonSeqModule stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) stringency = 'lax' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build else: import JunctionSeqModule stringency = 'strict'; mir_source = 'multiple' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) stringency = 'lax' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) if array_type == 'junction': try: import JunctionArray; import JunctionArrayEnsemblRules JunctionArray.filterForCriticalExons(species,array_type) JunctionArray.overRideExonEntriesWithJunctions(species,array_type) JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build except IOError: print 'No built junction files to analyze';sys.exit() if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'): import JunctionArray; import JunctionArrayEnsemblRules try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit() try: filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename) er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt') import shutil; shutil.copyfile(ef,er) except Exception: null=[] if array_type != 'RNASeq': ### Get the probeset-probe relationships from online - needed for FIRMA analysis filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' if array_type == 'junction' and 'lue' in specific_array_type: server_folder = 'junction/hGlue' verifyFile(filename,server_folder) ### Will force download if missing verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn': try: ### Available for select exon-arrays and AltMouse probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt' verifyFile(probeset_to_remove_file,array_type) except Exception: null=[]
def CompleteWorkflow(InputFile, EventAnnot, turn, rho_cutoff, strategy, seq): species = "Hs" row_method = 'hopach' column_method = 'hopach' row_metric = 'correlation' column_metric = 'euclidean' color_gradient = 'yellow_black_blue' contrast = 3 vendor = "RNASeq" GeneSelection = '' PathwaySelection = '' GeneSetSelection = 'None Selected' excludeCellCycle = False #rho_cutoff = 0.4 restrictBy = 'protein_coding' featurestoEvaluate = 'Genes' ExpressionCutoff = 0 CountsCutoff = 0 FoldDiff = 1.2 SamplesDiffering = 4 JustShowTheseIDs = '' removeOutliers = False PathwaySelection = [] array_type = "RNASeq" #rho_cutoff=0.4 gsp = UI.GeneSelectionParameters(species, array_type, vendor) gsp.setGeneSet(GeneSetSelection) gsp.setPathwaySelect(PathwaySelection) gsp.setGeneSelection(GeneSelection) gsp.setJustShowTheseIDs(JustShowTheseIDs) gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(ExpressionCutoff, CountsCutoff, FoldDiff, SamplesDiffering, removeOutliers, featurestoEvaluate, restrictBy, excludeCellCycle, column_metric, column_method, rho_cutoff) #Run splice ICGS """import UI species='Mm'; platform = "3'array"; vendor = 'Ensembl' gsp = UI.GeneSelectionParameters(species,platform,vendor) gsp.setGeneSet('None Selected') gsp.setPathwaySelect('') gsp.setGeneSelection('') gsp.setJustShowTheseIDs('') gsp.setNormalize('median') gsp.setSampleDiscoveryParameters(0,0,1.5,3, False,'PSI','protein_coding',False,'cosine','hopach',0.35)""" FilteredEventAnnot = filterEventAnnotation.FilterFile( InputFile, EventAnnot, turn) try: print "Running splice-ICGS for feature selection - Round" + str(turn) #except Exception:Rank=0 graphic_links3 = RNASeq.singleCellRNASeqWorkflow(species, 'exons', InputFile, mlp, exp_threshold=0, rpkm_threshold=0, parameters=gsp) Guidefile = graphic_links3[-1][-1] Guidefile = Guidefile[:-4] + '.txt' print "Running block identification for rank analyses - Round" + str( turn) RNASeq_blockIdentification.correlateClusteredGenesParameters( Guidefile, rho_cutoff=0.4, hits_cutoff=4, hits_to_report=50, ReDefinedClusterBlocks=True, filter=True) Guidefile_block = Guidefile[:-4] + '-BlockIDs.txt' NMFinput, Rank = NMF_Analysis.FilterFile(Guidefile, Guidefile_block, InputFile, turn) except Exception: print 'UNKNOWN ERROR!!!!!' print traceback.format_exc() Rank = 0 if Rank > 1: print 'Current turn:', turn, 'k =', if turn == 1: Rank = 2 elif Rank > 2: Rank = 30 else: Rank = 2 if seq == "bulk": use_adjusted_p = True else: use_adjusted_p = False print Rank print "Running NMF analyses for dimension reduction using " + str( Rank) + " ranks - Round" + str(turn) NMFResult, BinarizedOutput, Metadata, Annotation = NMF_Analysis.NMFAnalysis( NMFinput, Rank, turn, strategy) print "Running Metadata Analyses for finding differential splicing events" rootdir, CovariateQuery = metaDataAnalysis.remoteAnalysis( 'Hs', FilteredEventAnnot, Metadata, 'PSI', 0.1, use_adjusted_p, 0.05, Annotation) counter = 1 Guidedir = rootdir + CovariateQuery PSIdir = rootdir + 'ExpressionProfiles' global upd_guides upd_guides = [] name = [] group = [] grplst = [] for filename in os.listdir(Guidedir): if filename.startswith("PSI."): Guidefile = os.path.join(Guidedir, filename) psi = string.replace(filename, "PSI.", "") PSIfile = os.path.join(PSIdir, psi) omitcluster = FindTopUniqueEvents(Guidefile, psi, Guidedir) if omitcluster == 0: group.append(counter) name.append(psi) counter += 1 if counter > 2: dire = export.findParentDir(InputFile) output_dir = dire + 'OncoInputs' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) output_file = output_dir + '/SVMInput-Round' + str(turn) + '.txt' ExpandSampleClusters.filterRows(InputFile, output_file, filterDB=upd_guides, logData=False) header = ExpandSampleClusters.header_file(output_file) print "Running SVM prediction for improved subtypes - Round" + str( turn) train = ExpandSampleClusters.TrainDataGeneration( output_file, BinarizedOutput, name) grplst.append(group) ExpandSampleClusters.Classify(header, train, output_file, grplst, name, turn) header = Correlationdepletion.header_file(NMFResult) output_file = output_dir + '/DepletionInput-Round' + str( turn) + ".txt" sampleIndexSelection.filterFile(InputFile, output_file, header) print "Running Correlation Depletion - Round" + str(turn) commonkeys, count = Correlationdepletion.FindCorrelations( NMFResult, output_file, name) Depleted = Correlationdepletion.DepleteSplicingevents( commonkeys, output_file, count, InputFile) InputFile = Depleted flag = True else: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: if Rank == 1: try: print "Running K-means analyses instead of NMF - Round" + str( turn) header = [] header = Kmeans.header_file(Guidefile_block) Kmeans.KmeansAnalysis(Guidefile_block, header, InputFile, turn) flag = False except Exception: print 'WARNING!!!!! DID NOT RUN K-MEANS!!!!!' print traceback.format_exc() flag = False else: flag = False return flag, InputFile, FilteredEventAnnot
def generateConstitutiveExpression(exp_dbase,constitutive_gene_db,probeset_gene_db,pre_filtered_db,array_names,filename): """Generate Steady-State expression values for each gene for analysis in the main module of this package""" steady_state_db={}; k=0; l=0 remove_nonexpressed_genes = 'no' ### By default set to 'no' ###1st Pass: Identify probesets for steady-state calculation for gene in probeset_gene_db: if avg_all_probes_for_steady_state == 'yes': average_all_probesets[gene] = probeset_gene_db[gene] ### These are all exon aligning (not intron) probesets else: if gene not in constitutive_gene_db: average_all_probesets[gene] = probeset_gene_db[gene] else: constitutive_probeset_list = constitutive_gene_db[gene] constitutive_filtered=[] ###Added this extra code to eliminate constitutive probesets not in exp_dbase (gene level filters are more efficient when dealing with this many probesets) for probeset in constitutive_probeset_list: if probeset in probeset_gene_db[gene]: constitutive_filtered.append(probeset) if len(constitutive_filtered)>0: average_all_probesets[gene] = constitutive_filtered else: average_all_probesets[gene] = probeset_gene_db[gene] ###2nd Pass: Remove probesets that have no detected expression (keep all if none are expressed) if excludeLowExpressionExons: non_expressed_genes={} ### keep track of these for internal QC for gene in average_all_probesets: gene_probe_list=[]; x = 0 for probeset in average_all_probesets[gene]: if probeset in pre_filtered_db: gene_probe_list.append(probeset); x += 1 ###If no constitutive and there are probes with detected expression: replace entry if x >0: average_all_probesets[gene] = gene_probe_list elif remove_nonexpressed_genes == 'yes': non_expressed_genes[gene]=[] if remove_nonexpressed_genes == 'yes': for gene in non_expressed_genes: del average_all_probesets[gene] ###3rd Pass: Make sure the probesets are present in the input set (this is not typical unless a user is loading a pre-filtered probeset expression dataset) for gene in average_all_probesets: v=0 for probeset in average_all_probesets[gene]: try: null = exp_dbase[probeset]; v+=1 except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets if v==0: ###Therefore, no probesets were found that were previously predicted to be best constitutive try: average_all_probesets[gene] = probeset_gene_db[gene] ###expand the average_all_probesets to include any exon linked to the gene except KeyError: print gene, probeset, len(probeset_gene_db), len(average_all_probesets);kill for probeset in exp_dbase: array_count = len(exp_dbase[probeset]); break try: null = array_count except Exception: print 'WARNING...CRITICAL ERROR. Make sure the correct array type is selected and that all input expression files are indeed present (array_count ERROR).'; forceError ###Calculate avg expression for each array for each probeset (using constitutive values) gene_count_db={} for gene in average_all_probesets: x = 0 ###For each array, average all probeset expression values gene_sum=0 probeset_list = average_all_probesets[gene]#; k+= len(average_all_probesets[gene]) if array_type != 'RNASeq': ### Just retain the list of probesets for RNA-seq while x < array_count: exp_list=[] ### average all exp values for constituitive probesets for each array for probeset in probeset_list: try: exp_val = exp_dbase[probeset][x] exp_list.append(exp_val) except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets try: if len(exp_list)==0: for probeset in probeset_list: try: exp_val = exp_dbase[probeset][x] exp_list.append(exp_val) except KeyError: null =[] ###occurs if the expression probeset list is missing some of these probesets avg_const_exp=statistics.avg(exp_list) ### Add only one avg-expression value for each array, this loop try: steady_state_db[gene].append(avg_const_exp) except KeyError: steady_state_db[gene] = [avg_const_exp] except ZeroDivisionError: null=[] ### Occurs when processing a truncated dataset (for testing usually) - no values for the gene should be included x += 1 l = len(probeset_gene_db) - len(steady_state_db) steady_state_export = filename[0:-4]+'-steady-state.txt' steady_state_export = string.replace(steady_state_export,'counts.','exp.') fn=filepath(steady_state_export); data = open(fn,'w'); title = 'Gene_ID' if array_type == 'RNASeq': import RNASeq steady_state_db, pre_filtered_db = RNASeq.calculateGeneLevelStatistics(steady_state_export,species,average_all_probesets,normalize_feature_exp,array_names,UserOptions,excludeLowExp=excludeLowExpressionExons) ### This "pre_filtered_db" replaces the above since the RNASeq module performs the exon and junction-level filtering, not ExonArray (RPKM and count based) ### Use pre_filtered_db to exclude non-expressed features for multi-group alternative exon analysis removeNonExpressedProbesets(pre_filtered_db,full_dataset_export_dir) reload(RNASeq) for array in array_names: title = title +'\t'+ array data.write(title+'\n') for gene in steady_state_db: ss_vals = gene for exp_val in steady_state_db[gene]: ss_vals = ss_vals +'\t'+ str(exp_val) data.write(ss_vals+'\n') data.close() exp_dbase={}; steady_state_db={}; pre_filtered_db ={} #print k, "probesets were not found in the expression file, that could be used for the constitutive expression calculation" #print l, "genes were also not included that did not have such expression data" print "Steady-state data exported to",steady_state_export