def __init__(self,url,dir,file_type): try: self.suppress = suppress_printouts except Exception: self.suppress = 'no' """Copy the contents of a file from a given URL to a local file.""" filename = url.split('/')[-1]; self.status = '' #print [url, dir] if file_type == None: file_type ='' if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename output_filepath_object = export.createExportFile(dir+filename,dir[:-1]) output_filepath = filepath(dir+filename); self.output_filepath = output_filepath if self.suppress == 'no': print "Downloading the following file:",filename,' ', self.original_increment = 5 self.increment = 0 from urllib import urlretrieve webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction) if self.suppress == 'no': print '' self.testFile() if self.suppress == 'no': print self.status if 'Internet' not in self.status: if '.zip' in filename: if self.suppress == 'no': print "Extracting zip file...", try: decompressZipStackOverflow(filename,dir); status = 'completed' except Exception: #print 'Native unzip not present...trying python unzip methods...' status = unzipFiles(filename,dir) if status == 'failed': print 'zip extraction failed!' self.gz_filepath = filepath(output_filepath); self.status = 'remove' if self.suppress == 'no': print "zip file extracted" elif '.gz' in filename: self.gz_filepath = output_filepath if len(file_type)==0: extension = '.gz' else: extension = 'gz' decompressed_filepath = string.replace(self.gz_filepath,extension,file_type) ### Below code can be too memory intensive #file_size = os.path.getsize(output_filepath) #megabtyes = file_size/1000000.00 #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb') data = open(decompressed_filepath,'wb') #print "\nExtracting downloaded file:",self.gz_filepath import shutil; shutil.copyfileobj(content,data) # http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html os.chdir(filepath(dir)) if '.tar' in decompressed_filepath: import tarfile tfile = tarfile.open(decompressed_filepath) tfile.extractall() tfile.close() tar_dir = string.replace(decompressed_filepath,'.tar','') self.status = 'remove' else: self.gz_filepath = ''; self.status = 'remove'
def exportProbesetDomainMappings(species,array_type,indirect_mapping,probeset_domain_match_db): if (array_type == 'junction' or array_type == 'RNASeq') and data_type != 'null': export_file = "AltDatabase/"+species+"/"+array_type+"/"+data_type+"/"+species+"_Ensembl_"+indirect_mapping+"domain_aligning_probesets.txt" else: export_file = "AltDatabase/"+species+"/"+array_type+"/"+species+"_Ensembl_"+indirect_mapping+"domain_aligning_probesets.txt" data = export.createExportFile(export_file,"AltDatabase/"+species+"/"+array_type) data.write('Probeset\tInterPro-Description\n') for probeset in probeset_domain_match_db: domain_info_list = probeset_domain_match_db[probeset] for ipd in domain_info_list: data.write(probeset+'\t'+ipd+'\n') data.close() print "Direct probeset to domain associations exported to:",export_file
def outputSummaryResults(summary_results_db,name,analysis_method,root_dir): #summary_results_db[dataset_name] = udI,udI-up_diff,ddI,ddI-down_diff,udI_mx,udI_mx-mx_diff,up_dI_genes,down_gene, annotation_list annotation_db = {} for dataset in summary_results_db: for entry in summary_results_db[dataset][-1]: annotation = entry[0] count = entry[1] if 'AA:' not in annotation: try: annotation_db[annotation].append((dataset,count)) except KeyError: annotation_db[annotation] = [(dataset,count)] annotation_ls = [] for annotation in annotation_db: annotation_ls.append(annotation) annotation_ls.sort() annotation_db2={} for annotation in annotation_ls: for dataset in summary_results_db: y=0 for entry in summary_results_db[dataset][-1]: annotation2 = entry[0] count = entry[1] if annotation2 == annotation: y=1; new_count = count if y == 1: try: annotation_db2[dataset].append((annotation,new_count)) except KeyError: annotation_db2[dataset] = [(annotation,new_count)] else: try: annotation_db2[dataset].append((annotation,0)) except KeyError: annotation_db2[dataset] = [(annotation,0)] summary_output = root_dir+'AltResults/AlternativeOutput/'+analysis_method+'-summary-results'+name+'.txt' fn=filepath(summary_output) data = export.createExportFile(summary_output,'AltResults/AlternativeOutput') if analysis_method == 'splicing-index' or analysis_method == 'FIRMA': event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'alternative-exons' else: event_type1 = 'inclusion-events'; event_type2 = 'exclusion-events'; event_type3 = 'mutually-exlusive-events' title = 'Dataset-name' +'\t'+ event_type1+'\t'+event_type2 +'\t'+ event_type3 +'\t'+ 'up-deltaI-genes' +'\t'+ 'down-deltaI-genes' +'\t'+ 'total-'+analysis_method+'-genes' title = title +'\t' + 'upregulated_genes' +'\t'+ 'downregulated_genes' +'\t'+ analysis_method+'-genes-differentially-exp'+'\t'+ 'RNA_processing/binding-factors-upregulated' +'\t'+ 'RNA_processing/binding-factors-downregulated' +'\t'+ analysis_method+'_RNA_processing/binding-factors' title = title +'\t'+ 'avg-downregulated-peptide-length' +'\t'+ 'std-downregulated-peptide-length' +'\t'+ 'avg-upregulated-peptide-length' +'\t'+ 'std-upregulated-peptide-length' +'\t'+ 'ttest-peptide-length' +'\t'+ 'median-peptide-length-fold-change' for entry in annotation_ls: title = title +'\t'+ entry data.write(title+'\n') for dataset in summary_results_db: values = dataset for entry in summary_results_db[dataset][0:-1]: values = values +'\t'+ str(entry) if dataset in annotation_db2: for entry in annotation_db2[dataset]: values = values +'\t'+ str(entry[1]) data.write(values+'\n') data.close()
def exportData(gene_db,data_type,species): export_file = 'AltDatabase/ensembl/SubGeneViewer/'+species+'/Xport_sgv_'+data_type+'.csv' if data_type == 'feature': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_feature'+'\n' if data_type == 'structure': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_structure'+'\n' if data_type == 'splice': title = 'gene'+'\t'+'symbol'+'\t'+'sgv_splice'+'\n' data = export.createExportFile(export_file,'AltDatabase/ensembl/SubGeneViewer/'+species) #fn=filepath(export_file); data = open(fn,'w') data.write(title) for gene in gene_db: try: symbol = gene_symbol_db[gene] value_str_list = gene_db[gene] value_str = string.join(value_str_list,',') values = string.join([gene,symbol,value_str],'\t')+'\n'; data.write(values) except KeyError: null = [] data.close() print "exported to",export_file
def __init__(self,url,dir,file_type): """Copy the contents of a file from a given URL to a local file.""" filename = url.split('/')[-1] if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename output_filepath_object = export.createExportFile(dir+filename,dir[:-1]) output_filepath = filepath(dir+filename) print "Downloading the following file:",filename,' ', self.original_increment = 10 self.increment = 0 import urllib from urllib import urlretrieve try: try: webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction) except IOError: if 'Binary' in traceback.format_exc(): #IOError: [Errno ftp error] 200 Switching to Binary mode. ### https://bugs.python.org/issue1067702 - some machines the socket doesn't close and causes an error - reload to close the socket reload(urllib) webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction) reload(urllib) except: print 'Unknown URL error encountered...'; forceURLError print '' print "\nFile downloaded to:",output_filepath if '.zip' in filename: try: decompressZipStackOverflow(filename,dir); status = 'completed' except Exception: status = unzipFiles(filename,dir) if status == 'failed': print 'Zip Extraction failed' self.gz_filepath = filepath(output_filepath); self.status = 'remove' print "zip file extracted..." elif '.gz' in filename: self.gz_filepath = output_filepath if len(file_type)==0: extension = '.gz' else: extension = 'gz' decompressed_filepath = string.replace(self.gz_filepath,extension,file_type) ### Below code can be too memory intensive #file_size = os.path.getsize(output_filepath) #megabtyes = file_size/1000000.00 #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb') data = open(decompressed_filepath,'wb') #print "\nExtracting downloaded file:",self.gz_filepath import shutil; shutil.copyfileobj(content,data) self.status = 'remove' else: self.gz_filepath = ''; self.status = 'NA'
def exportData(gene_db, data_type, species): export_file = "AltDatabase/ensembl/SubGeneViewer/" + species + "/Xport_sgv_" + data_type + ".csv" if data_type == "feature": title = "gene" + "\t" + "symbol" + "\t" + "sgv_feature" + "\n" if data_type == "structure": title = "gene" + "\t" + "symbol" + "\t" + "sgv_structure" + "\n" if data_type == "splice": title = "gene" + "\t" + "symbol" + "\t" + "sgv_splice" + "\n" data = export.createExportFile(export_file, "AltDatabase/ensembl/SubGeneViewer/" + species) # fn=filepath(export_file); data = open(fn,'w') data.write(title) for gene in gene_db: try: symbol = gene_symbol_db[gene] value_str_list = gene_db[gene] value_str = string.join(value_str_list, ",") values = string.join([gene, symbol, value_str], "\t") + "\n" data.write(values) except KeyError: null = [] data.close() print "exported to", export_file
def __init__(self,url,dir,file_type): """Copy the contents of a file from a given URL to a local file.""" filename = url.split('/')[-1] if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename output_filepath_object = export.createExportFile(dir+filename,dir[:-1]) output_filepath = filepath(dir+filename) print "Downloading the following file:",filename,' ', self.original_increment = 10 self.increment = 0 from urllib import urlretrieve webfile, msg = urlretrieve(url, output_filepath,reporthook=self.reporthookFunction) print '' print "\nFile downloaded to:",output_filepath if '.zip' in filename: try: decompressZipStackOverflow(filename,dir); status = 'completed' except Exception: status = unzipFiles(filename,dir) if status == 'failed': print 'Zip Extraction failed' self.gz_filepath = filepath(output_filepath); self.status = 'remove' print "zip file extracted..." elif '.gz' in filename: self.gz_filepath = output_filepath if len(file_type)==0: extension = '.gz' else: extension = 'gz' decompressed_filepath = string.replace(self.gz_filepath,extension,file_type) ### Below code can be too memory intensive #file_size = os.path.getsize(output_filepath) #megabtyes = file_size/1000000.00 #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb') data = open(decompressed_filepath,'wb') #print "\nExtracting downloaded file:",self.gz_filepath import shutil; shutil.copyfileobj(content,data) self.status = 'remove' else: self.gz_filepath = ''; self.status = 'NA'
def parse_input_data(filename,data_type): fn=filepath(filename); first_line = 1; array_group_name_db = {}; z=0; array_group_db = {}; output_file = [] #print "Reading",filename secondary_data_type = export.getParentDir(filename) ### e.g., expression or counts for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line); t = string.split(data,'\t'); probeset = t[0]; z+=1 if first_line == 1: first_line = 0 #makes this value null for the next loop of actual array data ###Below ocucrs if the data is raw opposed to precomputed if data_type == 'export': if array_type == 'exon': folder = 'ExonArray'+'/'+species + '/' elif array_type == 'gene': folder = 'GeneArray'+'/'+species + '/' elif array_type == 'junction': folder = 'JunctionArray'+'/'+species + '/' elif array_type == 'RNASeq': folder = 'RNASeq'+'/'+species + '/' else: folder = array_type + '/' parent_path = root_dir+'AltExpression/'+folder if array_type == 'RNASeq': output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str(original_exp_threshold) +'_'+ filter_method+'.txt' else: output_file = altanalzye_input[0:-4] + '.p' + str(int(100*p)) +'_'+ filter_method+'.txt' output_file_dir = parent_path+output_file print "...Exporting",output_file_dir export_data = export.createExportFile(output_file_dir,root_dir+'AltExpression/'+folder) fn=filepath(output_file_dir); export_data = open(fn,'w'); export_data.write(line) if ':' in t[1]: array_group_list = []; x=0 ###gives us an original index value for each entry in the group for entry in t[1:]: array_group,array_name = string.split(entry,':') try: array_group_db[array_group].append(x) array_group_name_db[array_group].append(array_name) except KeyError: array_group_db[array_group] = [x] array_group_name_db[array_group] = [array_name] ### below only occurs with a new group addition array_group_list.append(array_group) #use this to generate comparisons in the below linked function x += 1 #print '##### array_group_list',array_group_list elif len(probeset)>0 and data_type != 'export': ###Use the index values from above to assign each expression value to a new database temp_group_array={}; array_index_list = [] ###Use this list for permutation analysis for group in array_group_db: #array_index_list.append(array_group_db[group]) group_values = [] for array_index in array_group_db[group]: try: exp_val = float(t[array_index+1]) except IndexError: print t, z,'\n',array_index,'\n',group, probeset;kill group_values.append(exp_val) avg_stat = statistics.avg(group_values) if data_type == 'expression': ###If non-log array data if exp_data_format == 'non-log': ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray(). if array_type == 'RNASeq': if normalization_method == 'RPKM' and secondary_data_type == 'expression': if ':I' in probeset: k=1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed) elif ':' not in probeset: if avg_stat>=gene_rpkm_threshold: k=1 else: k=0 elif avg_stat>=exon_rpkm_threshold: k=1 elif '-' in probeset: k=1 ### Don't consider RPKM for junctions, just counts else: k=0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k] else: ### Otherwise, we are looking at count data if '-' in probeset: ### junction meeting minimum read-count number if avg_stat>=junction_exp_threshold: k=1 ### junction_exp_threshold is the same as nonlog_exp_threshold else: k=0 elif ':' not in probeset: if avg_stat>=gene_exp_threshold: k=1 else: k=0 else: ### exon or intron meeting minimum read-count number if avg_stat>=exon_exp_threshold: k=1 else: k=0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k] else: if avg_stat>=nonlog_exp_threshold: k=1 else: k=0 elif avg_stat>=log_expression_threshold: k=1 else: k=0 if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] else: try: expression_status_db[probeset].append(k) except KeyError: expression_status_db[probeset] = [k] #if probeset == '3209315': print [group],k,len(group_values),array_group_list if data_type == 'p-value': if avg_stat<=p: k=1 else: k=0 #if 'G7216513_a_at' in probeset: print k, avg_stat try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] elif data_type == 'export': if exp_data_format == 'non-log': ### This code was added in version 1.16 in conjunction with a switch from logstatus to ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors exp_values = t[1:]; exp_values_log2=[] for exp_val in exp_values: exp_values_log2.append(str(math.log(float(exp_val),2))) ### exp_val+=1 was removed in 2.0.5 line = string.join([probeset]+exp_values_log2,'\t')+'\n' try: null = export_db[probeset]; export_data.write(line) except KeyError: null = [] ### occurs if not a probeset to include in the filtered results export file if data_type == 'export': export_data.close() return output_file
def exportTransitResults(array_group_list,array_raw_group_values,array_group_db,avg_const_exp_db,adj_fold_dbase,exon_db,dataset_name,apt_dir): """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files for analysis with MiDAS""" #array_group_list contains group names in order of analysis #array_raw_group_values contains expression values for the x number of groups in above list #array_group_db key is the group name and values are the list of array names #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key ordered_array_header_list=[] for group in array_group_list: ###contains the correct order for each group for array_id in array_group_db[group]: ordered_array_header_list.append(str(array_id)) ordered_exp_val_db = {} ###new dictionary containing all expression values together, but organized based on group probeset_affygene_db = {} ###lists all altsplice probesets and corresponding affygenes for probeset in array_raw_group_values: try: include_probeset = 'yes' ###Examines user input parameters for inclusion of probeset types in the analysis if include_probeset == 'yes': if probeset in adj_fold_dbase: ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset) for group_val_list in array_raw_group_values[probeset]: non_log_group_exp_vals = statistics.log_fold_conversion(group_val_list) for val in non_log_group_exp_vals: try: ordered_exp_val_db[probeset].append(str(val)) except KeyError: ordered_exp_val_db[probeset] = [str(val)] affygene = exon_db[probeset].GeneID() try: probeset_affygene_db[affygene].append(probeset) except KeyError: probeset_affygene_db[affygene] = [probeset] except KeyError: ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file ###In that case, just ignore the entry null = '' gene_count = 0 ordered_gene_val_db={} for affygene in avg_const_exp_db: ###now, add all constitutive gene level expression values (only per anlayzed gene) if affygene in probeset_affygene_db: ###ensures we only include gene data where there are altsplice examined probesets non_log_ordered_exp_const_val = statistics.log_fold_conversion(avg_const_exp_db[affygene]) gene_count+=1 for val in non_log_ordered_exp_const_val: try: ordered_gene_val_db[affygene].append(str(val)) except KeyError: ordered_gene_val_db[affygene] = [str(val)] convert_probesets_to_numbers={} convert_affygene_to_numbers={}; array_type = 'junction' probeset_affygene_number_db={}; x=0; y=0 for affygene in probeset_affygene_db: x+=1; y = x ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene x_copy = x example_gene_probeset = probeset_affygene_db[affygene][0] #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID() if x_copy not in exon_db: convert_affygene_to_numbers[affygene] = str(x_copy) else: print affygene, x_copy,'new numeric for MIDAS already exists as a probeset ID number'; kill for probeset in probeset_affygene_db[affygene]: y = y+1; y_copy = y if exon_db[probeset].ArrayType() == 'exon': y_copy = probeset ### Only appropriate when the probeset ID is a number array_type = 'exon' convert_probesets_to_numbers[probeset] = str(y_copy) try: probeset_affygene_number_db[str(x_copy)].append(str(y_copy)) except KeyError: probeset_affygene_number_db[str(x_copy)] = [str(y_copy)] x=y metafile = 'AltResults/MIDAS/meta-'+dataset_name[0:-1]+'.txt' data1 = export.createExportFile(metafile,'AltResults/MIDAS') title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n' data1.write(title) for affygene in probeset_affygene_number_db: probeset_list = probeset_affygene_number_db[affygene]; probe_number = str(len(probeset_list)*6) probeset_list = [string.join(probeset_list,' ')] probeset_list.append(affygene); probeset_list.append(affygene); probeset_list.reverse(); probeset_list.append(probe_number) probeset_list = string.join(probeset_list,'\t'); probeset_list=probeset_list+'\n' data1.write(probeset_list) data1.close() junction_exp_file = 'AltResults/MIDAS/'+array_type+'-exp-'+dataset_name[0:-1]+'.txt' fn2=filepath(junction_exp_file) data2 = open(fn2,'w') ordered_array_header_list.reverse(); ordered_array_header_list.append('probeset_id'); ordered_array_header_list.reverse() title = string.join(ordered_array_header_list,'\t') data2.write(title+'\n') for probeset in ordered_exp_val_db: probeset_number = convert_probesets_to_numbers[probeset] exp_values = ordered_exp_val_db[probeset]; exp_values.reverse(); exp_values.append(probeset_number); exp_values.reverse() exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n' data2.write(exp_values) data2.close() gene_exp_file = 'AltResults/MIDAS/gene-exp-'+dataset_name[0:-1]+'.txt' fn3=filepath(gene_exp_file) data3 = open(fn3,'w') title = string.join(ordered_array_header_list,'\t') data3.write(title+'\n') for affygene in ordered_gene_val_db: try: affygene_number = convert_affygene_to_numbers[affygene] except KeyError: print len(convert_affygene_to_numbers), len(ordered_gene_val_db); kill exp_values = ordered_gene_val_db[affygene]; exp_values.reverse(); exp_values.append(affygene_number); exp_values.reverse() exp_values = string.join(exp_values,'\t'); exp_values = exp_values +'\n' data3.write(exp_values) data3.close() exportMiDASArrayNames(array_group_list,array_group_db,dataset_name,'new') coversionfile = 'AltResults/MIDAS/probeset-conversion-'+dataset_name[0:-1]+'.txt' fn5=filepath(coversionfile) data5 = open(fn5,'w') title = 'probeset\tprobeset_number\n'; data5.write(title) for probeset in convert_probesets_to_numbers: ###contains the correct order for each group probeset_number = convert_probesets_to_numbers[probeset] values = probeset+'\t'+probeset_number+'\n' data5.write(values) data5.close() """ ### This code is obsolete... used before AltAnalyze could connect to APT directly. commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt' data = export.createExportFile(commands,'AltResults/MIDAS') path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n' metafile = 'meta-'+dataset_name[0:-1]+'.txt' junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt' gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt' celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt' command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output' data.write(path); data.write(command_line); data.close() """ status = runMiDAS(apt_dir,array_type,dataset_name,array_group_list,array_group_db) return status
def __init__(self,url,dir,file_type): try: self.suppress = suppress_printouts except Exception: self.suppress = 'no' """Copy the contents of a file from a given URL to a local file.""" filename = url.split('/')[-1]; self.status = '' #print [url, dir, file_type] #dir = unique.filepath(dir) ### Can screw up directory structures if file_type == None: file_type ='' if len(file_type) == 2: filename, file_type = file_type ### Added this feature for when a file has an invalid filename output_filepath_object = export.createExportFile(dir+filename,dir[:-1]) output_filepath = filepath(dir+filename); self.output_filepath = output_filepath if self.suppress == 'no': print "Downloading the following file:",filename,' ', self.original_increment = 5 self.increment = 0 import urllib from urllib import urlretrieve #if 'gene.txt.gz' in url: print [self.reporthookFunction];sys.exit() try: try: webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction) except IOError: if 'Binary' in traceback.format_exc(): #IOError: [Errno ftp error] 200 Switching to Binary mode. ### https://bugs.python.org/issue1067702 - some machines the socket doesn't close and causes an error - reload to close the socket reload(urllib) webfile, msg = urlretrieve(url,output_filepath,reporthook=self.reporthookFunction) reload(urllib) except: print 'Unknown URL error encountered...'; forceURLError if self.suppress == 'no': print '' self.testFile() if self.suppress == 'no': print self.status if 'Internet' not in self.status: if '.zip' in filename: if self.suppress == 'no': print "Extracting zip file...", try: decompressZipStackOverflow(filename,dir); status = 'completed' except Exception: #print 'Native unzip not present...trying python unzip methods...' status = unzipFiles(filename,dir) if status == 'failed': print 'zip extraction failed!' self.gz_filepath = filepath(output_filepath); self.status = 'remove' if self.suppress == 'no': print "zip file extracted" elif '.gz' in filename: self.gz_filepath = output_filepath if len(file_type)==0: extension = '.gz' else: extension = 'gz' decompressed_filepath = string.replace(self.gz_filepath,extension,file_type) ### Below code can be too memory intensive #file_size = os.path.getsize(output_filepath) #megabtyes = file_size/1000000.00 #if megabtyes>5000: force_error ### force_error is an undefined variable which causes an exception import gzip; content = gzip.GzipFile(self.gz_filepath, 'rb') data = open(decompressed_filepath,'wb') #print "\nExtracting downloaded file:",self.gz_filepath import shutil; shutil.copyfileobj(content,data) # http://pythonicprose.blogspot.com/2009/10/python-extract-or-unzip-tar-file.html os.chdir(filepath(dir)) if '.tar' in decompressed_filepath: import tarfile tfile = tarfile.open(decompressed_filepath) tfile.extractall() tfile.close() tar_dir = string.replace(decompressed_filepath,'.tar','') self.status = 'remove' else: self.gz_filepath = ''; self.status = 'remove'
def exportTransitResults(array_group_list, array_raw_group_values, array_group_db, avg_const_exp_db, adj_fold_dbase, exon_db, dataset_name, apt_dir): """Export processed raw expression values (e.g. add global fudge factor or eliminate probe sets based on filters) to txt files for analysis with MiDAS""" #array_group_list contains group names in order of analysis #array_raw_group_values contains expression values for the x number of groups in above list #array_group_db key is the group name and values are the list of array names #avg_const_exp_db contains the average expression values for all arrays for all constitutive probesets, with gene as the key ordered_array_header_list = [] for group in array_group_list: ###contains the correct order for each group for array_id in array_group_db[group]: ordered_array_header_list.append(str(array_id)) ordered_exp_val_db = { } ###new dictionary containing all expression values together, but organized based on group probeset_affygene_db = { } ###lists all altsplice probesets and corresponding affygenes for probeset in array_raw_group_values: try: include_probeset = 'yes' ###Examines user input parameters for inclusion of probeset types in the analysis if include_probeset == 'yes': if probeset in adj_fold_dbase: ###indicates that this probeset is analyzed for splicing (e.g. has a constitutive probeset) for group_val_list in array_raw_group_values[probeset]: non_log_group_exp_vals = statistics.log_fold_conversion( group_val_list) for val in non_log_group_exp_vals: try: ordered_exp_val_db[probeset].append(str(val)) except KeyError: ordered_exp_val_db[probeset] = [str(val)] affygene = exon_db[probeset].GeneID() try: probeset_affygene_db[affygene].append(probeset) except KeyError: probeset_affygene_db[affygene] = [probeset] except KeyError: ###Indicates that the expression dataset file was not filtered for whether annotations exist in the exon annotation file ###In that case, just ignore the entry null = '' gene_count = 0 ordered_gene_val_db = {} for affygene in avg_const_exp_db: ###now, add all constitutive gene level expression values (only per anlayzed gene) if affygene in probeset_affygene_db: ###ensures we only include gene data where there are altsplice examined probesets non_log_ordered_exp_const_val = statistics.log_fold_conversion( avg_const_exp_db[affygene]) gene_count += 1 for val in non_log_ordered_exp_const_val: try: ordered_gene_val_db[affygene].append(str(val)) except KeyError: ordered_gene_val_db[affygene] = [str(val)] convert_probesets_to_numbers = {} convert_affygene_to_numbers = {} array_type = 'junction' probeset_affygene_number_db = {} x = 0 y = 0 for affygene in probeset_affygene_db: x += 1 y = x ###each affygene has a unique number, from other affygenes and probesets and probesets count up from each affygene x_copy = x example_gene_probeset = probeset_affygene_db[affygene][0] #if exon_db[example_gene_probeset].ArrayType() == 'exon': x_copy = exon_db[example_gene_probeset].SecondaryGeneID() if x_copy not in exon_db: convert_affygene_to_numbers[affygene] = str(x_copy) else: print affygene, x_copy, 'new numeric for MIDAS already exists as a probeset ID number' kill for probeset in probeset_affygene_db[affygene]: y = y + 1 y_copy = y if exon_db[probeset].ArrayType() == 'exon': y_copy = probeset ### Only appropriate when the probeset ID is a number array_type = 'exon' convert_probesets_to_numbers[probeset] = str(y_copy) try: probeset_affygene_number_db[str(x_copy)].append(str(y_copy)) except KeyError: probeset_affygene_number_db[str(x_copy)] = [str(y_copy)] x = y metafile = 'AltResults/MIDAS/meta-' + dataset_name[0:-1] + '.txt' data1 = export.createExportFile(metafile, 'AltResults/MIDAS') title = 'probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n' data1.write(title) for affygene in probeset_affygene_number_db: probeset_list = probeset_affygene_number_db[affygene] probe_number = str(len(probeset_list) * 6) probeset_list = [string.join(probeset_list, ' ')] probeset_list.append(affygene) probeset_list.append(affygene) probeset_list.reverse() probeset_list.append(probe_number) probeset_list = string.join(probeset_list, '\t') probeset_list = probeset_list + '\n' data1.write(probeset_list) data1.close() junction_exp_file = 'AltResults/MIDAS/' + array_type + '-exp-' + dataset_name[ 0:-1] + '.txt' fn2 = filepath(junction_exp_file) data2 = open(fn2, 'w') ordered_array_header_list.reverse() ordered_array_header_list.append('probeset_id') ordered_array_header_list.reverse() title = string.join(ordered_array_header_list, '\t') data2.write(title + '\n') for probeset in ordered_exp_val_db: probeset_number = convert_probesets_to_numbers[probeset] exp_values = ordered_exp_val_db[probeset] exp_values.reverse() exp_values.append(probeset_number) exp_values.reverse() exp_values = string.join(exp_values, '\t') exp_values = exp_values + '\n' data2.write(exp_values) data2.close() gene_exp_file = 'AltResults/MIDAS/gene-exp-' + dataset_name[0:-1] + '.txt' fn3 = filepath(gene_exp_file) data3 = open(fn3, 'w') title = string.join(ordered_array_header_list, '\t') data3.write(title + '\n') for affygene in ordered_gene_val_db: try: affygene_number = convert_affygene_to_numbers[affygene] except KeyError: print len(convert_affygene_to_numbers), len(ordered_gene_val_db) kill exp_values = ordered_gene_val_db[affygene] exp_values.reverse() exp_values.append(affygene_number) exp_values.reverse() exp_values = string.join(exp_values, '\t') exp_values = exp_values + '\n' data3.write(exp_values) data3.close() exportMiDASArrayNames(array_group_list, array_group_db, dataset_name, 'new') coversionfile = 'AltResults/MIDAS/probeset-conversion-' + dataset_name[ 0:-1] + '.txt' fn5 = filepath(coversionfile) data5 = open(fn5, 'w') title = 'probeset\tprobeset_number\n' data5.write(title) for probeset in convert_probesets_to_numbers: ###contains the correct order for each group probeset_number = convert_probesets_to_numbers[probeset] values = probeset + '\t' + probeset_number + '\n' data5.write(values) data5.close() """ ### This code is obsolete... used before AltAnalyze could connect to APT directly. commands = 'AltResults/MIDAS/commands-'+dataset_name[0:-1]+'.txt' data = export.createExportFile(commands,'AltResults/MIDAS') path = filepath('AltResults/MIDAS'); path = string.replace(path,'\\','/'); path = 'cd '+path+'\n\n' metafile = 'meta-'+dataset_name[0:-1]+'.txt' junction_exp_file = array_type+'-exp-'+dataset_name[0:-1]+'.txt' gene_exp_file = 'gene-exp-'+dataset_name[0:-1]+'.txt' celfiles = 'celfiles-'+dataset_name[0:-1]+'.txt' command_line = 'apt-midas -c '+celfiles+' -g '+gene_exp_file+' -e '+junction_exp_file+' -m '+metafile+' -o '+dataset_name[0:-1]+'-output' data.write(path); data.write(command_line); data.close() """ status = runMiDAS(apt_dir, array_type, dataset_name, array_group_list, array_group_db) return status
def parse_input_data(filename, data_type): fn = filepath(filename) first_line = 1 array_group_name_db = {} z = 0 array_group_db = {} output_file = [] #print "Reading",filename secondary_data_type = export.getParentDir( filename) ### e.g., expression or counts for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') probeset = t[0] z += 1 if first_line == 1: first_line = 0 #makes this value null for the next loop of actual array data ###Below ocucrs if the data is raw opposed to precomputed if data_type == 'export': if array_type == 'exon': folder = 'ExonArray' + '/' + species + '/' elif array_type == 'gene': folder = 'GeneArray' + '/' + species + '/' elif array_type == 'junction': folder = 'JunctionArray' + '/' + species + '/' elif array_type == 'RNASeq': folder = 'RNASeq' + '/' + species + '/' else: folder = array_type + '/' parent_path = root_dir + 'AltExpression/' + folder if array_type == 'RNASeq': output_file = altanalzye_input[0:-4] + '.ExpCutoff-' + str( original_exp_threshold) + '_' + filter_method + '.txt' else: output_file = altanalzye_input[0:-4] + '.p' + str( int(100 * p)) + '_' + filter_method + '.txt' output_file_dir = parent_path + output_file print "...Exporting", output_file_dir export_data = export.createExportFile( output_file_dir, root_dir + 'AltExpression/' + folder) fn = filepath(output_file_dir) export_data = open(fn, 'w') export_data.write(line) if ':' in t[1]: array_group_list = [] x = 0 ###gives us an original index value for each entry in the group for entry in t[1:]: array_group, array_name = string.split(entry, ':') try: array_group_db[array_group].append(x) array_group_name_db[array_group].append(array_name) except KeyError: array_group_db[array_group] = [x] array_group_name_db[array_group] = [array_name] ### below only occurs with a new group addition array_group_list.append( array_group ) #use this to generate comparisons in the below linked function x += 1 #print '##### array_group_list',array_group_list elif len(probeset) > 0 and data_type != 'export': ###Use the index values from above to assign each expression value to a new database temp_group_array = {} array_index_list = [] ###Use this list for permutation analysis for group in array_group_db: #array_index_list.append(array_group_db[group]) group_values = [] for array_index in array_group_db[group]: try: exp_val = float(t[array_index + 1]) except IndexError: print t, z, '\n', array_index, '\n', group, probeset kill group_values.append(exp_val) avg_stat = statistics.avg(group_values) if data_type == 'expression': ###If non-log array data if exp_data_format == 'non-log': ### This works better for RNASeq as opposed to log transforming and then filtering which is more stringent and different than the filtering in ExonArray(). if array_type == 'RNASeq': if normalization_method == 'RPKM' and secondary_data_type == 'expression': if ':I' in probeset: k = 1 ### Don't require an RPKM threshold for intron IDs (these will likely never meet this unless small or fully retained and highly expressed) elif ':' not in probeset: if avg_stat >= gene_rpkm_threshold: k = 1 else: k = 0 elif avg_stat >= exon_rpkm_threshold: k = 1 elif '-' in probeset: k = 1 ### Don't consider RPKM for junctions, just counts else: k = 0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, gene_rpkm_threshold, avg_stat, k] else: ### Otherwise, we are looking at count data if '-' in probeset: ### junction meeting minimum read-count number if avg_stat >= junction_exp_threshold: k = 1 ### junction_exp_threshold is the same as nonlog_exp_threshold else: k = 0 elif ':' not in probeset: if avg_stat >= gene_exp_threshold: k = 1 else: k = 0 else: ### exon or intron meeting minimum read-count number if avg_stat >= exon_exp_threshold: k = 1 else: k = 0 #if 'ENSMUSG00000045991:E2.2' in probeset: print [probeset, normalization_method, secondary_data_type, exon_exp_threshold, junction_exp_threshold, avg_stat, k] else: if avg_stat >= nonlog_exp_threshold: k = 1 else: k = 0 elif avg_stat >= log_expression_threshold: k = 1 else: k = 0 if normalization_method == 'RPKM' and secondary_data_type == 'expression': ### Treat as dabp p-value try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] else: try: expression_status_db[probeset].append(k) except KeyError: expression_status_db[probeset] = [k] #if probeset == '3209315': print [group],k,len(group_values),array_group_list if data_type == 'p-value': if avg_stat <= p: k = 1 else: k = 0 #if 'G7216513_a_at' in probeset: print k, avg_stat try: pvalue_status_db[probeset].append(k) except KeyError: pvalue_status_db[probeset] = [k] elif data_type == 'export': if exp_data_format == 'non-log': ### This code was added in version 1.16 in conjunction with a switch from logstatus to ### non-log in AltAnalyze to prevent "Process AltAnalyze Filtered" associated errors exp_values = t[1:] exp_values_log2 = [] for exp_val in exp_values: exp_values_log2.append(str( math.log(float(exp_val), 2))) ### exp_val+=1 was removed in 2.0.5 line = string.join([probeset] + exp_values_log2, '\t') + '\n' try: null = export_db[probeset] export_data.write(line) except KeyError: null = [ ] ### occurs if not a probeset to include in the filtered results export file if data_type == 'export': export_data.close() return output_file
def exportGroupedComparisonProbesetData(filename,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis): """This function organizes the raw expression data into sorted groups, exports the organized data for all conditions and comparisons and calculates which probesets have groups that meet the user defined dabg and expression thresholds.""" #comparison_filename_list=[] #if perform_alt_analysis != 'expression': ### User Option (removed in version 2.0 since the option prevented propper filtering) comparison_filename_list=[] probeset_dbase={}; exp_dbase={}; constitutive_gene_db={}; probeset_gene_db={} ### reset databases to conserve memory global expr_group_list; global comp_group_list; global expr_group_db if data_type == 'residuals': expr_group_dir = string.replace(filename,'residuals.','groups.') comp_group_dir = string.replace(filename,'residuals.','comps.') elif data_type == 'expression': expr_group_dir = string.replace(filename,'exp.','groups.') comp_group_dir = string.replace(filename,'exp.','comps.') if 'counts.' in filename: expr_group_dir = string.replace(expr_group_dir,'counts.','groups.') comp_group_dir = string.replace(comp_group_dir,'counts.','comps.') data_type = 'counts' elif data_type == 'dabg': expr_group_dir = string.replace(filename,'stats.','groups.') comp_group_dir = string.replace(filename,'stats.','comps.') comp_group_list, comp_group_list2 = ExpressionBuilder.importComparisonGroups(comp_group_dir) expr_group_list,expr_group_db = ExpressionBuilder.importArrayGroups(expr_group_dir,array_linker_db) print "Reorganizing expression data into comparison groups for export to down-stream splicing analysis software" ###Do this only for the header data group_count,raw_data_comp_headers = reorder_arrays.reorderArrayHeaders(array_names,expr_group_list,comp_group_list,array_linker_db) ###Export the header info and store the export write data for reorder_arrays global comparision_export_db; comparision_export_db={}; array_type_name = 'Exon' if array_type == 'junction': array_type_name = 'Junction' elif array_type == 'RNASeq': array_type_name = 'RNASeq' if data_type != 'residuals': AltAnalzye_input_dir = root_dir+"AltExpression/pre-filtered/"+data_type+'/' else: AltAnalzye_input_dir = root_dir+"AltExpression/FIRMA/residuals/"+array_type+'/'+species+'/' ### These files does not need to be filtered until AltAnalyze.py for comparison in comp_group_list2: ###loop throught the list of comparisons group1 = comparison[0]; group2 = comparison[1] group1_name = expr_group_db[group1]; group2_name = expr_group_db[group2] comparison_filename = species+'_'+array_type_name+'_'+ group1_name + '_vs_' + group2_name + '.txt' new_file = AltAnalzye_input_dir + comparison_filename; comparison_filename_list.append(comparison_filename) data = export.createExportFile(new_file,AltAnalzye_input_dir[:-1]) try: array_names = raw_data_comp_headers[comparison] except KeyError: print raw_data_comp_headers;kill title = ['UID']+array_names; title = string.join(title,'\t')+'\n'; data.write(title) comparision_export_db[comparison] = data ###store the export file write data so we can write after organizing #print filename, normalize_feature_exp biotypes = importExonProbesetData(filename,probeset_db,'reorderFilterAndExportAll') if normalize_feature_exp == 'RPKM': ### Add the gene-level RPKM data (this is in addition to the counts. file) exp_gene_db={} for i in probeset_db: exp_gene_db[probeset_db[i][0]]=[] filename = string.replace(filename,'.txt','-steady-state.txt') #print filename, normalize_feature_exp, 'here' importExonProbesetData(filename,exp_gene_db,'reorderFilterAndExportAll') for comparison in comparision_export_db: data = comparision_export_db[comparison]; data.close() print "Pairwise comparisons for AltAnalyze exported..." try: fulldataset_export_object.close() except Exception: null=[] return comparison_filename_list, biotypes