def exportCorrelationResults(exp_input): input_file = export.findFilename(exp_input) if '.txt' in exp_output_file: corr_output_file = string.replace(exp_output_file, 'DATASET', 'LineageCorrelations') else: ### Occurs when processing a non-standard AltAnalyze file corr_output_file = exp_output_file + '/' + input_file corr_output_file = string.replace( corr_output_file, '.txt', '-' + coding_type + '-' + compendiumPlatform + '.txt') if analysis_type == 'AltExon': corr_output_file = string.replace(corr_output_file, coding_type, 'AltExon') filename = export.findFilename(corr_output_file) score_data = export.ExportFile(corr_output_file) if use_scipy: zscore_output_dir = string.replace(corr_output_file, '.txt', '-zscores.txt') probability_data = export.ExportFile(zscore_output_dir) #adjustPValues() replacePearsonPvalueWithZscore() ### Make title row headers = ['Sample_name'] for tissue in tissue_comparison_scores: for (r, p, sample) in tissue_comparison_scores[tissue]: headers.append(sample) break title_row = string.join(headers, '\t') + '\n' score_data.write(title_row) if use_scipy: probability_data.write(title_row) ### Export correlation data tissue_scores = {} tissue_probabilities = {} tissue_score_list = [] ### store and rank tissues according to max(score) for tissue in tissue_comparison_scores: scores = [] probabilities = [] for (r, p, sample) in tissue_comparison_scores[tissue]: scores.append(r) probabilities.append(p) tissue_score_list.append((max(scores), tissue)) tissue_scores[tissue] = string.join(map(str, [tissue] + scores), '\t') + '\n' ### export line if use_scipy: tissue_probabilities[tissue] = string.join( map(str, [tissue] + probabilities), '\t') + '\n' tissue_score_list.sort() tissue_score_list.reverse() for (score, tissue) in tissue_score_list: score_data.write(tissue_scores[tissue]) if use_scipy: probability_data.write(tissue_probabilities[tissue]) score_data.close() if use_scipy: probability_data.close() print filename, 'exported...' return zscore_output_dir
def exportCorrelationResults(exp_input): input_file = export.findFilename(exp_input) if '.txt' in exp_output_file: corr_output_file = string.replace(exp_output_file,'DATASET','LineageCorrelations') else: ### Occurs when processing a non-standard AltAnalyze file corr_output_file = exp_output_file+'/'+input_file corr_output_file = string.replace(corr_output_file,'.txt','-'+coding_type+'-'+compendiumPlatform+'.txt') if analysis_type == 'AltExon': corr_output_file = string.replace(corr_output_file,coding_type,'AltExon') filename = export.findFilename(corr_output_file) score_data = export.ExportFile(corr_output_file) if use_scipy: zscore_output_dir = string.replace(corr_output_file,'.txt','-zscores.txt') probability_data = export.ExportFile(zscore_output_dir) #adjustPValues() replacePearsonPvalueWithZscore() ### Make title row headers=['Sample_name'] for tissue in tissue_comparison_scores: for (r,p,sample) in tissue_comparison_scores[tissue]: headers.append(sample) break title_row = string.join(headers,'\t')+'\n' score_data.write(title_row) if use_scipy: probability_data.write(title_row) ### Export correlation data tissue_scores = {}; tissue_probabilities={}; tissue_score_list = [] ### store and rank tissues according to max(score) for tissue in tissue_comparison_scores: scores=[] probabilities=[] for (r,p,sample) in tissue_comparison_scores[tissue]: scores.append(r) probabilities.append(p) tissue_score_list.append((max(scores),tissue)) tissue_scores[tissue] = string.join(map(str,[tissue]+scores),'\t')+'\n' ### export line if use_scipy: tissue_probabilities[tissue] = string.join(map(str,[tissue]+probabilities),'\t')+'\n' tissue_score_list.sort() tissue_score_list.reverse() for (score,tissue) in tissue_score_list: score_data.write(tissue_scores[tissue]) if use_scipy: probability_data.write(tissue_probabilities[tissue]) score_data.close() if use_scipy: probability_data.close() print filename,'exported...' return zscore_output_dir
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print 'Running Combat...', expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir( expr_input_dir) + 'Non-Combat/' + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile( expr_input_dir, removeExtra='#') ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() #print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print 'Original expression file over-written with batch effect removal results...' ebat.to_csv(expr_input_dir, sep="\t")
def FilterFile(Guidefile, PSI, turn=0): if 'Clustering' in Guidefile: count = 1 else: count = 0 val = [] head = 0 for line in open(Guidefile, 'rU').xreadlines(): if head > count: line = line.rstrip('\r\n') q = string.split(line, '\t') val.append(q[0]) else: head += 1 continue dire = export.findParentDir(export.findParentDir(Guidefile)[:-1]) output_dir = dire + 'SubtypeAnalyses-Results' if os.path.exists(output_dir) == False: export.createExportFolder(output_dir) #output_file = output_dir+'/round'+str(turn)+'/'+export.findFilename(PSI)+'-filtered.txt' output_file = output_dir + '/round' + str( turn) + '/' + export.findFilename(PSI)[:-4] + '-filtered.txt' try: os.mkdir(output_dir + '/round' + str(turn)) except: pass ### already exists if turn == 1: ### No need to filter this file shutil.copyfile(PSI, output_file) else: filterRows(PSI, output_file, filterDB=val) return output_file
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) dir = string.replace(dir,'hGlue','') ### Used since the hGlue data is in a sub-directory filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable." if len(sys.argv)<2: try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print 'cannot be downloaded';force_error else: print 'cannot be downloaded';force_error elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file): try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def runPyCombat(fl): """ This method was added specifically for AltAnalyze version 2.0.8 (not in the original GitHub code) """ print "Running Combat...", expr_input_dir = fl.ExpFile() pheno_dir = formatPhenoFile(fl) moved_exp_dir = export.findParentDir(expr_input_dir) + "Non-Combat/" + export.findFilename(expr_input_dir) try: export.copyFile(expr_input_dir, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir ### now overwrite the origin excluding the commented rows export.cleanFile(expr_input_dir, removeExtra="#") ### remove comments from the original file except Exception: None pheno = pa.read_table(pheno_dir, index_col=0) dat = pa.read_table(expr_input_dir, index_col=0) mod = patsy.dmatrix("group", pheno, return_type="dataframe") t = time.time() # print dat, pheno.batch, mod;sys.exit() ebat = combat(dat, pheno.batch, mod, 0) print "...Combat completed in %.2f seconds" % (time.time() - t) print "Original expression file over-written with batch effect removal results..." ebat.to_csv(expr_input_dir, sep="\t")
def normalizeDataset(filename, output=None, normalization='quantile', platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Normalized/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) exportExpressionData(output, sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir, filename, platform) print 'Exported expression input file to:', output
def importExonIDTranslations(array_type,species,translate_to_genearray): gene_translation_db={}; gene_translation_db2={} if targetPlatform == 'gene' and translate_to_genearray == 'no': ### Get gene array to exon array probeset associations gene_translation_db = importExonIDTranslations('gene',species,'yes') for geneid in gene_translation_db: exonid = gene_translation_db[geneid] gene_translation_db2[exonid] = geneid #print exonid, geneid translation_db = gene_translation_db2 else: filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt' ### Import exon array to target platform translations (built for DomainGraph visualization) fn=filepath(filename); x=0; translation_db={} print 'Importing the translation file',export.findFilename(fn) for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: platform_id,exon_id = t if targetPlatform == 'gene' and translate_to_genearray == 'no': try: translation_db[platform_id] = gene_translation_db[exon_id] ### return RNA-Seq to gene array probeset ID #print platform_id, exon_id, gene_translation_db[exon_id];sys.exit() except Exception: null=[] else: translation_db[platform_id] = exon_id del gene_translation_db; del gene_translation_db2 return translation_db
def covertAffyFormatToBED(filename, ConversionDB=None): print 'processing:',filename parent = export.findParentDir(filename) if ConversionDB==None: output_file = 'simple_chr.bed' else: output_file = export.findFilename(filename) output_file = string.replace(output_file,'mm9','mm10') export_obj = export.ExportFile(parent+'/'+output_file) fn=filepath(filename); entry_count=0; readfiles = False for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if data[0]=='#': readfiles = False elif readfiles==False: readfiles = True if ConversionDB!=None: export_obj.write(line) ### Write header else: try: t = string.split(data[1:-1],'","') probeset_id,chr,strand,start,stop = t[:5] int(start) if ConversionDB==None: if 'chr' in chr: export_obj.write(chr+'\t'+start+'\t'+stop+'\t'+probeset_id+'\n') else: chr,start,stop = ConversionDB[probeset_id] t = [probeset_id,chr,strand,start,stop] + t[5:] values = '"'+string.join(t,'","')+'"\n' export_obj.write(values) entry_count+=1 except Exception: pass export_obj.close() print entry_count, 'entries saved to:',parent+'/'+output_file
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() ud = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) dir = string.replace(dir,'hGlue','') ### Used since the hGlue data is in a sub-directory filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename print url file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status and 'nnot' not in filename: ### Exclude for Affymetrix annotation files print_out = "File:\n"+url+"\ncould not be found on the server or an internet connection is unavailable." if len(sys.argv)<2: try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print 'cannot be downloaded';force_error else: print 'cannot be downloaded';force_error elif status == 'remove' and ('.zip' in file or '.tar' in file or '.gz' in file): try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def downloadCurrentVersion(filename, secondary_dir, file_type): import UI file_location_defaults = UI.importDefaultFileLocations() uds = file_location_defaults[ 'url'] ### Get the location of the download site from Config/default-files.csv for ud in uds: url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) filename = export.findFilename(filename) url = url_dir + secondary_dir + '/' + filename file, status = download(url, dir, file_type) continue_analysis = 'yes' if 'Internet' in status: print_out = "File:\n" + url + "\ncould not be found on server or internet connection is unavailable." try: UI.WarningWindow(print_out, 'WARNING!!!') continue_analysis = 'no' except Exception: print url print 'cannot be downloaded' die elif status == 'remove': try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def importTissueSpecificProfiles(species): if analysis_type == 'AltExon': filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_AltExon_protein_coding.txt' else: filename = 'AltDatabase/ensembl/'+species+'/'+species+'_'+targetPlatform +'_tissue-specific_'+coding_type+'.txt' if customMarkerFile != False and customMarkerFile != None: if len(customMarkerFile)>0: filename = customMarkerFile #filename = 'AltDatabase/ensembl/'+species+'/random.txt' #print 'Target platform used for analysis:',species, targetPlatform, coding_type if value_type == 'calls': filename = string.replace(filename,'.txt','_stats.txt') fn=filepath(filename); x=0 tissues_added={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: print 'Importing the tissue compedium database:',export.findFilename(filename) headers = t; x=1; index=0 for i in headers: if 'UID' == i: ens_index = index; uid_index = index if analysis_type == 'AltExon': ens_index = ens_index ### Assigned above when analyzing probesets elif 'Ensembl' in i: ens_index = index if 'marker-in' in i: tissue_index = index+1; marker_in = index index+=1 try: for i in t[tissue_index:]: tissues.append(i) except Exception: for i in t[1:]: tissues.append(i) if keyed_by == 'primaryID': try: ens_index = uid_index except Exception: None else: try: gene = t[0] tissue_exp = map(float, t[1:]) tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset except Exception: try: gene = string.split(t[ens_index],'|')[0] ### Only consider the first listed gene - this gene is the best option based on ExpressionBuilder rankings except Exception: pass #if 'Pluripotent Stem Cells' in t[marker_in] or 'Heart' in t[marker_in]: #if t[marker_in] not in tissues_added: ### Only add the first instance of a gene for that tissue - used more for testing to quickly run the analysis tissue_exp = map(float, t[tissue_index:]) if value_type == 'calls': tissue_exp = produceDetectionCalls(tissue_exp,platform) ### 0 or 1 calls tissue_specific_db[gene]=x,tissue_exp ### Use this to only grab relevant gene expression profiles from the input dataset tissues_added[t[marker_in]]=[] x+=1 print len(tissue_specific_db), 'genes in the tissue compendium database' if correlate_to_tissue_specific == 'yes': try: importTissueCorrelations(filename) except Exception: null=[]
def visualizePathwayAssociations(filename, species, mod_type, wpid, imageExport=True): ### Log any potential problems log_file = filepath('webservice.log') log_report = open(log_file, 'w') if wpid == None: force_invalid_pathway global mod global species_code global graphic_link graphic_link = {} mod = mod_type species_code = species root_dir = export.findParentDir(filename) criterion_name = export.findFilename(filename)[:-4] log_report.write('Filename: %s and WPID %s\n' % (filename, wpid)) if 'GO-Elite/input' in root_dir: root_dir = string.replace(root_dir, 'GO-Elite/input', 'WikiPathways') else: root_dir += 'WikiPathways/' analysis_type = 'Genes' id_db, column_headers = importDataSimple(filename, 'GO-Elite') log_report.write('GO-Elite input ID file imported successfully\n') log_report.write('%d IDs imported\n' % len(id_db)) pathway_db = {} pathway_db[wpid] = PathwayData( None ) ### only need to analyze object (method allows for analysis of any number) pathway_db = getPathwayAs(pathway_db, species_code, mod) log_report.write( 'Pathway data imported from GPML files obtained from webservice\n') id_color_db = getHexadecimalColorRanges( id_db, analysis_type) ### example id_db" is key:gene, value:fold graphID_db = getGraphIDAssociations(id_color_db, pathway_db, 'MOD') if imageExport != 'png': file_type = 'pdf' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + criterion_name, WPID=wpid) if imageExport != 'pdf': file_type = 'png' ### svg, pdf, png getColoredPathway(root_dir, graphID_db, file_type, '-' + criterion_name, WPID=wpid) log_report.write( 'Pathways colored and image data returned. Exiting webservice.\n') log_report.close() return graphic_link
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir(filename) + "Non-Quantile/" + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print "Moved original expression file to:" print "\t" + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print "Exported expression input file to:", output
def normalizeDataset(filename,output = None, normalization='quantile',platform="3'array"): """ Perform Quantile Normalization on an input expression dataset """ if output==None: output = filename moved_exp_dir = export.findParentDir(filename)+'Non-Normalized/'+export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t'+moved_exp_dir except Exception: None if normalization == 'Quantile' or normalization == 'quantile': print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple(sample_expression_db) exportExpressionData(output,sample_expression_db) elif normalization == 'group': performGroupNormalization(moved_exp_dir,filename,platform) print 'Exported expression input file to:',output
def visualizePathwayAssociations(filename,species,mod_type,wpid,imageExport=True): ### Log any potential problems log_file = filepath('webservice.log') log_report = open(log_file,'w') if wpid == None: force_invalid_pathway global mod global species_code global graphic_link graphic_link={} mod = mod_type species_code = species root_dir = export.findParentDir(filename) criterion_name = export.findFilename(filename)[:-4] log_report.write('Filename: %s and WPID %s\n' % (filename,wpid)) if 'GO-Elite/input' in root_dir: root_dir = string.replace(root_dir,'GO-Elite/input','WikiPathways') else: root_dir+='WikiPathways/' analysis_type = 'Genes' id_db,column_headers = importDataSimple(filename,'GO-Elite') log_report.write('GO-Elite input ID file imported successfully\n') log_report.write('%d IDs imported\n' % len(id_db)) pathway_db={} pathway_db[wpid] = PathwayData(None) ### only need to analyze object (method allows for analysis of any number) pathway_db = getPathwayAs(pathway_db,species_code,mod) log_report.write('Pathway data imported from GPML files obtained from webservice\n') id_color_db = getHexadecimalColorRanges(id_db,analysis_type) ### example id_db" is key:gene, value:fold graphID_db = getGraphIDAssociations(id_color_db,pathway_db,'MOD') if imageExport != 'png': file_type = 'pdf' ### svg, pdf, png getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid) if imageExport != 'pdf': file_type = 'png' ### svg, pdf, png getColoredPathway(root_dir,graphID_db,file_type,'-'+criterion_name,WPID=wpid) log_report.write('Pathways colored and image data returned. Exiting webservice.\n') log_report.close() return graphic_link
def normalizeDataset(filename, output=None): """ Perform Quantile Normalization on an input expression dataset """ print "Importing data..." sample_expression_db = importExpressionValues(filename) print "Performing quantile normalization..." sample_expression_db = RNASeq.quantileNormalizationSimple( sample_expression_db) if output == None: output = filename moved_exp_dir = export.findParentDir( filename) + 'Non-Quantile/' + export.findFilename(filename) try: export.copyFile(filename, moved_exp_dir) print 'Moved original expression file to:' print '\t' + moved_exp_dir except Exception: None exportExpressionData(output, sample_expression_db) print 'Exported expression input file to:', output
def readFPKMs(path): if ".gz" in path: f = gzip.open(path, "rb") else: f = open(path, "rU") file_content = f.read() fpkm_data = string.split(file_content, "\n") sample = export.findFilename(path) if "fpkm_tracking" in sample: sample = string.split(sample, ".fpkm_tracking")[0] sample = string.replace(sample, ".sorted.genes", "") fpkm_db = {} transcript_db = {} firstLine = True row_count = 0 for line in fpkm_data: data = cleanUpLine(line) t = string.split(data, "\t") if firstLine: try: track_i = t.index("tracking_id") gene_i = t.index("gene_id") fpkm_i = t.index("FPKM") except Exception: fpkm_i = 9 gene_i = 3 row_count = 1 firstLine = False if firstLine == False and row_count > 0: if len(t) > 1: geneID = t[gene_i] transcriptID = t[gene_i] fpkm = t[fpkm_i] fpkm_db[transcriptID] = float(fpkm) transcript_db[transcriptID] = geneID row_count += 1 sample_FPKM_db[sample] = fpkm_db return sample_FPKM_db, transcript_db
def covertAffyFormatToBED(filename, ConversionDB=None): print 'processing:', filename parent = export.findParentDir(filename) if ConversionDB == None: output_file = 'simple_chr.bed' else: output_file = export.findFilename(filename) output_file = string.replace(output_file, 'mm9', 'mm10') export_obj = export.ExportFile(parent + '/' + output_file) fn = filepath(filename) entry_count = 0 readfiles = False for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if data[0] == '#': readfiles = False elif readfiles == False: readfiles = True if ConversionDB != None: export_obj.write(line) ### Write header else: try: t = string.split(data[1:-1], '","') probeset_id, chr, strand, start, stop = t[:5] int(start) if ConversionDB == None: if 'chr' in chr: export_obj.write(chr + '\t' + start + '\t' + stop + '\t' + probeset_id + '\n') else: chr, start, stop = ConversionDB[probeset_id] t = [probeset_id, chr, strand, start, stop] + t[5:] values = '"' + string.join(t, '","') + '"\n' export_obj.write(values) entry_count += 1 except Exception: pass export_obj.close() print entry_count, 'entries saved to:', parent + '/' + output_file
def readFPKMs(path): if '.gz' in path: f=gzip.open(path,'rb') else: f=open(path,"rU") file_content=f.read() fpkm_data = string.split(file_content,'\n') sample = export.findFilename(path) if 'fpkm_tracking' in sample: sample = string.split(sample,'.fpkm_tracking')[0] sample = string.replace(sample,'.sorted.genes','') fpkm_db={} transcript_db={} firstLine=True row_count=0 for line in fpkm_data: data = cleanUpLine(line) t = string.split(data,'\t') if firstLine: try: track_i = t.index('tracking_id') gene_i = t.index('gene_id') fpkm_i = t.index('FPKM') except Exception: fpkm_i = 9 gene_i = 3 row_count = 1 firstLine = False if firstLine == False and row_count>0: if len(t)>1: geneID = t[gene_i] transcriptID = t[gene_i] fpkm = t[fpkm_i] fpkm_db[transcriptID] = float(fpkm) transcript_db[transcriptID] = geneID row_count+=1 sample_FPKM_db[sample] = fpkm_db return sample_FPKM_db,transcript_db
def readFPKMs(path): f=gzip.open(path,'rb') file_content=f.read() fpkm_data = string.split(file_content,'\n') sample = export.findFilename(path) fpkm_db={} transcript_db={} firstLine=True for line in fpkm_data: data = cleanUpLine(line) t = string.split(data,'\t') if firstLine: track_i = t.index('tracking_id') gene_i = t.index('gene_id') fpkm_i = t.index('FPKM') firstLine = False else: geneID = t[gene_i] transcriptID = t[gene_i] fpkm = t[fpkm_i] fpkm_db[transcriptID] = float(fpkm) transcript_db[transcriptID] = geneID sample_FPKM_db[sample] = fpkm_db return sample_FPKM_db,transcript_db
def downloadCurrentVersion(filename,secondary_dir,file_type): import UI file_location_defaults = UI.importDefaultFileLocations() uds = file_location_defaults['url'] ### Get the location of the download site from Config/default-files.csv for ud in uds: url_dir = ud.Location() ### Only one entry dir = export.findParentDir(filename) filename = export.findFilename(filename) url = url_dir+secondary_dir+'/'+filename file,status = download(url,dir,file_type); continue_analysis = 'yes' if 'Internet' in status: print_out = "File:\n"+url+"\ncould not be found on server or internet connection is unavailable." try: UI.WarningWindow(print_out,'WARNING!!!') continue_analysis = 'no' except Exception: print url print 'cannot be downloaded';die elif status == 'remove': try: os.remove(file) ### Not sure why this works now and not before except Exception: status = status return continue_analysis
def readFPKMs(path): f = gzip.open(path, 'rb') file_content = f.read() fpkm_data = string.split(file_content, '\n') sample = export.findFilename(path) fpkm_db = {} transcript_db = {} firstLine = True for line in fpkm_data: data = cleanUpLine(line) t = string.split(data, '\t') if firstLine: track_i = t.index('tracking_id') gene_i = t.index('gene_id') fpkm_i = t.index('FPKM') firstLine = False else: geneID = t[gene_i] transcriptID = t[gene_i] fpkm = t[fpkm_i] fpkm_db[transcriptID] = float(fpkm) transcript_db[transcriptID] = geneID sample_FPKM_db[sample] = fpkm_db return sample_FPKM_db, transcript_db
def Enrichment(Inputfile,mutdict,mutfile,Expand,header): import collections import mappfinder X=defaultdict(list) prev="" head=0 group=defaultdict(list) enrichdict=defaultdict(float) mut=export.findFilename(mutfile) dire=export.findParentDir(Inputfile) output_dir = dire+'MutationEnrichment' export.createExportFolder(output_dir) exportnam=output_dir+'/Enrichment_Results.txt' export_enrich=open(exportnam,"w") exportnam=output_dir+'/Enrichment_tophits.txt' export_hit=open(exportnam,"w") export_enrich.write("Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n") if Expand=="yes": header2=header_file(Inputfile,Expand="yes") for line in open(Inputfile,'rU').xreadlines(): if head >0: line=line.rstrip('\r\n') q= string.split(line,'\t') for i in range(1,len(q)): if q[i]==str(1): #group[q[0]].append(header2[i-1]) group[header2[i-1]].append(q[0]) else: head+=1 continue else: for line in open(Inputfile,'rU').xreadlines(): line=line.rstrip('\r\n') line=string.split(line,'\t') #for i in range(1,len(line)): group[line[2]].append(line[0]) total_Scores={} for kiy in mutdict: if kiy =="MDP": print mutdict[kiy] groupdict={} remaining=[] remaining=list(set(header) - set(mutdict[kiy])) groupdict[1]=mutdict[kiy] groupdict[2]=remaining # export_enrich1.write(kiy) for key2 in group: r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy])))) n=float(len(group[key2])) R=float(len(set(mutdict[kiy]))) N=float(len(header)) if r==0 or R==1.0: print kiy,key2,r,n,R,N pval=float(1) z=float(0) null_z = 0.000 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) else: try: z = Zscore(r,n,N,R) except : z = 0.0000 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0,n,N,R) except Exception: null_z = 0.000 try: pval = mappfinder.FishersExactTest(r,n,R,N) zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) except Exception: pval=1.0 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) #pass if kiy in total_Scores: signature_db = total_Scores[kiy] signature_db[key2]=zsd ### Necessary format for the permutation function else: signature_db={key2:zsd} total_Scores[kiy] = signature_db sorted_results=[] mutlabels={} for kiy in total_Scores: signature_db = total_Scores[kiy] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([signature,float(zsd.PermuteP()),results]) sorted_results.sort() ### Sort by p-value prev="" for (sig,p,values) in sorted_results: if sig!=prev: flag=True export_hit.write(string.join(values,'\t')+'\n') if flag: if (float(values[5])>=0.5 and float(values[6])>=0.5) or float(values[5])>=0.6 : mutlabels[values[1]]=values[0] flag=False export_hit.write(string.join(values,'\t')+'\n') export_enrich.write(string.join(values,'\t')+'\n') prev=sig if len(sorted_results)==0: export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n') export_enrich.close() #print mutlabels return mutlabels
def Enrichment(Guidefile, mutdict, mutfile, Expand, header): X = defaultdict(list) prev = "" head = 0 group = defaultdict(list) mut = export.findFilename(mutfile) exportnam = Guidefile[:-4] + mut[:-4] + 'enrichment.txt' export_enrich = open(exportnam, "w") export_enrich.write("Mutations" + "\t" + "Cluster" + "\t" + "Pvalue" + "\t" + "r" + "\t" + "R" + "\t" + "n" + "\t" + "z-score" + "\t" + "Sensitivity" + "\t" + "Specificity" + "\n") if Expand == "yes": header2 = header_file(Guidefile) for line in open(Guidefile, 'rU').xreadlines(): if head > 0: line = line.rstrip('\r\n') q = string.split(line, '\t') for i in range(1, len(q)): if q[i] == str(1): group[q[0]].append(header2[i - 1]) else: head += 1 continue else: for line in open(Guidefile, 'rU').xreadlines(): line = line.rstrip('\r\n') line = string.split(line, '\t') #for i in range(1,len(line)): group[line[2]].append(line[0]) for kiy in mutdict: groupdict = {} remaining = [] remaining = list(set(header) - set(mutdict[kiy])) groupdict[1] = mutdict[kiy] groupdict[2] = remaining for key2 in group: r = float( len(group[key2]) - len(list(set(group[key2]) - set(mutdict[kiy])))) n = float(len(group[key2])) R = float(len(set(mutdict[kiy]))) N = float(len(header)) #print kiy,key2,r,n,R,N if r == 0: pval = float(1) z = float(0) else: try: pval, z = FishersExactTest(r, n, R, N) export_enrich.write( str(kiy) + "\t" + str(key2) + "\t" + str(pval) + "\t" + str(r) + "\t" + str(R) + "\t" + str(n) + "\t" + str(z) + "\t" + str(float(r) / (float(R))) + "\t" + str(float(r) / (float(n))) + "\n") except Exception: print r, n, R, N pass
def remoteGene(gene,Species,root_dir,comparison_file): global Transcript_Annotations_File global ExonRegion_File global Selected_Gene global Prt_Trans_File global Prt_Regions_File global Prt_Boundaries_File global SplicingIndex_File global UniPrt_Regions_File global microRNA_File global domainAnnotation_db global platform global species Selected_Gene = str(gene) species = Species comparison_name = string.split(export.findFilename(comparison_file),'.')[0] ExonRegion_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_exon.txt") Transcript_Annotations_File = unique.filepath("AltDatabase/ensembl/"+species+"/"+species+"_Ensembl_transcript-annotations.txt") Prt_Trans_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'Ensembl_Protein') Prt_Regions_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinFeatures') Prt_Boundaries_File = searchDirectory("AltDatabase/ensembl/"+species+"/",'ProteinCoordinates') UniPrt_Regions_File = searchDirectory("AltDatabase/uniprot/"+species+"/",'FeatureCoordinate') SplicingIndex_File = searchDirectory(root_dir+'/AltResults/ProcessedSpliceData/','splicing-index',secondary=comparison_name) platform = getPlatform(SplicingIndex_File) microRNA_File = searchDirectory("AltDatabase/"+species+"/"+platform,'microRNAs_multiple') #print(SplicingIndex_File) total_val = ProteinCentricIsoformView(Selected_Gene) junctions = total_val[0] p_boundaries = total_val[1] p_domains = total_val[2] transcript_db = total_val[3] exon_db = total_val[4] splice_db = total_val[5] microRNA_db = total_val[6] domainAnnotation_db = total_val[7] #for i in exon_db: # print("THE", i, exon_db[i], "\n") #for i in microRNA_db: # m_test = microRNA_db[i] # print(len(m_test)) # for q in m_test: # print("microRNA", q.ExonBlock(), q.Description(), q.BP(), "\n") #for i in exon_db["ENST00000349238"]: # print(i[2].EnsemblRegion()) domain_color_list = [] for i in p_domains: ploy = p_domains[i] for a in ploy: domain_color_list.append(a[1]) domain_color_list = list(set(domain_color_list)) domain_color_key = {} c_color1 = [0.8, 0.6, 0.1] c_color2 = [0.1, 0.6, 0.8] c_color3 = [0.6, 0.1, 0.8] c_color4 = [0.95, 0.6, 0.3] c_color5 = [0.3, 0.6, 0.95] c_color6 = [0.6, 0.3, 0.95] FLAG = 1 for item in domain_color_list: if(FLAG == 1): domain_color_key[item] = c_color1 FLAG = FLAG + 1 continue if(FLAG == 2): domain_color_key[item] = c_color2 FLAG = FLAG + 1 continue if(FLAG == 3): domain_color_key[item] = c_color3 FLAG = FLAG + 1 continue if(FLAG == 4): domain_color_key[item] = c_color4 FLAG = FLAG + 1 continue if(FLAG == 5): domain_color_key[item] = c_color5 FLAG = FLAG + 1 continue if(FLAG == 6): domain_color_key[item] = c_color6 FLAG = 1 continue #for i in domain_color_key: #print(i, domain_color_key[i], "\n") Y = 100 Transcript_to_Y = {} for transcript in transcript_db: Transcript_to_Y[transcript] = Y Y = Y + 300 import traceback def onpick(event): #ind = event.ind print(event.artist.get_label()) #for i in domainAnnotation_db: print(i,len(domainAnnotation_db));break fig = pylab.figure() ylim = Y + 200 currentAxis = pylab.gca() #ax = pylab.axes() ax = fig.add_subplot(111) X_Pos_List = [] CoordsBank = [] for transcript in transcript_db: try: Junc_List = junctions[transcript] y_pos = Transcript_to_Y[transcript] Gene_List = exon_db[transcript] color_flag = 1 for entry in Gene_List: G_start = entry[0][0] G_end = entry[0][1] Exon_Object = entry[2] try: LabelClass = splice_db[Exon_Object.EnsemblRegion()] ExonName = Exon_Object.EnsemblExon() RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "Exon: " + str(ExonName) + "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n" Label = string.replace(Label,"\n"," ") if(RegCall == "UC"): color_choice = "Grey" else: S_Int = float(SplicingIndex) if(S_Int > 0): #color_choice = (0.7, 0.7, 0.99) color_choice = 'blue' if(S_Int < 0): #color_choice = (0.8, 0.4, 0.4) color_choice = 'red' except: #print(traceback.format_exc());sys.exit() Label = "" color_choice = "Grey" #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion()) if((color_flag % 2) == 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True)) y_end = y_pos + 50 try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' Pval: '+str(Midas)[:4])) except Exception: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) if((color_flag % 2) != 0): currentAxis.add_patch(Rectangle((G_start, y_pos), (G_end - G_start), 50, color = color_choice, label = (entry[2].EnsemblRegion() + Label), picker = True)) y_end = y_pos + 50 try: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion()+' '+ 'SI: '+str(SplicingIndex)[:4]+' p-value: '+str(Midas)[:4])) except Exception: CoordsBank.append((G_start, G_end, y_pos, y_end, 'Exon: '+entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) color_flag = color_flag + 1 if(entry[2].EnsemblRegion() in microRNA_db): microRNA_object = microRNA_db[entry[2].EnsemblRegion()] mr_label = "MICRORNA MATCHES" + "\n" for class_object in microRNA_object: mr_exonname = class_object.ExonBlock() mr_desc = class_object.Description() + " " + class_object.Algorithms() #print(mr_desc) mr_label = mr_label + mr_desc + "\n" currentAxis.add_patch(Rectangle((G_start, (y_pos - 75)), (G_end - G_start), 40, color = "Green", label = (mr_label), picker = True)) y_start = y_pos - 75 y_end = y_pos - 35 CoordsBank.append((G_start, G_end, y_start, y_end, mr_desc)) for entry in Junc_List: junctionID = entry[-1] try: LabelClass = splice_db[entry[2]] RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "RegCall: " + str(RegCall) + "\n" + "Splicing Index: " + str(SplicingIndex) + "\n" + "P-Value: " + str(PVal) + "\n" + "Midas Value: " + str(Midas) + "\n" if(float(SplicingIndex) > 0): color_junc = "blue" if(float(SplicingIndex) < 0): color_junc = "red" if(RegCall == "UC"): color_junc = "grey" except: Label = "" color_junc = "grey" currentAxis.add_patch(Rectangle((entry[0], y_pos), (entry[1] - entry[0]), 50, color = "White", label = (str(entry[2]) + Label), picker = True)) ax.arrow(entry[0], (y_pos+50), 8, 40, label = (str(entry[2]) + Label), color = color_junc, picker = True) ax.arrow((entry[0] + 8), (y_pos+90), 11, -40, label = (str(entry[2]) + Label), color = color_junc, picker = True) y_start = y_pos y_end = y_pos + 30 #print(junctionID,y_start,y_end) CoordsBank.append((G_start, G_end, y_start, y_end, junctionID)) try: P_Bound_List = p_boundaries[transcript] E_Start = P_Bound_List[-2] E_End = P_Bound_List[-1] P_Start = P_Bound_List[1] P_End = P_Bound_List[2] #print("Boundaries: ", P_Start, P_End) X_Pos_List.append(int(E_End)) #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue")) try: currentAxis.add_patch(Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start), 10)) except: pass p_label_list = ["DEF"] #CoordsBank.append((P_Start, P_End, y_pos, P_End - P_Start, transcript)) ### Added by NS - needs work try: P_Domain_List = p_domains[transcript] except Exception: P_Domain_List=[] for entry in P_Domain_List: #print("Domain", entry) color_domain_choice = domain_color_key[entry[1]] domain_annotation = domainAnnotation_db[entry[1]] #domain_annotation = string.replace(domain_annotation,'REGION-','') p_label = (str(entry[0]) + " " + str(domain_annotation)) #print(entry[0], entry[2], entry[3], P_Start, P_End, domain_annotation, ) Repeat_Flag = 0 for i in p_label_list: if(p_label == i): Repeat_Flag = 1 if(Repeat_Flag == 1): continue p_label_list.append(p_label) currentAxis.add_patch(Rectangle((entry[2], y_pos + 100), (entry[3] - entry[2]), 50, color = color_domain_choice, label= p_label, picker = True)) y_start = y_pos + 100 y_end = y_pos + 150 CoordsBank.append((entry[2], entry[3], y_start, y_end, p_label)) except Exception: pass #print(traceback.format_exc()) except: #print(traceback.format_exc()) pass pylab.ylim([0.0, ylim]) try: max_x = max(X_Pos_List) except: max_x = 5000 try: pylab.xlim([0.0, max_x]) except: pylab.xlim([0.0, 3000]) fig.canvas.mpl_connect('pick_event', onpick) def format_coord(x, y): for m in CoordsBank: if(x >= m[0] and x <= m[1] and y >= m[2] and y <= m[3]): string_display = m[4] return string_display string_display = " " return string_display ax.format_coord = format_coord #datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None) pylab.show()
def getPlatform(filename): prefix = string.split(export.findFilename(filename),'.')[0] array_type = string.split(prefix,'_')[1] if array_type != 'RNASeq': array_type = string.lower(array_type) return array_type
def parseJunctionEntries(bam_dir,multi=False, Species=None): global bam_file global splicesite_db global IndicatedSpecies IndicatedSpecies = Species bam_file = bam_dir try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites() except Exception: splicesite_db={}; chromosomes_found={} start = time.time() try: import collections; junction_db=collections.OrderedDict() except Exception: try: import ordereddict; junction_db = ordereddict.OrderedDict() except Exception: junction_db={} original_junction_db = copy.deepcopy(junction_db) bamf = pysam.Samfile(bam_dir, "rb" ) ### Is there are indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0],entry.cigar) break except Exception: ### Make BAM Index if multi == False: print 'Building BAM index file for', bam_dir bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir) bamf = pysam.Samfile(bam_dir, "rb" ) chromosome = False chromosomes={} count=0 jid = 1 prior_jc_start=0 l1 = None; l2=None o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w") o.write('track name=junctions description="TopHat junctions"\n') export_isoform_models = False if export_isoform_models: io = open (string.replace(bam_dir,'.bam','__isoforms.txt'),"w") isoform_junctions = copy.deepcopy(junction_db) outlier_start = 0; outlier_end = 0; read_count = 0; c=0 for entry in bamf.fetch(): try: cigarstring = entry.cigarstring except Exception: codes = map(lambda x: x[0],entry.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None if cigarstring != None: if 'N' in cigarstring: ### Hence a junction """ if entry.cigar[0][1]<60 and entry.cigar[0][1]>20: if count<310: a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]] a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6] if l1==a1 and l2==a2: continue else: print entry.opt('XS'), a1,a2, entry.seq l1 = a1; l2 = a2 else: sys.exit() """ if prior_jc_start == 0: pass elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads writeJunctionBedFile(junction_db,jid,o) #writeIsoformFile(isoform_junctions,io) junction_db = copy.deepcopy(original_junction_db) ### Re-set this object jid+=1 chromosome = bamf.getrname( entry.rname ) chromosomes[chromosome]=[] ### keep track X=entry.pos Y=entry.pos+entry.alen prior_jc_start = X """ if entry.is_reverse: strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below) else: strand = '+' """ try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read except Exception: #if multi == False: print 'No TopHat strand information';sys.exit() tophat_strand = None coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X) for (five_prime_ss,three_prime_ss) in coordinates: jc = five_prime_ss,three_prime_ss #print X, Y, jc, entry.cigarstring, entry.cigar try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist]) except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]] if export_isoform_models: try: mate = bamf.mate(entry) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI if 'N' in mate.cigarstring: mate_coordinates,mate_up_to_intron_dist = getSpliceSites(mate.cigar,mate.pos) else: mate_coordinates=[] except Exception: mate_coordinates=[] #print coordinates,mate_coordinates junctions = map(lambda x: tuple(x),coordinates) if len(mate_coordinates)>0: try: isoform_junctions[chromosome,tuple(junctions),tophat_strand].append(mate_coordinates) except Exception: isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [mate_coordinates] else: if (chromosome,tuple(junctions),tophat_strand) not in isoform_junctions: isoform_junctions[chromosome,tuple(junctions),tophat_strand] = [] count+=1 writeJunctionBedFile(junction_db,jid,o) ### One last read-out if multi == False: print time.time()-start, 'seconds required to parse the BAM file' o.close() bamf.close() missing_chromosomes=[] for chr in chromosomes_found: if chr not in chromosomes: chr = string.replace(chr,'chr','') if chr not in chromosomes_found: if chr != 'M' and chr != 'MT': missing_chromosomes.append(chr) #missing_chromosomes = ['A','B','C','D'] try: bam_file = export.findFilename(bam_file) except Exception: pass return bam_file, missing_chromosomes
def filepath(filename, force=None): altDatabaseCheck = True #dir=os.path.dirname(dirfile.__file__) #directory file is input as a variable under the main dir = application_path """ if os.path.isfile(filename): fn = filename return fn elif os.path.isfile(dir+'/'+filename): fn = filename return fn #""" """ If a local file without the full path (e.g., Config/options.txt). Checks in the software directory.""" import export parent_dir = export.findParentDir(filename) actual_file = export.findFilename(filename) try: #if os.path.exists(dir+'/'+parent_dir): dir_list = os.listdir(dir + '/' + parent_dir) fn = dir + '/' + parent_dir + '/' + actual_file if '.txt' in fn or '.log' in fn: return fn except: pass if filename == '': ### Windows will actually recognize '' as the AltAnalyze root in certain situations but not others fn = dir elif ':' in filename: fn = filename else: try: try: dir_list = os.listdir(dir + '/' + filename) fn = dir + '/' + filename except: dir_list = os.listdir(filename) fn = filename ### test to see if the path can be found (then it is the full path) except Exception: fn = os.path.join(dir, filename) fileExists = os.path.isfile(fn) #print 'filename:',filename, fileExists """"When AltAnalyze installed through pypi - AltDatabase and possibly Config in user-directory """ if 'Config' in fn: if fileExists == False and force != 'application-path' and ignoreHome == False: fn = os.path.join(userHomeDir, filename) if 'AltDatabase' in fn: getCurrentGeneDatabaseVersion() fn = correctGeneDatabaseDir(fn) altanalyze_dir = string.split(fn, 'AltDatabase')[0] + 'AltDatabase' ### Check the AltDatabase dir not the fn, since the fn may not exist yet fileExists = os.path.isfile(altanalyze_dir) try: dir_list = os.listdir(altanalyze_dir) fileExists = True except Exception: pass #print 2, [fn],fileExists if fileExists == False and ignoreHome == False: fn = os.path.join(userHomeDir, filename) fn = correctGeneDatabaseDir(fn) altDatabaseCheck = False if '/Volumes/' in filename and altDatabaseCheck: filenames = string.split(filename, '/Volumes/') fn = '/Volumes/' + filenames[-1] for py2app_dir in py2app_dirs: fn = string.replace(fn, py2app_dir, '') if (('Databases' in fn) or ('AltDatabase' in fn)) and altDatabaseCheck: getCurrentGeneDatabaseVersion() fn = correctGeneDatabaseDir(fn) fn = string.replace(fn, '.txt.txt', '.txt') fn = string.replace(fn, '//', '/') fn = string.replace(fn, '//', '/') ### If /// present return fn
def associateQueryGenesWithInteractions(query_db,query_interactions,dir_file): suffix='' if dir_file!=None: if len(dir_file)!=0: suffix='-'+intNameShort+'_'+export.findFilename(dir_file)[:-4] if len(suffix)==0: try: suffix = '_'+FileName except Exception: None file_name = 'AltAnalyze-network'+suffix query_interactions_unique={} interacting_genes={} connections = 1 primary=0 secondary=0 terciary=0 for ensemblGene in query_db: if ensemblGene in interaction_db: for interacting_ensembl in interaction_db[ensemblGene]: if interacting_ensembl not in blackList: ###Only allow direct interactions found in query if interacting_ensembl in query_db: try: query_interactions[ensemblGene].append(interacting_ensembl) except KeyError: query_interactions[ensemblGene] = [interacting_ensembl] try: query_interactions[interacting_ensembl].append(ensemblGene) except KeyError: query_interactions[interacting_ensembl] = [ensemblGene] primary+=1 if degrees == 2 or degrees == 'indirect': try: interacting_genes[interacting_ensembl].append(ensemblGene) except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene] elif degrees == 'allInteracting' or degrees == 'all possible': try: query_interactions[ensemblGene].append(interacting_ensembl) except KeyError: query_interactions[ensemblGene] = [interacting_ensembl] if interacting_ensembl in secondaryQueryIDs: ### IDs in the expression file secondary+=1 ### When indirect degrees selected, no additional power added by this (only for direct or shortest path) try: query_interactions[ensemblGene].append(interacting_ensembl) except KeyError: query_interactions[ensemblGene] = [interacting_ensembl] if ensemblGene in second_degree_obligatory: for interacting_ensembl in second_degree_obligatory[ensemblGene]: try: interacting_genes[interacting_ensembl].append(ensemblGene) except KeyError: interacting_genes[interacting_ensembl] = [ensemblGene] ### Include indirect interactions to secondaryQueryIDs from the expression file if degrees == 2 or degrees == 'indirect': for ensemblGene in secondaryQueryIDs: if ensemblGene in interaction_db: for interacting_ensembl in interaction_db[ensemblGene]: if interacting_ensembl not in blackList: try: interacting_genes[interacting_ensembl].append(ensemblGene) terciary+=1#; print interacting_ensembl except KeyError: None ### Only increase the interacting_genes count if the interacting partner is present from the primary query list #print primary,secondary,terciary ### Report the number of unique interacting genes for interacting_ensembl in interacting_genes: if len(interacting_genes[interacting_ensembl])==1: interacting_genes[interacting_ensembl] = 1 else: unique_interactions = unique.unique(interacting_genes[interacting_ensembl]) interacting_genes[interacting_ensembl] = len(unique_interactions) query_indirect_interactions={}; indirect_interacting_gene_list=[]; interacting_gene_list=[]; added=[] if degrees=='shortestPath' or degrees=='shortest path': ### Typically identifying the single smallest path(s) between two nodes. query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_db,10) else: if degrees==2 or degrees=='indirect' or len(secondDegreeObligatoryCategories)>0: for ensembl in interacting_genes: if interacting_genes[ensembl] > connections: if ensembl in interaction_db: ### Only nodes removed due to promiscuity will not be found for interacting_ensembl in interaction_db[ensembl]: if interacting_ensembl in query_db or interacting_ensembl in secondaryQueryIDs: try: query_indirect_interactions[interacting_ensembl].append(ensembl) except KeyError: query_indirect_interactions[interacting_ensembl] = [ensembl] ###Record the highest linked nodes indirect_interacting_gene_list.append((interacting_genes[ensembl],ensembl)) if len(obligatory_interactions)>0: ### Include always all_reported_genes = combineDBs(query_interactions,query_indirect_interactions) ### combinesDBs and returns a unique list of genes for ensemblGene in all_reported_genes: ###This only includes genes in the original input list if ensemblGene in obligatory_interactions: for interacting_ensembl in obligatory_interactions[ensemblGene]: #symbol = ensembl_symbol_db[ensemblGene] try: query_interactions[ensemblGene].append(interacting_ensembl) except KeyError: query_interactions[ensemblGene] = [interacting_ensembl] z = dict(query_interactions.items() + query_indirect_interactions.items()) interaction_restricted_db={} for ensembl in z: interacting_nodes = z[ensembl] for node in interacting_nodes: if ensembl in interaction_restricted_db: db = interaction_restricted_db[ensembl] db[node] = 1 else: interaction_restricted_db[ensembl] = {node:1} if node in interaction_restricted_db: db = interaction_restricted_db[node] db[ensembl] = 1 else: interaction_restricted_db[node] = {ensembl:1} if degrees==2 or degrees=='indirect': ### get rid of non-specific interactions query_indirect_interactions, indirect_interacting_gene_list, interacting_gene_list = evaluateShortestPath(query_db,interaction_restricted_db,4) ###Record the highest linked nodes for ensembl in query_interactions: linked_nodes = len(unique.unique(query_interactions[ensembl])) interacting_gene_list.append((linked_nodes,ensembl)) interacting_gene_list.sort(); interacting_gene_list.reverse() indirect_interacting_gene_list.sort(); indirect_interacting_gene_list.reverse() print "Length of query_interactions:",len(query_interactions) query_interactions_unique=[] for gene1 in query_interactions: for gene2 in query_interactions[gene1]: temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort() if gene1 == gene2: interaction_type = 'self' else: interaction_type = 'distinct' temp.append(interaction_type); temp.reverse() query_interactions_unique.append(temp) for gene1 in query_indirect_interactions: for gene2 in query_indirect_interactions[gene1]: temp = []; temp.append(gene2); temp.append(gene1)#; temp.sort() if gene1 == gene2: interaction_type = 'self' else: interaction_type = 'indirect' temp.append(interaction_type); temp.reverse() query_interactions_unique.append(temp) query_interactions_unique = unique.unique(query_interactions_unique) query_interactions_unique.sort() ###Write out nodes linked to many other nodes new_file = outputDir+'/networks/'+file_name+ '-interactions_'+str(degrees)+'_degrees_summary.txt' data = export.ExportFile(new_file) for (linked_nodes,ensembl) in interacting_gene_list: try: symbol = query_db[ensembl] except KeyError: symbol = ensembl_symbol_db[ensembl] data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'direct'+'\n') for (linked_nodes,ensembl) in indirect_interacting_gene_list: try: symbol = query_db[ensembl] except KeyError: try: symbol = ensembl_symbol_db[ensembl] except KeyError: symbol = ensembl if 'HMDB' in symbol: try: symbol = hmdb_symbol_db[ensembl] except Exception: pass data.write(str(linked_nodes)+'\t'+ensembl+'\t'+symbol+'\t'+'indirect'+'\n') data.close() regulated_gene_db = query_db sif_export,symbol_pair_unique = exportInteractionData(file_name,query_interactions_unique,regulated_gene_db) return sif_export,symbol_pair_unique
def compareImportedTables(file_list,outputDir,importDir=False,considerNumericDirection=False,display=True): ### added for AltAnalyze print 'Creating Venn Diagram from input files...' import UI import export file_id_db={} file_list2=[] for file in file_list: x=0 if '.txt' in file: if importDir !=False: ### When all files in a directory are analyzed fn=UI.filepath(import_dir+'/'+file) else: fn = file file = export.findFilename(fn) ### Only report the actual filename file_list2.append(file) for line in open(fn,'rU').xreadlines(): if x == 0: data_type = examineFields(line) x+=1 else: data = UI.cleanUpLine(line) t = string.split(data,'\t') uid = t[0] valid = True if data_type != 'first': if data_type == 'comparison': score = float(string.split(t[6],'|')[0]) if 'yes' not in t[5]: valid = False ### not replicated independently if data_type == 'reciprocal': uid = t[8]+'-'+t[10] score = float(t[1]) if data_type == 'single': uid = t[6] score = float(t[1]) else: try: score = float(t[1]) #t[2] except Exception: score = None if score != None and considerNumericDirection: ### change the UID so that it only matches if the same direction if score>0: uid+='+' ### encode the ID with a negative sign else: uid+='-' ### encode the ID with a negative sign #if score>0: if valid: try: file_id_db[file].append(uid) except Exception: file_id_db[file] = [uid] id_lists=[] new_file_list=[] for file in file_list2: ### Use the sorted names if file in file_id_db: uids = file_id_db[file] id_lists.append(uids) new_file_list.append(file) #print file, len(new_file_list), len(uids) if len(file_id_db): if len(new_file_list)==2 or len(new_file_list)==3: SimpleMatplotVenn(new_file_list,id_lists,outputDir=outputDir,display=False) ### display both below venn(id_lists, new_file_list, fill="number", show_names=False, outputDir=outputDir, show_plot=display)
def importInteractionDatabases(interactionDirs): """ Import multiple interaction format file types (designated by the user) """ exclude=[] for file in interactionDirs: status = verifyFile(file) if status == 'not found': exclude.append(file) for i in exclude: interactionDirs.remove(i) for fn in interactionDirs: #loop through each file in the directory to output results x=0; imported=0; stored=0 file = export.findFilename(fn) count=0 print "Parsing interactions from:",file for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') count+=1 if x==0: x=1 #elif 'PAZAR' in data or 'Amadeus' in data:x+=0 else: obligatory = False imported+=1 proceed = True source='' interaction_type = 'interaction' try: symbol1,interaction_type, symbol2, ensembl1,ensembl2,source = t ens_ls1=[ensembl1]; ens_ls2=[ensembl2] if 'HMDB' in ensembl1: ensembl1 = string.replace(ensembl1,' ','') ### HMDB ID sometimes proceeded by ' ' symbol_hmdb_db[symbol1]=ensembl1 hmdb_symbol_db[ensembl1] = symbol1 interaction_type = 'Metabolic' if 'HMDB' in ensembl2: ensembl2 = string.replace(ensembl2,' ','') ### HMDB ID sometimes proceeded by ' ' symbol_hmdb_db[symbol2]=ensembl2 hmdb_symbol_db[ensembl2] = symbol2 interaction_type = 'Metabolic' except Exception: try: ensembl1,ensembl2,symbol1,symbol2,interaction_type=t if ensembl1 == '': try: ens_ls1 = symbol_ensembl_db[symbol1] ens_ls2 = symbol_ensembl_db[symbol2] except Exception: None except Exception: proceed = False if proceed: ### If the interaction data conformed to one of the two above types (typically two valid interacting gene IDs) if (len(ens_ls1)>0 and len(ens_ls2)>0): secondary_proceed = True stored+=1 for ensembl1 in ens_ls1: for ensembl2 in ens_ls2: """ if (ensembl1,ensembl2) == ('ENSG00000111704','ENSG00000152284'): print t;sys.exit() if (ensembl1,ensembl2) == ('ENSG00000152284','ENSG00000111704'): print t;sys.exit() """ if 'WikiPathways' in file or 'KEGG' in file: if ensembl2 != ensembl1: if (ensembl2,ensembl1) in interaction_annotation_dbase: del interaction_annotation_dbase[(ensembl2,ensembl1)] ### Exclude redundant entries with fewer interaction details (e.g., arrow direction BIOGRID) - overwrite with the opposite gene arrangement below if (ensembl1,ensembl2) in interaction_annotation_dbase: if interaction_annotation_dbase[(ensembl1,ensembl2)].InteractionType() !='physical': secondary_proceed = False ### Don't overwrite a more informative annotation like transcriptional regulation or microRNA targeting if 'DrugBank' in fn: source = 'DrugBank' interaction_type = 'drugInteraction' obligatory=True ensembl1, ensembl2 = ensembl2, ensembl1 ### switch the order of these (drugs reported as first ID and gene as the second) if secondary_proceed: z = InteractionInformation(ensembl1,ensembl2,source,interaction_type) interaction_annotation_dbase[ensembl1,ensembl2] = z #z = InteractionInformation(ensembl2,ensembl1,source,interaction_type) #interaction_annotation_dbase[ensembl2,ensembl1] = z try: interaction_db[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; interaction_db[ensembl1] = db ###weight of 1 (weights currently not-supported) try: interaction_db[ensembl2][ensembl1]=1 except KeyError: db = {ensembl1:1}; interaction_db[ensembl2] = db ###weight of 1 (weights currently not-supported) if obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs) try: obligatory_interactions[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; obligatory_interactions[ensembl1] = db ###weight of 1 (weights currentlynot-supported) elif source in secondDegreeObligatoryCategories: try: second_degree_obligatory[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; second_degree_obligatory[ensembl1] = db ###weight of 1 (weights currently not-supported) else: proceed = False try: ID1, null, ID2 = t proceed = True except Exception: try: ID1, ID2 = t proceed = True except Exception: None if proceed: if 'microRNATargets' in fn: if 'mir' in ID2: prefix = 'MIR' else: prefix = 'LET' ID2='MIR'+string.split(ID2,'-')[2] ### Ensembl naming convention source = 'microRNATargets' interaction_type = 'microRNAInteraction' obligatory=True try: ID_ls1 = symbol_ensembl_db[ID1] except Exception: ID_ls1 = [ID1] try: ID_ls2 = symbol_ensembl_db[ID2] except Exception: ID_ls2 = [ID2] """if 'microRNATargets' in fn: if '*' not in ID2: print ID_ls2;sys.exit()""" addInteractions = True for ID1 in ID_ls1: for ID2 in ID_ls2: z = InteractionInformation(ID2,ID1,source,interaction_type) interaction_annotation_dbase[ID2,ID1] = z ### This is the interaction direction that is appropriate try: interaction_db[ID1][ID2]=1 except KeyError: db = {ID2:1}; interaction_db[ID1] = db ###weight of 1 (weights currently supported) try: interaction_db[ID2][ID1]=1 except KeyError: db = {ID1:1}; interaction_db[ID2] = db ###weight of 1 (weights currently supported) if source in secondDegreeObligatoryCategories: try: second_degree_obligatory[ID1][ID2]=1 except KeyError: db = {ID2:1}; second_degree_obligatory[ID1] = db ###weight of 1 (weights currently supported) elif obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs) try: obligatory_interactions[ID1][ID2]=1 except KeyError: db = {ID2:1}; obligatory_interactions[ID1] = db ###weight of 1 (weights currently supported) ### Evaluate the most promiscous interactors (e.g., UBC) remove_list=[] for ID in interaction_db: if len(interaction_db[ID])>2000: remove_list.append(ID) #print len(interaction_db[ID]),ensembl_symbol_db[ID] for ID in remove_list: #print 'removing', ID del interaction_db[ID] blackList[ID] = [] print 'Imported interactions:',len(interaction_annotation_dbase)
def buildInteractions(species,Degrees,inputType,inputDir,outputdir,interactionDirs,Genes=None, geneSetType=None,PathwayFilter=None,OntologyID=None,directory=None,expressionFile=None, obligatorySet=None,secondarySet=None,IncludeExpIDs=False): global degrees global outputDir global inputDataType global obligatoryList ### Add these if connected to anything global secondaryQueryIDs global secondDegreeObligatoryCategories ### Add if common to anything in the input - Indicates systems to apply this to global symbol_hmdb_db; symbol_hmdb_db={}; global hmdb_symbol_db; hmdb_symbol_db={} ### Create an annotation database for HMDB IDs global FileName global intNameShort secondaryQueryIDs = {} degrees = Degrees outputDir = outputdir inputDataType = inputType obligatoryList = obligatorySet secondDegreeObligatoryCategories=[] intNameShort='' if obligatoryList == None: obligatoryList=[] if expressionFile == None: expressionFile = inputDir ### If it doesn't contain expression values, view as yellow nodes if secondarySet!= None and (degrees==1 or degrees=='direct'): ### If degrees == 2, this is redundant ### This currently adds alot of predictions - either make more stringent or currently exclude secondDegreeObligatoryCategories = secondarySet if PathwayFilter != None: if len(PathwayFilter)==1: FileName = PathwayFilter[0] if isinstance(PathwayFilter, tuple) or isinstance(PathwayFilter, list): FileName = string.join(list(PathwayFilter),' ') FileName = string.replace(FileName,':','-') else: FileName = PathwayFilter if len(FileName)>40: FileName = FileName[:40] elif OntologyID != None: FileName = OntologyID elif Genes != None: FileName = Genes ### Import Ensembl-Symbol annotations getEnsemblGeneData('AltDatabase/ensembl/'+species+'/'+species+'_Ensembl-annotations.txt') if len(interactionDirs[0]) == 1: interactionDirs = [interactionDirs] ### Import interaction databases indicated in interactionDirs for i in interactionDirs: print i i = export.findFilename(i) i=string.split(i,'-')[1] intNameShort+=i[0] importInteractionData(interactionDirs) getHMDBData(species) ### overwrite the symbol annotation from any HMDB that comes from a WikiPathway or KEGG pathway that we also include (for consistent official annotation) input_IDs = getGeneIDs(Genes) try: if isinstance(PathwayFilter, tuple): for pathway in PathwayFilter: IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,pathway,OntologyID,directory) for id in IDs:input_IDs[id]=None else: input_IDs = gene_associations.simpleGenePathwayImport(species,geneSetType,PathwayFilter,OntologyID,directory) except Exception: None if expressionFile == None or len(expressionFile)==0: expressionFile = exportSelectedIDs(input_IDs) ### create an expression file elif IncludeExpIDs: ### Prioritize selection of IDs for interactions WITH the primary query set (not among expression input IDs) secondaryQueryIDs = importqueryResults(species,expressionFile,{})[0] input_IDs,query_interactions,dir_file = importqueryResults(species,inputDir,input_IDs) sif_file,symbol_pair_unique = associateQueryGenesWithInteractions(input_IDs,query_interactions,dir_file) output_filename = exportGraphImage(species,sif_file,expressionFile) return output_filename
def importGeneExpressionValues(filename, tissue_specific_db, translation_db): ### Import gene-level expression raw values fn = filepath(filename) x = 0 genes_added = {} gene_expression_db = {} dataset_name = export.findFilename(filename) print 'importing:', dataset_name for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if x == 0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x = 1 else: gene = t[0] #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene, exon = string.split(gene, '-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass if gene in tissue_specific_db: index, tissue_exp = tissue_specific_db[gene] try: genes_added[gene] += 1 except Exception: genes_added[gene] = 1 proceed = True try: exp_vals = map(float, t[1:]) if platform == 'RNASeq': #if max(exp_vals)<3: proceed=False exp_vals = map(lambda x: math.log(x + 1, 2), exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls( exp_vals, targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index, exp_vals] except Exception: print 'Formatting error encountered in:', dataset_name forceError print len(gene_expression_db ), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene] > 1: del gene_expression_db[ gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append( gene_expression_db[gene] ) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db = []
def NMFAnalysis(expressionInputFile,NMFinputDir,Rank,platform,iteration=0,strategy="conservative"): root_dir = export.findParentDir(NMFinputDir)[:-1] if 'ExpressionInput' in root_dir: root_dir = export.findParentDir(root_dir) if 'NMF-SVM' in root_dir: root_dir = export.findParentDir(root_dir) export.findFilename(NMFinputDir) X=[] header=[] head=0 exportnam=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_versionr'+str(Rank)+'.txt' export_res=export.ExportFile(exportnam) exportnam_bin=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary'+str(Rank)+'.txt' export_res1=export.ExportFile(exportnam_bin) exportnam_bint=root_dir+'/NMF-SVM/NMF/round'+str(iteration)+'NMFsnmf_binary_t_'+str(Rank)+'.txt' export_res5=export.ExportFile(exportnam_bint) MF_input = root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt' export.customFileCopy(expressionInputFile,root_dir+'/NMF-SVM/ExpressionInput/exp.NMF-MarkerFinder.txt') export_res4=open(string.replace(MF_input,'exp.','groups.'),"w") export_res7=open(string.replace(MF_input,'exp.','comps.'),"w") exportnam2=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Metadata'+str(Rank)+'.txt' export_res2=export.ExportFile(exportnam2) exportnam3=root_dir+'/NMF-SVM/SubtypeAnalyses/round'+str(iteration)+'Annotation'+str(Rank)+'.txt' export_res3=export.ExportFile(exportnam3) #if 'Clustering' in NMFinputDir: # count=1 # start=2 #else: count=0 start=1 #print Rank for line in open(NMFinputDir,'rU').xreadlines(): line=line.rstrip('\r\n') q= string.split(line,'\t') if head >count: val=[] val2=[] me=0.0 for i in range(start,len(q)): try: val2.append(float(q[i])) except Exception: continue me=np.median(val2) for i in range(start,len(q)): try: val.append(float(q[i])) except Exception: val.append(float(me)) #if q[1]==prev: X.append(val) else: export_res1.write(line) export_res.write(line) export_res1.write("\n") #export_res4.write(line) #export_res4.write("\n") export_res.write("\n") header=q head+=1 continue group=defaultdict(list) sh=[] X=np.array(X) #print X.shape mat=[] #mat=X mat=zip(*X) mat=np.array(mat) #print mat.shape #model = NMF(n_components=15, init='random', random_state=0) #W = model.fit_transform(mat) nmf = nimfa.Snmf(mat,seed="nndsvd", rank=int(Rank), max_iter=20,n_run=1,track_factor=False,theta=0.95) nmf_fit = nmf() W = nmf_fit.basis() W=np.array(W) #np.savetxt("basismatrix2.txt",W,delimiter="\t") H=nmf_fit.coef() H=np.array(H) # np.savetxt("coefficientmatrix2.txt",H,delimiter="\t") #print W.shape sh=W.shape export_res3.write("uid\tUID\tUID\n") if int(Rank)==2: par=1 else: par=2 #for i in range(sh[1]): # val=W[:,i] # me=np.mean(val) # st=np.std(val) # export_res2.write(header[i+1]) # for j in range(sh[0]): # if float(W[i][j])>=float(me+(par*st)): # # export_res2.write("\t"+str(1)) # else: # export_res2.write("\t"+str(0)) # # export_res2.write("\n") if platform != 'PSI': sh=W.shape Z=[] export_res5.write("uid") export_res2.write("uid") for i in range(sh[1]): export_res5.write("\t"+'V'+str(i)) export_res2.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res5.write("\n") export_res2.write("\n") export_res3.write("\n") for i in range(sh[0]): new_val=[] val=W[i,:] export_res2.write(header[i+1]) export_res5.write(header[i+1]) export_res4.write(header[i+1]) flag=True for j in range(sh[1]): if W[i][j]==max(val) and flag: export_res5.write("\t"+str(1)) export_res2.write("\t"+str(1)) new_val.append(1) export_res4.write("\t"+str(j+1)+"\t"+'V'+str(j)) flag=False else: export_res5.write("\t"+str(0)) export_res2.write("\t"+str(0)) new_val.append(0) Z.append(new_val) export_res5.write("\n") export_res2.write("\n") export_res4.write("\n") W=zip(*W) W=np.array(W) sh=W.shape Z=zip(*Z) Z=np.array(Z) for i in range(sh[0]): export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): export_res.write("\t"+str(W[i][j])) export_res1.write("\t"+str(Z[i][j])) export_res.write("\n") export_res1.write("\n") export_res.close() export_res1.close() export_res2.close() export_res5.close() Orderedheatmap.Classify(exportnam_bint) return exportnam,exportnam_bin,exportnam2,exportnam3 else: W=zip(*W) W=np.array(W) sh=W.shape Z=[] for i in range(sh[0]): new_val=[] val=W[i,:] num=sum(i > 0.10 for i in val) if num >40 or num <3: compstd=True else: compstd=False me=np.mean(val) st=np.std(val) #print 'V'+str(i) export_res.write('V'+str(i)) export_res1.write('V'+str(i)) for j in range(sh[1]): if compstd: if float(W[i][j])>=float(me+(par*st)): export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) else: if float(W[i][j])>0.1: export_res1.write("\t"+str(1)) new_val.append(1) else: export_res1.write("\t"+str(0)) new_val.append(0) export_res.write("\t"+str(W[i][j])) Z.append(new_val) export_res.write("\n") export_res1.write("\n") # Z=zip(*Z) Z=np.array(Z) sh=Z.shape Z_new=[] val1=[] Z1=[] dellst=[] export_res2.write("uid") export_res5.write("uid") for i in range(sh[0]): indices=[] val1=Z[i,:] sum1=sum(val1) flag=False indices=[index for index, value in enumerate(val1) if value == 1] for j in range(sh[0]): val2=[] if i!=j: val2=Z[j,:] sum2=sum([val2[x] for x in indices]) summ2=sum(val2) try: if float(sum2)/float(sum1)>0.5: if summ2>sum1: flag=True #print str(i) except Exception: continue if flag==False: Z1.append(val1) export_res2.write("\t"+'V'+str(i)) export_res5.write("\t"+'V'+str(i)) export_res3.write('V'+str(i)+"\t"+"Covariate"+"\t"+str(1)+"\n") export_res2.write("\n") export_res5.write("\n") Z1=np.array(Z1) Z=Z1 Z=zip(*Z) Z=np.array(Z) sh=Z.shape for i in range(sh[0]): val1=Z[i,:] #print sum(val1) #if sum(val)>2: if sum(val1)>2: val=[0 if x==1 else x for x in val1] else: val=val1 me=np.mean(val) st=np.std(val) export_res2.write(header[i+1]) export_res5.write(header[i+1]) for j in range(sh[1]): if strategy=="conservative": export_res2.write("\t"+str(val1[j])) export_res5.write("\t"+str(val1[j])) else: export_res2.write("\t"+str(val[j])) export_res5.write("\t"+str(val[j])) export_res2.write("\n") export_res5.write("\n") Z_new.append(val) Z_new=zip(*Z_new) Z_new=np.array(Z_new) sh=Z_new.shape export_res5.close() Orderedheatmap.Classify(exportnam_bint) if strategy=="conservative": return exportnam,exportnam_bin,exportnam2,exportnam3 else: return exportnam,exportnam_bin,exportnam2,exportnam3
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] try: gene = string.split(t[0],'|')[0] except Exception: pass #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = t[1:] if '' in exp_vals: ### If missing values present (PSI values) exp_vals = ['0.000101' if i=='' else i for i in exp_vals] useLog = False exp_vals = map(float, exp_vals) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError """else: for gene in tissue_specific_db: if 'Ndufa9:ENSMUSG00000000399:I2.1-E3.1' in gene: print gene, 'dog';sys.exit() print gene;kill""" print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def importInteractionDatabases(interactionDirs): """ Import multiple interaction format file types (designated by the user) """ exclude=[] for file in interactionDirs: status = verifyFile(file) if status == 'not found': exclude.append(file) for i in exclude: interactionDirs.remove(i) for fn in interactionDirs: #loop through each file in the directory to output results x=0; imported=0; stored=0 file = export.findFilename(fn) print "Parsing interactions from:",file for line in open(fn,'rU').xreadlines(): data,null = string.split(line,'\n') t = string.split(data,'\t') if x==0: x=1 #elif 'PAZAR' in data or 'Amadeus' in data:x+=0 else: obligatory = False imported+=1 proceed = True source='' interaction_type = 'interaction' try: symbol1,interaction_type, symbol2, ensembl1,ensembl2,source = t ens_ls1=[ensembl1]; ens_ls2=[ensembl2] if 'HMDB' in ensembl1: ensembl1 = string.replace(ensembl1,' ','') ### HMDB ID sometimes proceeded by ' ' symbol_hmdb_db[symbol1]=ensembl1 hmdb_symbol_db[ensembl1] = symbol1 interaction_type = 'Metabolic' if 'HMDB' in ensembl2: ensembl2 = string.replace(ensembl2,' ','') ### HMDB ID sometimes proceeded by ' ' symbol_hmdb_db[symbol2]=ensembl2 hmdb_symbol_db[ensembl2] = symbol2 interaction_type = 'Metabolic' except Exception: try: ensembl1,ensembl2,symbol1,symbol2,interaction_type=t if ensembl1 == '': try: ens_ls1 = symbol_ensembl_db[symbol1] ens_ls2 = symbol_ensembl_db[symbol2] except Exception: None except Exception: proceed = False if proceed: ### If the interaction data conformed to one of the two above types (typically two valid interacting gene IDs) if (len(ens_ls1)>0 and len(ens_ls2)>0): secondary_proceed = True stored+=1 for ensembl1 in ens_ls1: for ensembl2 in ens_ls2: """ if (ensembl1,ensembl2) == ('ENSG00000111704','ENSG00000152284'): print t;sys.exit() if (ensembl1,ensembl2) == ('ENSG00000152284','ENSG00000111704'): print t;sys.exit() """ if 'WikiPathways' in file or 'KEGG' in file: if ensembl2 != ensembl1: if (ensembl2,ensembl1) in interaction_annotation_dbase: del interaction_annotation_dbase[(ensembl2,ensembl1)] ### Exclude redundant entries with fewer interaction details (e.g., arrow direction BIOGRID) - overwrite with the opposite gene arrangement below if (ensembl1,ensembl2) in interaction_annotation_dbase: if interaction_annotation_dbase[(ensembl1,ensembl2)].InteractionType() !='physical': secondary_proceed = False ### Don't overwrite a more informative annotation like transcriptional regulation or microRNA targeting if 'DrugBank' in fn: source = 'DrugBank' interaction_type = 'drugInteraction' obligatory=True ensembl1, ensembl2 = ensembl2, ensembl1 ### switch the order of these (drugs reported as first ID and gene as the second) if secondary_proceed: z = InteractionInformation(ensembl1,ensembl2,source,interaction_type) interaction_annotation_dbase[ensembl1,ensembl2] = z #z = InteractionInformation(ensembl2,ensembl1,source,interaction_type) #interaction_annotation_dbase[ensembl2,ensembl1] = z try: interaction_db[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; interaction_db[ensembl1] = db ###weight of 1 (weights currently not-supported) try: interaction_db[ensembl2][ensembl1]=1 except KeyError: db = {ensembl1:1}; interaction_db[ensembl2] = db ###weight of 1 (weights currently not-supported) if obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs) try: obligatory_interactions[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; obligatory_interactions[ensembl1] = db ###weight of 1 (weights currentlynot-supported) elif source in secondDegreeObligatoryCategories: try: second_degree_obligatory[ensembl1][ensembl2]=1 except KeyError: db = {ensembl2:1}; second_degree_obligatory[ensembl1] = db ###weight of 1 (weights currently not-supported) else: proceed = False try: ID1, null, ID2 = t proceed = True except Exception: try: ID1, ID2 = t proceed = True except Exception: None if proceed: if 'microRNATargets' in fn: if 'mir' in ID2: prefix = 'MIR' else: prefix = 'LET' ID2='MIR'+string.split(ID2,'-')[2] ### Ensembl naming convention source = 'microRNATargets' interaction_type = 'microRNAInteraction' obligatory=True try: ID_ls1 = symbol_ensembl_db[ID1] except Exception: ID_ls1 = [ID1] try: ID_ls2 = symbol_ensembl_db[ID2] except Exception: ID_ls2 = [ID2] """if 'microRNATargets' in fn: if '*' not in ID2: print ID_ls2;sys.exit()""" addInteractions = True for ID1 in ID_ls1: for ID2 in ID_ls2: z = InteractionInformation(ID2,ID1,source,interaction_type) interaction_annotation_dbase[ID2,ID1] = z ### This is the interaction direction that is appropriate try: interaction_db[ID1][ID2]=1 except KeyError: db = {ID2:1}; interaction_db[ID1] = db ###weight of 1 (weights currently supported) try: interaction_db[ID2][ID1]=1 except KeyError: db = {ID1:1}; interaction_db[ID2] = db ###weight of 1 (weights currently supported) if source in secondDegreeObligatoryCategories: try: second_degree_obligatory[ID1][ID2]=1 except KeyError: db = {ID2:1}; second_degree_obligatory[ID1] = db ###weight of 1 (weights currently supported) elif obligatory and source in obligatoryList: ### Include these in the final pathway if linked to any input node (e.g., miRNAs, drugs) try: obligatory_interactions[ID1][ID2]=1 except KeyError: db = {ID2:1}; obligatory_interactions[ID1] = db ###weight of 1 (weights currently supported) ### Evaluate the most promiscous interactors (e.g., UBC) remove_list=[] for ID in interaction_db: if len(interaction_db[ID])>2000: remove_list.append(ID) #print len(interaction_db[ID]),ensembl_symbol_db[ID] for ID in remove_list: #print 'removing', ID del interaction_db[ID] blackList[ID] = [] print 'Imported interactions:',len(interaction_annotation_dbase)
def latteralMerge(files_to_merge,original_filename,outputPath = None): """ Merging files can be dangerous, if there are duplicate IDs (e.g., gene symbols). To overcome issues in redundant gene IDs that are improperly matched (one row with zeros and the other with values), this function determines if a lateral merge is more appropriate. The latter merge: 1) Checks to see if the IDs are the same with the same order between the two or more datasets 2) merges the two or more matrices without looking at the genes. Note: This function is attempts to be memory efficient and should be updated in the future to merge blocks of row IDs sequentially.""" files_to_merge_revised = [] for filename in files_to_merge: ### If a sparse matrix - rename and convert to flat file if '.h5' in filename or '.mtx' in filename: from import_scripts import ChromiumProcessing import export file = export.findFilename(filename) export_name = file[:-4]+'-filt' if file == 'filtered_feature_bc_matrix.h5' or file == 'raw_feature_bc_matrix.h5' or file =='filtered_gene_bc_matrix.h5' or file == 'raw_gene_bc_matrix.h5': export_name = export.findParentDir(filename) export_name = export.findFilename(export_name[:-1]) elif file == 'matrix.mtx.gz' or file == 'matrix.mtx': parent = export.findParentDir(filename) export_name = export.findParentDir(parent) export_name = export.findFilename(export_name[:-1]) else: export_name = string.replace(file,'.mtx.gz','') export_name = string.replace(export_name,'.mtx','') export_name = string.replace(export_name,'.h5','') export_name = string.replace(export_name,'_matrix','') filename = ChromiumProcessing.import10XSparseMatrix(filename,'species',export_name) files_to_merge_revised.append(filename) files_to_merge = files_to_merge_revised print 'Files to merge:',files_to_merge includeFilenames = True file_uids = {} for filename in files_to_merge: firstRow=True fn=filepath(filename); x=0 if '/' in filename: file = string.split(filename,'/')[-1][:-4] else: file = string.split(filename,'\\')[-1][:-4] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if '\t' in data: t = string.split(data,'\t') elif ',' in data: t = string.split(data,',') else: t = string.split(data,'\t') if firstRow: firstRow = False else: uid = t[0] try: file_uids[file].append(uid) except: file_uids[file] = [uid] perfectMatch = True for file1 in file_uids: uids1 = file_uids[file1] for file2 in file_uids: uids2 = file_uids[file2] if uids1 != uids2: print file1,file2 perfectMatch = False if perfectMatch: print 'All ordered IDs match in the files ... performing latteral merge instead of key ID merge to prevent multi-matches...' firstRow=True increment = 5000 low = 1 high = 5000 added = 1 eo = open(output_dir+'/MergedFiles.txt','w') import collections def exportMergedRows(low,high): uid_values=collections.OrderedDict() for filename in files_to_merge: fn=filepath(filename); x=0; file_uids = {} if '/' in filename: file = string.split(filename,'/')[-1][:-4] else: file = string.split(filename,'\\')[-1][:-4] firstRow=True row_count = 0 uids=[] ### Over-ride this for each file for line in open(fn,'rU').xreadlines(): row_count+=1 if row_count<=high and row_count>=low: data = cleanUpLine(line) if '\t' in data: t = string.split(data,'\t') elif ',' in data: t = string.split(data,',') else: t = string.split(data,'\t') if firstRow and low==1: file = string.replace(file,'_matrix_CPTT','') if includeFilenames: header = [s + "."+file for s in t[1:]] ### add filename suffix else: header = t[1:] try: uid_values[row_count]+=header except: uid_values[row_count]=header uids.append('UID') firstRow=False else: uid = t[0] try: uid_values[row_count] += t[1:] except: uid_values[row_count] = t[1:] uids.append(uid) i=0 for index in uid_values: uid = uids[i] eo.write(string.join([uid]+uid_values[index],'\t')+'\n') i+=1 print 'completed',low,high uid_list = file_uids[file] while (len(uid_list)+increment)>high: exportMergedRows(low,high) high+=increment low+=increment eo.close() return True else: print 'Different identifier order in the input files encountered...' return False
def remoteGene(gene, Species, root_dir, comparison_file): global Transcript_Annotations_File global ExonRegion_File global Selected_Gene global Prt_Trans_File global Prt_Regions_File global Prt_Boundaries_File global SplicingIndex_File global UniPrt_Regions_File global microRNA_File global domainAnnotation_db global platform global species Selected_Gene = str(gene) species = Species comparison_name = string.split(export.findFilename(comparison_file), '.')[0] ExonRegion_File = unique.filepath("AltDatabase/ensembl/" + species + "/" + species + "_Ensembl_exon.txt") Transcript_Annotations_File = unique.filepath( "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl_transcript-annotations.txt") Prt_Trans_File = searchDirectory("AltDatabase/ensembl/" + species + "/", 'Ensembl_Protein') Prt_Regions_File = searchDirectory("AltDatabase/ensembl/" + species + "/", 'ProteinFeatures') Prt_Boundaries_File = searchDirectory( "AltDatabase/ensembl/" + species + "/", 'ProteinCoordinates') UniPrt_Regions_File = searchDirectory( "AltDatabase/uniprot/" + species + "/", 'FeatureCoordinate') SplicingIndex_File = searchDirectory(root_dir + '/AltResults/ProcessedSpliceData/', 'splicing-index', secondary=comparison_name) platform = getPlatform(SplicingIndex_File) microRNA_File = searchDirectory("AltDatabase/" + species + "/" + platform, 'microRNAs_multiple') #print(SplicingIndex_File) total_val = ProteinCentricIsoformView(Selected_Gene) junctions = total_val[0] p_boundaries = total_val[1] p_domains = total_val[2] transcript_db = total_val[3] exon_db = total_val[4] splice_db = total_val[5] microRNA_db = total_val[6] domainAnnotation_db = total_val[7] #for i in exon_db: # print("THE", i, exon_db[i], "\n") #for i in microRNA_db: # m_test = microRNA_db[i] # print(len(m_test)) # for q in m_test: # print("microRNA", q.ExonBlock(), q.Description(), q.BP(), "\n") #for i in exon_db["ENST00000349238"]: # print(i[2].EnsemblRegion()) domain_color_list = [] for i in p_domains: ploy = p_domains[i] for a in ploy: domain_color_list.append(a[1]) domain_color_list = list(set(domain_color_list)) domain_color_key = {} c_color1 = [0.8, 0.6, 0.1] c_color2 = [0.1, 0.6, 0.8] c_color3 = [0.6, 0.1, 0.8] c_color4 = [0.95, 0.6, 0.3] c_color5 = [0.3, 0.6, 0.95] c_color6 = [0.6, 0.3, 0.95] FLAG = 1 for item in domain_color_list: if (FLAG == 1): domain_color_key[item] = c_color1 FLAG = FLAG + 1 continue if (FLAG == 2): domain_color_key[item] = c_color2 FLAG = FLAG + 1 continue if (FLAG == 3): domain_color_key[item] = c_color3 FLAG = FLAG + 1 continue if (FLAG == 4): domain_color_key[item] = c_color4 FLAG = FLAG + 1 continue if (FLAG == 5): domain_color_key[item] = c_color5 FLAG = FLAG + 1 continue if (FLAG == 6): domain_color_key[item] = c_color6 FLAG = 1 continue #for i in domain_color_key: #print(i, domain_color_key[i], "\n") Y = 100 Transcript_to_Y = {} for transcript in transcript_db: Transcript_to_Y[transcript] = Y Y = Y + 300 import traceback def onpick(event): #ind = event.ind print(event.artist.get_label()) #for i in domainAnnotation_db: print(i,len(domainAnnotation_db));break fig = pylab.figure() ylim = Y + 200 currentAxis = pylab.gca() #ax = pylab.axes() ax = fig.add_subplot(111) X_Pos_List = [] CoordsBank = [] for transcript in transcript_db: try: Junc_List = junctions[transcript] y_pos = Transcript_to_Y[transcript] Gene_List = exon_db[transcript] color_flag = 1 for entry in Gene_List: G_start = entry[0][0] G_end = entry[0][1] Exon_Object = entry[2] try: LabelClass = splice_db[Exon_Object.EnsemblRegion()] ExonName = Exon_Object.EnsemblExon() RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "Exon: " + str( ExonName) + "\n" + "RegCall: " + str( RegCall) + "\n" + "Splicing Index: " + str( SplicingIndex) + "\n" + "P-Value: " + str( PVal) + "\n" + "Midas Value: " + str( Midas) + "\n" Label = string.replace(Label, "\n", " ") if (RegCall == "UC"): color_choice = "Grey" else: S_Int = float(SplicingIndex) if (S_Int > 0): #color_choice = (0.7, 0.7, 0.99) color_choice = 'blue' if (S_Int < 0): #color_choice = (0.8, 0.4, 0.4) color_choice = 'red' except: #print(traceback.format_exc());sys.exit() Label = "" color_choice = "Grey" #print("Start", G_start, "end", G_end, "Region", entry[2].EnsemblRegion()) if ((color_flag % 2) == 0): currentAxis.add_patch( Rectangle((G_start, y_pos), (G_end - G_start), 50, color=color_choice, label=(entry[2].EnsemblRegion() + Label), picker=True)) y_end = y_pos + 50 try: CoordsBank.append( (G_start, G_end, y_pos, y_end, 'Exon: ' + entry[2].EnsemblRegion() + ' ' + 'SI: ' + str(SplicingIndex)[:4] + ' Pval: ' + str(Midas)[:4])) except Exception: CoordsBank.append( (G_start, G_end, y_pos, y_end, 'Exon: ' + entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) if ((color_flag % 2) != 0): currentAxis.add_patch( Rectangle((G_start, y_pos), (G_end - G_start), 50, color=color_choice, label=(entry[2].EnsemblRegion() + Label), picker=True)) y_end = y_pos + 50 try: CoordsBank.append( (G_start, G_end, y_pos, y_end, 'Exon: ' + entry[2].EnsemblRegion() + ' ' + 'SI: ' + str(SplicingIndex)[:4] + ' p-value: ' + str(Midas)[:4])) except Exception: CoordsBank.append( (G_start, G_end, y_pos, y_end, 'Exon: ' + entry[2].EnsemblRegion())) #print(entry[2].EnsemblRegion(),y_pos,y_end) color_flag = color_flag + 1 if (entry[2].EnsemblRegion() in microRNA_db): microRNA_object = microRNA_db[entry[2].EnsemblRegion()] mr_label = "MICRORNA MATCHES" + "\n" for class_object in microRNA_object: mr_exonname = class_object.ExonBlock() mr_desc = class_object.Description( ) + " " + class_object.Algorithms() #print(mr_desc) mr_label = mr_label + mr_desc + "\n" currentAxis.add_patch( Rectangle((G_start, (y_pos - 75)), (G_end - G_start), 40, color="Green", label=(mr_label), picker=True)) y_start = y_pos - 75 y_end = y_pos - 35 CoordsBank.append( (G_start, G_end, y_start, y_end, mr_desc)) for entry in Junc_List: junctionID = entry[-1] try: LabelClass = splice_db[entry[2]] RegCall = LabelClass.RegCall() SplicingIndex = LabelClass.SplicingIndex() PVal = LabelClass.PVal() Midas = LabelClass.Midas() Label = "\n" + "RegCall: " + str( RegCall) + "\n" + "Splicing Index: " + str( SplicingIndex) + "\n" + "P-Value: " + str( PVal) + "\n" + "Midas Value: " + str( Midas) + "\n" if (float(SplicingIndex) > 0): color_junc = "blue" if (float(SplicingIndex) < 0): color_junc = "red" if (RegCall == "UC"): color_junc = "grey" except: Label = "" color_junc = "grey" currentAxis.add_patch( Rectangle((entry[0], y_pos), (entry[1] - entry[0]), 50, color="White", label=(str(entry[2]) + Label), picker=True)) ax.arrow(entry[0], (y_pos + 50), 8, 40, label=(str(entry[2]) + Label), color=color_junc, picker=True) ax.arrow((entry[0] + 8), (y_pos + 90), 11, -40, label=(str(entry[2]) + Label), color=color_junc, picker=True) y_start = y_pos y_end = y_pos + 30 #print(junctionID,y_start,y_end) CoordsBank.append((G_start, G_end, y_start, y_end, junctionID)) try: P_Bound_List = p_boundaries[transcript] E_Start = P_Bound_List[-2] E_End = P_Bound_List[-1] P_Start = P_Bound_List[1] P_End = P_Bound_List[2] #print("Boundaries: ", P_Start, P_End) X_Pos_List.append(int(E_End)) #currentAxis.add_patch(Rectangle((E_Start, y_pos), E_End, 50, color = "Blue")) try: currentAxis.add_patch( Rectangle((P_Start, (y_pos + 120)), (P_End - P_Start), 10)) except: pass p_label_list = ["DEF"] #CoordsBank.append((P_Start, P_End, y_pos, P_End - P_Start, transcript)) ### Added by NS - needs work try: P_Domain_List = p_domains[transcript] except Exception: P_Domain_List = [] for entry in P_Domain_List: #print("Domain", entry) color_domain_choice = domain_color_key[entry[1]] domain_annotation = domainAnnotation_db[entry[1]] #domain_annotation = string.replace(domain_annotation,'REGION-','') p_label = (str(entry[0]) + " " + str(domain_annotation)) #print(entry[0], entry[2], entry[3], P_Start, P_End, domain_annotation, ) Repeat_Flag = 0 for i in p_label_list: if (p_label == i): Repeat_Flag = 1 if (Repeat_Flag == 1): continue p_label_list.append(p_label) currentAxis.add_patch( Rectangle((entry[2], y_pos + 100), (entry[3] - entry[2]), 50, color=color_domain_choice, label=p_label, picker=True)) y_start = y_pos + 100 y_end = y_pos + 150 CoordsBank.append( (entry[2], entry[3], y_start, y_end, p_label)) except Exception: pass #print(traceback.format_exc()) except: #print(traceback.format_exc()) pass pylab.ylim([0.0, ylim]) try: max_x = max(X_Pos_List) except: max_x = 5000 try: pylab.xlim([0.0, max_x]) except: pylab.xlim([0.0, 3000]) fig.canvas.mpl_connect('pick_event', onpick) def format_coord(x, y): for m in CoordsBank: if (x >= m[0] and x <= m[1] and y >= m[2] and y <= m[3]): string_display = m[4] return string_display string_display = " " return string_display ax.format_coord = format_coord #datacursor(hover=True, formatter='{label}'.format, bbox=dict(fc='yellow', alpha=1), arrowprops=None) pylab.show()
def getPlatform(filename): prefix = string.split(export.findFilename(filename), '.')[0] array_type = string.split(prefix, '_')[1] if array_type != 'RNASeq': array_type = string.lower(array_type) return array_type
def parseJunctionEntries(bam_dir, multi=False, Species=None, ReferenceDir=None): global bam_file global splicesite_db global IndicatedSpecies global ExonReference IndicatedSpecies = Species ExonReference = ReferenceDir bam_file = bam_dir try: splicesite_db, chromosomes_found, gene_coord_db = retreiveAllKnownSpliceSites( ) except Exception: print traceback.format_exc() splicesite_db = {} chromosomes_found = {} start = time.time() try: import collections junction_db = collections.OrderedDict() except Exception: try: import ordereddict junction_db = ordereddict.OrderedDict() except Exception: junction_db = {} original_junction_db = copy.deepcopy(junction_db) bamf = pysam.Samfile(bam_dir, "rb") ### Is there an indexed .bai for the BAM? Check. try: for entry in bamf.fetch(): codes = map(lambda x: x[0], entry.cigar) break except Exception: ### Make BAM Index if multi == False: print 'Building BAM index file for', bam_dir bam_dir = str(bam_dir) #On Windows, this indexing step will fail if the __init__ pysam file line 51 is not set to - catch_stdout = False pysam.index(bam_dir) bamf = pysam.Samfile(bam_dir, "rb") chromosome = False chromosomes = {} bam_reads = 0 count = 0 jid = 1 prior_jc_start = 0 l1 = None l2 = None o = open(string.replace(bam_dir, '.bam', '__junction.bed'), "w") o.write('track name=junctions description="TopHat junctions"\n') export_isoform_models = False if export_isoform_models: io = open(string.replace(bam_dir, '.bam', '__isoforms.txt'), "w") isoform_junctions = copy.deepcopy(junction_db) outlier_start = 0 outlier_end = 0 read_count = 0 c = 0 for entry in bamf.fetch(): bam_reads += 1 try: cigarstring = entry.cigarstring except Exception: codes = map(lambda x: x[0], entry.cigar) if 3 in codes: cigarstring = 'N' else: cigarstring = None if cigarstring != None: if 'N' in cigarstring: ### Hence a junction if prior_jc_start == 0: pass elif (entry.pos - prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads writeJunctionBedFile(junction_db, jid, o) #writeIsoformFile(isoform_junctions,io) junction_db = copy.deepcopy( original_junction_db) ### Re-set this object jid += 1 chromosome = bamf.getrname(entry.rname) chromosomes[chromosome] = [] ### keep track X = entry.pos #if entry.query_name == 'SRR791044.33673569': #print chromosome, entry.pos, entry.reference_length, entry.alen, entry.query_name Y = entry.pos + entry.alen prior_jc_start = X try: tophat_strand = entry.opt( 'XS' ) ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read except Exception: #if multi == False: print 'No TopHat strand information';sys.exit() tophat_strand = None coordinates, up_to_intron_dist = getSpliceSites(entry.cigar, X) #if count > 100: sys.exit() #print entry.query_name,X, Y, entry.cigarstring, entry.cigar, tophat_strand for (five_prime_ss, three_prime_ss) in coordinates: jc = five_prime_ss, three_prime_ss #print X, Y, jc, entry.cigarstring, entry.cigar try: junction_db[chromosome, jc, tophat_strand].append( [X, Y, up_to_intron_dist]) except Exception: junction_db[chromosome, jc, tophat_strand] = [[ X, Y, up_to_intron_dist ]] if export_isoform_models: try: mate = bamf.mate( entry ) #https://groups.google.com/forum/#!topic/pysam-user-group/9HM6nx_f2CI if 'N' in mate.cigarstring: mate_coordinates, mate_up_to_intron_dist = getSpliceSites( mate.cigar, mate.pos) else: mate_coordinates = [] except Exception: mate_coordinates = [] #print coordinates,mate_coordinates junctions = map(lambda x: tuple(x), coordinates) if len(mate_coordinates) > 0: try: isoform_junctions[chromosome, tuple(junctions), tophat_strand].append( mate_coordinates) except Exception: isoform_junctions[chromosome, tuple(junctions), tophat_strand] = [ mate_coordinates ] else: if (chromosome, tuple(junctions), tophat_strand) not in isoform_junctions: isoform_junctions[chromosome, tuple(junctions), tophat_strand] = [] count += 1 writeJunctionBedFile(junction_db, jid, o) ### One last read-out if multi == False: print bam_reads, count, time.time( ) - start, 'seconds required to parse the BAM file' o.close() bamf.close() missing_chromosomes = [] for chr in chromosomes_found: if chr not in chromosomes: chr = string.replace(chr, 'chr', '') if chr not in chromosomes_found: if chr != 'M' and chr != 'MT': missing_chromosomes.append(chr) #missing_chromosomes = ['A','B','C','D'] try: bam_file = export.findFilename(bam_file) except Exception: pass return bam_file, missing_chromosomes
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = map(float, t[1:]) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def parseJunctionEntries(bam_dir,multi=False): global bam_file global splicesite_db bam_file = bam_dir try: splicesite_db,chromosomes_found = retreiveAllKnownSpliceSites() except Exception: splicesite_db={}; chromosomes_found={} start = time.time() try: import collections; junction_db=collections.OrderedDict() except Exception: try: import ordereddict; junction_db = ordereddict.OrderedDict() except Exception: junction_db={} original_junction_db = copy.deepcopy(junction_db) bamf = pysam.Samfile(bam_dir, "rb" ) chromosome = False chromosomes={} count=0 jid = 1 prior_jc_start=0 l1 = None; l2=None o = open (string.replace(bam_dir,'.bam','__junction.bed'),"w") o.write('track name=junctions description="TopHat junctions"\n') outlier_start = 0; outlier_end = 0; read_count = 0 for entry in bamf.fetch(): #chromosome = bamf.getrname( entry.rname ) codes = map(lambda x: x[0],entry.cigar) try: cigarstring = entry.cigarstring except Exception: if 3 in codes: cigarstring = 'N' else: cigarstring = None if cigarstring != None: if 'N' in cigarstring: ### Hence a junction if entry.cigar[0][1]<60 and entry.cigar[0][1]>20: """ if count<310: a1 = entry.seq[entry.cigar[0][1]-5:entry.cigar[0][1]] a2 = entry.seq[entry.cigar[0][1]:entry.cigar[0][1]+6] if l1==a1 and l2==a2: continue else: print entry.opt('XS'), a1,a2, entry.seq l1 = a1; l2 = a2 else: sys.exit()""" if prior_jc_start == 0: pass elif (entry.pos-prior_jc_start) > 5000 or bamf.getrname( entry.rname ) != chromosome: ### New chr or far from prior reads writeJunctionBedFile(junction_db,jid,o) junction_db = copy.deepcopy(original_junction_db) ### Re-set this object jid+=1 chromosome = bamf.getrname( entry.rname ) chromosomes[chromosome]=[] ### keep track X=entry.pos Y=entry.pos+entry.alen prior_jc_start = X if entry.is_reverse: strand = '-' ### This is the strand the seq aligns to but not necessarily the REAL strand the mRNA aligns to (see XS below) else: strand = '+' try: tophat_strand = entry.opt('XS') ### TopHat knows which sequences are likely real splice sites so it assigns a real strand to the read except Exception: #if multi == False: print 'No TopHat strand information';sys.exit() tophat_strand = None coordinates,up_to_intron_dist = getSpliceSites(entry.cigar,X) for (five_prime_ss,three_prime_ss) in coordinates: jc = five_prime_ss,three_prime_ss #print X, Y, jc, entry.cigarstring, entry.cigar try: junction_db[chromosome,jc,tophat_strand].append([X,Y,up_to_intron_dist]) except Exception: junction_db[chromosome,jc,tophat_strand] = [[X,Y,up_to_intron_dist]] count+=1 writeJunctionBedFile(junction_db,jid,o) ### One last read-out if multi == False: print time.time()-start, 'seconds required to parse the BAM file' o.close() bamf.close() missing_chromosomes=[] for chr in chromosomes_found: if chr not in chromosomes: chr = string.replace(chr,'chr','') if chr not in chromosomes_found: if chr != 'M' and chr != 'MT': missing_chromosomes.append(chr) #missing_chromosomes = ['A','B','C','D'] try: bam_file = export.findFilename(bam_file) except Exception: pass return bam_file, missing_chromosomes
def Enrichment(Inputfile,mutdict,mutfile,metaDataMatrixFormat,header): import collections import mappfinder X=defaultdict(list) prev="" head=0 group=defaultdict(list) enrichdict=defaultdict(float) mut=export.findFilename(mutfile) dire=export.findParentDir(Inputfile) output_dir = dire+'MutationEnrichment' print output_dir export.createExportFolder(output_dir) number_of_samples = 0 ### All enrichment results exportnam=output_dir+'/Enrichment_Results.txt' export_enrich=open(exportnam,"w") ### Selected Enrichment results based on p-value, sensitivity and specificity for association with cluster names exportnam=output_dir+'/Enrichment_tophits.txt' export_hit=open(exportnam,"w") header = "Mutations"+"\t"+"Cluster"+"\t"+"r"+"\t"+"R"+"\t"+"n"+"\t"+"Sensitivity"+"\t"+"Specificity"+"\t"+"z-score"+"\t"+"Fisher exact test"+"\t"+"adjp value"+"\n" export_enrich.write(header) export_hit.write(header) header2=returnSamplesInMetaData(Inputfile,metaDataMatrixFormat=True) print header2 for line in open(Inputfile,'rU').xreadlines(): if head > 0: number_of_samples+=1 line=line.rstrip('\r\n') q = string.split(line,'\t') for i in range(1,len(q)): if q[i]==str(1): #group[q[0]].append(header2[i-1]) group[header2[i-1]].append(q[0]) ### [Cluster] = [full_sample_ID] else: head+=1 continue print 'Number of patient samples in dataset =',number_of_samples total_Scores={} for kiy in mutdict: if kiy =="MDP": print mutdict[kiy] groupdict={} remaining=[] remaining=list(set(header) - set(mutdict[kiy])) groupdict[1]=mutdict[kiy] groupdict[2]=remaining #export_enrich1.write(kiy) for key2 in group: r=float(len(list(set(group[key2])))-len(list(set(group[key2]) - set(mutdict[kiy])))) n=float(len(group[key2])) R=float(len(set(mutdict[kiy]))) N=float(number_of_samples) if r==0 or key2=="1" or R==1.0: #print kiy,key2,r,n,R,N pval=float(1) z=float(0) null_z = 0.000 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) else: try: z = Zscore(r,n,N,R) except: z=0 ### Calculate a Z-score assuming zero matching entries try: null_z = Zscore(0,n,N,R) except Exception: null_z = 0.000 try: pval = mappfinder.FishersExactTest(r,n,R,N) zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) except Exception: pval=1.0 zsd = mappfinder.ZScoreData(key2,r,R,z,null_z,n) zsd.SetP(pval) #pass if kiy in total_Scores: signature_db = total_Scores[kiy] signature_db[key2]=zsd ### Necessary format for the permutation function else: signature_db={key2:zsd} total_Scores[kiy] = signature_db sorted_results=[] mutlabels={} for kiy in total_Scores: signature_db = total_Scores[kiy] ### Updates the adjusted p-value instances mappfinder.adjustPermuteStats(signature_db) for signature in signature_db: zsd = signature_db[signature] results = [kiy,signature,zsd.Changed(),zsd.Measured(),zsd.InPathway(),str(float(zsd.PercentChanged())/100.0),str(float(float(zsd.Changed())/float(zsd.InPathway()))), zsd.ZScore(), zsd.PermuteP(), zsd.AdjP()] #string.join(zsd.AssociatedIDs(),'|') sorted_results.append([signature,-1*float(zsd.ZScore()),results]) sorted_results.sort() ### Sort z-score prev="" for (sig,p,values) in sorted_results: if sig!=prev: flag=True export_hit.write(string.join(values,'\t')+'\n') if flag: ### Update the cluster label to include the top enriched term meeting, sensitivity and specificity cutoffs #print values[5],values[6],values[6],values[2]; sys.exit() if (float(values[5])>=0.2 and float(values[6])>=0.2 and float(values[7])>=1.95 and float(values[2])>=2): clusterID = values[1] topEnrichedTerm=values[0] mutlabels[clusterID]=clusterID+' ('+topEnrichedTerm+')' flag=False export_hit.write(string.join(values,'\t')+'\n') export_enrich.write(string.join(values,'\t')+'\n') prev=sig if len(sorted_results)==0: export_enrich.write(string.join([splicing_factor,'NONE','NONE','NONE','NONE','NONE','NONE'],'\t')+'\n') export_enrich.close() return mutlabels