def justConvertFilenames(species, outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename, '.pdf', '') fn = string.replace(fn, '.png', '') newname = string.split(fn, '__') if newname[0] in gene_to_symbol: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename, '__')[1] elif '\\' in filename: new_filename = string.split(filename, '\\')[1] elif '/' in filename: new_filename = string.split(filename, '/')[1] nnname = gene_to_symbol[ newname[0]][0] + '-SashimiPlot_' + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass else: continue
def filterRows(input_file,output_file,filterDB=None,logData=False): orderlst={} counter=[] export_object = open(output_file,'w') firstLine = True Flag=0; species="Hs" import OBO_import; import ExpressionBuilder gene_to_symbol_db = ExpressionBuilder.importGeneAnnotations(species) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol_db) for line in open(input_file,'rU').xreadlines(): flag1 = 0 data = cleanUpLine(line) values = string.split(data,'\t') if firstLine: firstLine = False if Flag==0: export_object.write(line) else: try: symbolID = gene_to_symbol_db[values[0]][0] except Exception: symbolID = values[0] if symbolID in filterDB: counter=[index for index, value in enumerate(filterDB) if value == symbolID] for it in range(0,len(counter)): orderlst[counter[it]]=line try: for i in range(0,len(orderlst)): export_object.write(orderlst[i]) except Exception: print i,filterDB[i] export_object.close() print 'Filtered rows printed to:',output_file
def sashmi_plot_list(bamdir,eventsToVisualizeFilename,PSIFilename,events=None): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) if events==None: splicing_events,expandedSearch = importSplicingEventsToVisualize(eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i],'__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events)==0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir+'/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir+'/ExpressionInput/'+file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups=[] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample]) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(steady_state_exp_file,bamdir,mopup_events,sample_group_db,groups,expandedSearch) if len(processed_events)>0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot(PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def justConvertFilenames(species, outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ("hide", "Ensembl-Symbol")) import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if ".pdf" in filename or ".png" in filename: fn = string.replace(filename, ".pdf", "") fn = string.replace(fn, ".png", "") newname = string.split(fn, "__") if newname[0] in gene_to_symbol: new_filename = str(filename) if "__" in filename: new_filename = string.split(filename, "__")[1] elif "\\" in filename: new_filename = string.split(filename, "\\")[1] elif "/" in filename: new_filename = string.split(filename, "/")[1] nnname = gene_to_symbol[newname[0]][0] + "-SashimiPlot_" + new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir, nnname)) except Exception: pass else: continue
def importOntologyAnnotations(species_code, ontology_type): try: system_codes, source_types, mod_types = GO_Elite.getSourceData() verified_nested = OBO_import.verifyNestedFileCreation( species_code, mod_types, ontology_type) if verified_nested == 'no': force_error ontology_annotations = OBO_import.importPreviousOntologyAnnotations( ontology_type) except Exception: try: ### Occurs when the annotation file isn't built yet - if so try to build OBO_import.buildNestedOntologyAssociations(species_code, mod_types, ontology_type) ontology_annotations = OBO_import.importPreviousOntologyAnnotations( ontology_type) except Exception: ontology_annotations = None return ontology_annotations
def importDataSimple(filename,input_type,MOD=None,Species=None): id_db={} fn = filepath(filename) x=0 for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if data[0]=='#' and x==0: x=0 elif x==0: column_headers = t[1:] if input_type != 'LineageProfiler': try: column_headers = t[2] ### exclude the ID, system code and p-value column headers except Exception: column_headers = 'NA' x=1 else: if x==1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes,source_types,mod_types = GO_Elite.getSourceData() source_data = system_codes[system_code] try: Mod=mod ### global established in upstream functions speciescode = species_code except Exception: Mod=MOD speciescode = Species if source_data == Mod: source_is_mod = True else: source_is_mod = False mod_source = Mod+'-'+source_data+'.txt' gene_to_source_id = gene_associations.getGeneToUid(speciescode,('hide',mod_source)) source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: try: id_db[t[0]] = float(t[2]) except Exception: id_db[t[0]] = 'NA' elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: try: value = t[2] except Exception: value = 'NA' if value == '+': value = 1 elif value == '-': value = -1 try: id_db[mod_id] = float(value) ### If multiple Ensembl IDs in dataset, only record the last associated fold change except Exception: id_db[mod_id] = 'NA' break else: id_db[t[0]]= map(float,t[1:]) ### Applies to LineageProfiler x+=1 #print len(id_db),column_headers return id_db,column_headers
def importDataSimple(filename, input_type): id_db = {} fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#': x = 0 elif x == 0: column_headers = t[1:] if input_type != 'LineageProfiler': column_headers = t[ 2] ### exclude the ID, system code and p-value column headers x = 1 else: if x == 1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes, source_types, mod_types = GO_Elite.getSourceData( ) source_data = system_codes[system_code] if source_data == mod: source_is_mod = True else: source_is_mod = False mod_source = mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( species_code, ('hide', mod_source)) source_to_gene = OBO_import.swapKeyValues( gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: id_db[t[0]] = float(t[2]) elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: id_db[mod_id] = float( t[2] ) ### If multiple Ensembl IDs in dataset, only record the last associated fold chagne else: id_db[t[0]] = map(float, t[1:]) ### Applies to LineageProfiler x += 1 return id_db, column_headers
def exportSymbolRelationships(pathway_to_symbol,selected_species,pathway_type,type): if selected_species != None: ### Restrict to selected species only current_species_dirs=selected_species else: current_species_dirs = unique.read_directory('/'+database_dir) for species in current_species_dirs: if '.' not in species: ens_dir = database_dir+'/'+species+'/gene-'+type+'/Ensembl-'+pathway_type+'.txt' ens_data = export.ExportFile(ens_dir) if 'mapp' in type: ens_data.write('GeneID\tSystem\tGeneSet\n') else: ens_data.write('GeneID\tGeneSet\n') try: ens_to_entrez = gene_associations.getGeneToUid(species,('hide','Ensembl-EntrezGene')) except Exception: ens_to_entrez ={} if len(ens_to_entrez)>0: entrez_dir = database_dir+'/'+species+'/gene-'+type+'/EntrezGene-'+pathway_type+'.txt' entrez_data = export.ExportFile(entrez_dir) if 'mapp' in type: entrez_data.write('GeneID\tSystem\tGeneSet\n') else: entrez_data.write('GeneID\tGeneSet\n') #print 'Exporting '+pathway_type+' databases for:',species try: gene_to_source_id = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) except Exception: gene_to_source_id={} source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) source_to_gene = lowerSymbolDB(source_to_gene) for pathway in pathway_to_symbol: for symbol in pathway_to_symbol[pathway]: try: genes = source_to_gene[symbol] for gene in genes: if 'mapp' in type: ens_data.write(gene+'\tEn\t'+pathway+'\n') else: ens_data.write(gene+'\t'+pathway+'\n') if gene in ens_to_entrez: for entrez in ens_to_entrez[gene]: if 'mapp' in type: entrez_data.write(entrez+'\tL\t'+pathway+'\n') else: entrez_data.write(entrez+'\t'+pathway+'\n') except Exception: null=[] ens_data.close() try: entrez_data.close() except Exception: null=[]
def justConvertFilenames(species,outputdir): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) import OBO_import; symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) for filename in os.listdir(outputdir): if '.pdf' in filename or '.png' in filename: fn = string.replace(filename,'.pdf','') fn = string.replace(fn,'.png','') newname=string.split(fn,'__') if newname[0] in gene_to_symbol: new_filename = str(filename) if '__' in filename: new_filename = string.split(filename,'__')[1] elif '\\' in filename: new_filename = string.split(filename,'\\')[1] elif '/' in filename: new_filename = string.split(filename,'/')[1] nnname=gene_to_symbol[newname[0]][0]+'-SashimiPlot_'+new_filename try: os.rename(os.path.join(outputdir, filename), os.path.join(outputdir,nnname)) except Exception: pass else: continue
def sashmi_plot_list(bamdir, eventsToVisualizeFilename, PSIFilename, events=None): import gene_associations gene_to_symbol = gene_associations.getGeneToUid(species, ('hide', 'Ensembl-Symbol')) import OBO_import symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) if events == None: splicing_events, expandedSearch = importSplicingEventsToVisualize( eventsToVisualizeFilename) else: ### Replace any ":" from the input events #for i in range(len(events)): events[i] = string.replace(events[i],':','__') expandedSearch = True for i in range(len(events)): gene = string.split(events[i], '__')[0] if gene in gene_to_symbol: symbol = gene_to_symbol[gene][0] elif 'ENS' not in gene or 'G0000' in gene: if gene in symbol_to_gene: ensID = symbol_to_gene[gene][0] symbol = gene events[ i] = ensID ### translate this ID to an Ensembl gene ID for propper SashimiPlot lookup splicing_events = events ### optionally get from supplied variable if len(splicing_events) == 0: print eventsToVisualizeFilename forceNoCompatibleEventsInFile print 'Exporting plots', ### Determine Groups for Coloring groups_file = 'None' dir_list = unique.read_directory(root_dir + '/ExpressionInput') for file in dir_list: if 'groups.' in file: groups_file = root_dir + '/ExpressionInput/' + file if groups_file != None: try: import ExpressionBuilder sample_group_db = ExpressionBuilder.simplerGroupImport(groups_file) groups = [] for sample in sample_group_db: if sample_group_db[sample] not in groups: groups.append(sample_group_db[sample] ) ### create an ordered list of unique group except Exception: groups = ['None'] #print traceback.format_exc() pass processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, splicing_events, sample_group_db, groups, False) mopup_events = getMopUpEvents(splicing_events, processed_events) ### Do the same for supplied gene queries or junctions that didn't map above using the gene expression values as a guide #print len(splicing_events),len(processed_events),len(mopup_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( steady_state_exp_file, bamdir, mopup_events, sample_group_db, groups, expandedSearch) if len(processed_events) > 0: mopup_events = getMopUpEvents(mopup_events, processed_events) processed_events = formatAndSubmitSplicingEventsToSashimiPlot( PSIFilename, bamdir, mopup_events, sample_group_db, groups, True) return gene_to_symbol
def reformatPolyAdenylationCoordinates(species,force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version={} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for',species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/'+species + '/' if force == 'yes': filename, status = update.download(url,output_dir,'') else: filename = output_dir+'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations; import OBO_import; import EnsemblImport; import export try: ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene') print len(ens_unigene),'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no' except Exception: ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene') print len(ens_entrez),'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') export_bedfile = output_dir+species+'_polyADB_2_predictions.bed' print 'exporting',export_bedfile export_data = export.ExportFile(export_bedfile) header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n' export_data.write(header) fn=filepath(filename); x=0; not_found={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr'+chr strand = '+'; geneid = siteid pos_start = str(int(position)-1); pos_end = position if use_entrez=='no': external_geneid = string.join(string.split(siteid,'.')[:2],'.') else: external_geneid=llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-'+ens_geneid chr,strand,start,end = gene_location_db[ens_geneid] else: not_found[external_geneid]=[] bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n' export_data.write(bed_format) export_data.close()
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations import OBO_import import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()
def performOntologyORA(ontology_dir): """ Perform over-representation analysis (ORA) on any provided Ontology """ start_time = time.time() ontology_type = getResourceType(ontology_dir) ######### Import Gene-to-Nested-Ontology ######### gene_to_ontology = gene_associations.importGeneToOntologyData( species_code, mod, 'nested', ontology_type) ontology_to_gene = OBO_import.swapKeyValues(gene_to_ontology) if len(gene_to_ontology) == 0: return 0, None else: ######### Calculate primary z-scores for GO terms ontology_to_mod_genes = getGenesInPathway( input_gene_list, gene_to_ontology) ### For summary gene reporting ontology_input_gene_count, Rg, input_linked_ontology = countGenesInPathway( input_gene_list, gene_to_ontology, 'yes') ontology_denominator_gene_count, Ng, denom_linked_ontology = countGenesInPathway( denominator_gene_list, gene_to_ontology, 'yes') #print Ng,"unique genes, linked to GO and in dataset and", Rg, "unique GO linked genes matching criterion." calculateZScores(ontology_input_gene_count, ontology_denominator_gene_count, Ng, Rg, ontology_to_gene, 'Ontology') if use_FET == 'no': ###Begining Ontology Permutation Analysis try: original_increment = int(permutations / 10) increment = original_increment except Exception: null = None x = 0 permute_ontology_inputs = [] if permutations != 0: print '*', for permute_input_list in permute_inputs: ### http://docs.python.org/library/multiprocessing.html if x == increment: increment += original_increment print '*', x += 1 permute_ontology_input_gene_count, null, null = countGenesInPathway( permute_input_list, gene_to_ontology, 'no') permute_input_list = [] permute_ontology_inputs.append( permute_ontology_input_gene_count) #if permutations !=0: print 'Gene Ontology finished' calculatePermuteZScores(permute_ontology_inputs, ontology_denominator_gene_count, Ng, Rg) calculatePermuteStats(original_ontology_z_score_data) adjustPermuteStats(original_ontology_z_score_data) go_headers = formatHeaders(gene_file, input_count, input_linked_ontology, denom_count, denom_linked_ontology, Rg, Ng, 'Ontology', OBO_date) exportPathwayData(original_ontology_z_score_data, gene_file, go_headers, ontology_type, 'Ontology') ### Export all gene associations (added in version 1.21) exportPathwayToGeneAssociations(ontology_to_mod_genes, mod, gene_file, gene_annotations, ontology_type, 'Ontology') end_time = time.time() time_diff = formatTime(start_time, end_time) print "Initial results for %s calculated in %s seconds" % ( ontology_type, time_diff) permute_ontology_inputs = [] return 1, ontology_to_mod_genes
def performGeneSetORA(geneset_dir): """ Perform over-representation analysis (ORA) on any provided Gene Set """ start_time = time.time() geneset_type = getResourceType(geneset_dir) #permuted_z_scores={}; original_mapp_z_score_data={} if geneset_type == 'Pathways': geneset_type = 'WikiPathways' ### Since MAPP tables can be provided by the user, allow the file to be missing if geneset_dir == 'UserSuppliedAssociations': gene_to_mapp = gene_associations.importGeneCustomData( species_code, system_codes, custom_sets_folder, mod) geneset_type = geneset_dir else: try: gene_to_mapp = gene_associations.importGeneMAPPData( species_code, geneset_dir) except Exception: gene_to_mapp = {} mapp_to_gene = OBO_import.swapKeyValues(gene_to_mapp) if len(gene_to_mapp) == 0: return 0, None else: ###Calculate primary z-scores for GeneSets mapp_to_mod_genes = getGenesInPathway( input_gene_list, gene_to_mapp) ### For summary reporting mapp_input_gene_count, Rm, input_linked_mapp = countGenesInPathway( input_gene_list, gene_to_mapp, 'yes') mapp_denominator_gene_count, Nm, denom_linked_mapp = countGenesInPathway( denominator_gene_list, gene_to_mapp, 'yes') #print Nm,"unique genes, linked to GeneSets and in dataset and", Rm, "unique GeneSets\n linked genes matching criterion." calculateZScores(mapp_input_gene_count, mapp_denominator_gene_count, Nm, Rm, mapp_to_gene, 'MAPP') if use_FET == 'no': permute_mapp_inputs = [] ###Begin GeneSets Permutation Analysis try: original_increment = int(permutations / 10) increment = original_increment except Exception: null = None x = 0 if permutations != 0: print '*', for permute_input_list in permute_inputs: if x == increment: increment += original_increment print '*', x += 1 permute_mapp_input_gene_count, null, null = countGenesInPathway( permute_input_list, gene_to_mapp, 'no') permute_mapp_inputs.append(permute_mapp_input_gene_count) calculatePermuteZScores(permute_mapp_inputs, mapp_denominator_gene_count, Nm, Rm) calculatePermuteStats(original_mapp_z_score_data) adjustPermuteStats(original_mapp_z_score_data) mapp_headers = formatHeaders(gene_file, input_count, input_linked_mapp, denom_count, denom_linked_mapp, Rm, Nm, 'MAPP', OBO_date) exportPathwayData(original_mapp_z_score_data, gene_file, mapp_headers, geneset_type, 'local') ### Export all gene associations (added in version 1.21) exportPathwayToGeneAssociations(mapp_to_mod_genes, mod, gene_file, gene_annotations, geneset_type, 'local') end_time = time.time() time_diff = formatTime(start_time, end_time) print "Initial results for %s calculated in %s seconds" % ( geneset_type, time_diff) permute_mapp_inputs = [] return 1, mapp_to_mod_genes
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] try: gene = string.split(t[0],'|')[0] except Exception: pass #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = t[1:] if '' in exp_vals: ### If missing values present (PSI values) exp_vals = ['0.000101' if i=='' else i for i in exp_vals] useLog = False exp_vals = map(float, exp_vals) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError """else: for gene in tissue_specific_db: if 'Ndufa9:ENSMUSG00000000399:I2.1-E3.1' in gene: print gene, 'dog';sys.exit() print gene;kill""" print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=False,previouslyRun=False,species=None): ### Import gene-level expression raw values fn=filepath(filename); x=0; genes_added={}; gene_expression_db={} dataset_name = export.findFilename(filename) max_val=0 print 'importing:',dataset_name try: import gene_associations, OBO_import gene_to_symbol = gene_associations.getGeneToUid(species,('hide','Ensembl-Symbol')) symbol_to_gene = OBO_import.swapKeyValues(gene_to_symbol) except Exception: symbol_to_gene={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: if '#' not in data: for i in t[1:]: sample_headers.append(i) x=1 else: gene = t[0] #if '-' not in gene and ':E' in gene: print gene;sys.exit() if analysis_type == 'AltExon': try: ens_gene,exon = string.split(gene,'-')[:2] except Exception: exon = gene gene = exon if keyed_by == 'translation': ### alternative value is 'primaryID' """if gene == 'ENSMUSG00000025915-E19.3': for i in translation_db: print [i], len(translation_db); break print gene, [translation_db[gene]];sys.exit()""" try: gene = translation_db[gene] ### Ensembl annotations except Exception: pass try: gene = symbol_to_gene[gene][0] ### If RNASeq is the selected platform and Symbol is the uid except Exception: pass if gene in tissue_specific_db: index,tissue_exp=tissue_specific_db[gene] try: genes_added[gene]+=1 except Exception: genes_added[gene]=1 proceed=True try: exp_vals = map(float, t[1:]) if platform == 'RNASeq': if max(exp_vals)>max_val: max_val = max(exp_vals) #if max(exp_vals)<3: proceed=False if useLog==False: exp_vals = map(lambda x: math.log(x+1,2),exp_vals) if value_type == 'calls': ### Hence, this is a DABG or RNA-Seq expression exp_vals = produceDetectionCalls(exp_vals,targetPlatform) ### 0 or 1 calls if proceed: gene_expression_db[gene] = [index,exp_vals] except Exception: print 'Non-numeric values detected:' x = 5 print t[:x] while x < t: t[x:x+5] x+=5 print 'Formatting error encountered in:',dataset_name; forceError print len(gene_expression_db), 'matching genes in the dataset and tissue compendium database' for gene in genes_added: if genes_added[gene]>1: del gene_expression_db[gene] ### delete entries that are present in the input set multiple times (not trustworthy) else: expession_subset.append(gene_expression_db[gene]) ### These contain the rank order and expression #print len(expession_subset);sys.exit() expession_subset.sort() ### This order now matches that of gene_expression_db=[] if max_val<20 and platform == 'RNASeq' and previouslyRun==False: ### Only allow to happen once importGeneExpressionValues(filename,tissue_specific_db,translation_db,useLog=True,previouslyRun=True,species=species)
def importDataSimple(filename, input_type, MOD=None, Species=None): id_db = {} fn = filepath(filename) x = 0 for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data, '\t') if data[0] == '#' and x == 0: x = 0 elif x == 0: column_headers = t[1:] if input_type != 'LineageProfiler': try: column_headers = t[ 2] ### exclude the ID, system code and p-value column headers except Exception: column_headers = 'NA' x = 1 else: if x == 1 and input_type != 'LineageProfiler': ### get system conversions system_code = t[1] import GO_Elite import OBO_import system_codes, source_types, mod_types = GO_Elite.getSourceData( ) source_data = system_codes[system_code] try: Mod = mod ### global established in upstream functions speciescode = species_code except Exception: Mod = MOD speciescode = Species if source_data == Mod: source_is_mod = True else: source_is_mod = False mod_source = Mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( speciescode, ('hide', mod_source)) source_to_gene = OBO_import.swapKeyValues( gene_to_source_id) if input_type != 'LineageProfiler': if source_is_mod == True: try: id_db[t[0]] = float(t[2]) except Exception: id_db[t[0]] = 'NA' elif t[0] in source_to_gene: mod_ids = source_to_gene[t[0]] for mod_id in mod_ids: try: value = t[2] except Exception: value = 'NA' if value == '+': value = 1 elif value == '-': value = -1 try: id_db[mod_id] = float( value ) ### If multiple Ensembl IDs in dataset, only record the last associated fold change except Exception: id_db[mod_id] = 'NA' break else: id_db[t[0]] = map(float, t[1:]) ### Applies to LineageProfiler x += 1 #print len(id_db),column_headers return id_db, column_headers
def generateMAPPFinderScores(species_title, species_id, source, mod_db, system_Codes, permute, resources_to_analyze, file_dirs, parent_root): global mappfinder_output_dir global custom_sets_folder global root root = parent_root global mapp_to_mod_genes global ontology_to_mod_genes global system_codes system_codes = system_Codes criterion_input_folder, criterion_denom_folder, output_dir, custom_sets_folder = file_dirs previous_denominator_file_dir = '' ontology_to_mod_genes = {} mapp_to_mod_genes = {} global test test = 'no' program_type, database_dir = unique.whatProgramIsThis() if resources_to_analyze == 'Gene Ontology': resources_to_analyze = 'GeneOntology' if len(output_dir) == 0: mappfinder_output_dir = 'input/MAPPFinder' else: mappfinder_output_dir = output_dir + '/GO-Elite_results/CompleteResults/ORA' global source_data source_data = source global mod mod = mod_db global species_code species_code = species_id global species_name species_name = species_title global gene_to_mapp global permutations permutations = permute global eliminate_redundant_genes eliminate_redundant_genes = 'yes' global permuted_z_scores global ontology_annotations global original_ontology_z_score_data global original_mapp_z_score_data global input_gene_list global denominator_gene_list global gene_file global denom_file_status global input_count global denom_count global gene_annotations global source_to_gene global use_FET if permutations == "FisherExactTest": use_FET = 'yes' ### Use Fisher's Exact test instead of permutation-based p-values permutations = 0 else: use_FET = 'no' start_time = time.time() gene_annotations = gene_associations.importGeneData(species_code, mod) OBO_date = importVersionData('OBO/') if len(criterion_input_folder) == 0: import_dir = '/input/GenesToQuery/' + species_code import_dir_alt = import_dir[1:] else: import_dir = criterion_input_folder import_dir_alt = criterion_input_folder m = GrabFiles() m.setdirectory(import_dir) try: dir_list = readDirText( import_dir ) #send a sub_directory to a function to identify all files in a directory except Exception: print_out = 'Warning! Input directory location is not a valid folder. Exiting GO-Elite.' ForceCriticalError(print_out) try: denom_dir_list = readDirText(criterion_denom_folder) except Exception: print_out = 'Warning! Denominator directory location is not a valid folder. Exiting GO-Elite.' ForceCriticalError(print_out) if len(dir_list) == 0: error_message = 'No files with the extension ".txt" found in the input directory.' ForceCriticalError(error_message) if len(denom_dir_list) == 0: error_message = 'No files with the extension ".txt" found in the denominator directory.' ForceCriticalError(error_message) inputs_analyzed = 0 for mappfinder_input in dir_list: #loop through each file in the directory permuted_z_scores = {} original_ontology_z_score_data = {} original_mapp_z_score_data = {} print 'Performing over-representation analysis (ORA) on', mappfinder_input gene_file_dir, gene_file = m.searchdirectory(mappfinder_input) ###Import Input gene/source-id lists input_gene_list, source_data_input, error_message = gene_associations.importUIDsForMAPPFinderQuery( import_dir_alt + '/' + gene_file, system_codes, 'no') input_count = len(input_gene_list) if 'WARNING!!!' in error_message: ### Warn the user about SwissProt issues when importing the denominator ForceCriticalError(error_message) if len(criterion_denom_folder) == 0: denom_folder = '/input/GenesToQuery/' + species_code + '/DenominatorGenes' else: denom_folder = criterion_denom_folder error_warning = "\nThe directory\n" + '[' + denom_folder + ']' + "\nwas not found. Please create the directory\nand place an appropriate denominator file\nor files in it." denominator_file_dir = identifyGeneFiles( denom_folder, gene_file) ###input is in input\Genes, denominator in try: denominator_file_dir = identifyGeneFiles( denom_folder, gene_file) ###input is in input\Genes, denominator in denominator_file = string.split(denominator_file_dir, '/')[-1] print 'Using:', denominator_file, 'for the denominator.' except Exception: print_out = "WARNING: No denominator file included in\nthe Denominator directory.\nTo proceed, place all denominator\nIDs in a file in that directory." ForceCriticalError(print_out) if denominator_file_dir == previous_denominator_file_dir: denom_file_status = 'old' else: denom_file_status = 'new' if denom_file_status == 'new': previous_denominator_file_dir = denominator_file_dir denominator_gene_list, source_data_denom, error_message = gene_associations.importUIDsForMAPPFinderQuery( denominator_file_dir, system_codes, 'no') denom_count = len(denominator_gene_list) if 'SwissProt' in error_message and 'WARNING!!!' not in error_message: if len(input_gene_list) == 0: error_message += '\nNo valid input IDs found. Exiting GO-Elite.' try: UI.WarningWindow( error_message, 'Warning!!! Identifier Error' ) ### Only warn, don't force an exit (if SwissProt full IDs are present) except Exception: None sys.exit() else: try: UI.WarningWindow( error_message, 'Warning!!! Identifier Error' ) ### Only warn, don't force an exit (if SwissProt full IDs are present) except Exception: None elif len(error_message) > 0: ForceCriticalError(error_message) if len(denominator_gene_list) == len(input_gene_list): print_out = 'Input and Denominator lists have identical counts.\nPlease load a propper denominator set (containing\nthe input list with all assayed gene IDs) before proceeding.' ForceCriticalError(print_out) original_denominator_gene_list = [] for id in denominator_gene_list: original_denominator_gene_list.append( id ) ###need this to be a valid list not dictionary for permutation analysis if len(source_data_input) > 0: source_data = source_data_input ###over-ride source_data if a source was identified from the input file if source_data != mod: if denom_file_status == 'new': mod_source = mod + '-' + source_data + '.txt' #checkDenominatorMatchesInput(input_gene_list,denominator_gene_list,gene_file) ###This is checked for the source IDs not associated MOD IDs try: gene_to_source_id = gene_associations.getGeneToUid( species_code, mod_source) print mod_source, 'imported' except Exception: try: if mod == 'EntrezGene': mod = 'Ensembl' else: mod = 'EntrezGene' print 'The primary system (MOD) has been switched from', mod_db, 'to', mod, '\n(' + mod_db, 'not supported for the %s ID system).' % source_data mod_source = mod + '-' + source_data + '.txt' gene_to_source_id = gene_associations.getGeneToUid( species_code, mod_source) except Exception: print_out = "WARNING: The primary gene ID system '" + mod + "'\ndoes not support relationships with '" + source_data + "'.\nRe-run using a supported primary ID system." ForceCriticalError(print_out) source_to_gene = OBO_import.swapKeyValues(gene_to_source_id) denominator_gene_list = associateInputSourceWithGene( source_to_gene, denominator_gene_list) ### Introduced the below method in version 1.21 to improve permutation speed (no longer need to search all source IDs) ### Only includes source ID to gene relationships represented in the denominator file (needed for Affymetrix) source_to_gene = OBO_import.swapKeyValues( denominator_gene_list) ###Replace input lists with corresponding MOD IDs input_gene_list = associateInputSourceWithGene( source_to_gene, input_gene_list) checkDenominatorMatchesInput( input_gene_list, denominator_gene_list, gene_file) ###This is for only the associated MOD IDs gd = GrabFiles() gd.setdirectory('/' + database_dir + '/' + species_code + '/gene-mapp') available_genesets = reorganizeResourceList(gd.getAllFiles(mod)) od = GrabFiles() od.setdirectory('/' + database_dir + '/' + species_code + '/gene-go') available_ontologies = reorganizeResourceList(od.getAllFiles(mod)) input_gene_count = len( input_gene_list ) ###Count number of genes associated with source input IDs if len(input_gene_list) == 0 or len(denominator_gene_list) == 0: if len(input_gene_list) == 0: print_out = 'WARNING!!!! None of the input IDs provided map to genes for ' + mappfinder_input + '. Check to make sure the selected species is correct.' print_out += '\nSelected species: ' + species_name print_out += '\nInput ID system: ' + str(source_data_input) print_out += '\nPrimary ID system (MOD): ' + str(mod) ForceCriticalError(print_out) if len(denominator_gene_list) == 0: print_out = 'WARNING!!!! None of the denominator IDs provided map to genes for ' + denominator_file_dir + '. Check to make sure the selected species is correct.' print_out += '\nSelected species: ' + species_name print_out += '\nDenominator ID system: ' + str(source) print_out += '\nPrimary ID system (MOD):' + str(mod) ForceCriticalError(print_out) elif len(available_ontologies) == 0 and len(available_genesets) == 0: print_out = 'WARNING!!!! No Ontology or GeneSets appear to be available for this species. Please supply and re-analyze.' ForceCriticalError(print_out) else: """ Perform permutation analysis and ORA on available GeneSets or Ontologies""" inputs_analyzed += 1 global permute_inputs permute_inputs = [] if permutations != 0 or use_FET == 'no': buildPermutationDatabase(original_denominator_gene_list, input_count) run_status = 0 ### Analyzed ontologies if len(available_ontologies) > 0: print ' Analyzing input ID list with available ontologies' for ontology_dir in available_ontologies: ontology_type = getResourceType(ontology_dir) permuted_z_scores = {} original_ontology_z_score_data = {} #print ontology_type, resources_to_analyze if resources_to_analyze == ontology_type or resources_to_analyze == 'all': ontology_annotations = importOntologyAnnotations( species_code, ontology_type) if ontology_annotations != None: ### Occurs when the files are named or formatted correctly status, ontology_to_mod_genes = performOntologyORA( ontology_dir) run_status += status ### Analyzed gene-sets if len(available_genesets) > 0: print ' Analyzing input ID list with available gene-sets' for geneset_dir in available_genesets: geneset_type = getResourceType(geneset_dir) permuted_z_scores = {} original_mapp_z_score_data = {} if resources_to_analyze == geneset_type or resources_to_analyze == 'all': status, mapp_to_mod_genes = performGeneSetORA(geneset_dir) run_status += status if len(custom_sets_folder) > 0: ### Hence - Analyze User Supplied GeneSets permuted_z_scores = {} original_mapp_z_score_data = {} run_status += performGeneSetORA('UserSuppliedAssociations')[0] permute_inputs = [] permute_mapp_inputs = [] ontology_input_gene_count = [] mapp_input_gene_count = [] if run_status == 0: ### Returns the number of successfully analyzed gene-set databases program_type, database_dir = unique.whatProgramIsThis() print_out = "Warning!!! Either the MOD you have selected: " + mod + "\nis missing the appropriate relationshipfiles necessary to run GO-Elite\nor you have selected an invalid resource to analyze. Either replace\nthe missing MOD files in " + database_dir + '/' + species_code + ' sub-directories or\nselect a different MOD at run-time.' ForceCriticalError(print_out) end_time = time.time() time_diff = formatTime(start_time, end_time) print 'ORA analyses finished in %s seconds' % time_diff return ontology_to_mod_genes, mapp_to_mod_genes ###Return the MOD genes associated with each GO term and MAPP