def getAnnotations(Species,array_type,reannotate_exon_seq,force): """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release).""" global species; species = Species; global test; global test_cluster test = 'no'; test_cluster = ['TC0701360']; data_type = 'mRNA' global ensembl_exon_db; global ensembl_exon_db; global exon_clusters; global exon_region_db ensembl_exon_db,ensembl_annot_db,exon_clusters,intron_clusters,exon_region_db,intron_retention_db,ucsc_splicing_annot_db,ens_transcript_db = EnsemblImport.getEnsemblAssociations(species,data_type,test) ensembl_probeset_db = importCriticalExonLocations(species,array_type,ensembl_exon_db,force) ###Get Pre-computed genomic locations for critical exons ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons(ensembl_probeset_db,exon_clusters,ensembl_exon_db,exon_region_db,intron_retention_db,intron_clusters,ucsc_splicing_annot_db); constitutive_gene_db={} ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type,ensembl_probeset_db,species) print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..." ### Change filenames to reflect junction array type export_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'; ef=filepath(export_filename) export_replacement = string.replace(export_filename,'_probe','_'+array_type+'_probe') er=filepath(export_replacement); shutil.copyfile(ef,er); os.remove(ef) ### Copy file to a new name ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses) if reannotate_exon_seq == 'yes': JunctionArray.reAnnotateCriticalExonSequences(species,array_type)
def buildExonArrayExonAnnotations(species, array_type, force): ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) import ExonArrayEnsemblRules; reload(ExonArrayEnsemblRules) process_from_scratch='yes' constitutive_source='default' ### Build the databases and return the variables (not used here) source_biotype = 'mRNA' if array_type == 'gene': source_biotype = 'gene' probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,source_biotype,species)
def buildExonArrayExonAnnotations(species, array_type, force): ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna' run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations, run_from_scratch, force) import ExonArrayEnsemblRules reload(ExonArrayEnsemblRules) process_from_scratch = 'yes' constitutive_source = 'default' ### Build the databases and return the variables (not used here) source_biotype = 'mRNA' if array_type == 'gene': source_biotype = 'gene' probeset_db, annotate_db, constitutive_gene_db, splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations( process_from_scratch, constitutive_source, source_biotype, species)
def grabExonIntronPromoterSequences(species,array_type,data_type,output_types): ### output_types could be adjacent intron sequences, adjacent exon sequences, targets exon sequence or promoter sequence_input_dir_list=[] if data_type == 'probeset': sequence_input_dir = '/AltResults/AlternativeOutput/'+array_type+'/sequence_input' if data_type == 'gene': sequence_input_dir = '/ExpressionOutput/'+array_type+'/sequence_input' dir_list = read_directory(sequence_input_dir) for input_file in dir_list: filedir = sequence_input_dir[1:]+'/'+input_file filter_db = inputResultFiles(filedir,data_type) export_exon_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt' ensembl_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,data_type,filter_db) """for gene in ensembl_probeset_db: if gene == 'ENSG00000139737': for x in ensembl_probeset_db[gene]: exon_id,((probe_start,probe_stop,probeset_id,exon_class,transcript_clust),ed) = x print gene, ed.ExonID() kill""" analysis_type = 'get_sequence' dir = 'AltDatabase/ensembl/'+species+'/'; gene_seq_filename = dir+species+'_gene-seq-2000_flank' ensembl_probeset_db = EnsemblImport.import_sequence_data(gene_seq_filename,ensembl_probeset_db,species,analysis_type) """ critical_exon_file = 'AltDatabase/'+species+'/'+ array_type + '/' + array_type+'_critical-exon-seq.txt' if output_types == 'all' and data_type == 'probeset': output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns'] else: output_types = [output_types] for output_type in output_types: sequence_input_dir = string.replace(sequence_input_dir,'_input','_output') filename = sequence_input_dir[1:]+'/ExportedSequence-'+data_type+'-'+output_type+'.txt' exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type) """ if output_types == 'all' and data_type == 'probeset': output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns'] else: output_types = [output_types] for output_type in output_types: sequence_input_dir2 = string.replace(sequence_input_dir,'_input','_output') filename = sequence_input_dir2[1:]+'/'+input_file[:-4]+'-'+data_type+'-'+output_type+'.txt' exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type)
def displayExpressionGraph(species, Platform, exp_file, gene, transpose, display=True, showIntrons=False, analysisType='plot'): ### Get gene annotations (users can provide an Ensembl or symbol) print 'Importing exon-level expression data for visualization (be patient)...' import ExonAnalyze_module global platform platform = Platform if platform != 'AltMouse': gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" else: gene_annotation_file = "AltDatabase/" + species + "/" + platform + "/" + platform + "_gene_annotations.txt" genes = [] gene = string.replace(gene, '|', ',') gene = string.replace(gene, ' ', ',') if ',' in gene: genes += string.split(gene, ',') else: genes.append(gene) gene_db = {} for gene in genes: try: if 'ENS' in gene: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, platform, keyBySymbol=False) ### Make an SQLite call gene_symbol = annotate_db[gene].Symbol() else: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, platform, keyBySymbol=True) gene_symbol = gene gene = annotate_db[gene].GeneID() gene_db[gene] = gene_symbol except Exception: print gene, 'not in database' if len(gene_db) == 0: force_no_gene_found_error if 'AltResults' in exp_file: root_dir = string.split(exp_file, 'AltResults')[0] + 'ExonPlots/' else: root_dir = string.split(exp_file, 'ExpressionInput')[0] + 'ExonPlots/' import ExonAnalyze_module if platform == 'RNASeq': datatype = 'exons' else: datatype = 'probesets' export_exon_filename = 'AltDatabase/' + species + '/' + platform + '/' + species + '_Ensembl_' + datatype + '.txt' if verifyFileLength(export_exon_filename) == 0: rootdir = string.replace(root_dir, 'ExonPlots/', '') export_exon_filename = rootdir + '/' + export_exon_filename import ExonArrayEnsemblRules ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction( export_exon_filename, 'gene-probesets', gene_db) ### Make an SQLite call filter_db = {} for gene in ensembl_exon_db: ensembl_exon_db[gene].sort() for (index, ed, id) in ensembl_exon_db[gene]: filter_db[id] = [] try: os.mkdir(root_dir) except Exception: None ### dir exists print 'Image results being saved to the folder "ExonPlots" in the AltAnalyze results directory.' importTableEntries(exp_file, filter_db, ensembl_exon_db, gene_db, root_dir, transpose, display, showIntrons, analysisType=analysisType) ### Make an SQLite call
def displayExpressionGraph(species,Platform,exp_file,gene,transpose,display=True,showIntrons=False,analysisType='plot'): ### Get gene annotations (users can provide an Ensembl or symbol) print 'Importing exon-level expression data for visualization (be patient)...' import ExonAnalyze_module global platform platform = Platform if platform != 'AltMouse': gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt" else: gene_annotation_file = "AltDatabase/"+species+"/"+platform+"/"+platform+"_gene_annotations.txt" genes=[] gene=string.replace(gene,'|',',') gene=string.replace(gene,' ',',') if ',' in gene: genes += string.split(gene,',') else: genes.append(gene) gene_db={} for gene in genes: try: if 'ENS' in gene: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,platform,keyBySymbol=False) ### Make an SQLite call gene_symbol = annotate_db[gene].Symbol() else: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,platform,keyBySymbol=True) gene_symbol = gene gene = annotate_db[gene].GeneID() gene_db[gene]=gene_symbol except Exception: #if len(gene)>0: print gene, 'not in database' pass if len(gene_db)==0: force_no_gene_found_error if 'AltResults' in exp_file: root_dir = string.split(exp_file,'AltResults')[0]+'ExonPlots/' else: root_dir = string.split(exp_file,'ExpressionInput')[0]+'ExonPlots/' import ExonAnalyze_module if platform == 'RNASeq': datatype = 'exons' else: datatype = 'probesets' export_exon_filename = 'AltDatabase/'+species+'/'+platform+'/'+species+'_Ensembl_'+datatype+'.txt' if verifyFileLength(export_exon_filename) == 0: rootdir = string.replace(root_dir,'ExonPlots/','') export_exon_filename = rootdir+'/'+export_exon_filename import ExonArrayEnsemblRules ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,'gene-probesets',gene_db) ### Make an SQLite call filter_db = {} for gene in ensembl_exon_db: ensembl_exon_db[gene].sort() for (index,ed,id) in ensembl_exon_db[gene]: filter_db[id] = [] try: os.mkdir(root_dir) except Exception: None ### dir exists print 'Image results being saved to the folder "ExonPlots" in the AltAnalyze results directory.' importTableEntries(exp_file,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType=analysisType) ### Make an SQLite call
def getJunctionComparisonsFromExport(species,array_type): type = 'standard' gene_junction_db = importEnsemblUCSCAltJunctions(species,type) ### Retrieve probesets with exon-junctions associated - these are critical exons filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt' gene_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(filename,'junctions',{}) left={}; right={}; gene_db={}; gene_exon_db={}; nonjunction_aligning={} for gene in gene_probeset_db: for (probe_data,ed) in gene_probeset_db[gene]: probeset, strand, probeset_start, probeset_stop = probe_data region_id = string.replace(ed.RegionNumber(),'-','.') original_region_id = region_id region_ids = string.split(region_id,'|') gene_db[probeset[:-2]]=gene #ed.AssociatedSplicingJunctions() r_starts=string.split(ed.ExonStart(),'|'); r_stops=string.split(ed.ExonStop(),'|') for region_id in region_ids: if '|5' in probeset: try: left[probeset[:-2]].append(region_id) except Exception: left[probeset[:-2]]=[region_id] if strand == '+': ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database if probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'left' elif probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'left' elif '|3' in probeset: try: right[probeset[:-2]].append(region_id) except Exception: right[probeset[:-2]]=[region_id] if strand == '+': if probeset_start not in r_starts: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_start,'right' elif probeset_stop not in r_stops: nonjunction_aligning[probeset[:-2]] = original_region_id+'_'+probeset_stop,'right' else: if '_' in region_id: print killer try: gene_exon_db[gene,region_id].append(probeset) except Exception: gene_exon_db[gene,region_id] = [probeset] print 'len(nonjunction_aligning)',len(nonjunction_aligning) gene_exon_db = eliminateRedundant(gene_exon_db) junction_db={} ### Get the exon-region IDs for an exon-junction for probeset in left: gene = gene_db[probeset] if probeset in right: for region1 in left[probeset]: for region2 in right[probeset]: junction = region1+'-'+region2 try: junction_db[gene,junction].append(probeset) except Exception: junction_db[gene,junction] = [probeset] probeset_junction_export = 'AltDatabase/' + species + '/'+array_type+'/'+ species + '_junction_comps.txt' fn=filepath(probeset_junction_export); data = open(fn,'w') print "Exporting",probeset_junction_export title = 'gene'+'\t'+'critical_exon'+'\t'+'exclusion_junction_region'+'\t'+'inclusion_junction_region'+'\t'+'exclusion_probeset'+'\t'+'inclusion_probeset'+'\t'+'data_source'+'\n' data.write(title); temp_list=[] for (gene,critical_exon,incl_junction,excl_junction) in gene_junction_db: if (gene,incl_junction) in junction_db: incl_junction_probesets = junction_db[gene,incl_junction] if (gene,excl_junction) in junction_db: excl_junction_probesets = junction_db[gene,excl_junction] for incl_junction_probeset in incl_junction_probesets: for excl_junction_probeset in excl_junction_probesets: try: for incl_exon_probeset in gene_exon_db[gene,critical_exon]: if incl_junction_probeset in nonjunction_aligning or excl_junction_probeset in nonjunction_aligning: null=[] else: ### Ensure the probeset DOES map to the annotated junctions temp_list.append(string.join([gene,critical_exon,excl_junction,critical_exon,excl_junction_probeset,incl_exon_probeset,'AltAnalyze'],'\t')+'\n') except Exception: null=[] if incl_junction_probeset in nonjunction_aligning: new_region_id, side = nonjunction_aligning[incl_junction_probeset] incl_junction = renameJunction(incl_junction,side,new_region_id) if excl_junction_probeset in nonjunction_aligning: new_region_id, side = nonjunction_aligning[excl_junction_probeset] excl_junction = renameJunction(excl_junction,side,new_region_id) if excl_junction_probeset!=incl_junction_probeset: temp_list.append(string.join([gene,critical_exon,excl_junction,incl_junction,excl_junction_probeset,incl_junction_probeset,'AltAnalyze'],'\t')+'\n') temp_list = unique.unique(temp_list) for i in temp_list: data.write(i) data.close() print 'Number of compared junctions exported', len(temp_list)
def getAnnotations(fl,Array_type,p_threshold,e_threshold,data_source,manufacturer,constitutive_source,Species,avg_all_for_ss,filter_by_DABG,perform_alt_analysis,expression_data_format): global species; species = Species; global average_all_probesets; average_all_probesets={} global avg_all_probes_for_steady_state; avg_all_probes_for_steady_state = avg_all_for_ss; global filter_by_dabg; filter_by_dabg = filter_by_DABG global dabg_p_threshold; dabg_p_threshold = float(p_threshold); global root_dir; global biotypes; global normalize_feature_exp global expression_threshold; global exp_data_format; exp_data_format = expression_data_format; global UserOptions; UserOptions = fl global full_dataset_export_dir; global excludeLowExpressionExons """ try: exon_exp_threshold = fl.ExonExpThreshold() except Exception: exon_exp_threshold = 0 try: exon_rpkm_threshold = fl.ExonRPKMThreshold() except Exception: exon_rpkm_threshold = 0 try: gene_rpkm_threshold = fl.RPKMThreshold() except Exception: gene_rpkm_threshold = 0 try: gene_exp_threshold = fl.GeneExpThreshold() except Exception: gene_exp_threshold = 0 """ ### The input expression data can be log or non-log. If non-log, transform to log in FilterDABG prior to the alternative exon analysis - v.1.16 if expression_data_format == 'log': try: expression_threshold = math.log(float(e_threshold),2) except Exception: expression_threshold = 0 ### Applies to RNASeq datasets else: expression_threshold = float(e_threshold) process_from_scratch = 'no' ###internal variables used while testing global dabg_summary; global expression_summary; dabg_summary={};expression_summary={} global fulldataset_export_object; global array_type; array_type = Array_type global exp_analysis_type; exp_analysis_type = 'expression' global stats_input_dir expr_input_dir = fl.ExpFile(); stats_input_dir = fl.StatsFile(); root_dir = fl.RootDir() try: normalize_feature_exp = fl.FeatureNormalization() except Exception: normalize_feature_exp = 'NA' try: excludeLowExpressionExons = fl.excludeLowExpressionExons() except Exception: excludeLowExpressionExons = True try: useJunctionsForGeneExpression = fl.useJunctionsForGeneExpression() if useJunctionsForGeneExpression: print 'Using known junction only to estimate gene expression!!!' except Exception: useJunctionsForGeneExpression = False source_biotype = 'mRNA' if array_type == 'gene': source_biotype = 'gene' elif array_type == 'junction': source_biotype = 'junction' ###Get annotations using Affymetrix as a trusted source or via links to Ensembl if array_type == 'AltMouse': probeset_db,constitutive_gene_db = ExpressionBuilder.importAltMerge('full'); annotate_db={} source_biotype = 'AltMouse' elif manufacturer == 'Affymetrix' or array_type == 'RNASeq': if array_type == 'RNASeq': source_biotype = array_type, root_dir probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,source_biotype,species) ### Get all file locations and get array headers #print len(splicing_analysis_db),"genes included in the splicing annotation database (constitutive only containing)" stats_file_status = verifyFile(stats_input_dir) array_linker_db,array_names = importExonProbesetData(expr_input_dir,{},'arraynames') input_dir_split = string.split(expr_input_dir,'/') full_dataset_export_dir = root_dir+'AltExpression/FullDatasets/ExonArray/'+species+'/'+string.replace(input_dir_split[-1],'exp.','') if array_type == 'gene': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','GeneArray') if array_type == 'junction': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','JunctionArray') if array_type == 'AltMouse': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','AltMouse') if array_type == 'RNASeq': full_dataset_export_dir = string.replace(full_dataset_export_dir,'ExonArray','RNASeq') try: fulldataset_export_object = export.ExportFile(full_dataset_export_dir) except Exception: print 'AltAnalyze is having trouble creating the directory:\n',full_dataset_export_dir print 'Report this issue to the AltAnalyze help desk or create this directory manually (Error Code X1).'; force_exception ### Organize arrays according to groups and export all probeset data and any pairwise comparisons data_type = 'expression' if array_type == 'RNASeq': expr_input_dir = string.replace(expr_input_dir,'exp.','counts.') ### Filter based on the counts file and then replace values with the normalized as the last step comparison_filename_list,biotypes = exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) if useJunctionsForGeneExpression: if 'junction' in biotypes: if 'exon' in biotypes: del biotypes['exon'] if filter_by_dabg == 'yes' and stats_file_status == 'found': data_type = 'dabg' exportGroupedComparisonProbesetData(stats_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) ###Filter expression data based on DABG and annotation filtered probesets (will work without DABG filtering as well) - won't work for RNA-Seq (execute function later) filtered_exon_db = removeNonExpressedProbesets(probeset_db,full_dataset_export_dir) filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db,'expression',array_names,perform_alt_analysis) constitutive_gene_db={}; probeset_gene_db = makeGeneLevelAnnotations(probeset_db) if array_type == 'RNASeq': fulldataset_export_object = export.ExportFile(full_dataset_export_dir) data_type = 'expression' ### Repeat with counts and then with exp. to add gene-level estimates to both exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) fulldataset_export_object = export.ExportFile(full_dataset_export_dir) expr_input_dir = string.replace(expr_input_dir,'counts.','exp.') exportGroupedComparisonProbesetData(expr_input_dir,probeset_db,data_type,array_names,array_linker_db,perform_alt_analysis) try: clearObjectsFromMemory(average_all_probesets); clearObjectsFromMemory(expression_summary); clearObjectsFromMemory(splicing_analysis_db) except Exception: null=[] filtered_exon_db=[]; probeset_db={}; average_all_probesets={}; expression_summary={}; splicing_analysis_db={} #filtered_exp_db,group_count,ranked_array_headers = filterExpressionData(expr_input_dir,filtered_exon_db,constitutive_gene_db,probeset_db) #filtered_gene_db = permformFtests(filtered_exp_db,group_count,probeset_db) """ pre_filtered_db=[] print 'global vars' returnLargeGlobalVars() print 'local vars' all = [var for var in locals() if (var[:2], var[-2:]) != ("__", "__")] for var in all: try: if len(locals()[var])>500: print var, len(locals()[var]) except Exception: null=[] """ return probeset_gene_db, annotate_db, comparison_filename_list
grabExonIntronPromoterSequences(Species,Array_type,Data_type,Output_types) sys.exit() #""" avg_all_for_ss = 'yes' import_dir = '/AltDatabase/'+Species+ '/exon' expr_file_dir = 'ExpressionInput\exp.HEK-confluency.plier.txt' dagb_p = 0.001 f_cutoff = 2.297 exons_to_grab = "core" x = 'Affymetrix' y = 'Ensembl' z = 'default' data_source = y constitutive_source = z filename = expr_file_dir; p = dagb_p getAnnotations(expr_file_dir,dagb_p,exons_to_grab,data_source,constitutive_source,Species) global species; species = Species process_from_scratch = 'no' ###Get annotations using Affymetrix as a trusted source or via links to Ensembl if data_source == 'Affymetrix': annotation_dbases = ExonArrayAffyRules.getAnnotations(exons_to_grab,constitutive_source,process_from_scratch) probe_association_db,constitutive_gene_db,exon_location_db, trans_annotation_db, trans_annot_extended = annotation_dbases else: probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,species,avg_all_for_ss) filterExpressionData(filename,filtered_exon_db,constitutive_gene_db,probeset_db,data_type) #filtered_gene_db = permformFtests(filtered_exp_db,group_count,probeset_db)