def getAnnotations(Species, array_type, reannotate_exon_seq, force): """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release).""" global species species = Species global test global test_cluster test = 'no' test_cluster = ['TC0701360'] data_type = 'mRNA' global ensembl_exon_db global ensembl_exon_db global exon_clusters global exon_region_db ensembl_exon_db, ensembl_annot_db, exon_clusters, intron_clusters, exon_region_db, intron_retention_db, ucsc_splicing_annot_db, ens_transcript_db = EnsemblImport.getEnsemblAssociations( species, data_type, test) ensembl_probeset_db = importCriticalExonLocations( species, array_type, ensembl_exon_db, force) ###Get Pre-computed genomic locations for critical exons ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons( ensembl_probeset_db, exon_clusters, ensembl_exon_db, exon_region_db, intron_retention_db, intron_clusters, ucsc_splicing_annot_db) constitutive_gene_db = {} ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type, ensembl_probeset_db, species) print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..." ### Change filenames to reflect junction array type export_filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' ef = filepath(export_filename) export_replacement = string.replace(export_filename, '_probe', '_' + array_type + '_probe') er = filepath(export_replacement) shutil.copyfile(ef, er) os.remove(ef) ### Copy file to a new name ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses) if reannotate_exon_seq == 'yes': JunctionArray.reAnnotateCriticalExonSequences(species, array_type)
def buildExonArrayExonAnnotations(species, array_type, force): ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) from build_scripts import ExonArrayEnsemblRules; reload(ExonArrayEnsemblRules) process_from_scratch='yes' constitutive_source='default' ### Build the databases and return the variables (not used here) source_biotype = 'mRNA' if array_type == 'gene': source_biotype = 'gene' probeset_db,annotate_db,constitutive_gene_db,splicing_analysis_db = ExonArrayEnsemblRules.getAnnotations(process_from_scratch,constitutive_source,source_biotype,species)
def displayExpressionGraph(species, Platform, exp_file, gene, transpose, display=True, showIntrons=False, analysisType='plot'): ### Get gene annotations (users can provide an Ensembl or symbol) print 'Importing exon-level expression data for visualization (be patient)...' from build_scripts import ExonAnalyze_module global platform platform = Platform if platform != 'AltMouse': gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" else: gene_annotation_file = "AltDatabase/" + species + "/" + platform + "/" + platform + "_gene_annotations.txt" genes = [] gene = string.replace(gene, '|', ',') gene = string.replace(gene, ' ', ',') if ',' in gene: genes += string.split(gene, ',') else: genes.append(gene) gene_db = {} for gene in genes: try: if 'ENS' in gene: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, platform, keyBySymbol=False) ### Make an SQLite call gene_symbol = annotate_db[gene].Symbol() else: try: annotate_db ### If variable is defined except Exception: annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, platform, keyBySymbol=True) gene_symbol = gene gene = annotate_db[gene].GeneID() gene_db[gene] = gene_symbol except Exception: #if len(gene)>0: print gene, 'not in database' pass if len(gene_db) == 0: force_no_gene_found_error if 'AltResults' in exp_file: root_dir = string.split(exp_file, 'AltResults')[0] + 'ExonPlots/' else: root_dir = string.split(exp_file, 'ExpressionInput')[0] + 'ExonPlots/' from build_scripts import ExonAnalyze_module if platform == 'RNASeq': datatype = 'exons' else: datatype = 'probesets' export_exon_filename = 'AltDatabase/' + species + '/' + platform + '/' + species + '_Ensembl_' + datatype + '.txt' if verifyFileLength(export_exon_filename) == 0: rootdir = string.replace(root_dir, 'ExonPlots/', '') export_exon_filename = rootdir + '/' + export_exon_filename from build_scripts import ExonArrayEnsemblRules ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction( export_exon_filename, 'gene-probesets', gene_db) ### Make an SQLite call filter_db = {} for gene in ensembl_exon_db: ensembl_exon_db[gene].sort() for (index, ed, id) in ensembl_exon_db[gene]: filter_db[id] = [] try: os.mkdir(root_dir) except Exception: None ### dir exists print 'Image results being saved to the folder "ExonPlots" in the AltAnalyze results directory.' importTableEntries(exp_file, filter_db, ensembl_exon_db, gene_db, root_dir, transpose, display, showIntrons, analysisType=analysisType) ### Make an SQLite call
def getJunctionComparisonsFromExport(species, array_type): type = 'standard' gene_junction_db = importEnsemblUCSCAltJunctions(species, type) ### Retrieve probesets with exon-junctions associated - these are critical exons filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt' gene_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction( filename, 'junctions', {}) left = {} right = {} gene_db = {} gene_exon_db = {} nonjunction_aligning = {} for gene in gene_probeset_db: for (probe_data, ed) in gene_probeset_db[gene]: probeset, strand, probeset_start, probeset_stop = probe_data region_id = string.replace(ed.RegionNumber(), '-', '.') original_region_id = region_id region_ids = string.split(region_id, '|') gene_db[probeset[:-2]] = gene #ed.AssociatedSplicingJunctions() r_starts = string.split(ed.ExonStart(), '|') r_stops = string.split(ed.ExonStop(), '|') for region_id in region_ids: if '|5' in probeset: try: left[probeset[:-2]].append(region_id) except Exception: left[probeset[:-2]] = [region_id] if strand == '+': ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database if probeset_stop not in r_stops: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_stop, 'left' elif probeset_start not in r_starts: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_start, 'left' elif '|3' in probeset: try: right[probeset[:-2]].append(region_id) except Exception: right[probeset[:-2]] = [region_id] if strand == '+': if probeset_start not in r_starts: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_start, 'right' elif probeset_stop not in r_stops: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_stop, 'right' else: if '_' in region_id: print killer try: gene_exon_db[gene, region_id].append(probeset) except Exception: gene_exon_db[gene, region_id] = [probeset] print 'len(nonjunction_aligning)', len(nonjunction_aligning) gene_exon_db = eliminateRedundant(gene_exon_db) junction_db = {} ### Get the exon-region IDs for an exon-junction for probeset in left: gene = gene_db[probeset] if probeset in right: for region1 in left[probeset]: for region2 in right[probeset]: junction = region1 + '-' + region2 try: junction_db[gene, junction].append(probeset) except Exception: junction_db[gene, junction] = [probeset] probeset_junction_export = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_junction_comps.txt' fn = filepath(probeset_junction_export) data = open(fn, 'w') print "Exporting", probeset_junction_export title = 'gene' + '\t' + 'critical_exon' + '\t' + 'exclusion_junction_region' + '\t' + 'inclusion_junction_region' + '\t' + 'exclusion_probeset' + '\t' + 'inclusion_probeset' + '\t' + 'data_source' + '\n' data.write(title) temp_list = [] for (gene, critical_exon, incl_junction, excl_junction) in gene_junction_db: if (gene, incl_junction) in junction_db: incl_junction_probesets = junction_db[gene, incl_junction] if (gene, excl_junction) in junction_db: excl_junction_probesets = junction_db[gene, excl_junction] for incl_junction_probeset in incl_junction_probesets: for excl_junction_probeset in excl_junction_probesets: try: for incl_exon_probeset in gene_exon_db[ gene, critical_exon]: if incl_junction_probeset in nonjunction_aligning or excl_junction_probeset in nonjunction_aligning: null = [] else: ### Ensure the probeset DOES map to the annotated junctions temp_list.append( string.join([ gene, critical_exon, excl_junction, critical_exon, excl_junction_probeset, incl_exon_probeset, 'AltAnalyze' ], '\t') + '\n') except Exception: null = [] if incl_junction_probeset in nonjunction_aligning: new_region_id, side = nonjunction_aligning[ incl_junction_probeset] incl_junction = renameJunction( incl_junction, side, new_region_id) if excl_junction_probeset in nonjunction_aligning: new_region_id, side = nonjunction_aligning[ excl_junction_probeset] excl_junction = renameJunction( excl_junction, side, new_region_id) if excl_junction_probeset != incl_junction_probeset: temp_list.append( string.join([ gene, critical_exon, excl_junction, incl_junction, excl_junction_probeset, incl_junction_probeset, 'AltAnalyze' ], '\t') + '\n') temp_list = unique.unique(temp_list) for i in temp_list: data.write(i) data.close() print 'Number of compared junctions exported', len(temp_list)