def geneCoordinates(species,symbols): genes=[] from build_scripts import EnsemblImport ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True) for symbol in symbols: if symbol in ensembl_annotation_db: ens_geneid = ensembl_annotation_db[symbol] genes.append((ens_geneid,symbol)) else: print symbol, 'not found' ### Get gene genomic locations gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') search_locations=[] for (gene,symbol) in genes: chr,strand,start,end = gene_location_db[gene] #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953' if len(chr)>6: print symbol, 'bad chromosomal reference:',chr else: search_locations.append([chr,strand,start,end,symbol])
def alignProbesetsToTranscripts(species, array_type, Analysis_type, Force, CoordinateBasedMatching=False): global force force = Force global analysis_type analysis_type = Analysis_type global coordinateBasedMatching coordinateBasedMatching = CoordinateBasedMatching """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts""" if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': data_type = 'junctions' probeset_seq_file = '' biotype = 'gene' if data_type == 'junctions' and analysis_type == 'reciprocal': start_time = time.time( ) ### Indicates whether to store information at the level of genes or probesets probeset_seq_db, pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence( species, array_type, biotype) end_time = time.time() time_diff = int(end_time - start_time) elif analysis_type == 'single': start_time = time.time() probeset_seq_db, pairwise_probeset_combinations = importAllJunctionSequences( species, array_type) end_time = time.time() time_diff = int(end_time - start_time) print "Analyses finished in %d seconds" % time_diff elif array_type == 'exon': data_type = 'exon' probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' ###Import probe-level associations exon_db = importSplicingAnnotationDatabase(probeset_annotations_file) start_time = time.time() probeset_seq_db = importProbesetSequences(exon_db, species) end_time = time.time() time_diff = int(end_time - start_time) print "Analyses finished in %d seconds" % time_diff ### Match probesets to mRNAs\= from build_scripts import EnsemblImport if coordinateBasedMatching == True and array_type == 'RNASeq': EnsemblImport.exportTranscriptExonIDAssociations(species) matchTranscriptExonIDsToJunctionIDs( species, array_type, probeset_seq_db ) ### no sequences in probeset_seq_db, just junctionIDs else: #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs importEnsemblTranscriptSequence(species, array_type, probeset_seq_db) try: mRNASeqAlign.importUCSCTranscriptSequences(species, array_type, probeset_seq_db) except Exception: pass ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available probeset_seq_db = {} ### Re-set db ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset if data_type == 'junctions': ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file import_dir = '/AltDatabase/' + species + '/SequenceData/output' g = GrabFiles() g.setdirectory(import_dir) align_files = g.searchdirectory('mRNA_alignments') reAnalyzeRNAProbesetMatches(align_files, species, array_type, pairwise_probeset_combinations)
def getAnnotations(Species, array_type, reannotate_exon_seq, force): """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release).""" global species species = Species global test global test_cluster test = 'no' test_cluster = ['TC0701360'] data_type = 'mRNA' global ensembl_exon_db global ensembl_exon_db global exon_clusters global exon_region_db ensembl_exon_db, ensembl_annot_db, exon_clusters, intron_clusters, exon_region_db, intron_retention_db, ucsc_splicing_annot_db, ens_transcript_db = EnsemblImport.getEnsemblAssociations( species, data_type, test) ensembl_probeset_db = importCriticalExonLocations( species, array_type, ensembl_exon_db, force) ###Get Pre-computed genomic locations for critical exons ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons( ensembl_probeset_db, exon_clusters, ensembl_exon_db, exon_region_db, intron_retention_db, intron_clusters, ucsc_splicing_annot_db) constitutive_gene_db = {} ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type, ensembl_probeset_db, species) print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..." ### Change filenames to reflect junction array type export_filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' ef = filepath(export_filename) export_replacement = string.replace(export_filename, '_probe', '_' + array_type + '_probe') er = filepath(export_replacement) shutil.copyfile(ef, er) os.remove(ef) ### Copy file to a new name ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses) if reannotate_exon_seq == 'yes': JunctionArray.reAnnotateCriticalExonSequences(species, array_type)
def importAndReformatEnsemblJunctionAnnotations(species, array_type, nonconstitutive_junctions): filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_' + array_type + '_probesets.txt' export_filepath = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' efn = filepath(export_filepath) export_data = open(efn, 'w') fn = filepath(filename) x = 0 ensembl_exon_db = {} left = {} right = {} exon_gene_db = {} nonjunction_aligning = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 export_data.write(data + '\n') else: t = string.split(data, '\t') probeset, exon_id, ensembl_gene_id, transcript_cluster_id, chr, strand, probeset_start, probeset_stop, affy_class, constitutitive_probeset, ens_exon_ids, exon_annotations, regionid, r_start, r_stop, splice_event, splice_junctions = t if len(regionid) < 1: regionid = exon_id t[12] = exon_id if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention tc, probeset = string.split(probeset, ':') regionid = string.replace(regionid, '-', '.') original_region_id = regionid r_starts = string.split(r_start, '|') r_stops = string.split(r_stop, '|') ed = EnsemblImport.ExonStructureData(ensembl_gene_id, chr, strand, probeset_start, probeset_stop, constitutitive_probeset, ens_exon_ids, []) ed.reSetExonID(regionid) if '|5' in probeset: left[probeset[:-2]] = ed, t if strand == '+': ### If the junction probesets DO NOT align to the region coordinates, then the probeset maps to a junction outside the database if probeset_stop not in r_stops: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_stop, 'left' elif probeset_start not in r_starts: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_start, 'left' elif '|3' in probeset: right[probeset[:-2]] = ed, t if strand == '+': if probeset_start not in r_starts: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_start, 'right' elif probeset_stop not in r_stops: nonjunction_aligning[ probeset[: -2]] = original_region_id + '_' + probeset_stop, 'right' else: t[0] = probeset ensembl_exon_db[probeset] = ed export_data.write(string.join(t, '\t') + '\n') regionids = string.split(regionid, '|') for regionid in regionids: exon_gene_db[ensembl_gene_id, regionid] = probeset for probeset in left: if probeset in right: l, pl = left[probeset] r, pr = right[probeset] if l.Constitutive() != r.Constitutive(): l.setConstitutive( 'no' ) ### used to determine if a junciton is alternative or constitutive if probeset in nonconstitutive_junctions: l.setConstitutive('no') l.setJunctionCoordinates(l.ExonStart(), l.ExonStop(), r.ExonStart(), r.ExonStop()) ens_exon_idsl = pl[10] ens_exon_idsr = pr[10] exon_idl = pl[1] exon_idr = pr[1] regionidl = pl[12] regionidr = pr[12] splice_junctionsl = pl[-1] splice_junctionsr = pr[-1] exon_idl = string.replace(exon_idl, '-', '.') exon_idr = string.replace(exon_idr, '-', '.') regionidl_block = string.split(regionidl, '-')[0] regionidr_block = string.split(regionidr, '-')[0] if regionidl_block != regionidr_block: ### Otherwise, the junction is probing a single exon block and thus is not informative regionidl = string.replace(regionidl, '-', '.') regionidr = string.replace(regionidr, '-', '.') exon_id = exon_idl + '-' + exon_idr regionid = regionidl + '-' + regionidr if probeset in nonjunction_aligning: new_region_id, side = nonjunction_aligning[probeset] regionid = renameJunction(regionid, side, new_region_id) l.reSetExonID(regionid) ensembl_exon_db[probeset] = l splice_junctionsl += splice_junctionsr ens_exon_idsl = string.split(ens_exon_idsl, '|') ens_exon_idsr = string.split(ens_exon_idsr, '|') ens_exon_ids = string.join( unique.unique(ens_exon_idsl + ens_exon_idsr), '|') pl[10] = ens_exon_ids pl[12] = regionid pl[1] = exon_id pl[-1] = splice_junctionsl pl[13] = l.ExonStart() + '|' + l.ExonStop() pl[14] = r.ExonStart() + '|' + r.ExonStop() strand = pl[5] if strand == '+': pl[6] = l.ExonStop() pl[7] = r.ExonStart() ### juncstion splice-sites else: pl[6] = l.ExonStart() pl[7] = r.ExonStop() ### juncstion splice-sites pl[0] = probeset pl[9] = l.Constitutive() pl = string.join(pl, '\t') + '\n' export_data.write(pl) export_data.close() return ensembl_exon_db, exon_gene_db
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db = AltAnalyze.importGOEliteSpeciesInfo() tax_db = {} for species_full in species_annot_db: taxid = species_annot_db[species_full].TaxID() tax_db[taxid] = species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export import update filesearch = '_sprot_' all_swissprot = update.getFTPData( 'ftp.expasy.org', '/databases/uniprot/current_release/knowledgebase/taxonomic_divisions', filesearch) for file in all_swissprot: gz_filepath, status = update.download(file, 'uniprot_temp/', '') if status == 'not-removed': try: os.remove(gz_filepath ) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db = {} altanalyze_species_uniprot_db = {} dir = read_directory('/uniprot_temp') for filename in dir: fn = filepath('uniprot_temp/' + filename) for line in open(fn, 'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data, '=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species, ' ')[:2] species_full = string.join(species, ' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/' + filename ss = string.split(species_full, ' ') if len( ss ) == 2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append( (taxid, 'ftp://' + url + '.gz')) except KeyError: species_uniprot_db[species_full] = [ (taxid, 'ftp://' + url + '.gz') ] taxid = '' species_full = '' from build_scripts import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values( species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values) > 1: found = 'no' for (taxid, url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found = 'yes' print 'ambiguity resolved:', species_full break if found == 'yes': break else: (taxid, url) = values[0] up.write(string.join([species_full, taxid, url], '\t') + '\n') up.close()
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations from import_scripts import OBO_import from build_scripts import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()