def geneCoordinates(species,symbols): genes=[] import EnsemblImport ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True) for symbol in symbols: ens_geneid = ensembl_annotation_db[symbol] genes.append((ens_geneid,symbol)) ### Get gene genomic locations gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') search_locations=[] for (gene,symbol) in genes: chr,strand,start,end = gene_location_db[gene] if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953' if len(chr)>6: print symbol, 'bad chromosomal reference:',chr else: search_locations.append([chr,strand,start,end,symbol])
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations( species) export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations( gene_annotation_file, 'RNASeq') print 'Annotations for', len(gene_location_db), 'genes imported' sorted_list = [] protein_coding = 0 for gene in gene_location_db: chr, strand, start, end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding += 1 else: biotype = 'NA' if len(chr) < 7: sorted_list.append( [chr, strand, int(start), int(end), gene, biotype]) #else: print chr;sys.exit() print len(sorted_list), 'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr, strand, start, end, gene, biotype = values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene, symbol, chr, strand, str(start), str(end), biotype] export_data.write(string.join(values, '\t') + '\n') export_data.close() print species, 'chromosome locations exported to:\n', export_path
def geneCoordinates(species,symbols): genes=[] import EnsemblImport ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True) for symbol in symbols: if symbol in ensembl_annotation_db: ens_geneid = ensembl_annotation_db[symbol] genes.append((ens_geneid,symbol)) else: print symbol, 'not found' ### Get gene genomic locations gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') search_locations=[] for (gene,symbol) in genes: chr,strand,start,end = gene_location_db[gene] #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953' if len(chr)>6: print symbol, 'bad chromosomal reference:',chr else: search_locations.append([chr,strand,start,end,symbol])
def exportChromosomeStrandCoordinates(species): import EnsemblImport gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') import ExpressionBuilder gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species) export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt' export_data = export.ExportFile(export_path) import ExonAnalyze_module gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt" annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq') print 'Annotations for',len(gene_location_db),'genes imported' sorted_list=[]; protein_coding=0 for gene in gene_location_db: chr,strand,start,end = gene_location_db[gene] if gene in gene_biotype_db: biotype = gene_biotype_db[gene][-1] if biotype == 'protein_coding': protein_coding+=1 else: biotype = 'NA' if len(chr)<7: sorted_list.append([chr,strand,int(start),int(end),gene,biotype]) #else: print chr;sys.exit() print len(sorted_list),'genes for typical chromosomes present' print protein_coding, 'protein coding genes present' sorted_list.sort() for values in sorted_list: chr,strand,start,end,gene,biotype=values try: symbol = annotate_db[gene].Symbol() except Exception: symbol = '' values = [gene,symbol,chr,strand,str(start),str(end),biotype] export_data.write(string.join(values,'\t')+'\n') export_data.close() print species, 'chromosome locations exported to:\n',export_path
def reformatPolyAdenylationCoordinates(species,force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version={} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for',species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/'+species + '/' if force == 'yes': filename, status = update.download(url,output_dir,'') else: filename = output_dir+'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations; import OBO_import; import EnsemblImport; import export try: ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene') print len(ens_unigene),'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no' except Exception: ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene') print len(ens_entrez),'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array') export_bedfile = output_dir+species+'_polyADB_2_predictions.bed' print 'exporting',export_bedfile export_data = export.ExportFile(export_bedfile) header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n' export_data.write(header) fn=filepath(filename); x=0; not_found={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) if x==0: x=1 else: siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr'+chr strand = '+'; geneid = siteid pos_start = str(int(position)-1); pos_end = position if use_entrez=='no': external_geneid = string.join(string.split(siteid,'.')[:2],'.') else: external_geneid=llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-'+ens_geneid chr,strand,start,end = gene_location_db[ens_geneid] else: not_found[external_geneid]=[] bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n' export_data.write(bed_format) export_data.close()
def reformatPolyAdenylationCoordinates(species, force): """ PolyA annotations are currently only available from UCSC for human, but flat file annotations from 2003-2006 are available for multiple species. Convert these to BED format""" version = {} version['Rn'] = '2003(rn3)' version['Dr'] = '2003(zv4)' version['Gg'] = '2004(galGal2)' version['Hs'] = '2006(hg8)' version['Mm'] = '2004(mm5)' print 'Exporting polyADB_2 coordinates as BED for', species ### Obtain the necessary database files url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt' output_dir = 'AltDatabase/ucsc/' + species + '/' if force == 'yes': filename, status = update.download(url, output_dir, '') else: filename = output_dir + 'polyAsite.txt' ### Import the refseq to Ensembl information import gene_associations import OBO_import import EnsemblImport import export try: ens_unigene = gene_associations.getGeneToUid(species, 'Ensembl-UniGene') print len(ens_unigene), 'Ensembl-UniGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_unigene) use_entrez = 'no' except Exception: ens_entrez = gene_associations.getGeneToUid(species, 'Ensembl-EntrezGene') print len(ens_entrez), 'Ensembl-EntrezGene entries imported' external_ensembl = OBO_import.swapKeyValues(ens_entrez) use_entrez = 'yes' gene_location_db = EnsemblImport.getEnsemblGeneLocations( species, 'RNASeq', 'key_by_array') export_bedfile = output_dir + species + '_polyADB_2_predictions.bed' print 'exporting', export_bedfile export_data = export.ExportFile(export_bedfile) header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n' export_data.write(header) fn = filepath(filename) x = 0 not_found = {} for line in open(fn, 'rU').xreadlines(): data = cleanUpLine(line) if x == 0: x = 1 else: siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split( data, '\t') if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention if species in siteid: if 'NA' not in chr: chr = 'chr' + chr strand = '+' geneid = siteid pos_start = str(int(position) - 1) pos_end = position if use_entrez == 'no': external_geneid = string.join( string.split(siteid, '.')[:2], '.') else: external_geneid = llid if external_geneid in external_ensembl: ens_geneid = external_ensembl[external_geneid][0] geneid += '-' + ens_geneid chr, strand, start, end = gene_location_db[ens_geneid] else: not_found[external_geneid] = [] bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', '-'], '\t' ) + '\n' ### We don't know the strand, so write out both strands export_data.write(bed_format) bed_format = string.join( [chr, pos_start, pos_end, geneid, '0', strand], '\t') + '\n' export_data.write(bed_format) export_data.close()