def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def getUniProtURLsForAllSupportedSpecies(): ### Import all UniProt supproted species and URLs species_uniprot_db = {} fn = filepath('Config/uniprot-species-file.txt') for line in open(fn, 'r').xreadlines(): data = cleanUpLine(line) species_full, taxid, url = string.split(data, '\t') if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!) species_uniprot_db[species_full] = taxid, url import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db = AltAnalyze.importGOEliteSpeciesInfo() ### Export all urls for currently supported species import UI file_location_defaults = UI.importDefaultFileLocations() location_db = {} species_added = [] for species_full in species_annot_db: if species_full in species_uniprot_db: taxid, url = species_uniprot_db[species_full] species_code = species_annot_db[species_full].SpeciesCode() try: location_db[url].append(species_code) except Exception: location_db[url] = [species_code] species_added.append(species_full) for species_full in species_annot_db: taxid = species_annot_db[species_full].TaxID() species_code = species_annot_db[species_full].SpeciesCode() if species_full not in species_added: for species_name in species_uniprot_db: tax, url = species_uniprot_db[species_name] if tax == taxid: location_db[url].append(species_code) print species_code for url in location_db: species = string.join(location_db[url], '|') fl = UI.FileLocationData('ftp', url, species) try: file_location_defaults['UniProt'].append(fl) except KeyError: file_location_defaults['UniProt'] = [fl] UI.exportDefaultFileLocations(file_location_defaults)
def getUniProtURLsForAllSupportedSpecies(): ### Import all UniProt supproted species and URLs species_uniprot_db={} fn=filepath('Config/uniprot-species-file.txt') for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) species_full,taxid,url = string.split(data,'\t') if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!) species_uniprot_db[species_full] = taxid,url import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo() ### Export all urls for currently supported species import UI file_location_defaults = UI.importDefaultFileLocations() location_db={}; species_added=[] for species_full in species_annot_db: if species_full in species_uniprot_db: taxid,url = species_uniprot_db[species_full] species_code = species_annot_db[species_full].SpeciesCode() try: location_db[url].append(species_code) except Exception: location_db[url] = [species_code] species_added.append(species_full) for species_full in species_annot_db: taxid = species_annot_db[species_full].TaxID() species_code = species_annot_db[species_full].SpeciesCode() if species_full not in species_added: for species_name in species_uniprot_db: tax,url = species_uniprot_db[species_name] if tax == taxid: location_db[url].append(species_code) print species_code for url in location_db: species = string.join(location_db[url],'|') fl = UI.FileLocationData('ftp', url, species) try: file_location_defaults['UniProt'].append(fl) except KeyError: file_location_defaults['UniProt'] = [fl] UI.exportDefaultFileLocations(file_location_defaults)
def TargetScanImport(parse_sequences,force): """The TargetScan data is currently extracted from a cross-species conserved family file. This file only contains gene symbol, microRNA name and 3'UTR seed locations.""" if species == 'Mm': tax = '10090'; prefix = 'mmu-' elif species == 'Hs': tax = '9606'; prefix = 'hsa-' elif species == 'Rn': tax = '10116'; prefix = 'rno-' else: prefix = 'hsa-' import AltAnalyze ###Get taxid annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: if species==species_annot_db[species_full].SpeciesCode(): tax = species_annot_db[species_full].TaxID() global l ### See if the files are already there verifyTSG, target_scan_target_file = verifyExternalDownload('TargetScanGenes') verifyTSS, target_scan_sequence_file = verifyExternalDownload('TargetScanSequences') if verifyTSG == 'no' or verifyTSS == 'no': ### used to be - if force == 'yes' if parse_sequences == 'no': ### Then download the latest annotations and sequences target_scan_target_file = downloadFile('TargetScanGenes') target_scan_sequence_file = downloadFile('TargetScanSequences') ### Cross-species TargetScan file with UTR seqeunces for all genes with reported targets in the conserved family file ### Although this file includes valid sequence data that appears to match up to the target file, the target file ### appears to only list the seed seqeunce location (UTR start and stop) and not the full binding sequence and thus ### is not ammenable to probe set alignment. print 'parsing', target_scan_sequence_file fn=filepath(target_scan_sequence_file); x=0; target_scan_gene_utr_seq={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: symbol = string.upper(t[2]); tax_id = t[3]; utr_seq = t[4] if tax_id == tax: utr_seq_no_gaps = string.replace(utr_seq,'-','') utr_seq_no_gaps = string.replace(utr_seq_no_gaps,'U','T') if symbol in symbol_ensembl_current and len(utr_seq_no_gaps)>0: target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps print 'UTR sequence for',len(target_scan_gene_utr_seq),'TargetScan genes stored in memory.' mir_sequences = []; count=0 print 'parsing', target_scan_target_file #verifyFile(target_scan_target_file,species) ### Makes sure file is local and if not downloads. fn=filepath(target_scan_target_file); x=0; k=[]; l=[] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 data = string.lower(data) t = string.split(data,'\t') i=0 for value in t: if 'mir' in value: m = i elif 'gene id' in value: g = i elif 'gene symbol' in value: s = i elif 'transcript' in value: r = i elif 'species id' in value: txi = i elif 'utr start' in value: us = i elif 'utr end' in value: ue = i i+=1 else: mir = t[m]; geneid = t[g]; gene_symbol = string.upper(t[s]); taxid = t[txi]; utr_start = int(t[us]); utr_end = int(t[ue]) ### Old format #mir = t[0]; gene_symbol = string.upper(t[1]); taxid = t[2]; utr_start = t[3]; utr_end = t[4] if '/' in mir: mir_list=[] mirs = string.split(mir,'/') for mirid in mirs[1:]: mirid = 'miR-'+mirid mir_list.append(mirid) mir_list.append(mirs[0]) else: mir_list = [mir] if taxid == tax: ###human #target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps if gene_symbol in symbol_ensembl_current: ensembl_geneids = symbol_ensembl_current[gene_symbol]; proceed = 'yes'; k.append(gene_symbol) else: proceed = 'no'; l.append(gene_symbol) if gene_symbol in target_scan_gene_utr_seq: ### TargetScan provides the core, while processed miRs are typically 22nt - seems to approximate other databases better adj_start = utr_start-15 if adj_start < 0: adj_start=0 mir_sequences = target_scan_gene_utr_seq[gene_symbol][adj_start:utr_end+1] #if string.lower(gene_symbol) == 'tns3' and mir == 'miR-182': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences else: mir_sequences=[] ###Already multiple geneids associated with each symbol so don't need to worry about renundancy if proceed == 'yes': for ensembl_geneid in ensembl_geneids: for mir in mir_list: #if ensembl_geneid == 'ENSG00000137815' and mir == 'miR-214': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences,target_scan_gene_utr_seq[gene_symbol];sys.exit() if parse_sequences == 'yes': if (prefix+mir,ensembl_geneid) in combined_results: combined_results[(prefix+mir,ensembl_geneid)].append(mir_sequences); count+=1 else: #if ensembl_geneid == 'ENSMUSG00000029467': print mir y = MicroRNATargetData(ensembl_geneid,gene_symbol,mir_sequences,prefix+mir,'TargetScan') count+=1 try: microRNA_target_db[prefix+mir].append(y) except KeyError: microRNA_target_db[prefix+mir] = [y] k = unique.unique(k); l = unique.unique(l) print 'ensembls-found:',len(k),', not found:',len(l) print l[:10] print count, 'miRNA-target relationships added for TargetScan'
def pictarImport(parse_sequences,type,added): """Annotations originally from the file: ng1536-S3.xls, posted as supplementary data at: http://www.nature.com/ng/journal/v37/n5/suppinfo/ng1536_S1.html. The file being parsed here has been pre-matched to Ensembl IDs using the ExonModule of LinkEST, for human.""" mir_sequences=[] if species == 'Mm': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-target-annotated.txt'; tax = '10090' else: filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '10116' #if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606' if type == 'pre-computed': if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606' else: if species == 'Hs': filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '9606' import AltAnalyze ###Get taxid annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: if species==species_annot_db[species_full].SpeciesCode(): tax = species_annot_db[species_full].TaxID() print 'parsing', filename; count=0 print 'len(symbol_ensembl)', len(symbol_ensembl) verifyFile(filename,species) ### Makes sure file is local and if not downloads. fn=filepath(filename); x=1 for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: if species == 'Hs': if type == 'pre-computed': ensembl_geneid, mir, mir_sequences = t; ensembl_geneids = [ensembl_geneid] else: symbol=string.upper(t[2]);mir=t[6];mir_sequences=t[11] if symbol in symbol_ensembl and len(symbol)>0: ensembl_geneids=symbol_ensembl[symbol] else: ensembl_geneids=[''] elif species == 'Mm': mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','mmu') if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] elif species == 'Rn': mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','rno') if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] else: mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11] if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] for ensembl_geneid in ensembl_geneids: if len(ensembl_geneid)>1 and (ensembl_geneid,mir) not in added: if parse_sequences == 'yes': if (mir,ensembl_geneid) in combined_results: combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences)); count+=1 else: #if count < 800 and '-125b' in mir: print ensembl_geneid, mir, mm_symbol; count+=1 #elif count>799: kill y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'pictar'); count+=1 try: microRNA_target_db[mir].append(y) except KeyError: microRNA_target_db[mir] = [y] added[(ensembl_geneid,mir)]=[] print count, 'miRNA-target relationships added for PicTar' return added