def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Example #2
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
def getUniProtURLsForAllSupportedSpecies():
    ### Import all UniProt supproted species and URLs
    species_uniprot_db = {}
    fn = filepath('Config/uniprot-species-file.txt')
    for line in open(fn, 'r').xreadlines():
        data = cleanUpLine(line)
        species_full, taxid, url = string.split(data, '\t')
        if 'H**o sapiens' not in species_full:  ### There's a separate file for us humans (so egotistical!!!)
            species_uniprot_db[species_full] = taxid, url

    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db = AltAnalyze.importGOEliteSpeciesInfo()

    ### Export all urls for currently supported species
    import UI
    file_location_defaults = UI.importDefaultFileLocations()

    location_db = {}
    species_added = []
    for species_full in species_annot_db:
        if species_full in species_uniprot_db:
            taxid, url = species_uniprot_db[species_full]
            species_code = species_annot_db[species_full].SpeciesCode()
            try:
                location_db[url].append(species_code)
            except Exception:
                location_db[url] = [species_code]
            species_added.append(species_full)

    for species_full in species_annot_db:
        taxid = species_annot_db[species_full].TaxID()
        species_code = species_annot_db[species_full].SpeciesCode()
        if species_full not in species_added:
            for species_name in species_uniprot_db:
                tax, url = species_uniprot_db[species_name]
                if tax == taxid:
                    location_db[url].append(species_code)
                    print species_code

    for url in location_db:
        species = string.join(location_db[url], '|')
        fl = UI.FileLocationData('ftp', url, species)
        try:
            file_location_defaults['UniProt'].append(fl)
        except KeyError:
            file_location_defaults['UniProt'] = [fl]
    UI.exportDefaultFileLocations(file_location_defaults)
def getUniProtURLsForAllSupportedSpecies():
    ### Import all UniProt supproted species and URLs
    species_uniprot_db={}
    fn=filepath('Config/uniprot-species-file.txt')
    for line in open(fn,'r').xreadlines():
        data = cleanUpLine(line)
        species_full,taxid,url = string.split(data,'\t')
        if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!)
            species_uniprot_db[species_full] = taxid,url
        
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo()
        
    ### Export all urls for currently supported species
    import UI
    file_location_defaults = UI.importDefaultFileLocations()
    
    location_db={}; species_added=[]
    for species_full in species_annot_db:
        if species_full in species_uniprot_db:
            taxid,url = species_uniprot_db[species_full]
            species_code = species_annot_db[species_full].SpeciesCode()
            try: location_db[url].append(species_code)
            except Exception: location_db[url] = [species_code]
            species_added.append(species_full)
            
    for species_full in species_annot_db:
        taxid = species_annot_db[species_full].TaxID()
        species_code = species_annot_db[species_full].SpeciesCode()
        if species_full not in species_added:
            for species_name in species_uniprot_db:
                tax,url = species_uniprot_db[species_name]
                if tax == taxid:
                    location_db[url].append(species_code)
                    print species_code
                
    for url in location_db:
        species = string.join(location_db[url],'|')
        fl = UI.FileLocationData('ftp', url, species)
        try: file_location_defaults['UniProt'].append(fl)
        except KeyError: file_location_defaults['UniProt'] = [fl]
    UI.exportDefaultFileLocations(file_location_defaults)
def TargetScanImport(parse_sequences,force):
    """The TargetScan data is currently extracted from a cross-species conserved family file. This file only contains
    gene symbol, microRNA name and 3'UTR seed locations."""
    if species == 'Mm': tax = '10090'; prefix = 'mmu-'
    elif species == 'Hs': tax = '9606'; prefix = 'hsa-'
    elif species == 'Rn': tax = '10116'; prefix = 'rno-'
    else: prefix = 'hsa-'

    import AltAnalyze
    ###Get taxid annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        if species==species_annot_db[species_full].SpeciesCode():
            tax = species_annot_db[species_full].TaxID()
            
    global l
    
    ### See if the files are already there
    verifyTSG, target_scan_target_file = verifyExternalDownload('TargetScanGenes')
    verifyTSS, target_scan_sequence_file = verifyExternalDownload('TargetScanSequences')

    if verifyTSG == 'no' or verifyTSS == 'no': ### used to be - if force == 'yes'
        if parse_sequences == 'no':
            ### Then download the latest annotations and sequences
            target_scan_target_file = downloadFile('TargetScanGenes')
            target_scan_sequence_file = downloadFile('TargetScanSequences')

    ### Cross-species TargetScan file with UTR seqeunces for all genes with reported targets in the conserved family file
    ### Although this file includes valid sequence data that appears to match up to the target file, the target file
    ### appears to only list the seed seqeunce location (UTR start and stop) and not the full binding sequence and thus
    ### is not ammenable to probe set alignment.
    print 'parsing', target_scan_sequence_file
    fn=filepath(target_scan_sequence_file); x=0; target_scan_gene_utr_seq={}
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            symbol = string.upper(t[2]); tax_id = t[3]; utr_seq = t[4]
            if tax_id == tax:
                utr_seq_no_gaps = string.replace(utr_seq,'-','')
                utr_seq_no_gaps = string.replace(utr_seq_no_gaps,'U','T')
                if symbol in symbol_ensembl_current and len(utr_seq_no_gaps)>0:
                    target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
    print 'UTR sequence for',len(target_scan_gene_utr_seq),'TargetScan genes stored in memory.'
        
    mir_sequences = []; count=0
    print 'parsing', target_scan_target_file
    #verifyFile(target_scan_target_file,species) ### Makes sure file is local and if not downloads.
    fn=filepath(target_scan_target_file); x=0; k=[]; l=[]
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0:
            x=1
            data = string.lower(data)
            t = string.split(data,'\t')
            i=0
            for value in t:
                if 'mir' in value: m = i
                elif 'gene id' in value: g = i
                elif 'gene symbol' in value: s = i
                elif 'transcript' in value: r = i
                elif 'species id' in value: txi = i
                elif 'utr start' in value: us = i
                elif 'utr end' in value: ue = i
                i+=1
        else:
            mir = t[m]; geneid = t[g]; gene_symbol = string.upper(t[s]); taxid = t[txi]; utr_start = int(t[us]); utr_end  = int(t[ue])
            ### Old format
            #mir = t[0]; gene_symbol = string.upper(t[1]); taxid = t[2]; utr_start = t[3]; utr_end = t[4]
            if '/' in mir:
                mir_list=[]
                mirs = string.split(mir,'/')
                for mirid in mirs[1:]:
                    mirid = 'miR-'+mirid
                    mir_list.append(mirid)
                mir_list.append(mirs[0])
            else: mir_list = [mir]

            if taxid == tax: ###human
                #target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps
                if gene_symbol in symbol_ensembl_current: ensembl_geneids = symbol_ensembl_current[gene_symbol]; proceed = 'yes'; k.append(gene_symbol)
                else: proceed = 'no'; l.append(gene_symbol)
                if gene_symbol in target_scan_gene_utr_seq:
                    ### TargetScan provides the core, while processed miRs are typically 22nt - seems to approximate other databases better
                    adj_start = utr_start-15
                    if adj_start < 0: adj_start=0
                    mir_sequences = target_scan_gene_utr_seq[gene_symbol][adj_start:utr_end+1]
                    #if string.lower(gene_symbol) == 'tns3' and mir == 'miR-182': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences
                else: mir_sequences=[]
                ###Already multiple geneids associated with each symbol so don't need to worry about renundancy
                if proceed == 'yes':
                    for ensembl_geneid in ensembl_geneids:
                        for mir in mir_list:
                            #if ensembl_geneid == 'ENSG00000137815' and mir == 'miR-214': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences,target_scan_gene_utr_seq[gene_symbol];sys.exit()
                            if parse_sequences == 'yes':
                                if (prefix+mir,ensembl_geneid) in combined_results:
                                    combined_results[(prefix+mir,ensembl_geneid)].append(mir_sequences); count+=1
                            else:
                                #if ensembl_geneid == 'ENSMUSG00000029467': print mir
                                y = MicroRNATargetData(ensembl_geneid,gene_symbol,mir_sequences,prefix+mir,'TargetScan')
                                count+=1
                                try: microRNA_target_db[prefix+mir].append(y)
                                except KeyError: microRNA_target_db[prefix+mir] = [y]
    k = unique.unique(k); l = unique.unique(l)
    print 'ensembls-found:',len(k),', not found:',len(l)
    print l[:10]
    print count, 'miRNA-target relationships added for TargetScan'
def pictarImport(parse_sequences,type,added):
    """Annotations originally from the file: ng1536-S3.xls, posted as supplementary data at:
    http://www.nature.com/ng/journal/v37/n5/suppinfo/ng1536_S1.html. The file being parsed here has been pre-matched to Ensembl IDs
    using the ExonModule of LinkEST, for human."""
    mir_sequences=[]
    if species == 'Mm': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-target-annotated.txt'; tax = '10090'
    else: filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '10116'
        
    #if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606'
    if type == 'pre-computed':
        if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606'
    else:
        if species == 'Hs': filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '9606'

    import AltAnalyze
    ###Get taxid annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        if species==species_annot_db[species_full].SpeciesCode():
            tax = species_annot_db[species_full].TaxID()
            
    print 'parsing', filename; count=0
    print 'len(symbol_ensembl)', len(symbol_ensembl)
    verifyFile(filename,species) ### Makes sure file is local and if not downloads.
    fn=filepath(filename); x=1
    for line in open(fn,'rU').xreadlines():         
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            if species == 'Hs':
                if type == 'pre-computed':
                    ensembl_geneid, mir, mir_sequences = t; ensembl_geneids = [ensembl_geneid]
                else:
                    symbol=string.upper(t[2]);mir=t[6];mir_sequences=t[11]
                    if symbol in symbol_ensembl and len(symbol)>0: ensembl_geneids=symbol_ensembl[symbol]
                    else: ensembl_geneids=['']                    
            elif species == 'Mm':
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','mmu')
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']
            elif species == 'Rn':
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','rno')
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']
            else:
                mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]
                if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol]
                else: ensembl_geneids=['']                
            for ensembl_geneid in ensembl_geneids:
                if len(ensembl_geneid)>1 and (ensembl_geneid,mir) not in added:
                    if parse_sequences == 'yes':
                        if (mir,ensembl_geneid) in combined_results:
                            combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences)); count+=1
                    else:
                        #if count < 800 and '-125b' in mir: print ensembl_geneid, mir, mm_symbol; count+=1
                        #elif count>799: kill
                        y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'pictar'); count+=1
                        try: microRNA_target_db[mir].append(y)
                        except KeyError: microRNA_target_db[mir] = [y]
                        added[(ensembl_geneid,mir)]=[]
                    
    print count, 'miRNA-target relationships added for PicTar'
    return added