Ejemplo n.º 1
0
def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_url,force):
    global uniprot_ensembl_db;uniprot_ensembl_db={}
    global uniprot_db;uniprot_db={}; global species_name; global uniprot_fildir
    global secondary_to_primary_db; secondary_to_primary_db={}
    import update; reload(update)
    
    species_name = species_full
    
    import UI; species_names = UI.getSpeciesInfo()
    species_full = species_names[species]
    species_full = string.replace(species_full,' ','_')

    uniprot_file = string.split(uniprot_filename_url,'/')[-1]; uniprot_file = string.replace(uniprot_file,'.gz','')
    trembl_file = string.split(trembl_filename_url,'/')[-1]; trembl_file = string.replace(trembl_file,'.gz','')
    uniprot_fildir = 'AltDatabase/uniprot/'+species+'/'
    uniprot_download_fildir = 'AltDatabase/uniprot/'
    uniprot_ens_file = species+'_Ensembl-UniProt.txt'; uniprot_ens_location = uniprot_fildir+uniprot_ens_file
    uniprot_location = uniprot_download_fildir+uniprot_file
    trembl_location = uniprot_download_fildir+trembl_file

    add_trembl_annotations = 'no' ### Currently we don't need these annotations    
    try: importEnsemblUniprot(uniprot_ens_location)
    except IOError:
        try:
            ### Download the data from the AltAnalyze website (if there)
            update.downloadCurrentVersion(uniprot_ens_location,species,'txt')
            importEnsemblUniprot(uniprot_ens_location)
        except Exception: null=[]
    try:
        uniprot_ens_location_built = string.replace(uniprot_ens_location,'UniProt','Uniprot-SWISSPROT')
        uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
        importEnsemblUniprot(uniprot_ens_location_built)
    except Exception: null=[]
    
    ### Import UniProt annotations
    counts = update.verifyFile(uniprot_location,'counts')
    if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
    else:
        ### Directly download the data from UniProt
        gz_filepath, status = update.download(uniprot_filename_url,uniprot_download_fildir,'')

        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status     
        import_uniprot_db(uniprot_location)
        
    if add_trembl_annotations == 'yes':
        ### Import TreMBL annotations
        try:
            if force == 'yes': uniprot_location += '!!!!!' ### Force an IOError
            import_uniprot_db(trembl_location)
        except IOError:
            ### Directly download the data from UniProt
            update.download(trembl_filename_url,uniprot_download_fildir,'')
            import_uniprot_db(trembl_location)        
    export()
    exportEnsemblUniprot(uniprot_ens_location)
Ejemplo n.º 2
0
def runExtractUniProt(species,species_full,uniprot_filename_url,trembl_filename_url,force):
    global uniprot_ensembl_db;uniprot_ensembl_db={}
    global uniprot_db;uniprot_db={}; global species_name; global uniprot_fildir
    global secondary_to_primary_db; secondary_to_primary_db={}
    import update; reload(update)
    
    species_name = species_full
    
    import UI; species_names = UI.getSpeciesInfo()
    species_full = species_names[species]
    species_full = string.replace(species_full,' ','_')

    uniprot_file = string.split(uniprot_filename_url,'/')[-1]; uniprot_file = string.replace(uniprot_file,'.gz','')
    trembl_file = string.split(trembl_filename_url,'/')[-1]; trembl_file = string.replace(trembl_file,'.gz','')
    uniprot_fildir = 'AltDatabase/uniprot/'+species+'/'
    uniprot_download_fildir = 'AltDatabase/uniprot/'
    uniprot_ens_file = species+'_Ensembl-UniProt.txt'; uniprot_ens_location = uniprot_fildir+uniprot_ens_file
    uniprot_location = uniprot_download_fildir+uniprot_file
    trembl_location = uniprot_download_fildir+trembl_file

    add_trembl_annotations = 'no' ### Currently we don't need these annotations    
    try: importEnsemblUniprot(uniprot_ens_location)
    except IOError:
        try:
            ### Download the data from the AltAnalyze website (if there)
            update.downloadCurrentVersion(uniprot_ens_location,species,'txt')
            importEnsemblUniprot(uniprot_ens_location)
        except Exception: null=[]
    try:
        uniprot_ens_location_built = string.replace(uniprot_ens_location,'UniProt','Uniprot-SWISSPROT')
        uniprot_ens_location_built = string.replace(uniprot_ens_location_built,'uniprot','Uniprot-SWISSPROT')
        importEnsemblUniprot(uniprot_ens_location_built)
    except Exception: null=[]
    
    ### Import UniProt annotations
    counts = update.verifyFile(uniprot_location,'counts')
    if force == 'no' or counts > 8: import_uniprot_db(uniprot_location)
    else:
        ### Directly download the data from UniProt
        gz_filepath, status = update.download(uniprot_filename_url,uniprot_download_fildir,'')

        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status     
        import_uniprot_db(uniprot_location)
        
    if add_trembl_annotations == 'yes':
        ### Import TreMBL annotations
        try:
            if force == 'yes': uniprot_location += '!!!!!' ### Force an IOError
            import_uniprot_db(trembl_location)
        except IOError:
            ### Directly download the data from UniProt
            update.download(trembl_filename_url,uniprot_download_fildir,'')
            import_uniprot_db(trembl_location)        
    export()
    exportEnsemblUniprot(uniprot_ens_location)
Ejemplo n.º 3
0
def command_update(*args):
    """
        Update to latest development version
    """
    import update
    if not (update.download() and update.install()):
        print("UPDATE FAILED")
Ejemplo n.º 4
0
def command_update(*args):
    """
        Update to latest development version
    """
    import update
    if not (update.download() and update.install()):
        print("UPDATE FAILED")
Ejemplo n.º 5
0
def importUCSCTranscriptSequences(species,array_type,probeset_seq_db):
    start_time = time.time()

    if force == 'yes':
        ### Download mRNA sequence file from website
        import UI; species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full,' ','_')
        ucsc_mRNA_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/bigZips','mrna.fa.gz')
        output_dir = 'AltDatabase/'+species+'/SequenceData/'
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir,output_dir,'')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
        except Exception: null=[] ### Occurs when file is not available for this species
            
    filename = 'AltDatabase/'+species+'/SequenceData/mrna.fa'
    output_file = 'AltDatabase/'+species+'/SequenceData/output/'+array_type+'_UCSC-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)      
    output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/'+array_type+'_UCSC_mRNA_seqmatches.txt'
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)    
    
    print "Begining generic fasta import of",filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0
    fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0
    for line in open(fn,'rU').xreadlines():
        try: data, newline= string.split(line,'\n')
        except ValueError: continue
        if len(data)>0:
            if data[0] != '#':
                try:
                    if data[0] == '>':
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = 'no'
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence); gene_found = 'yes'
                                        mRNA_seq = sequence[1:]; mRNA_length = len(mRNA_seq)    
                                        k+=1; probeset_seq_data = probeset_seq_db[ens_gene]
                                        results = simpleSeqMatchProtocol(probeset_seq_data,mRNA_seq)
                                        for (call,probeset) in results:
                                            dataw.write(string.join([probeset,str(call),accession],'\t')+'\n')
                                if gene_found == 'yes':
                                    values = [accession,mRNA_seq]; values = string.join(values,'\t')+'\n'
                                    datar.write(values)
                        values = string.split(data,' '); accession = values[0][1:]
                        sequence = '|'; continue
                except IndexError: null = []       
                try:
                    if data[0] != '>': sequence = sequence + data
                except IndexError: print kill; continue
    datar.close()
    end_time = time.time(); time_diff = int(end_time-start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
Ejemplo n.º 6
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Ejemplo n.º 7
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
def importmiRNAMap(parse_sequences,force):
    """ Added in AltAnalyze version 2.0, this database provides target sequences for several species and different databases, 
    including miRanda, RNAhybrid and TargetScan. For more information see: http://mirnamap.mbc.nctu.edu.tw/html/about.html"""
    gz_filepath = verifyFileAdvanced('miRNA_targets_',species)
    if force == 'yes' or len(gz_filepath)==0:
        import UI; species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full,' ','_')
        miRNAMap_dir = update.getFTPData('mirnamap.mbc.nctu.edu.tw','/miRNAMap2/miRNA_Targets/'+species_full,'.txt.tar.gz')
        output_dir = 'AltDatabase/miRBS/'+species+'/'
        gz_filepath, status = update.download(miRNAMap_dir,output_dir,'')
        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status 

    fn=filepath(string.replace(gz_filepath,'.tar.gz','')); x=0; count=0
    for line in open(fn,'rU').readlines():             
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            try:
                miRNA, ensembl_transcript_id, target_start, target_end, miRNA_seq, alignment, target_seq, algorithm, c1, c2, c3 = t
                #if 'GGCTCCTGTCACCTGGGTCCGT'in target_seq:
                #print 'a'; sys.exit()
                #if 'TCF7L1' in symbol or 'TCF3' in symbol:
                #if '-422a' in miRNA:
                #print miRNA;sys.exit()
                #print symbol, mir; sys.exit()
                if ensembl_transcript_id in ens_gene_to_transcript:
                    geneids = ens_gene_to_transcript[ensembl_transcript_id]     
                    target_seq = string.upper(string.replace(target_seq,'-',''))
                    target_seq = string.replace(target_seq,'U','T')
                    for ensembl_geneid in geneids:
                        if parse_sequences == 'yes':
                            if (miRNA,ensembl_geneid) in combined_results:
                                combined_results[(miRNA,ensembl_geneid)].append(target_seq)
                        else:
                            y = MicroRNATargetData(ensembl_geneid,'',miRNA,target_seq,algorithm); count+=1
                            try: microRNA_target_db[miRNA].append(y)
                            except KeyError: microRNA_target_db[miRNA] = [y]
            except Exception: x=1 ### Bad formatting
                       
    print count, 'miRNA-target relationships added for mirnamap'
    return count
def downloadFile(file_type):
    import UI
    file_location_defaults = UI.importDefaultFileLocations()
    try:
        fld = file_location_defaults[file_type]
        url = fld.Location()
    except Exception:
        for fl in fld:
            if species in fl.Species(): url = fl.Location()
    if 'Target' in file_type: output_dir = 'AltDatabase/miRBS/'
    else: output_dir = 'AltDatabase/miRBS/'+species + '/'
    gz_filepath, status = update.download(url,output_dir,'')
    if status == 'not-removed':
        try: os.remove(gz_filepath) ### Not sure why this works now and not before
        except Exception: status = status
    
    filename = string.replace(gz_filepath,'.zip','.txt')
    filename = string.replace(filename,'.gz','.txt')
    filename = string.replace(filename,'.txt.txt','.txt')
    
    return filename
Ejemplo n.º 10
0
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    import OBO_import
    import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()
Ejemplo n.º 11
0
import update

passive_partition = update.get_passive_partition()

print(passive_partition)

def progress_callback(percentage):
    print(percentage)
    
update.download(url='https://github.com/wipfli/update/releases/download/v0.1.0/rootfs.ext2.xz', passive_partition=passive_partition, progress_callback=progress_callback, total_size=52807640)

print(update.get_checksum(passive_partition))

update.flash_boot_select(passive_partition)
Ejemplo n.º 12
0
 def download_updates(self):
     now = int(time.mktime(time.gmtime()))
     if now > settings.server.get('last_update_check', 0) + 24*60*60:
         settings.server['last_update_check'] = now
         update.download()
Ejemplo n.º 13
0


### Update password crypt
if currentVersion<2050:
	MessageBox(None, u'Vous devez reconfigurer le Bot !', u'update', 0)
	updatef = zipfile.ZipFile('./config_original.zip')
	updatef.extractall()
	
### Get config Gui
try:
	os.chdir(os.path.dirname(sys.argv[0]))
except:
	pass
if currentVersion<3001 or not os.path.exists('config.exe'):
	download('https://github.com/downloads/maxisoft/millenium-bot/config.exe','config.exe')



### ----------------- restart le service
try:
	RunCMD('sc start "MillenuimBot"') 
except:
	pass


### Retour au repertoir de depart 
try:
	os.chdir(os.path.dirname(sys.argv[0]))
except:
	pass
Ejemplo n.º 14
0
def reformatPolyAdenylationCoordinates(species,force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version={}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for',species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/'+species + '/'
    if force == 'yes':
        filename, status = update.download(url,output_dir,'')
    else: filename = output_dir+'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations; import OBO_import; import EnsemblImport; import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene')
        print len(ens_unigene),'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene')
        print len(ens_entrez),'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    
    export_bedfile = output_dir+species+'_polyADB_2_predictions.bed'
    print 'exporting',export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n'
    export_data.write(header)
    
    fn=filepath(filename); x=0; not_found={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t')
            if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr'+chr
                strand = '+'; geneid = siteid
                pos_start = str(int(position)-1); pos_end = position
                if use_entrez=='no':
                    external_geneid = string.join(string.split(siteid,'.')[:2],'.')
                else: external_geneid=llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-'+ens_geneid
                    chr,strand,start,end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid]=[]
                    bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n'
                export_data.write(bed_format)
    export_data.close()   
Ejemplo n.º 15
0
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db):
    start_time = time.time()

    if force == 'yes':
        ### Download mRNA sequence file from website
        import UI
        species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full, ' ', '_')
        ucsc_mRNA_dir = update.getFTPData(
            'hgdownload.cse.ucsc.edu',
            '/goldenPath/currentGenomes/' + species_full + '/bigZips',
            'mrna.fa.gz')
        output_dir = 'AltDatabase/' + species + '/SequenceData/'
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir,
                                                  '')
            if status == 'not-removed':
                try:
                    os.remove(gz_filepath
                              )  ### Not sure why this works now and not before
                except OSError:
                    status = status
        except Exception:
            null = []  ### Occurs when file is not available for this species

    filename = 'AltDatabase/' + species + '/SequenceData/mrna.fa'
    output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_UCSC-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)
    output_file = 'AltDatabase/' + species + '/SequenceData/output/sequences/' + array_type + '_UCSC_mRNA_seqmatches.txt'
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)

    print "Begining generic fasta import of", filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn = filepath(filename)
    sequence = '|'
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    fn = filepath(filename)
    sequence = '|'
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    for line in open(fn, 'rU').xreadlines():
        try:
            data, newline = string.split(line, '\n')
        except ValueError:
            continue
        if len(data) > 0:
            if data[0] != '#':
                try:
                    if data[0] == '>':
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = 'no'
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence)
                                        gene_found = 'yes'
                                        mRNA_seq = sequence[1:]
                                        mRNA_length = len(mRNA_seq)
                                        k += 1
                                        probeset_seq_data = probeset_seq_db[
                                            ens_gene]
                                        results = simpleSeqMatchProtocol(
                                            probeset_seq_data, mRNA_seq)
                                        for (call, probeset) in results:
                                            dataw.write(
                                                string.join([
                                                    probeset,
                                                    str(call), accession
                                                ], '\t') + '\n')
                                if gene_found == 'yes':
                                    values = [accession, mRNA_seq]
                                    values = string.join(values, '\t') + '\n'
                                    datar.write(values)
                        values = string.split(data, ' ')
                        accession = values[0][1:]
                        sequence = '|'
                        continue
                except IndexError:
                    null = []
                try:
                    if data[0] != '>': sequence = sequence + data
                except IndexError:
                    print kill
                    continue
    datar.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
Ejemplo n.º 16
0
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db):
    start_time = time.time()

    if force == "yes":
        ### Download mRNA sequence file from website
        import UI

        species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full, " ", "_")
        ucsc_mRNA_dir = update.getFTPData(
            "hgdownload.cse.ucsc.edu", "/goldenPath/currentGenomes/" + species_full + "/bigZips", "mrna.fa.gz"
        )
        output_dir = "AltDatabase/" + species + "/SequenceData/"
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir, "")
            if status == "not-removed":
                try:
                    os.remove(gz_filepath)  ### Not sure why this works now and not before
                except OSError:
                    status = status
        except Exception:
            null = []  ### Occurs when file is not available for this species

    filename = "AltDatabase/" + species + "/SequenceData/mrna.fa"
    output_file = "AltDatabase/" + species + "/SequenceData/output/" + array_type + "_UCSC-mRNA_alignments.txt"
    dataw = export.ExportFile(output_file)
    output_file = (
        "AltDatabase/" + species + "/SequenceData/output/sequences/" + array_type + "_UCSC_mRNA_seqmatches.txt"
    )
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)

    print "Begining generic fasta import of", filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn = filepath(filename)
    sequence = "|"
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    fn = filepath(filename)
    sequence = "|"
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    for line in open(fn, "rU").xreadlines():
        try:
            data, newline = string.split(line, "\n")
        except ValueError:
            continue
        if len(data) > 0:
            if data[0] != "#":
                try:
                    if data[0] == ">":
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = "no"
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence)
                                        gene_found = "yes"
                                        mRNA_seq = sequence[1:]
                                        mRNA_length = len(mRNA_seq)
                                        k += 1
                                        probeset_seq_data = probeset_seq_db[ens_gene]
                                        results = simpleSeqMatchProtocol(probeset_seq_data, mRNA_seq)
                                        for (call, probeset) in results:
                                            dataw.write(string.join([probeset, str(call), accession], "\t") + "\n")
                                if gene_found == "yes":
                                    values = [accession, mRNA_seq]
                                    values = string.join(values, "\t") + "\n"
                                    datar.write(values)
                        values = string.split(data, " ")
                        accession = values[0][1:]
                        sequence = "|"
                        continue
                except IndexError:
                    null = []
                try:
                    if data[0] != ">":
                        sequence = sequence + data
                except IndexError:
                    print kill
                    continue
    datar.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
Ejemplo n.º 17
0
def downloadHMDBMetaboCardFlatFile():
    url = 'http://www.hmdb.ca/public/downloads/current/metabocards.zip'
    fln, status = update.download(url, 'BuildDBs/HMDB/', '')
Ejemplo n.º 18
0
def downloadKEGGPathwayIDs():
    url = 'ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab'
    fln, status = update.download(url, 'BuildDBs/HMDB/', '')
Ejemplo n.º 19
0
 def download_updates(self):
     now = int(time.mktime(time.gmtime()))
     if now > settings.server.get('last_update_check', 0) + 24 * 60 * 60:
         settings.server['last_update_check'] = now
         update.download()