Exemple #1
0
def importUCSCTranscriptSequences(species,array_type,probeset_seq_db):
    start_time = time.time()

    if force == 'yes':
        ### Download mRNA sequence file from website
        import UI; species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full,' ','_')
        ucsc_mRNA_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/bigZips','mrna.fa.gz')
        output_dir = 'AltDatabase/'+species+'/SequenceData/'
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir,output_dir,'')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
        except Exception: null=[] ### Occurs when file is not available for this species
            
    filename = 'AltDatabase/'+species+'/SequenceData/mrna.fa'
    output_file = 'AltDatabase/'+species+'/SequenceData/output/'+array_type+'_UCSC-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)      
    output_file = 'AltDatabase/'+species+'/SequenceData/output/sequences/'+array_type+'_UCSC_mRNA_seqmatches.txt'
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)    
    
    print "Begining generic fasta import of",filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0
    fn=filepath(filename); sequence = '|'; ucsc_mRNA_hit_len={}; ucsc_probeset_null_hits={}; k=0
    for line in open(fn,'rU').xreadlines():
        try: data, newline= string.split(line,'\n')
        except ValueError: continue
        if len(data)>0:
            if data[0] != '#':
                try:
                    if data[0] == '>':
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = 'no'
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence); gene_found = 'yes'
                                        mRNA_seq = sequence[1:]; mRNA_length = len(mRNA_seq)    
                                        k+=1; probeset_seq_data = probeset_seq_db[ens_gene]
                                        results = simpleSeqMatchProtocol(probeset_seq_data,mRNA_seq)
                                        for (call,probeset) in results:
                                            dataw.write(string.join([probeset,str(call),accession],'\t')+'\n')
                                if gene_found == 'yes':
                                    values = [accession,mRNA_seq]; values = string.join(values,'\t')+'\n'
                                    datar.write(values)
                        values = string.split(data,' '); accession = values[0][1:]
                        sequence = '|'; continue
                except IndexError: null = []       
                try:
                    if data[0] != '>': sequence = sequence + data
                except IndexError: print kill; continue
    datar.close()
    end_time = time.time(); time_diff = int(end_time-start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Exemple #3
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
def importmiRNAMap(parse_sequences,force):
    """ Added in AltAnalyze version 2.0, this database provides target sequences for several species and different databases, 
    including miRanda, RNAhybrid and TargetScan. For more information see: http://mirnamap.mbc.nctu.edu.tw/html/about.html"""
    gz_filepath = verifyFileAdvanced('miRNA_targets_',species)
    if force == 'yes' or len(gz_filepath)==0:
        import UI; species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full,' ','_')
        miRNAMap_dir = update.getFTPData('mirnamap.mbc.nctu.edu.tw','/miRNAMap2/miRNA_Targets/'+species_full,'.txt.tar.gz')
        output_dir = 'AltDatabase/miRBS/'+species+'/'
        gz_filepath, status = update.download(miRNAMap_dir,output_dir,'')
        if status == 'not-removed':
            try: os.remove(gz_filepath) ### Not sure why this works now and not before
            except OSError: status = status 

    fn=filepath(string.replace(gz_filepath,'.tar.gz','')); x=0; count=0
    for line in open(fn,'rU').readlines():             
        data = cleanUpLine(line)
        t = string.split(data,'\t')
        if x==0: x=1
        else:
            try:
                miRNA, ensembl_transcript_id, target_start, target_end, miRNA_seq, alignment, target_seq, algorithm, c1, c2, c3 = t
                #if 'GGCTCCTGTCACCTGGGTCCGT'in target_seq:
                #print 'a'; sys.exit()
                #if 'TCF7L1' in symbol or 'TCF3' in symbol:
                #if '-422a' in miRNA:
                #print miRNA;sys.exit()
                #print symbol, mir; sys.exit()
                if ensembl_transcript_id in ens_gene_to_transcript:
                    geneids = ens_gene_to_transcript[ensembl_transcript_id]     
                    target_seq = string.upper(string.replace(target_seq,'-',''))
                    target_seq = string.replace(target_seq,'U','T')
                    for ensembl_geneid in geneids:
                        if parse_sequences == 'yes':
                            if (miRNA,ensembl_geneid) in combined_results:
                                combined_results[(miRNA,ensembl_geneid)].append(target_seq)
                        else:
                            y = MicroRNATargetData(ensembl_geneid,'',miRNA,target_seq,algorithm); count+=1
                            try: microRNA_target_db[miRNA].append(y)
                            except KeyError: microRNA_target_db[miRNA] = [y]
            except Exception: x=1 ### Bad formatting
                       
    print count, 'miRNA-target relationships added for mirnamap'
    return count
Exemple #5
0
        for line in open(fn,'rU').xreadlines():
            counts+=1
            if counts>10: break
    except Exception:
        counts=0
    if species_name == 'counts': ### Used if the file cannot be downloaded from http://www.altanalyze.org
        return counts
    elif counts == 0:
        if species_name in filename: server_folder = species_name ### Folder equals species unless it is a universal file
        elif 'Mm' in filename: server_folder = 'Mm' ### For PicTar
        else: server_folder = 'all'
        print 'Downloading:',server_folder,filename
        update.downloadCurrentVersion(filename,server_folder,'txt')
    else:
        return counts
    
if __name__ == '__main__':
    species = 'Hs'; #species_full = 'Drosophila_melanogaster'
    filename = 'AltDatabase/ucsc/'+species+'/polyaDb.txt'
    verifyFile(filename,species) ### Makes sure file is local and if not downloads.
    sys.exit()
    importEnsExonStructureData(species,[],[],[]);sys.exit()
    reformatPolyAdenylationCoordinates(species,'no');sys.exit()
    #test = 'yes'
    #test_gene = ['ENSG00000140153','ENSG00000075413']
    import UCSCImport; import update
    knownAlt_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/database','knownAlt.txt.gz')
    polyA_dir = update.getFTPData('hgdownload.cse.ucsc.edu','/goldenPath/currentGenomes/'+species_full+'/database','polyaDb.txt.gz')
    output_dir = 'AltDatabase/ucsc/'+species + '/'
    UCSCImport.downloadFiles(knownAlt_dir,output_dir); UCSCImport.downloadFiles(polyA_dir,output_dir);sys.exit()
    ensembl_ucsc_splicing_annotations = importEnsExonStructureData(species,ensembl_gene_coordinates,ensembl_annotations,exon_annotation_db)
Exemple #6
0
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db):
    start_time = time.time()

    if force == 'yes':
        ### Download mRNA sequence file from website
        import UI
        species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full, ' ', '_')
        ucsc_mRNA_dir = update.getFTPData(
            'hgdownload.cse.ucsc.edu',
            '/goldenPath/currentGenomes/' + species_full + '/bigZips',
            'mrna.fa.gz')
        output_dir = 'AltDatabase/' + species + '/SequenceData/'
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir,
                                                  '')
            if status == 'not-removed':
                try:
                    os.remove(gz_filepath
                              )  ### Not sure why this works now and not before
                except OSError:
                    status = status
        except Exception:
            null = []  ### Occurs when file is not available for this species

    filename = 'AltDatabase/' + species + '/SequenceData/mrna.fa'
    output_file = 'AltDatabase/' + species + '/SequenceData/output/' + array_type + '_UCSC-mRNA_alignments.txt'
    dataw = export.ExportFile(output_file)
    output_file = 'AltDatabase/' + species + '/SequenceData/output/sequences/' + array_type + '_UCSC_mRNA_seqmatches.txt'
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)

    print "Begining generic fasta import of", filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn = filepath(filename)
    sequence = '|'
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    fn = filepath(filename)
    sequence = '|'
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    for line in open(fn, 'rU').xreadlines():
        try:
            data, newline = string.split(line, '\n')
        except ValueError:
            continue
        if len(data) > 0:
            if data[0] != '#':
                try:
                    if data[0] == '>':
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = 'no'
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence)
                                        gene_found = 'yes'
                                        mRNA_seq = sequence[1:]
                                        mRNA_length = len(mRNA_seq)
                                        k += 1
                                        probeset_seq_data = probeset_seq_db[
                                            ens_gene]
                                        results = simpleSeqMatchProtocol(
                                            probeset_seq_data, mRNA_seq)
                                        for (call, probeset) in results:
                                            dataw.write(
                                                string.join([
                                                    probeset,
                                                    str(call), accession
                                                ], '\t') + '\n')
                                if gene_found == 'yes':
                                    values = [accession, mRNA_seq]
                                    values = string.join(values, '\t') + '\n'
                                    datar.write(values)
                        values = string.split(data, ' ')
                        accession = values[0][1:]
                        sequence = '|'
                        continue
                except IndexError:
                    null = []
                try:
                    if data[0] != '>': sequence = sequence + data
                except IndexError:
                    print kill
                    continue
    datar.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
Exemple #7
0
def importUCSCTranscriptSequences(species, array_type, probeset_seq_db):
    start_time = time.time()

    if force == "yes":
        ### Download mRNA sequence file from website
        import UI

        species_names = UI.getSpeciesInfo()
        species_full = species_names[species]
        species_full = string.replace(species_full, " ", "_")
        ucsc_mRNA_dir = update.getFTPData(
            "hgdownload.cse.ucsc.edu", "/goldenPath/currentGenomes/" + species_full + "/bigZips", "mrna.fa.gz"
        )
        output_dir = "AltDatabase/" + species + "/SequenceData/"
        try:
            gz_filepath, status = update.download(ucsc_mRNA_dir, output_dir, "")
            if status == "not-removed":
                try:
                    os.remove(gz_filepath)  ### Not sure why this works now and not before
                except OSError:
                    status = status
        except Exception:
            null = []  ### Occurs when file is not available for this species

    filename = "AltDatabase/" + species + "/SequenceData/mrna.fa"
    output_file = "AltDatabase/" + species + "/SequenceData/output/" + array_type + "_UCSC-mRNA_alignments.txt"
    dataw = export.ExportFile(output_file)
    output_file = (
        "AltDatabase/" + species + "/SequenceData/output/sequences/" + array_type + "_UCSC_mRNA_seqmatches.txt"
    )
    datar = export.ExportFile(output_file)

    ucsc_mrna_to_gene = importUCSCTranscriptAssociations(species)

    print "Begining generic fasta import of", filename
    #'>gnl|ENS|Mm#S10859962 Mus musculus 12 days embryo spinal ganglion cDNA /gb=AK051143 /gi=26094349 /ens=Mm.1 /len=2289']
    #'ATCGTGGTGTGCCCAGCTCTTCCAAGGACTGCTGCGCTTCGGGGCCCAGGTGAGTCCCGC'
    fn = filepath(filename)
    sequence = "|"
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    fn = filepath(filename)
    sequence = "|"
    ucsc_mRNA_hit_len = {}
    ucsc_probeset_null_hits = {}
    k = 0
    for line in open(fn, "rU").xreadlines():
        try:
            data, newline = string.split(line, "\n")
        except ValueError:
            continue
        if len(data) > 0:
            if data[0] != "#":
                try:
                    if data[0] == ">":
                        if len(sequence) > 1:
                            if accession in ucsc_mrna_to_gene:
                                gene_found = "no"
                                for ens_gene in ucsc_mrna_to_gene[accession]:
                                    if ens_gene in probeset_seq_db:
                                        sequence = string.upper(sequence)
                                        gene_found = "yes"
                                        mRNA_seq = sequence[1:]
                                        mRNA_length = len(mRNA_seq)
                                        k += 1
                                        probeset_seq_data = probeset_seq_db[ens_gene]
                                        results = simpleSeqMatchProtocol(probeset_seq_data, mRNA_seq)
                                        for (call, probeset) in results:
                                            dataw.write(string.join([probeset, str(call), accession], "\t") + "\n")
                                if gene_found == "yes":
                                    values = [accession, mRNA_seq]
                                    values = string.join(values, "\t") + "\n"
                                    datar.write(values)
                        values = string.split(data, " ")
                        accession = values[0][1:]
                        sequence = "|"
                        continue
                except IndexError:
                    null = []
                try:
                    if data[0] != ">":
                        sequence = sequence + data
                except IndexError:
                    print kill
                    continue
    datar.close()
    end_time = time.time()
    time_diff = int(end_time - start_time)
    print "UCSC mRNA sequences analyzed in %d seconds" % time_diff
if __name__ == '__main__':
    species = 'Hs'
    #species_full = 'Drosophila_melanogaster'
    filename = 'AltDatabase/ucsc/' + species + '/polyaDb.txt'
    verifyFile(filename,
               species)  ### Makes sure file is local and if not downloads.
    sys.exit()
    importEnsExonStructureData(species, [], [], [])
    sys.exit()
    reformatPolyAdenylationCoordinates(species, 'no')
    sys.exit()
    #test = 'yes'
    #test_gene = ['ENSG00000140153','ENSG00000075413']
    import UCSCImport
    import update
    knownAlt_dir = update.getFTPData(
        'hgdownload.cse.ucsc.edu',
        '/goldenPath/currentGenomes/' + species_full + '/database',
        'knownAlt.txt.gz')
    polyA_dir = update.getFTPData(
        'hgdownload.cse.ucsc.edu',
        '/goldenPath/currentGenomes/' + species_full + '/database',
        'polyaDb.txt.gz')
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    UCSCImport.downloadFiles(knownAlt_dir, output_dir)
    UCSCImport.downloadFiles(polyA_dir, output_dir)
    sys.exit()
    ensembl_ucsc_splicing_annotations = importEnsExonStructureData(
        species, ensembl_gene_coordinates, ensembl_annotations,
        exon_annotation_db)