Ejemplo n.º 1
0
def geneCoordinates(species,symbols):
    genes=[]
    import EnsemblImport
    ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True)
    for symbol in symbols:
        ens_geneid = ensembl_annotation_db[symbol]
        genes.append((ens_geneid,symbol))
    
    ### Get gene genomic locations
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    search_locations=[]
    for (gene,symbol) in genes:
        chr,strand,start,end = gene_location_db[gene]
        if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953'
        if len(chr)>6: print symbol, 'bad chromosomal reference:',chr
        else:
            search_locations.append([chr,strand,start,end,symbol])
Ejemplo n.º 2
0
def alignProbesetsToTranscripts(species,array_type,Analysis_type,Force, CoordinateBasedMatching = False):
    global force; force = Force; global analysis_type; analysis_type = Analysis_type
    global coordinateBasedMatching; coordinateBasedMatching = CoordinateBasedMatching
    """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts"""
      
    if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
        data_type = 'junctions'; probeset_seq_file=''; biotype = 'gene'
        if data_type == 'junctions' and analysis_type == 'reciprocal':
            start_time = time.time() ### Indicates whether to store information at the level of genes or probesets
            probeset_seq_db,pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence(species,array_type,biotype)
            end_time = time.time(); time_diff = int(end_time-start_time)
        elif analysis_type == 'single':
            start_time = time.time()
            probeset_seq_db,pairwise_probeset_combinations = importAllJunctionSequences(species,array_type)
            end_time = time.time(); time_diff = int(end_time-start_time)
        print "Analyses finished in %d seconds" % time_diff
    elif array_type == 'exon':
        data_type = 'exon'
        probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'
        ###Import probe-level associations
        exon_db = importSplicingAnnotationDatabase(probeset_annotations_file)
        start_time = time.time()
        probeset_seq_db = importProbesetSequences(exon_db,species)
        end_time = time.time(); time_diff = int(end_time-start_time)
        print "Analyses finished in %d seconds" % time_diff

    ### Match probesets to mRNAs\=
    import EnsemblImport
    if coordinateBasedMatching == True and array_type == 'RNASeq':
        EnsemblImport.exportTranscriptExonIDAssociations(species)
        matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs
    else:
        #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs
        importEnsemblTranscriptSequence(species,array_type,probeset_seq_db)
        try: mRNASeqAlign.importUCSCTranscriptSequences(species,array_type,probeset_seq_db)
        except Exception: pass ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available

    probeset_seq_db={} ### Re-set db

    ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset    
    if data_type == 'junctions':
        ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file
        import_dir = '/AltDatabase/'+species+'/SequenceData/output'
        g = GrabFiles(); g.setdirectory(import_dir)
        align_files = g.searchdirectory('mRNA_alignments')
        reAnalyzeRNAProbesetMatches(align_files,species,array_type,pairwise_probeset_combinations)
Ejemplo n.º 3
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Ejemplo n.º 4
0
def findSpeciesInUniProtFiles(force):
    ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs
    
    import AltAnalyze
    ###Get species annotations from the GO-Elite config
    species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={}
    for species_full in species_annot_db:
        taxid=species_annot_db[species_full].TaxID()
        tax_db[taxid]=species_full

    if force == 'yes':
        ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl
        import export; import update
        filesearch = '_sprot_'
        all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch)
        for file in all_swissprot:
            gz_filepath, status = update.download(file,'uniprot_temp/','')        
            if status == 'not-removed':
                try: os.remove(gz_filepath) ### Not sure why this works now and not before
                except OSError: status = status
            
    species_uniprot_db={}; altanalyze_species_uniprot_db={}
    dir=read_directory('/uniprot_temp')
    for filename in dir:    
        fn=filepath('uniprot_temp/'+filename)
        for line in open(fn,'r').xreadlines():
            data = cleanUpLine(line)
            if data[0:2] == 'OX':
                taxid = string.split(data,'=')[1][:-1]
                if taxid in tax_db:
                    species_full = tax_db[taxid]
            elif data[0:2] == 'OS':
                species = data[5:]
                species = string.split(species,' ')[:2]
                species_full = string.join(species,' ')
            elif data[0] == '/':
                url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename
                ss = string.split(species_full,' ')
                if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full
                    try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz'))
                    except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')]
                taxid = ''; species_full = ''
    import EnsemblImport
    species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db)
    ### Export all species to UniProt file relationships so this function needs to only be run once
    import export         
    up = export.ExportFile('Config/uniprot-species-file.txt')
    for species_full in species_uniprot_db:
        values = species_uniprot_db[species_full]
        if len(values)>1:
            found = 'no'
            for (taxid,url) in values:
                if taxid in tax_db:
                    if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break
                if found == 'yes': break
        else: (taxid,url) = values[0]
        up.write(string.join([species_full,taxid,url],'\t')+'\n')
    up.close()
Ejemplo n.º 5
0
def geneCoordinates(species,symbols):
    genes=[]
    import EnsemblImport
    ensembl_annotation_db = EnsemblImport.reimportEnsemblAnnotations(species,symbolKey=True)
    for symbol in symbols:
        if symbol in ensembl_annotation_db:
            ens_geneid = ensembl_annotation_db[symbol]
            genes.append((ens_geneid,symbol))
        else:
            print symbol, 'not found'
    
    ### Get gene genomic locations
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    search_locations=[]
    for (gene,symbol) in genes:
        chr,strand,start,end = gene_location_db[gene]
        #if symbol == 'SRSF10': chr = 'chr1'; strand = '-'; start = '24295573'; end = '24306953'
        if len(chr)>6: print symbol, 'bad chromosomal reference:',chr
        else:
            search_locations.append([chr,strand,start,end,symbol])
Ejemplo n.º 6
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(
        species)
    export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(
        gene_annotation_file, 'RNASeq')

    print 'Annotations for', len(gene_location_db), 'genes imported'

    sorted_list = []
    protein_coding = 0
    for gene in gene_location_db:
        chr, strand, start, end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding += 1

        else: biotype = 'NA'
        if len(chr) < 7:
            sorted_list.append(
                [chr, strand, int(start),
                 int(end), gene, biotype])
        #else: print chr;sys.exit()
    print len(sorted_list), 'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()
    for values in sorted_list:
        chr, strand, start, end, gene, biotype = values
        try:
            symbol = annotate_db[gene].Symbol()
        except Exception:
            symbol = ''
        values = [gene, symbol, chr, strand, str(start), str(end), biotype]
        export_data.write(string.join(values, '\t') + '\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n', export_path
Ejemplo n.º 7
0
def grabExonIntronPromoterSequences(species,array_type,data_type,output_types):
    ### output_types could be adjacent intron sequences, adjacent exon sequences, targets exon sequence or promoter
    sequence_input_dir_list=[]
    if data_type == 'probeset': sequence_input_dir = '/AltResults/AlternativeOutput/'+array_type+'/sequence_input'
    if data_type == 'gene': sequence_input_dir = '/ExpressionOutput/'+array_type+'/sequence_input'
    
    dir_list = read_directory(sequence_input_dir)
    for input_file in dir_list:
        filedir = sequence_input_dir[1:]+'/'+input_file
        filter_db = inputResultFiles(filedir,data_type)
        export_exon_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'        
        ensembl_probeset_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,data_type,filter_db)
        """for gene in ensembl_probeset_db:
            if gene == 'ENSG00000139737':
                for x in ensembl_probeset_db[gene]:
                    exon_id,((probe_start,probe_stop,probeset_id,exon_class,transcript_clust),ed) = x
                    print gene, ed.ExonID()
        kill"""
        analysis_type = 'get_sequence'
        dir = 'AltDatabase/ensembl/'+species+'/'; gene_seq_filename = dir+species+'_gene-seq-2000_flank'
        ensembl_probeset_db = EnsemblImport.import_sequence_data(gene_seq_filename,ensembl_probeset_db,species,analysis_type)

        """
        critical_exon_file = 'AltDatabase/'+species+'/'+ array_type + '/' + array_type+'_critical-exon-seq.txt'
        if output_types == 'all' and data_type == 'probeset':
            output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns']
        else: output_types = [output_types]
        
        for output_type in output_types:
            sequence_input_dir = string.replace(sequence_input_dir,'_input','_output')
            filename = sequence_input_dir[1:]+'/ExportedSequence-'+data_type+'-'+output_type+'.txt'
            exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type)
        """
        if output_types == 'all' and data_type == 'probeset':
            output_types = ['alt-promoter','promoter','exon','adjacent-exons','adjacent-introns']
        else: output_types = [output_types]
        
        for output_type in output_types:
            sequence_input_dir2 = string.replace(sequence_input_dir,'_input','_output')
            filename = sequence_input_dir2[1:]+'/'+input_file[:-4]+'-'+data_type+'-'+output_type+'.txt'
            exportExonIntronPromoterSequences(filename, ensembl_probeset_db,data_type,output_type)
Ejemplo n.º 8
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species)
    export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq')
      
    print 'Annotations for',len(gene_location_db),'genes imported'
    
    sorted_list=[]; protein_coding=0 
    for gene in gene_location_db:
        chr,strand,start,end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding+=1
                
        else: biotype = 'NA'
        if len(chr)<7:
            sorted_list.append([chr,strand,int(start),int(end),gene,biotype])
        #else: print chr;sys.exit()
    print len(sorted_list),'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()        
    for values in sorted_list:
        chr,strand,start,end,gene,biotype=values
        try: symbol = annotate_db[gene].Symbol()
        except Exception: symbol = ''
        values = [gene,symbol,chr,strand,str(start),str(end),biotype]
        export_data.write(string.join(values,'\t')+'\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n',export_path
Ejemplo n.º 9
0
def reformatPolyAdenylationCoordinates(species,force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version={}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for',species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/'+species + '/'
    if force == 'yes':
        filename, status = update.download(url,output_dir,'')
    else: filename = output_dir+'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations; import OBO_import; import EnsemblImport; import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,'Ensembl-UniGene')
        print len(ens_unigene),'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene); use_entrez='no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,'Ensembl-EntrezGene')
        print len(ens_entrez),'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez); use_entrez='yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')
    
    export_bedfile = output_dir+species+'_polyADB_2_predictions.bed'
    print 'exporting',export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#'+species+'\t'+'polyADB_2'+'\t'+version[species]+'\n'
    export_data.write(header)
    
    fn=filepath(filename); x=0; not_found={}
    for line in open(fn,'rU').xreadlines():
        data = cleanUpLine(line)
        if x==0: x=1
        else:
            siteid,llid,chr,sitenum,position,supporting_EST,cleavage = string.split(data,'\t')
            if chr == 'chrM': chr = 'chrMT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M': chr = 'MT' ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr'+chr
                strand = '+'; geneid = siteid
                pos_start = str(int(position)-1); pos_end = position
                if use_entrez=='no':
                    external_geneid = string.join(string.split(siteid,'.')[:2],'.')
                else: external_geneid=llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-'+ens_geneid
                    chr,strand,start,end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid]=[]
                    bed_format = string.join([chr,pos_start,pos_end,geneid,'0','-'],'\t')+'\n' ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join([chr,pos_start,pos_end,geneid,'0',strand],'\t')+'\n'
                export_data.write(bed_format)
    export_data.close()   
Ejemplo n.º 10
0
def alignProbesetsToTranscripts(species,
                                array_type,
                                Analysis_type,
                                Force,
                                CoordinateBasedMatching=False):
    global force
    force = Force
    global analysis_type
    analysis_type = Analysis_type
    global coordinateBasedMatching
    coordinateBasedMatching = CoordinateBasedMatching
    """Match exon or junction probeset sequences to Ensembl and USCS mRNA transcripts"""

    if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
        data_type = 'junctions'
        probeset_seq_file = ''
        biotype = 'gene'
        if data_type == 'junctions' and analysis_type == 'reciprocal':
            start_time = time.time(
            )  ### Indicates whether to store information at the level of genes or probesets
            probeset_seq_db, pairwise_probeset_combinations = importJunctionAnnotationDatabaseAndSequence(
                species, array_type, biotype)
            end_time = time.time()
            time_diff = int(end_time - start_time)
        elif analysis_type == 'single':
            start_time = time.time()
            probeset_seq_db, pairwise_probeset_combinations = importAllJunctionSequences(
                species, array_type)
            end_time = time.time()
            time_diff = int(end_time - start_time)
        print "Analyses finished in %d seconds" % time_diff
    elif array_type == 'exon':
        data_type = 'exon'
        probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt'
        ###Import probe-level associations
        exon_db = importSplicingAnnotationDatabase(probeset_annotations_file)
        start_time = time.time()
        probeset_seq_db = importProbesetSequences(exon_db, species)
        end_time = time.time()
        time_diff = int(end_time - start_time)
        print "Analyses finished in %d seconds" % time_diff

    ### Match probesets to mRNAs\=
    import EnsemblImport
    if coordinateBasedMatching == True and array_type == 'RNASeq':
        EnsemblImport.exportTranscriptExonIDAssociations(species)
        matchTranscriptExonIDsToJunctionIDs(
            species, array_type, probeset_seq_db
        )  ### no sequences in probeset_seq_db, just junctionIDs
    else:
        #matchTranscriptExonIDsToJunctionIDs(species,array_type,probeset_seq_db) ### no sequences in probeset_seq_db, just junctionIDs
        importEnsemblTranscriptSequence(species, array_type, probeset_seq_db)
        try:
            mRNASeqAlign.importUCSCTranscriptSequences(species, array_type,
                                                       probeset_seq_db)
        except Exception:
            pass  ### If the species not supported by UCSC - the UCSC file is not written, but the other mRNA_alignments files should be available

    probeset_seq_db = {}  ### Re-set db

    ### Import results if junction array to make comparisons valid for junction-pairs rather than a single probeset
    if data_type == 'junctions':
        ### Re-import matches from above and export matching and non-matching transcripts for each probeset to a new file
        import_dir = '/AltDatabase/' + species + '/SequenceData/output'
        g = GrabFiles()
        g.setdirectory(import_dir)
        align_files = g.searchdirectory('mRNA_alignments')
        reAnalyzeRNAProbesetMatches(align_files, species, array_type,
                                    pairwise_probeset_combinations)
Ejemplo n.º 11
0
def reformatPolyAdenylationCoordinates(species, force):
    """ PolyA annotations are currently only available from UCSC for human, but flat file
    annotations from 2003-2006 are available for multiple species. Convert these to BED format"""
    version = {}
    version['Rn'] = '2003(rn3)'
    version['Dr'] = '2003(zv4)'
    version['Gg'] = '2004(galGal2)'
    version['Hs'] = '2006(hg8)'
    version['Mm'] = '2004(mm5)'

    print 'Exporting polyADB_2 coordinates as BED for', species
    ### Obtain the necessary database files
    url = 'http://altanalyze.org/archiveDBs/all/polyAsite.txt'
    output_dir = 'AltDatabase/ucsc/' + species + '/'
    if force == 'yes':
        filename, status = update.download(url, output_dir, '')
    else:
        filename = output_dir + 'polyAsite.txt'

    ### Import the refseq to Ensembl information
    import gene_associations
    import OBO_import
    import EnsemblImport
    import export
    try:
        ens_unigene = gene_associations.getGeneToUid(species,
                                                     'Ensembl-UniGene')
        print len(ens_unigene), 'Ensembl-UniGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_unigene)
        use_entrez = 'no'
    except Exception:
        ens_entrez = gene_associations.getGeneToUid(species,
                                                    'Ensembl-EntrezGene')
        print len(ens_entrez), 'Ensembl-EntrezGene entries imported'
        external_ensembl = OBO_import.swapKeyValues(ens_entrez)
        use_entrez = 'yes'
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    export_bedfile = output_dir + species + '_polyADB_2_predictions.bed'
    print 'exporting', export_bedfile
    export_data = export.ExportFile(export_bedfile)
    header = '#' + species + '\t' + 'polyADB_2' + '\t' + version[species] + '\n'
    export_data.write(header)

    fn = filepath(filename)
    x = 0
    not_found = {}
    for line in open(fn, 'rU').xreadlines():
        data = cleanUpLine(line)
        if x == 0: x = 1
        else:
            siteid, llid, chr, sitenum, position, supporting_EST, cleavage = string.split(
                data, '\t')
            if chr == 'chrM':
                chr = 'chrMT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if chr == 'M':
                chr = 'MT'  ### MT is the Ensembl convention whereas M is the Affymetrix and UCSC convention
            if species in siteid:
                if 'NA' not in chr: chr = 'chr' + chr
                strand = '+'
                geneid = siteid
                pos_start = str(int(position) - 1)
                pos_end = position
                if use_entrez == 'no':
                    external_geneid = string.join(
                        string.split(siteid, '.')[:2], '.')
                else:
                    external_geneid = llid
                if external_geneid in external_ensembl:
                    ens_geneid = external_ensembl[external_geneid][0]
                    geneid += '-' + ens_geneid
                    chr, strand, start, end = gene_location_db[ens_geneid]
                else:
                    not_found[external_geneid] = []
                    bed_format = string.join(
                        [chr, pos_start, pos_end, geneid, '0', '-'], '\t'
                    ) + '\n'  ### We don't know the strand, so write out both strands
                    export_data.write(bed_format)
                bed_format = string.join(
                    [chr, pos_start, pos_end, geneid, '0', strand],
                    '\t') + '\n'
                export_data.write(bed_format)
    export_data.close()
def getAnnotations(Species,array_type,reannotate_exon_seq,force):
    """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release)."""
    global species; species = Species; global test; global test_cluster
    test = 'no'; test_cluster = ['TC0701360']; data_type = 'mRNA'

    global ensembl_exon_db; global ensembl_exon_db; global exon_clusters; global exon_region_db
    ensembl_exon_db,ensembl_annot_db,exon_clusters,intron_clusters,exon_region_db,intron_retention_db,ucsc_splicing_annot_db,ens_transcript_db = EnsemblImport.getEnsemblAssociations(species,data_type,test)
    ensembl_probeset_db = importCriticalExonLocations(species,array_type,ensembl_exon_db,force) ###Get Pre-computed genomic locations for critical exons
    ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons(ensembl_probeset_db,exon_clusters,ensembl_exon_db,exon_region_db,intron_retention_db,intron_clusters,ucsc_splicing_annot_db); constitutive_gene_db={}
    ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type,ensembl_probeset_db,species)
    print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..."
    
    ### Change filenames to reflect junction array type
    export_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'; ef=filepath(export_filename)
    export_replacement = string.replace(export_filename,'_probe','_'+array_type+'_probe')
    er=filepath(export_replacement); shutil.copyfile(ef,er); os.remove(ef) ### Copy file to a new name

    ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses)
    if reannotate_exon_seq == 'yes':
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)