예제 #1
0
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """
    
    import JunctionArray
    import JunctionArrayEnsemblRules    
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        import ExonAnnotate_module
        import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no'
        probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,array_type) ### Will force download if missing
        exon_db={}; filtered_arrayids={};filter_status='no'
        constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status)
        alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species,array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force)
    
    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
    verifyFile(filename,array_type) ### Will force download if missing
    filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt"
    verifyFile(filename,array_type) ### Will force download if missing
예제 #2
0
def import_annotations(filename, array_type):
    import ExonAnalyze_module
    fn = filepath(filename)
    annotate_db = {}
    x = 0
    if array_type == 'AltMouse':
        for line in open(fn, 'rU').xreadlines():
            data = cleanUpLine(line)
            if x == 0: x = 1
            else:
                try:
                    affygene, description, ll_id, symbol, rna_processing_annot = string.split(
                        data, '\t')
                except ValueError:
                    affygene, description, ll_id, symbol = string.split(
                        data, '\t')
                    splicing_annotation = ''
                if '"' in description:
                    null, description, null = string.split(description, '"')
                rna_processing_annot = ''
                y = ExonAnalyze_module.GeneAnnotationData(
                    affygene, description, symbol, ll_id, rna_processing_annot)
                annotate_db[affygene] = y
    else:
        for line in open(fn, 'rU').xreadlines():
            data = cleanUpLine(line)
            if x == 0: x = 1
            else:
                rna_processing_annot = ''
                try:
                    ensembl, description, symbol, rna_processing_annot = string.split(
                        data, '\t')
                except ValueError:
                    ensembl, description, symbol = string.split(data, '\t')
                y = ExonAnalyze_module.GeneAnnotationData(
                    ensembl, description, symbol, ensembl,
                    rna_processing_annot)
                annotate_db[ensembl] = y
    return annotate_db
예제 #3
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(
        species, 'RNASeq', 'key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(
        species)
    export_path = 'GenMAPPDBs/' + species + '/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(
        gene_annotation_file, 'RNASeq')

    print 'Annotations for', len(gene_location_db), 'genes imported'

    sorted_list = []
    protein_coding = 0
    for gene in gene_location_db:
        chr, strand, start, end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding += 1

        else: biotype = 'NA'
        if len(chr) < 7:
            sorted_list.append(
                [chr, strand, int(start),
                 int(end), gene, biotype])
        #else: print chr;sys.exit()
    print len(sorted_list), 'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()
    for values in sorted_list:
        chr, strand, start, end, gene, biotype = values
        try:
            symbol = annotate_db[gene].Symbol()
        except Exception:
            symbol = ''
        values = [gene, symbol, chr, strand, str(start), str(end), biotype]
        export_data.write(string.join(values, '\t') + '\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n', export_path
예제 #4
0
def exportChromosomeStrandCoordinates(species):
    import EnsemblImport
    gene_location_db = EnsemblImport.getEnsemblGeneLocations(species,'RNASeq','key_by_array')

    import ExpressionBuilder
    gene_biotype_db = ExpressionBuilder.importTranscriptBiotypeAnnotations(species)
    export_path = 'GenMAPPDBs/'+species+'/chr_gene_locations.txt'
    export_data = export.ExportFile(export_path)

    import ExonAnalyze_module
    gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt"
    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,'RNASeq')
      
    print 'Annotations for',len(gene_location_db),'genes imported'
    
    sorted_list=[]; protein_coding=0 
    for gene in gene_location_db:
        chr,strand,start,end = gene_location_db[gene]
        if gene in gene_biotype_db:
            biotype = gene_biotype_db[gene][-1]
            if biotype == 'protein_coding': protein_coding+=1
                
        else: biotype = 'NA'
        if len(chr)<7:
            sorted_list.append([chr,strand,int(start),int(end),gene,biotype])
        #else: print chr;sys.exit()
    print len(sorted_list),'genes for typical chromosomes present'
    print protein_coding, 'protein coding genes present'
    sorted_list.sort()        
    for values in sorted_list:
        chr,strand,start,end,gene,biotype=values
        try: symbol = annotate_db[gene].Symbol()
        except Exception: symbol = ''
        values = [gene,symbol,chr,strand,str(start),str(end),biotype]
        export_data.write(string.join(values,'\t')+'\n')
    export_data.close()
    print species, 'chromosome locations exported to:\n',export_path
예제 #5
0
def displayExpressionGraph(species,
                           Platform,
                           exp_file,
                           gene,
                           transpose,
                           display=True,
                           showIntrons=False,
                           analysisType='plot'):
    ### Get gene annotations (users can provide an Ensembl or symbol)
    print 'Importing exon-level expression data for visualization (be patient)...'
    import ExonAnalyze_module
    global platform
    platform = Platform
    if platform != 'AltMouse':
        gene_annotation_file = "AltDatabase/ensembl/" + species + "/" + species + "_Ensembl-annotations.txt"
    else:
        gene_annotation_file = "AltDatabase/" + species + "/" + platform + "/" + platform + "_gene_annotations.txt"

    genes = []
    gene = string.replace(gene, '|', ',')
    gene = string.replace(gene, ' ', ',')
    if ',' in gene:
        genes += string.split(gene, ',')
    else:
        genes.append(gene)
    gene_db = {}
    for gene in genes:
        try:
            if 'ENS' in gene:
                try:
                    annotate_db  ### If variable is defined
                except Exception:
                    annotate_db = ExonAnalyze_module.import_annotations(
                        gene_annotation_file, platform,
                        keyBySymbol=False)  ### Make an SQLite call
                gene_symbol = annotate_db[gene].Symbol()
            else:
                try:
                    annotate_db  ### If variable is defined
                except Exception:
                    annotate_db = ExonAnalyze_module.import_annotations(
                        gene_annotation_file, platform, keyBySymbol=True)
                gene_symbol = gene
                gene = annotate_db[gene].GeneID()
            gene_db[gene] = gene_symbol
        except Exception:
            print gene, 'not in database'

    if len(gene_db) == 0:
        force_no_gene_found_error
    if 'AltResults' in exp_file:
        root_dir = string.split(exp_file, 'AltResults')[0] + 'ExonPlots/'
    else:
        root_dir = string.split(exp_file, 'ExpressionInput')[0] + 'ExonPlots/'

    import ExonAnalyze_module
    if platform == 'RNASeq': datatype = 'exons'
    else: datatype = 'probesets'
    export_exon_filename = 'AltDatabase/' + species + '/' + platform + '/' + species + '_Ensembl_' + datatype + '.txt'
    if verifyFileLength(export_exon_filename) == 0:
        rootdir = string.replace(root_dir, 'ExonPlots/', '')
        export_exon_filename = rootdir + '/' + export_exon_filename

    import ExonArrayEnsemblRules
    ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(
        export_exon_filename, 'gene-probesets',
        gene_db)  ### Make an SQLite call

    filter_db = {}
    for gene in ensembl_exon_db:
        ensembl_exon_db[gene].sort()
        for (index, ed, id) in ensembl_exon_db[gene]:
            filter_db[id] = []

    try:
        os.mkdir(root_dir)
    except Exception:
        None  ### dir exists
    print 'Image results being saved to the folder "ExonPlots" in the AltAnalyze results directory.'
    importTableEntries(exp_file,
                       filter_db,
                       ensembl_exon_db,
                       gene_db,
                       root_dir,
                       transpose,
                       display,
                       showIntrons,
                       analysisType=analysisType)  ### Make an SQLite call
예제 #6
0
파일: QC.py 프로젝트: venkatmi/altanalyze
def displayExpressionGraph(species,Platform,exp_file,gene,transpose,display=True,showIntrons=False,analysisType='plot'):
    ### Get gene annotations (users can provide an Ensembl or symbol)
    print 'Importing exon-level expression data for visualization (be patient)...'
    import ExonAnalyze_module
    global platform
    platform = Platform
    if platform != 'AltMouse': gene_annotation_file = "AltDatabase/ensembl/"+species+"/"+species+"_Ensembl-annotations.txt"
    else: gene_annotation_file = "AltDatabase/"+species+"/"+platform+"/"+platform+"_gene_annotations.txt"

    genes=[]
    gene=string.replace(gene,'|',',')
    gene=string.replace(gene,' ',',')
    if ',' in gene:
        genes += string.split(gene,',')
    else: genes.append(gene)    
    gene_db={}
    for gene in genes:
        try:
            if 'ENS' in gene:
                try: annotate_db ### If variable is defined
                except Exception:
                    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,platform,keyBySymbol=False) ### Make an SQLite call
                gene_symbol = annotate_db[gene].Symbol()
            else:
                try: annotate_db ### If variable is defined
                except Exception:
                    annotate_db = ExonAnalyze_module.import_annotations(gene_annotation_file,platform,keyBySymbol=True)
                gene_symbol = gene
                gene = annotate_db[gene].GeneID()
            gene_db[gene]=gene_symbol
        except Exception:
            #if len(gene)>0: print gene, 'not in database'
            pass
        
    if len(gene_db)==0:
        force_no_gene_found_error
    if 'AltResults' in exp_file:
        root_dir = string.split(exp_file,'AltResults')[0]+'ExonPlots/'
    else:
        root_dir = string.split(exp_file,'ExpressionInput')[0]+'ExonPlots/'
        
    import ExonAnalyze_module
    if platform == 'RNASeq': datatype = 'exons'
    else: datatype = 'probesets'
    export_exon_filename = 'AltDatabase/'+species+'/'+platform+'/'+species+'_Ensembl_'+datatype+'.txt'
    if verifyFileLength(export_exon_filename) == 0:
        rootdir = string.replace(root_dir,'ExonPlots/','')
        export_exon_filename = rootdir+'/'+export_exon_filename
    
    import ExonArrayEnsemblRules
    ensembl_exon_db = ExonArrayEnsemblRules.reimportEnsemblProbesetsForSeqExtraction(export_exon_filename,'gene-probesets',gene_db) ### Make an SQLite call
    
    filter_db = {}
    for gene in ensembl_exon_db:
        ensembl_exon_db[gene].sort()
        
        for (index,ed,id) in ensembl_exon_db[gene]:
            filter_db[id] = []
            
    try: os.mkdir(root_dir)
    except Exception: None ### dir exists
    print 'Image results being saved to the folder "ExonPlots" in the AltAnalyze results directory.'
    importTableEntries(exp_file,filter_db,ensembl_exon_db,gene_db,root_dir,transpose,display,showIntrons,analysisType=analysisType) ### Make an SQLite call
예제 #7
0
def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)
예제 #8
0
def buildAltMouseExonAnnotations(species, array_type, force, genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """

    import JunctionArray
    import JunctionArrayEnsemblRules
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        import ExonAnnotate_module
        import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'
        onlyAnalyzeJunctions = 'no'
        probeset_annotations_file = "AltDatabase/" + species + "/" + array_type + "/" + "MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,
                   array_type)  ### Will force download if missing
        exon_db = {}
        filtered_arrayids = {}
        filter_status = 'no'
        constituitive_probeset_db, exon_db, genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(
            probeset_annotations_file, array_type, filtered_arrayids,
            filter_status)
        alt_junction_db, critical_exon_db, exon_dbase, exon_inclusion_db, exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(
            exon_db, constituitive_probeset_db, {},
            agglomerate_inclusion_probesets, onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,
                                                      critical_exon_db,
                                                      exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species, array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'
    run_from_scratch = 'yes'
    export_all_associations = 'no'  ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                            run_from_scratch, force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species, array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species, array_type,
                                             reannotate_exon_seq, force)

    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/" + species + "/" + array_type + "/" + "MASTER-probeset-transcript.txt"
    verifyFile(filename, array_type)  ### Will force download if missing
    filename = "AltDatabase/" + species + '/' + array_type + '/' + array_type + "_annotations.txt"
    verifyFile(filename, array_type)  ### Will force download if missing
예제 #9
0
def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; #coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)