def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)
Exemple #2
0
def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; #coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)