Ejemplo n.º 1
0
def formatAttributeForExport(attribute_db,filename):
    import IdentifyAltIsoforms
    export_db={}
    for (gene,probeset) in attribute_db:
        attribute_list = attribute_db[(gene,probeset)]; attribute_list2=[]
        for (attribute,direction) in attribute_list: attribute_list2.append(attribute+'|'+direction)
        export_db[probeset]=attribute_list2
    print 'Exporting:',filename
    IdentifyAltIsoforms.exportSimple(export_db,filename,'')
Ejemplo n.º 2
0
def reAnalyzeRNAProbesetMatches(align_files,species,array_type,pairwise_probeset_combinations):
    """Import matching and non-matching probesets and export the valid comparisons"""
    align_files2=[]
    for file in align_files:
        if array_type in file: align_files2.append(file)
    align_files = align_files2
    
    matching={}; not_matching={}
    for filename in align_files:
        print 'Reading',filename
        start_time = time.time()
        fn=filepath(filename)
        for line in open(fn,'rU').xreadlines():
            values = string.replace(line,'\n','')
            probeset,call,accession = string.split(values,'\t')
            if call == '1':
                try: matching[probeset].append(accession)
                except KeyError: matching[probeset] = [accession]
            else:
                try: not_matching[probeset].append(accession)
                except KeyError: not_matching[probeset] = [accession]

    probeset_matching_pairs={}; matching_in_both=0; match_and_null=0; no_matches=0; no_nulls=0
    for (probeset1,probeset2) in pairwise_probeset_combinations:
        if probeset1 in matching and probeset2 in matching:
            matching[probeset1].sort(); matching[probeset2].sort()
            match1 = string.join(matching[probeset1],'|')
            match2 = string.join(matching[probeset2],'|')
            if match1 != match2:
                probeset_matching_pairs[probeset1+'|'+probeset2] = [match1,match2]
            matching_in_both+=1
        else:
            if probeset1 in matching and probeset1 in not_matching:
                match = string.join(matching[probeset1],'|')
                null_match = string.join(filterNullMatch(not_matching[probeset1],matching[probeset1]),'|')
                probeset_matching_pairs[probeset1] = [match,null_match]
                match_and_null+=1
            elif probeset2 in matching and probeset2 in not_matching:
                match = string.join(matching[probeset2],'|')
                null_match = string.join(filterNullMatch(not_matching[probeset2],matching[probeset2]),'|')
                probeset_matching_pairs[probeset2] = [match,null_match]
                match_and_null+=1
            elif probeset1 in matching or probeset2 in matching: no_nulls+=1
            else:
                no_matches+=1
                #if no_matches<10: print probeset1,probeset2

    print matching_in_both, "probeset pairs with matching isoforms for both recipricol probesets."
    print match_and_null, "probeset pairs with a match for one and null for that one."
    print no_nulls, "probeset pairs with only one match."
    print no_matches, "probeset pairs with no matches."
    
    import IdentifyAltIsoforms
    export_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_all-transcript-matches.txt'
    if analysis_type == 'single':
        export_file = 'AltDatabase/'+species+'/'+array_type+'/junction/'+species+'_all-transcript-matches.txt'
    IdentifyAltIsoforms.exportSimple(probeset_matching_pairs,export_file,'')
Ejemplo n.º 3
0
def formatAttributeForExport(attribute_db, filename):
    import IdentifyAltIsoforms
    export_db = {}
    for (gene, probeset) in attribute_db:
        attribute_list = attribute_db[(gene, probeset)]
        attribute_list2 = []
        for (attribute, direction) in attribute_list:
            attribute_list2.append(attribute + '|' + direction)
        export_db[probeset] = attribute_list2
    print 'Exporting:', filename
    IdentifyAltIsoforms.exportSimple(export_db, filename, '')
Ejemplo n.º 4
0
def getCodingGenomicCoordinates(species):
    global cds_location_db
    global cds_genomic_db
    import IdentifyAltIsoforms
    cds_location_db = IdentifyAltIsoforms.importCDScoordinates(species)
    #print cds_location_db['ENST00000436739'];sys.exit()
    cds_genomic_db={}
    
    option = 'exon'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
Ejemplo n.º 5
0
def getCodingGenomicCoordinates(species):
    global cds_location_db
    global cds_genomic_db
    import IdentifyAltIsoforms
    cds_location_db = IdentifyAltIsoforms.importCDScoordinates(species)
    #print cds_location_db['ENST00000436739'];sys.exit()
    cds_genomic_db={}
    
    option = 'exon'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
Ejemplo n.º 6
0
def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)
Ejemplo n.º 7
0
def reAnalyzeRNAProbesetMatches(align_files, species, array_type,
                                pairwise_probeset_combinations):
    """Import matching and non-matching probesets and export the valid comparisons"""
    align_files2 = []
    for file in align_files:
        if array_type in file: align_files2.append(file)
    align_files = align_files2

    matching = {}
    not_matching = {}
    for filename in align_files:
        print 'Reading', filename
        start_time = time.time()
        fn = filepath(filename)
        for line in open(fn, 'rU').xreadlines():
            values = string.replace(line, '\n', '')
            probeset, call, accession = string.split(values, '\t')
            if call == '1':
                try:
                    matching[probeset].append(accession)
                except KeyError:
                    matching[probeset] = [accession]
            else:
                try:
                    not_matching[probeset].append(accession)
                except KeyError:
                    not_matching[probeset] = [accession]

    probeset_matching_pairs = {}
    matching_in_both = 0
    match_and_null = 0
    no_matches = 0
    no_nulls = 0
    for (probeset1, probeset2) in pairwise_probeset_combinations:
        if probeset1 in matching and probeset2 in matching:
            matching[probeset1].sort()
            matching[probeset2].sort()
            match1 = string.join(matching[probeset1], '|')
            match2 = string.join(matching[probeset2], '|')
            if match1 != match2:
                probeset_matching_pairs[probeset1 + '|' +
                                        probeset2] = [match1, match2]
            matching_in_both += 1
        else:
            if probeset1 in matching and probeset1 in not_matching:
                match = string.join(matching[probeset1], '|')
                null_match = string.join(
                    filterNullMatch(not_matching[probeset1],
                                    matching[probeset1]), '|')
                probeset_matching_pairs[probeset1] = [match, null_match]
                match_and_null += 1
            elif probeset2 in matching and probeset2 in not_matching:
                match = string.join(matching[probeset2], '|')
                null_match = string.join(
                    filterNullMatch(not_matching[probeset2],
                                    matching[probeset2]), '|')
                probeset_matching_pairs[probeset2] = [match, null_match]
                match_and_null += 1
            elif probeset1 in matching or probeset2 in matching:
                no_nulls += 1
            else:
                no_matches += 1
                #if no_matches<10: print probeset1,probeset2

    print matching_in_both, "probeset pairs with matching isoforms for both recipricol probesets."
    print match_and_null, "probeset pairs with a match for one and null for that one."
    print no_nulls, "probeset pairs with only one match."
    print no_matches, "probeset pairs with no matches."

    import IdentifyAltIsoforms
    export_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_all-transcript-matches.txt'
    if analysis_type == 'single':
        export_file = 'AltDatabase/' + species + '/' + array_type + '/junction/' + species + '_all-transcript-matches.txt'
    IdentifyAltIsoforms.exportSimple(probeset_matching_pairs, export_file, '')
Ejemplo n.º 8
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign; analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
       
        import IdentifyAltIsoforms; run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
        import FeatureAlignment; import JunctionArray
        FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            import JunctionArray; import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        import JunctionArray; import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
Ejemplo n.º 9
0
def executeParameters(species, array_type, force, genomic_build,
                      update_uniprot, update_ensembl,
                      update_probeset_to_ensembl, update_domain, update_miRs,
                      update_all, update_miR_seq, ensembl_version):
    if '|' in array_type:
        array_type, specific_array_type = string.split(
            array_type, '|'
        )  ### To destinguish between array sub-types, like the HJAY and hGlue
    else:
        specific_array_type = array_type

    if update_all == 'yes':
        update_uniprot = 'yes'
        update_ensembl = 'yes'
        update_probeset_to_ensembl = 'yes'
        update_domain = 'yes'
        update_miRs = 'yes'

    if update_ensembl == 'yes':
        import EnsemblSQL
        reload(EnsemblSQL)
        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""
        configType = 'Advanced'
        analysisType = 'AltAnalyzeDBs'
        externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'
        analysisType = 'ExternalOnly'
        externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version, species)

    if update_uniprot == 'yes':
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species, force)

    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species, array_type, force,
                                         genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species, array_type,
                                         specific_array_type, force,
                                         genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq
            test_status = 'no'
            data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species, data_type, test_status,
                                          force)
        else:
            buildExonArrayExonAnnotations(species, array_type, force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'
        run_from_scratch = 'yes'
        export_all_associations = 'yes'  ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                                run_from_scratch, force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species, array_type)
            null = {}
        if (species == 'Mm' and array_type == 'AltMouse'
            ) or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     analysis_type, force)

        import IdentifyAltIsoforms
        run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species, array_type, 'null', force,
                                       run_seqcomp)
        import FeatureAlignment
        FeatureAlignment.findDomainsByGenomeCoordinates(
            species, array_type, 'null')

        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     'single', force)
            IdentifyAltIsoforms.runProgram(species, array_type, 'junction',
                                           force, run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(
                species, array_type, 'junction')
            if array_type == 'junction' or array_type == 'RNASeq':
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species, array_type, 'exon',
                                               force, run_seqcomp)
                # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
                if array_type == 'RNASeq':
                    import JunctionArray
                    JunctionArray.combineExonJunctionAnnotations(
                        species, array_type)

    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions
            only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(
                species, force, only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':
            import ExonSeqModule
            stringency = 'strict'
            process_microRNA_predictions = 'yes'
            mir_source = 'multiple'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'
            mir_source = 'multiple'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)

    if array_type == 'junction':
        try:
            import JunctionArray
            import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species, array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species, array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        except IOError:
            print 'No built junction files to analyze'
            sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm'
                                   or species == 'Rn'):
        import JunctionArray
        import JunctionArrayEnsemblRules
        try:
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
        except IOError:
            print 'No Ensembl_exons.txt file to analyze'
            sys.exit()

    try:
        filename = 'AltDatabase/' + species + '/SequenceData/miRBS-combined_gene-targets.txt'
        ef = filepath(filename)
        er = string.replace(
            ef, species + '/SequenceData/miRBS-combined_gene-targets.txt',
            'ensembl/' + species + '/' + species + '_microRNA-Ensembl.txt')
        import shutil
        shutil.copyfile(ef, er)
    except Exception:
        null = []
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,
                       server_folder)  ### Will force download if missing
            verifyFile('AltDatabase/' + species + '/' + array_type +
                       '/platform.txt',
                       server_folder)  ### Will force download if missing
        elif array_type != 'AltMouse':
            verifyFile(filename,
                       array_type)  ### Will force download if missing
        if (array_type == 'exon'
                or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file, array_type)
            except Exception:
                null = []
Ejemplo n.º 10
0
def alignAllDomainsToTranscripts(species,platform):
    """ This function is only run during the database build process to create files available for subsequent download.
    This recapitulates several functions executed during the database build process but does so explicitely for each
    isoform with the goal of obtained genomic coordinates of each protein feature post de novo sequence alignment.
    This includes all Ensembl proteins, UCSC mRNAs and in silico translated RNAs """
    
    ### Import all transcript to gene associations for Ensembl and UCSC transcripts
    global gene_transcript_db
    gene_transcript_db={}
    option = 'transcript'
    print 'Importing transcript data into memory'
    filename = 'AltDatabase/ensembl/'+species+'/'+species+'_Ensembl_transcript-annotations.txt'
    importEnsExonStructureData(filename,option)
    filename = 'AltDatabase/ucsc/'+species+'/'+species+'_UCSC_transcript_structure_mrna.txt'
    try: importEnsExonStructureData(filename,option)
    except Exception: None ### Not available for all species - needs to be built prior to transcript model creation
    
    import FeatureAlignment
    ucsc_transcripts={}
    gene_db = {}
    gene_transcript_db = FeatureAlignment.eliminateRedundant(gene_transcript_db)
    for gene in gene_transcript_db:
        for (ac,type) in gene_transcript_db[gene]:
            if type != 'Ensembl':
                ucsc_transcripts[ac]=[] ### Store all the untranslated UCSC mRNAs
        gene_db[gene] = [gene] ### mimics the necessary structure for FeatureAlignment
    ### Identify untranslated Ensembl transcripts
    
    print 'Importing Ensembl transcript to protein'
    ens_transcript_protein_db = importEnsemblTranscriptAssociations(species)
    
    ### Import protein ID and protein sequence into a dictionary
    #global protein_sequence_db
    #protein_sequence_db = FeatureAlignment.remoteEnsemblProtSeqImport(species) ### All Ensembl protein sequences
    
    """This code imports all protein sequences (NCBI, Ensembl, in silico translated) associated with optimal isoform pairs,
    however, not all isoforms analyzed in the database are here, hence, this should be considered a subset of in silico
    translated Ensembl mRNAs, UCSC ,RNAs, and known analyzed UCSC proteins"""
    #ucsc_transcripts={}
    #ucsc_transcripts['BC065499']=[]
    #ucsc_transcripts['AK309510']=[] ### in silico translated
    #ens_transcript_protein_db={}
    ### Download or translate ANY AND ALL mRNAs considered by AltAnalyze via in silico translation
    import IdentifyAltIsoforms
    analysis_type = 'fetch_new' # analysis_type = 'fetch' ???

    #IdentifyAltIsoforms.remoteTranslateRNAs(species,ucsc_transcripts,ens_transcript_protein_db,analysis_type)
    ### Derive all protein ID, domain and genomic coordinate data from Ensembl and UniProt
    """ This data is available for Ensembl and UniProt isoforms but we re-derive the associations based on sequence for completeness """

    ### Get the domain sequences and genomic coordinates
    """
    # for testing
    gt = {}; y=0
    for gene in gene_db:
        if y < 20:
            gt[gene] = gene_db[gene]
        else: break
        y+=1
    """
    protein_ft_db,domain_gene_counts = FeatureAlignment.grab_exon_level_feature_calls(species,platform,gene_db)
    import ExonAnalyze_module
    seq_files, mRNA_protein_seq_db = IdentifyAltIsoforms.importProteinSequences(species,'getSequence') ### Import all available protein sequences (downloaded or in silico)
    coordinate_type = 'genomic'; #coordinate_type = 'protein'
    ExonAnalyze_module.getFeatureIsoformGenomePositions(species,protein_ft_db,mRNA_protein_seq_db,gene_transcript_db,coordinate_type)