Ejemplo n.º 1
0
def buildJunctionExonAnnotations(species, array_type, specific_array_type,
                                 force, genomic_build):
    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'
    run_from_scratch = 'yes'
    force = 'no'
    export_all_associations = 'no'  ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                            run_from_scratch, force)

    ### Get genomic locations and initial annotations for exon sequences (exon pobesets and junctions)
    import JunctionArray
    import JunctionArrayEnsemblRules
    """ The following functions:
    1) Extract transcript cluster-to-gene annotations
    2) Extract exon sequences for junctions and exon probesets from the Affymetrix annotation file (version 2.0),
    3) Map these sequences to Ensembl gene sequences (build specific) plus and minus 2KB, upstream and downstream
    4) Obtain AltAnalyze exon region annotations and obtain full-length exon sequences for each exon probeset
    5) Consoladate these into an Ensembl_probeset.txt file (rather than Ensembl_junction_probeset.txt) with junctions
       having a single probeset identifier.
    6) Determine which junctions and junction-exons represent recipricol junctions using:
       a) AltAnalyze identified recipricol junctions from Ensembl and UCSC and
       b) Affymetrix suggested recipricol junctions based on common exon cluster annotations, creating
          Mm_junction_comps_updated.txt.
       c) De novo comparison of all exon-junction region IDs for all junctions using the EnsemblImport method compareJunctions().
    """
    ### Steps 1-3
    JunctionArray.getJunctionExonLocations(species, array_type,
                                           specific_array_type)
    ### Step 4
    JunctionArrayEnsemblRules.getAnnotations(species, array_type, 'yes', force)
    ### Step 5-6
    JunctionArray.identifyJunctionComps(species, array_type,
                                        specific_array_type)
Ejemplo n.º 2
0
def buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build):
    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'; force='no'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    #buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

    ### Get genomic locations and initial annotations for exon sequences (exon pobesets and junctions)    
    import JunctionArray
    import JunctionArrayEnsemblRules
    """ The following functions:
    1) Extract transcript cluster-to-gene annotations
    2) Extract exon sequences for junctions and exon probesets from the Affymetrix annotation file (version 2.0),
    3) Map these sequences to Ensembl gene sequences (build specific) plus and minus 2KB, upstream and downstream
    4) Obtain AltAnalyze exon region annotations and obtain full-length exon sequences for each exon probeset
    5) Consoladate these into an Ensembl_probeset.txt file (rather than Ensembl_junction_probeset.txt) with junctions
       having a single probeset identifier.
    6) Determine which junctions and junction-exons represent recipricol junctions using:
       a) AltAnalyze identified recipricol junctions from Ensembl and UCSC and
       b) Affymetrix suggested recipricol junctions based on common exon cluster annotations, creating
          Mm_junction_comps_updated.txt.
       c) De novo comparison of all exon-junction region IDs for all junctions using the EnsemblImport method compareJunctions().
    """
    ### Steps 1-3
    JunctionArray.getJunctionExonLocations(species,array_type,specific_array_type)
    ### Step 4
    JunctionArrayEnsemblRules.getAnnotations(species,array_type,'yes',force)
    ### Step 5-6
    JunctionArray.identifyJunctionComps(species,array_type,specific_array_type)
Ejemplo n.º 3
0
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """
    
    import JunctionArray
    import JunctionArrayEnsemblRules    
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        import ExonAnnotate_module
        import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no'
        probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,array_type) ### Will force download if missing
        exon_db={}; filtered_arrayids={};filter_status='no'
        constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status)
        alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species,array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force)
    
    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt"
    verifyFile(filename,array_type) ### Will force download if missing
    filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt"
    verifyFile(filename,array_type) ### Will force download if missing
def importCriticalExonLocations(species,array_type,ensembl_exon_db,force):
    ###ensembl_exon_db[(geneid,chr,strand)] = [[E5,exon_info]] #exon_info = (exon_start,exon_stop,exon_id,exon_annot)
    ###ensembl_probeset_db[geneid,chr,strand].append(probeset_data) #probeset_data = [start,stop,probeset_id,exon_class,transcript_cluster_id]
    gene_info_db = {}
    for (ens_geneid,chr,strand) in ensembl_exon_db: gene_info_db[ens_geneid] = chr,strand
    filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical_exon_locations.txt'
    array_ensembl={}

    ###Get the most recent gene-symbol annotations (applicable with a new Ensembl build for the same genomic build)
    ensembl_symbol_db = getEnsemblAnnotations(species)
    primary_gene_annotation_file = 'AltDatabase/'+species +'/'+ array_type +'/'+ array_type+ '_gene_annotations.txt'
    update.verifyFile(primary_gene_annotation_file,array_type)
    array_gene_annotations = JunctionArray.importGeneric(primary_gene_annotation_file)
            
    for array_geneid in array_gene_annotations:    
        t = array_gene_annotations[array_geneid]; description=t[0];entrez=t[1];symbol=t[2]
        if symbol in ensembl_symbol_db and len(symbol)>0 and len(array_geneid)>0:
            ens_geneid = ensembl_symbol_db[symbol]
            if len(ens_geneid)>0: array_ensembl[array_geneid]= ens_geneid
          
    update.verifyFile(filename,array_type)  
    ensembl_probeset_db = importJunctionLocationData(filename,array_ensembl,gene_info_db,test)
       
    print len(ensembl_probeset_db), "Genes inlcuded in",array_type,"location database"
    return ensembl_probeset_db
def getAnnotations(Species,array_type,reannotate_exon_seq,force):
    """Annotate Affymetrix exon array data using files Ensembl data (sync'ed to genome release)."""
    global species; species = Species; global test; global test_cluster
    test = 'no'; test_cluster = ['TC0701360']; data_type = 'mRNA'

    global ensembl_exon_db; global ensembl_exon_db; global exon_clusters; global exon_region_db
    ensembl_exon_db,ensembl_annot_db,exon_clusters,intron_clusters,exon_region_db,intron_retention_db,ucsc_splicing_annot_db,ens_transcript_db = EnsemblImport.getEnsemblAssociations(species,data_type,test)
    ensembl_probeset_db = importCriticalExonLocations(species,array_type,ensembl_exon_db,force) ###Get Pre-computed genomic locations for critical exons
    ensembl_probeset_db = ExonArrayEnsemblRules.annotateExons(ensembl_probeset_db,exon_clusters,ensembl_exon_db,exon_region_db,intron_retention_db,intron_clusters,ucsc_splicing_annot_db); constitutive_gene_db={}
    ExonArrayEnsemblRules.exportEnsemblLinkedProbesets(array_type,ensembl_probeset_db,species)
    print "\nCritical exon data exported coordinates, exon associations and splicing annotations exported..."
    
    ### Change filenames to reflect junction array type
    export_filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt'; ef=filepath(export_filename)
    export_replacement = string.replace(export_filename,'_probe','_'+array_type+'_probe')
    er=filepath(export_replacement); shutil.copyfile(ef,er); os.remove(ef) ### Copy file to a new name

    ### Export full exon seqeunce for probesets/critical exons to replace the original incomplete sequence (used for miRNA analyses)
    if reannotate_exon_seq == 'yes':
        JunctionArray.reAnnotateCriticalExonSequences(species,array_type)
Ejemplo n.º 6
0
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version):    
    if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue
    else: specific_array_type = array_type
    
    if update_all == 'yes':
        update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes'
        
    if update_ensembl == 'yes':
        import EnsemblSQL; reload(EnsemblSQL)

        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""        
        configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force)
        
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version,species)
            
    if update_uniprot == 'yes':            
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species,force)
                
    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species,array_type,force,genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq; test_status = 'no'; data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species,data_type,test_status,force)
        else: buildExonArrayExonAnnotations(species,array_type,force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'; run_from_scratch = 'yes'
        export_all_associations = 'yes' ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species,array_type); null={}
        if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign; analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force)
       
        import IdentifyAltIsoforms; run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
        import FeatureAlignment; import JunctionArray
        FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
        
        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
            
            """ Repeat above with CoordinateBasedMatching = True """ 
            ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs)
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null')
            mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True)
            IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction')
            IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp)
            if array_type == 'RNASeq':
                JunctionArray.combineExonJunctionAnnotations(species,array_type)
                
    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':        
            import ExonSeqModule
            stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'; mir_source = 'multiple'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force)

    if array_type == 'junction':
        try:
            import JunctionArray; import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species,array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species,array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
            ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build
        except IOError: print 'No built junction files to analyze';sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'):
        import JunctionArray; import JunctionArrayEnsemblRules
        try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type)
        except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit()
    
    try:
        filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename)
        er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt')
        import shutil; shutil.copyfile(ef,er)
    except Exception: null=[]
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,server_folder) ### Will force download if missing
            verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing
        elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing
        if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file,array_type)
            except Exception: null=[]
    output_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'-exon_probesets.txt'
    fn=filepath(output_file); data = open(fn,'w')
    data.write(array_type+'_probeset\texon_probeset\n')
    
    for probeset in junction_to_exonids:
        exon_probeset = junction_to_exonids[probeset]
        data.write(probeset+'\t'+exon_probeset+'\n')    
    data.close()
    
if __name__ == '__main__':
    m = 'Mm'; h = 'Hs'
    Species = m
    array_type = 'RNASeq' ###In theory, could be another type of junciton or combination array

    annotateJunctionIDsAsExon(Species,array_type); sys.exit()
    
    #reimportJunctionComps(Species,array_type,'original');kill
    #JunctionArray.getJunctionExonLocations(Species,array_type)
    """
    ### Get UCSC associations (download databases if necessary)
    import UCSCImport
    mRNA_Type = 'mrna'; run_from_scratch = 'yes'; force='no'
    export_all_associations = 'no' ### YES only for protein prediction analysis
    UCSCImport.runUCSCEnsemblAssociations(Species,mRNA_Type,export_all_associations,run_from_scratch,force)
    """
    getAnnotations(Species,array_type,'yes','no')
    JunctionArray.identifyJunctionComps(Species,array_type)
    #importAndReformatEnsemblJunctionAnnotations(Species,array_type)
    
    
Ejemplo n.º 8
0
def executeParameters(species, array_type, force, genomic_build,
                      update_uniprot, update_ensembl,
                      update_probeset_to_ensembl, update_domain, update_miRs,
                      update_all, update_miR_seq, ensembl_version):
    if '|' in array_type:
        array_type, specific_array_type = string.split(
            array_type, '|'
        )  ### To destinguish between array sub-types, like the HJAY and hGlue
    else:
        specific_array_type = array_type

    if update_all == 'yes':
        update_uniprot = 'yes'
        update_ensembl = 'yes'
        update_probeset_to_ensembl = 'yes'
        update_domain = 'yes'
        update_miRs = 'yes'

    if update_ensembl == 'yes':
        import EnsemblSQL
        reload(EnsemblSQL)
        """ Used to grab all essential Ensembl annotations previously obtained via BioMart"""
        configType = 'Advanced'
        analysisType = 'AltAnalyzeDBs'
        externalDBName = ''
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl-to-External gene associations"""
        configType = 'Basic'
        analysisType = 'ExternalOnly'
        externalDBName = 'Uniprot/SWISSPROT'
        EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species, configType,
                                                       analysisType,
                                                       externalDBName,
                                                       ensembl_version, force)
        """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """
        if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq':
            EnsemblSQL.getFullGeneSequences(ensembl_version, species)

    if update_uniprot == 'yes':
        ###Might need to delete the existing versions of downloaded databases or force download
        buildUniProtFunctAnnotations(species, force)

    if update_probeset_to_ensembl == 'yes':
        if species == 'Mm' and array_type == 'AltMouse':
            buildAltMouseExonAnnotations(species, array_type, force,
                                         genomic_build)
        elif array_type == 'junction':
            buildJunctionExonAnnotations(species, array_type,
                                         specific_array_type, force,
                                         genomic_build)
        elif array_type == 'RNASeq':
            import RNASeq
            test_status = 'no'
            data_type = 'mRNA'
            RNASeq.getEnsemblAssociations(species, data_type, test_status,
                                          force)
        else:
            buildExonArrayExonAnnotations(species, array_type, force)

    if update_domain == 'yes':

        ### Get UCSC associations for all Ensembl linked genes (download databases if necessary)        if species == 'Mm' and array_type == 'AltMouse':
        mRNA_Type = 'mrna'
        run_from_scratch = 'yes'
        export_all_associations = 'yes'  ### YES only for protein prediction analysis
        buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                                run_from_scratch, force)

        if (species == 'Mm' and array_type == 'AltMouse'):
            """Imports and re-exports array-Ensembl annotations"""
            import JunctionArray
            null = JunctionArray.importArrayAnnotations(species, array_type)
            null = {}
        if (species == 'Mm' and array_type == 'AltMouse'
            ) or array_type == 'junction' or array_type == 'RNASeq':
            """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing"""
            import mRNASeqAlign
            analysis_type = 'reciprocal'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     analysis_type, force)

        import IdentifyAltIsoforms
        run_seqcomp = 'no'
        IdentifyAltIsoforms.runProgram(species, array_type, 'null', force,
                                       run_seqcomp)
        import FeatureAlignment
        FeatureAlignment.findDomainsByGenomeCoordinates(
            species, array_type, 'null')

        if array_type == 'junction' or array_type == 'RNASeq':
            ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction'
            mRNASeqAlign.alignProbesetsToTranscripts(species, array_type,
                                                     'single', force)
            IdentifyAltIsoforms.runProgram(species, array_type, 'junction',
                                           force, run_seqcomp)
            FeatureAlignment.findDomainsByGenomeCoordinates(
                species, array_type, 'junction')
            if array_type == 'junction' or array_type == 'RNASeq':
                ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon'
                IdentifyAltIsoforms.runProgram(species, array_type, 'exon',
                                               force, run_seqcomp)
                # FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed
                if array_type == 'RNASeq':
                    import JunctionArray
                    JunctionArray.combineExonJunctionAnnotations(
                        species, array_type)

    if update_miRs == 'yes':
        if update_miR_seq == 'yes':
            import MatchMiRTargetPredictions
            only_add_sequence_to_previous_results = 'no'
            MatchMiRTargetPredictions.runProgram(
                species, force, only_add_sequence_to_previous_results)

        if array_type == 'exon' or array_type == 'gene':
            import ExonSeqModule
            stringency = 'strict'
            process_microRNA_predictions = 'yes'
            mir_source = 'multiple'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            stringency = 'lax'
            ExonSeqModule.runProgram(species, array_type,
                                     process_microRNA_predictions, mir_source,
                                     stringency)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        else:
            import JunctionSeqModule
            stringency = 'strict'
            mir_source = 'multiple'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)
            stringency = 'lax'
            JunctionSeqModule.runProgram(species, array_type, mir_source,
                                         stringency, force)

    if array_type == 'junction':
        try:
            import JunctionArray
            import JunctionArrayEnsemblRules
            JunctionArray.filterForCriticalExons(species, array_type)
            JunctionArray.overRideExonEntriesWithJunctions(species, array_type)
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
            ExonArray.exportMetaProbesets(
                array_type, species)  ### Export metaprobesets for this build
        except IOError:
            print 'No built junction files to analyze'
            sys.exit()
    if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm'
                                   or species == 'Rn'):
        import JunctionArray
        import JunctionArrayEnsemblRules
        try:
            JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(
                species, array_type)
        except IOError:
            print 'No Ensembl_exons.txt file to analyze'
            sys.exit()

    try:
        filename = 'AltDatabase/' + species + '/SequenceData/miRBS-combined_gene-targets.txt'
        ef = filepath(filename)
        er = string.replace(
            ef, species + '/SequenceData/miRBS-combined_gene-targets.txt',
            'ensembl/' + species + '/' + species + '_microRNA-Ensembl.txt')
        import shutil
        shutil.copyfile(ef, er)
    except Exception:
        null = []
    if array_type != 'RNASeq':
        ### Get the probeset-probe relationships from online - needed for FIRMA analysis
        filename = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probeset-probes.txt'
        if array_type == 'junction' and 'lue' in specific_array_type:
            server_folder = 'junction/hGlue'
            verifyFile(filename,
                       server_folder)  ### Will force download if missing
            verifyFile('AltDatabase/' + species + '/' + array_type +
                       '/platform.txt',
                       server_folder)  ### Will force download if missing
        elif array_type != 'AltMouse':
            verifyFile(filename,
                       array_type)  ### Will force download if missing
        if (array_type == 'exon'
                or array_type == 'AltMouse') and species != 'Rn':
            try:
                ### Available for select exon-arrays and AltMouse
                probeset_to_remove_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_probes_to_remove.txt'
                verifyFile(probeset_to_remove_file, array_type)
            except Exception:
                null = []
Ejemplo n.º 9
0
def buildAltMouseExonAnnotations(species, array_type, force, genomic_build):
    """Code required to:
    1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived
       directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using
       dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py").
    2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb
       flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for
       coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced
       the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference).
    3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array
       Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport.
    This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re-
    written as "Mm_AltMouse_Ensembl_probeset.txt".
    """

    import JunctionArray
    import JunctionArrayEnsemblRules
    rederive_exonseq = 'no'
    ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above)
    if rederive_exonseq == 'yes':
        import AltAnalyze
        import ExonAnnotate_module
        import ExonAnalyze_module
        agglomerate_inclusion_probesets = 'no'
        onlyAnalyzeJunctions = 'no'
        probeset_annotations_file = "AltDatabase/" + species + "/" + array_type + "/" + "MASTER-probeset-transcript.txt"
        verifyFile(probeset_annotations_file,
                   array_type)  ### Will force download if missing
        exon_db = {}
        filtered_arrayids = {}
        filter_status = 'no'
        constituitive_probeset_db, exon_db, genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(
            probeset_annotations_file, array_type, filtered_arrayids,
            filter_status)
        alt_junction_db, critical_exon_db, exon_dbase, exon_inclusion_db, exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(
            exon_db, constituitive_probeset_db, {},
            agglomerate_inclusion_probesets, onlyAnalyzeJunctions)
        ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,
                                                      critical_exon_db,
                                                      exon_dbase)
        print "Finished exporting junctions used in AltMouse array comparisons."

        ExonAnalyze_module.exportAltMouseExonSequence()
        JunctionArray.reAnnotateCriticalExonSequences(species, array_type)

    ### Get UCSC associations (download databases if necessary)
    mRNA_Type = 'mrna'
    run_from_scratch = 'yes'
    export_all_associations = 'no'  ### YES only for protein prediction analysis
    buildUCSCAnnoationFiles(species, mRNA_Type, export_all_associations,
                            run_from_scratch, force)

    reannotate_exon_seq = 'yes'
    print 'genomic_build', genomic_build
    if genomic_build == 'new':
        ### Need to run with every new genomic build (match up new coordinates
        print "Begining to derive exon sequence from new genomic build"
        JunctionArray.identifyCriticalExonLocations(species, array_type)
        reannotate_exon_seq = 'yes'
    JunctionArrayEnsemblRules.getAnnotations(species, array_type,
                                             reannotate_exon_seq, force)

    ### Download files required during AltAnalyze analysis but not during the database build process
    filename = "AltDatabase/" + species + "/" + array_type + "/" + "MASTER-probeset-transcript.txt"
    verifyFile(filename, array_type)  ### Will force download if missing
    filename = "AltDatabase/" + species + '/' + array_type + '/' + array_type + "_annotations.txt"
    verifyFile(filename, array_type)  ### Will force download if missing