def buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build): ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes'; force='no' export_all_associations = 'no' ### YES only for protein prediction analysis #buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) ### Get genomic locations and initial annotations for exon sequences (exon pobesets and junctions) from build_scripts import JunctionArray from build_scripts import JunctionArrayEnsemblRules """ The following functions: 1) Extract transcript cluster-to-gene annotations 2) Extract exon sequences for junctions and exon probesets from the Affymetrix annotation file (version 2.0), 3) Map these sequences to Ensembl gene sequences (build specific) plus and minus 2KB, upstream and downstream 4) Obtain AltAnalyze exon region annotations and obtain full-length exon sequences for each exon probeset 5) Consoladate these into an Ensembl_probeset.txt file (rather than Ensembl_junction_probeset.txt) with junctions having a single probeset identifier. 6) Determine which junctions and junction-exons represent recipricol junctions using: a) AltAnalyze identified recipricol junctions from Ensembl and UCSC and b) Affymetrix suggested recipricol junctions based on common exon cluster annotations, creating Mm_junction_comps_updated.txt. c) De novo comparison of all exon-junction region IDs for all junctions using the EnsemblImport method compareJunctions(). """ ### Steps 1-3 JunctionArray.getJunctionExonLocations(species,array_type,specific_array_type) ### Step 4 JunctionArrayEnsemblRules.getAnnotations(species,array_type,'yes',force) ### Step 5-6 JunctionArray.identifyJunctionComps(species,array_type,specific_array_type)
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build): """Code required to: 1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py"). 2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference). 3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport. This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re- written as "Mm_AltMouse_Ensembl_probeset.txt". """ from build_scripts import JunctionArray from build_scripts import JunctionArrayEnsemblRules rederive_exonseq = 'no' ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above) if rederive_exonseq == 'yes': import AltAnalyze from import_scripts import ExonAnnotate_module from build_scripts import ExonAnalyze_module agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no' probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(probeset_annotations_file,array_type) ### Will force download if missing exon_db={}; filtered_arrayids={};filter_status='no' constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status) alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions) ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase) print "Finished exporting junctions used in AltMouse array comparisons." ExonAnalyze_module.exportAltMouseExonSequence() JunctionArray.reAnnotateCriticalExonSequences(species,array_type) ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) reannotate_exon_seq = 'yes' print 'genomic_build', genomic_build if genomic_build == 'new': ### Need to run with every new genomic build (match up new coordinates print "Begining to derive exon sequence from new genomic build" JunctionArray.identifyCriticalExonLocations(species,array_type) reannotate_exon_seq = 'yes' JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force) ### Download files required during AltAnalyze analysis but not during the database build process filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(filename,array_type) ### Will force download if missing filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt" verifyFile(filename,array_type) ### Will force download if missing
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version): if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue else: specific_array_type = array_type if update_all == 'yes': update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes' if update_ensembl == 'yes': from build_scripts import EnsemblSQL; reload(EnsemblSQL) """ Used to grab all essential Ensembl annotations previously obtained via BioMart""" configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = '' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl-to-External gene associations""" configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """ if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': EnsemblSQL.getFullGeneSequences(ensembl_version,species) if update_uniprot == 'yes': ###Might need to delete the existing versions of downloaded databases or force download buildUniProtFunctAnnotations(species,force) if update_probeset_to_ensembl == 'yes': if species == 'Mm' and array_type == 'AltMouse': buildAltMouseExonAnnotations(species,array_type,force,genomic_build) elif array_type == 'junction': buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build) elif array_type == 'RNASeq': import RNASeq; test_status = 'no'; data_type = 'mRNA' RNASeq.getEnsemblAssociations(species,data_type,test_status,force) else: buildExonArrayExonAnnotations(species,array_type,force) if update_domain == 'yes': if array_type == 'RNASeq': only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences else: only_rely_on_coordinate_mapping = False from build_scripts import FeatureAlignment from build_scripts import JunctionArray from build_scripts import mRNASeqAlign from build_scripts import IdentifyAltIsoforms ### Get UCSC associations for all Ensembl linked genes (download databases if necessary) if species == 'Mm' and array_type == 'AltMouse': mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'yes' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) if (species == 'Mm' and array_type == 'AltMouse'): """Imports and re-exports array-Ensembl annotations""" null = JunctionArray.importArrayAnnotations(species,array_type); null={} if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing""" analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force) run_seqcomp = 'no' if only_rely_on_coordinate_mapping == False: IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') if array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon' IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed """ Repeat above with CoordinateBasedMatching = True """ ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs) analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) if array_type == 'RNASeq': JunctionArray.combineExonJunctionAnnotations(species,array_type) if update_miRs == 'yes': if update_miR_seq == 'yes': from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no' MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results) if array_type == 'exon' or array_type == 'gene': from build_scripts import ExonSeqModule stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) stringency = 'lax' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build else: from build_scripts import JunctionSeqModule stringency = 'strict'; mir_source = 'multiple' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) stringency = 'lax' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) if array_type == 'junction': try: from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules JunctionArray.filterForCriticalExons(species,array_type) JunctionArray.overRideExonEntriesWithJunctions(species,array_type) JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build except IOError: print 'No built junction files to analyze';sys.exit() if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'): from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit() try: filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename) er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt') import shutil; shutil.copyfile(ef,er) except Exception: null=[] if array_type != 'RNASeq': ### Get the probeset-probe relationships from online - needed for FIRMA analysis filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' if array_type == 'junction' and 'lue' in specific_array_type: server_folder = 'junction/hGlue' verifyFile(filename,server_folder) ### Will force download if missing verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn': try: ### Available for select exon-arrays and AltMouse probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt' verifyFile(probeset_to_remove_file,array_type) except Exception: null=[]