def runProgram(Species,Array_type,mir_source,stringency,Force): global species; global array_type; global force process_microRNA_predictions = 'yes' species = Species; array_type = Array_type; force = Force import_dir = '/AltDatabase/'+species+'/'+array_type filedir = import_dir[1:]+'/' dir_list = read_directory(import_dir) #send a sub_directory to a function to identify all files in a directory probeset_seq_file='' for input_file in dir_list: #loop through each file in the directory to results if 'critical-exon-seq_updated' in input_file: probeset_seq_file = filedir+input_file elif 'critical-exon-seq' in input_file: probeset_seq_file2 = filedir+input_file if len(probeset_seq_file)==0: probeset_seq_file=probeset_seq_file2 data_type = 'critical-exons' try: splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type) except UnboundLocalError: probeset_seq_file = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-exon-seq_updated.txt' update.downloadCurrentVersion(probeset_seq_file,array_type,'txt') splice_event_db = getParametersAndExecute(probeset_seq_file,array_type,species,data_type) if process_microRNA_predictions == 'yes': print 'stringency:',stringency try: ensembl_mirna_db = ExonSeqModule.importmiRNATargetPredictionsAdvanced(species) ExonSeqModule.alignmiRNAData(array_type,mir_source,species,stringency,ensembl_mirna_db,splice_event_db) except Exception: pass
def importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype): array_ens_db={} if array_type == 'AltMouse': filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'-Ensembl_relationships.txt' update.verifyFile(filename,array_type) ### Will force download if missing fn=filepath(filename); x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: array_gene,ens_gene = t try: array_ens_db[array_gene].append(ens_gene) except KeyError: array_ens_db[array_gene]=[ens_gene] filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_critical-junction-seq.txt' fn=filepath(filename); probeset_seq_db={}; x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: probeset,probeset_seq,junction_seq = t; junction_seq=string.replace(junction_seq,'|','') probeset_seq_db[probeset] = probeset_seq,junction_seq ###Import reciprocol junctions, so we can compare these directly instead of hits to nulls and combine with sequence data ###This short-cuts what we did in two function in ExonSeqModule with exon level data filename = 'AltDatabase/'+species+'/'+array_type+'/'+array_type+'_junction-comparisons.txt' fn=filepath(filename); probeset_gene_seq_db={}; x = 0 for line in open(fn,'r').xreadlines(): data, newline = string.split(line,'\n'); t = string.split(data,'\t') if x==0: x=1 else: array_gene,probeset1,probeset2,critical_exons = t #; critical_exons = string.split(critical_exons,'|') probesets = [probeset1,probeset2] if array_type == 'junction' or array_type == 'RNASeq': array_ens_db[array_gene]=[array_gene] if array_gene in array_ens_db: ensembl_gene_ids = array_ens_db[array_gene] for probeset_id in probesets: if probeset_id in probeset_seq_db: probeset_seq,junction_seq = probeset_seq_db[probeset_id] if biotype == 'gene': for ensembl_gene_id in ensembl_gene_ids: probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_id,array_gene,probesets,critical_exons) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) try: probeset_gene_seq_db[ensembl_gene_id].append(probe_data) except KeyError: probeset_gene_seq_db[ensembl_gene_id] = [probe_data] else: ### Used for probeset annotations downstream of sequence alignment in LinkEST, analagous to exon_db for exon analyses probe_data = ExonSeqModule.JunctionDataSimple(probeset_id,ensembl_gene_ids,array_gene,probesets,critical_exons) probe_data.SetExonSeq(probeset_seq) probe_data.SetJunctionSeq(junction_seq) probeset_gene_seq_db[probeset_id] = probe_data print len(probeset_gene_seq_db),"genes with probeset sequence associated" return probeset_gene_seq_db
def getParametersAndExecute(probeset_seq_file,array_type,species,data_type): if data_type == 'critical-exons': if array_type == 'RNASeq': probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_exons.txt' else: probeset_annotations_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_'+array_type+'_probesets.txt' ###Import probe-level associations exon_db = ExonSeqModule.importSplicingAnnotationDatabase(probeset_annotations_file,array_type) start_time = time.time() probeset_seq_db = importProbesetSeqeunces(probeset_seq_file,exon_db,species) ###Do this locally with a function that works on tab-delimited as opposed to fasta sequences (exon array) end_time = time.time(); time_diff = int(end_time-start_time) elif data_type == 'junctions': start_time = time.time(); biotype = 'gene' ### Indicates whether to store information at the level of genes or probesets probeset_seq_db = importSplicingAnnotationDatabaseAndSequence(species,array_type,biotype) end_time = time.time(); time_diff = int(end_time-start_time) print "Analyses finished in %d seconds" % time_diff return probeset_seq_db
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version): if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue else: specific_array_type = array_type if update_all == 'yes': update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes' if update_ensembl == 'yes': from build_scripts import EnsemblSQL; reload(EnsemblSQL) """ Used to grab all essential Ensembl annotations previously obtained via BioMart""" configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = '' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl-to-External gene associations""" configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """ if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': EnsemblSQL.getFullGeneSequences(ensembl_version,species) if update_uniprot == 'yes': ###Might need to delete the existing versions of downloaded databases or force download buildUniProtFunctAnnotations(species,force) if update_probeset_to_ensembl == 'yes': if species == 'Mm' and array_type == 'AltMouse': buildAltMouseExonAnnotations(species,array_type,force,genomic_build) elif array_type == 'junction': buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build) elif array_type == 'RNASeq': import RNASeq; test_status = 'no'; data_type = 'mRNA' RNASeq.getEnsemblAssociations(species,data_type,test_status,force) else: buildExonArrayExonAnnotations(species,array_type,force) if update_domain == 'yes': if array_type == 'RNASeq': only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences else: only_rely_on_coordinate_mapping = False from build_scripts import FeatureAlignment from build_scripts import JunctionArray from build_scripts import mRNASeqAlign from build_scripts import IdentifyAltIsoforms ### Get UCSC associations for all Ensembl linked genes (download databases if necessary) if species == 'Mm' and array_type == 'AltMouse': mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'yes' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) if (species == 'Mm' and array_type == 'AltMouse'): """Imports and re-exports array-Ensembl annotations""" null = JunctionArray.importArrayAnnotations(species,array_type); null={} if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing""" analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force) run_seqcomp = 'no' if only_rely_on_coordinate_mapping == False: IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') if array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon' IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed """ Repeat above with CoordinateBasedMatching = True """ ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs) analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) if array_type == 'RNASeq': JunctionArray.combineExonJunctionAnnotations(species,array_type) if update_miRs == 'yes': if update_miR_seq == 'yes': from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no' MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results) if array_type == 'exon' or array_type == 'gene': from build_scripts import ExonSeqModule stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) stringency = 'lax' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build else: from build_scripts import JunctionSeqModule stringency = 'strict'; mir_source = 'multiple' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) stringency = 'lax' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) if array_type == 'junction': try: from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules JunctionArray.filterForCriticalExons(species,array_type) JunctionArray.overRideExonEntriesWithJunctions(species,array_type) JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build except IOError: print 'No built junction files to analyze';sys.exit() if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'): from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit() try: filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename) er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt') import shutil; shutil.copyfile(ef,er) except Exception: null=[] if array_type != 'RNASeq': ### Get the probeset-probe relationships from online - needed for FIRMA analysis filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' if array_type == 'junction' and 'lue' in specific_array_type: server_folder = 'junction/hGlue' verifyFile(filename,server_folder) ### Will force download if missing verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn': try: ### Available for select exon-arrays and AltMouse probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt' verifyFile(probeset_to_remove_file,array_type) except Exception: null=[]
def annotateJunctionIDsAsExon(species, array_type): from build_scripts import ExonSeqModule probeset_annotations_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_junction_probesets-filtered.txt' if array_type == 'RNASeq': probeset_annotations_file = string.replace( probeset_annotations_file, 'junction_probesets-filtered', 'exons') junction_exon_db = ExonSeqModule.importSplicingAnnotationDatabase( probeset_annotations_file, array_type) probeset_annotations_file = 'AltDatabase/' + species + '/exon/' + species + '_Ensembl_probesets.txt' exon_db = ExonSeqModule.importSplicingAnnotationDatabase( probeset_annotations_file, array_type) ### Extract unique exon regions from Exon Array annotations multiple_exon_regions = {} unique_exon_regions = {} for probeset in exon_db: y = exon_db[probeset] geneid = y.GeneID() if '|' in y.ExonRegionID(): exonids = string.split(y.ExonRegionID(), '|') for exonid in exonids: multiple_exon_regions[geneid, exonid] = y else: unique_exon_regions[geneid, y.ExonRegionID()] = y ### Add missing exons to unique for uid in multiple_exon_regions: if uid not in unique_exon_regions: unique_exon_regions[uid] = multiple_exon_regions[uid] """ for i in unique_exon_regions: if 'ENSMUSG00000066842' in i: print i stop """ ### Extract unique exon regions from Junction Array annotation junction_to_exonids = {} for probeset in junction_exon_db: if 'ENSMUSG00000066842' in probeset: print probeset y = junction_exon_db[probeset] geneid = y.GeneID() if '|' in y.ExonRegionID(): exonids = string.split(y.ExonRegionID(), '|') if probeset == 'ENSMUSG00000066842|E60.1': print[[exonids]] for exonid in exonids: if (geneid, exonid) in unique_exon_regions: y = unique_exon_regions[geneid, exonid] if probeset == 'ENSMUSG00000066842:E60.1': print[y.Probeset()] junction_to_exonids[probeset] = y.Probeset() else: if (geneid, string.replace(y.ExonRegionID(), '.', '-')) in unique_exon_regions: #if ':' in probeset: print [probeset,y.ExonRegionID()];kill y = unique_exon_regions[ geneid, string.replace(y.ExonRegionID(), '.', '-')] junction_to_exonids[probeset] = y.Probeset() output_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_' + array_type + '-exon_probesets.txt' fn = filepath(output_file) data = open(fn, 'w') data.write(array_type + '_probeset\texon_probeset\n') for probeset in junction_to_exonids: exon_probeset = junction_to_exonids[probeset] data.write(probeset + '\t' + exon_probeset + '\n') data.close()