def formatAttributeForExport(attribute_db,filename): from build_scripts import IdentifyAltIsoforms export_db={} for (gene,probeset) in attribute_db: attribute_list = attribute_db[(gene,probeset)]; attribute_list2=[] for (attribute,direction) in attribute_list: attribute = string.replace(attribute,'|',' ') attribute_list2.append(attribute+'|'+direction) export_db[probeset]=attribute_list2 print 'Exporting:',filename IdentifyAltIsoforms.exportSimple(export_db,filename,'')
def executeParameters(species,array_type,force,genomic_build,update_uniprot,update_ensembl,update_probeset_to_ensembl,update_domain,update_miRs,update_all,update_miR_seq,ensembl_version): if '|' in array_type: array_type, specific_array_type = string.split(array_type,'|') ### To destinguish between array sub-types, like the HJAY and hGlue else: specific_array_type = array_type if update_all == 'yes': update_uniprot='yes'; update_ensembl='yes'; update_probeset_to_ensembl='yes'; update_domain='yes'; update_miRs = 'yes' if update_ensembl == 'yes': from build_scripts import EnsemblSQL; reload(EnsemblSQL) """ Used to grab all essential Ensembl annotations previously obtained via BioMart""" configType = 'Advanced'; analysisType = 'AltAnalyzeDBs'; externalDBName = '' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl-to-External gene associations""" configType = 'Basic'; analysisType = 'ExternalOnly'; externalDBName = 'Uniprot/SWISSPROT' EnsemblSQL.buildEnsemblRelationalTablesFromSQL(species,configType,analysisType,externalDBName,ensembl_version,force) """ Used to grab Ensembl full gene sequence plus promoter and 3'UTRs """ if array_type == 'AltMouse' or array_type == 'junction' or array_type == 'RNASeq': EnsemblSQL.getFullGeneSequences(ensembl_version,species) if update_uniprot == 'yes': ###Might need to delete the existing versions of downloaded databases or force download buildUniProtFunctAnnotations(species,force) if update_probeset_to_ensembl == 'yes': if species == 'Mm' and array_type == 'AltMouse': buildAltMouseExonAnnotations(species,array_type,force,genomic_build) elif array_type == 'junction': buildJunctionExonAnnotations(species,array_type,specific_array_type,force,genomic_build) elif array_type == 'RNASeq': import RNASeq; test_status = 'no'; data_type = 'mRNA' RNASeq.getEnsemblAssociations(species,data_type,test_status,force) else: buildExonArrayExonAnnotations(species,array_type,force) if update_domain == 'yes': if array_type == 'RNASeq': only_rely_on_coordinate_mapping = True ### This will provide more accurate results as many junctions have missing sequences else: only_rely_on_coordinate_mapping = False from build_scripts import FeatureAlignment from build_scripts import JunctionArray from build_scripts import mRNASeqAlign from build_scripts import IdentifyAltIsoforms ### Get UCSC associations for all Ensembl linked genes (download databases if necessary) if species == 'Mm' and array_type == 'AltMouse': mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'yes' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) if (species == 'Mm' and array_type == 'AltMouse'): """Imports and re-exports array-Ensembl annotations""" null = JunctionArray.importArrayAnnotations(species,array_type); null={} if (species == 'Mm' and array_type == 'AltMouse') or array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: """Performs probeset sequence aligment to Ensembl and UCSC transcripts. To do: Need to setup download if files missing""" analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force) run_seqcomp = 'no' if only_rely_on_coordinate_mapping == False: IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') if array_type == 'junction' or array_type == 'RNASeq': if only_rely_on_coordinate_mapping == False: ### For junction probeset sequences from mRNASeqAlign(), find and assess alternative proteins - export to the folder 'junction' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') ### For exon probesets (and junction exons) align and assess alternative proteins - export to the folder 'exon' IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'exon') # not needed """ Repeat above with CoordinateBasedMatching = True """ ### Peform coordinate based junction mapping to transcripts (requires certain sequence files built in IdentifyAltIosofmrs) analysis_type = 'reciprocal' mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,analysis_type,force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'null',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'null') mRNASeqAlign.alignProbesetsToTranscripts(species,array_type,'single',force,CoordinateBasedMatching = True) IdentifyAltIsoforms.runProgram(species,array_type,'junction',force,run_seqcomp) FeatureAlignment.findDomainsByGenomeCoordinates(species,array_type,'junction') IdentifyAltIsoforms.runProgram(species,array_type,'exon',force,run_seqcomp) if array_type == 'RNASeq': JunctionArray.combineExonJunctionAnnotations(species,array_type) if update_miRs == 'yes': if update_miR_seq == 'yes': from build_scripts import MatchMiRTargetPredictions; only_add_sequence_to_previous_results = 'no' MatchMiRTargetPredictions.runProgram(species,force,only_add_sequence_to_previous_results) if array_type == 'exon' or array_type == 'gene': from build_scripts import ExonSeqModule stringency = 'strict'; process_microRNA_predictions = 'yes'; mir_source = 'multiple' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) stringency = 'lax' ExonSeqModule.runProgram(species,array_type,process_microRNA_predictions,mir_source,stringency) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build else: from build_scripts import JunctionSeqModule stringency = 'strict'; mir_source = 'multiple' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) stringency = 'lax' JunctionSeqModule.runProgram(species,array_type,mir_source,stringency,force) if array_type == 'junction': try: from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules JunctionArray.filterForCriticalExons(species,array_type) JunctionArray.overRideExonEntriesWithJunctions(species,array_type) JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) ExonArray.exportMetaProbesets(array_type,species) ### Export metaprobesets for this build except IOError: print 'No built junction files to analyze';sys.exit() if array_type == 'RNASeq' and (species == 'Hs' or species == 'Mm' or species == 'Rn'): from build_scripts import JunctionArray; from build_scripts import JunctionArrayEnsemblRules try: JunctionArrayEnsemblRules.annotateJunctionIDsAsExon(species,array_type) except IOError: print 'No Ensembl_exons.txt file to analyze';sys.exit() try: filename = 'AltDatabase/'+species+'/SequenceData/miRBS-combined_gene-targets.txt'; ef=filepath(filename) er = string.replace(ef,species+'/SequenceData/miRBS-combined_gene-targets.txt','ensembl/'+species+'/'+species+'_microRNA-Ensembl.txt') import shutil; shutil.copyfile(ef,er) except Exception: null=[] if array_type != 'RNASeq': ### Get the probeset-probe relationships from online - needed for FIRMA analysis filename = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probeset-probes.txt' if array_type == 'junction' and 'lue' in specific_array_type: server_folder = 'junction/hGlue' verifyFile(filename,server_folder) ### Will force download if missing verifyFile('AltDatabase/'+species+'/'+array_type+'/platform.txt',server_folder) ### Will force download if missing elif array_type != 'AltMouse': verifyFile(filename,array_type) ### Will force download if missing if (array_type == 'exon' or array_type == 'AltMouse') and species != 'Rn': try: ### Available for select exon-arrays and AltMouse probeset_to_remove_file = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_probes_to_remove.txt' verifyFile(probeset_to_remove_file,array_type) except Exception: null=[]
def reAnalyzeRNAProbesetMatches(align_files, species, array_type, pairwise_probeset_combinations): """Import matching and non-matching probesets and export the valid comparisons""" align_files2 = [] for file in align_files: if array_type in file: align_files2.append(file) align_files = align_files2 matching = {} not_matching = {} for filename in align_files: print 'Reading', filename start_time = time.time() fn = filepath(filename) for line in open(fn, 'rU').xreadlines(): values = string.replace(line, '\n', '') probeset, call, accession = string.split(values, '\t') if call == '1': try: matching[probeset].append(accession) except KeyError: matching[probeset] = [accession] else: try: not_matching[probeset].append(accession) except KeyError: not_matching[probeset] = [accession] probeset_matching_pairs = {} matching_in_both = 0 match_and_null = 0 no_matches = 0 no_nulls = 0 for (probeset1, probeset2) in pairwise_probeset_combinations: if probeset1 in matching and probeset2 in matching: matching[probeset1].sort() matching[probeset2].sort() match1 = string.join(matching[probeset1], '|') match2 = string.join(matching[probeset2], '|') if match1 != match2: probeset_matching_pairs[probeset1 + '|' + probeset2] = [match1, match2] matching_in_both += 1 else: if probeset1 in matching and probeset1 in not_matching: match = string.join(matching[probeset1], '|') null_match = string.join( filterNullMatch(not_matching[probeset1], matching[probeset1]), '|') probeset_matching_pairs[probeset1] = [match, null_match] match_and_null += 1 elif probeset2 in matching and probeset2 in not_matching: match = string.join(matching[probeset2], '|') null_match = string.join( filterNullMatch(not_matching[probeset2], matching[probeset2]), '|') probeset_matching_pairs[probeset2] = [match, null_match] match_and_null += 1 elif probeset1 in matching or probeset2 in matching: no_nulls += 1 else: no_matches += 1 #if no_matches<10: print probeset1,probeset2 print matching_in_both, "probeset pairs with matching isoforms for both recipricol probesets." print match_and_null, "probeset pairs with a match for one and null for that one." print no_nulls, "probeset pairs with only one match." print no_matches, "probeset pairs with no matches." from build_scripts import IdentifyAltIsoforms export_file = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_all-transcript-matches.txt' if analysis_type == 'single': export_file = 'AltDatabase/' + species + '/' + array_type + '/junction/' + species + '_all-transcript-matches.txt' IdentifyAltIsoforms.exportSimple(probeset_matching_pairs, export_file, '')