def exportMetaProbesets(array_type,species): import AltAnalyze; reload(AltAnalyze) import export probeset_types = ['core','extended','full'] if array_type == 'junction': probeset_types = ['all'] for probeset_type in probeset_types: exon_db,null = AltAnalyze.importSplicingAnnotations(array_type,species,probeset_type,'yes','') gene_db={}; null=[] for probeset in exon_db: ### At this point, exon_db is filtered by the probeset_type (e.g., core) ensembl_gene_id = exon_db[probeset].GeneID() try: gene_db[ensembl_gene_id].append(probeset) except Exception: gene_db[ensembl_gene_id] = [probeset] exon_db=[]; uid=0 output_dir = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_'+array_type+'_'+probeset_type+'.mps' #output_cv_dir = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Conversion_'+array_type+'_'+probeset_type+'.txt' #data_conversion = export.ExportFile(output_cv_dir) data = export.ExportFile(output_dir) data.write('probeset_id\ttranscript_cluster_id\tprobeset_list\tprobe_count\n') print "Exporting",len(gene_db),"to",output_dir for ensembl_gene_id in gene_db: probeset_strlist = string.join(gene_db[ensembl_gene_id],' '); uid+=1 line = string.join([str(uid),str(uid),probeset_strlist,str(len(gene_db[ensembl_gene_id])*4)],'\t')+'\n' data.write(line) #conversion_line = string.join([str(uid),ensembl_gene_id],'\t')+'\n'; data_conversion.write(conversion_line) data.close(); #data_conversion.close()
def findSpeciesInUniProtFiles(force): ### Download all UniProt annotation files and grab all species names, TaxIDs and corresponding URLs import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: taxid=species_annot_db[species_full].TaxID() tax_db[taxid]=species_full if force == 'yes': ### Should only need to be run if UniProt changes it's species to file associations or new species supported by Ensembl import export; import update filesearch = '_sprot_' all_swissprot = update.getFTPData('ftp.expasy.org','/databases/uniprot/current_release/knowledgebase/taxonomic_divisions',filesearch) for file in all_swissprot: gz_filepath, status = update.download(file,'uniprot_temp/','') if status == 'not-removed': try: os.remove(gz_filepath) ### Not sure why this works now and not before except OSError: status = status species_uniprot_db={}; altanalyze_species_uniprot_db={} dir=read_directory('/uniprot_temp') for filename in dir: fn=filepath('uniprot_temp/'+filename) for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) if data[0:2] == 'OX': taxid = string.split(data,'=')[1][:-1] if taxid in tax_db: species_full = tax_db[taxid] elif data[0:2] == 'OS': species = data[5:] species = string.split(species,' ')[:2] species_full = string.join(species,' ') elif data[0] == '/': url = 'ftp.expasy.org/databases/uniprot/current_release/knowledgebase/taxonomic_divisions/'+filename ss = string.split(species_full,' ') if len(ss)==2: ### Species name is in the format H**o sapiens - and '(' not in species_full and ')' not in species_full and '/' not in species_full try: species_uniprot_db[species_full].append((taxid,'ftp://'+url+'.gz')) except KeyError: species_uniprot_db[species_full] = [(taxid,'ftp://'+url+'.gz')] taxid = ''; species_full = '' import EnsemblImport species_uniprot_db = EnsemblImport.eliminate_redundant_dict_values(species_uniprot_db) ### Export all species to UniProt file relationships so this function needs to only be run once import export up = export.ExportFile('Config/uniprot-species-file.txt') for species_full in species_uniprot_db: values = species_uniprot_db[species_full] if len(values)>1: found = 'no' for (taxid,url) in values: if taxid in tax_db: if species_full == tax_db[taxid]: found='yes'; print 'ambiguity resolved:',species_full; break if found == 'yes': break else: (taxid,url) = values[0] up.write(string.join([species_full,taxid,url],'\t')+'\n') up.close()
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build): """Code required to: 1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py"). 2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference). 3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport. This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re- written as "Mm_AltMouse_Ensembl_probeset.txt". """ from build_scripts import JunctionArray from build_scripts import JunctionArrayEnsemblRules rederive_exonseq = 'no' ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above) if rederive_exonseq == 'yes': import AltAnalyze from import_scripts import ExonAnnotate_module from build_scripts import ExonAnalyze_module agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no' probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(probeset_annotations_file,array_type) ### Will force download if missing exon_db={}; filtered_arrayids={};filter_status='no' constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status) alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions) ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase) print "Finished exporting junctions used in AltMouse array comparisons." ExonAnalyze_module.exportAltMouseExonSequence() JunctionArray.reAnnotateCriticalExonSequences(species,array_type) ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) reannotate_exon_seq = 'yes' print 'genomic_build', genomic_build if genomic_build == 'new': ### Need to run with every new genomic build (match up new coordinates print "Begining to derive exon sequence from new genomic build" JunctionArray.identifyCriticalExonLocations(species,array_type) reannotate_exon_seq = 'yes' JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force) ### Download files required during AltAnalyze analysis but not during the database build process filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(filename,array_type) ### Will force download if missing filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt" verifyFile(filename,array_type) ### Will force download if missing
def buildAltMouseExonAnnotations(species,array_type,force,genomic_build): """Code required to: 1) Extract out Affymetrix provided exon sequence (probeset sequence extracted from "probeset_sequence_reversed.txt", derived directly from the Affymetrix AltMouse probe annotation file), from the "SEQUENCE-transcript-dbase.txt" (built using dump-chip1 .gff sequence and AltMerge-Peptide Informatics script "sequence_analysis_AltMouse_refseq.py"). 2) Once exported, grab full length exon sequences using exon/intron coordinates matches to full-length gene sequences with 2kb flanking sequence to efficiently predict microRNA binding site exclusion (reAnnotateCriticalExonSequences) and later for coordinate mapping to get exons aligning with UCSC annotated splicing annotations and exons. This sequence data replaced the previous file (don't need to re-run this - see rederive_exonseq == 'yes' below for reference). 3) Match the updated exon sequences to the most recent genomic coordinates and build the exact equivalent of the exon array Mm_Ensembl_probeset.txt database (same structure and ExonArrayEnsemblRules.py code). This involves running EnsemblImport. This code should be run before the exon array location build code since the "Mm_Ensembl_probeset.txt" is created and then re- written as "Mm_AltMouse_Ensembl_probeset.txt". """ import JunctionArray import JunctionArrayEnsemblRules rederive_exonseq = 'no' ### Only needs to be run once, to export exon sequence for AltMouse array the original (1 and 2 above) if rederive_exonseq == 'yes': import AltAnalyze import ExonAnnotate_module import ExonAnalyze_module agglomerate_inclusion_probesets = 'no'; onlyAnalyzeJunctions='no' probeset_annotations_file = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(probeset_annotations_file,array_type) ### Will force download if missing exon_db={}; filtered_arrayids={};filter_status='no' constituitive_probeset_db,exon_db,genes_being_analyzed = AltAnalyze.importSplicingAnnotationDatabase(probeset_annotations_file,array_type,filtered_arrayids,filter_status) alt_junction_db,critical_exon_db,exon_dbase,exon_inclusion_db,exon_db = ExonAnnotate_module.identifyPutativeSpliceEvents(exon_db,constituitive_probeset_db,{},agglomerate_inclusion_probesets,onlyAnalyzeJunctions) ExonAnnotate_module.exportJunctionComparisons(alt_junction_db,critical_exon_db,exon_dbase) print "Finished exporting junctions used in AltMouse array comparisons." ExonAnalyze_module.exportAltMouseExonSequence() JunctionArray.reAnnotateCriticalExonSequences(species,array_type) ### Get UCSC associations (download databases if necessary) mRNA_Type = 'mrna'; run_from_scratch = 'yes' export_all_associations = 'no' ### YES only for protein prediction analysis buildUCSCAnnoationFiles(species,mRNA_Type,export_all_associations,run_from_scratch,force) reannotate_exon_seq = 'yes' print 'genomic_build', genomic_build if genomic_build == 'new': ### Need to run with every new genomic build (match up new coordinates print "Begining to derive exon sequence from new genomic build" JunctionArray.identifyCriticalExonLocations(species,array_type) reannotate_exon_seq = 'yes' JunctionArrayEnsemblRules.getAnnotations(species,array_type,reannotate_exon_seq,force) ### Download files required during AltAnalyze analysis but not during the database build process filename = "AltDatabase/"+species+"/"+array_type+"/"+"MASTER-probeset-transcript.txt" verifyFile(filename,array_type) ### Will force download if missing filename = "AltDatabase/"+species+'/'+ array_type+'/'+array_type+"_annotations.txt" verifyFile(filename,array_type) ### Will force download if missing
def getUniProtURLsForAllSupportedSpecies(): ### Import all UniProt supproted species and URLs species_uniprot_db = {} fn = filepath('Config/uniprot-species-file.txt') for line in open(fn, 'r').xreadlines(): data = cleanUpLine(line) species_full, taxid, url = string.split(data, '\t') if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!) species_uniprot_db[species_full] = taxid, url import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db = AltAnalyze.importGOEliteSpeciesInfo() ### Export all urls for currently supported species import UI file_location_defaults = UI.importDefaultFileLocations() location_db = {} species_added = [] for species_full in species_annot_db: if species_full in species_uniprot_db: taxid, url = species_uniprot_db[species_full] species_code = species_annot_db[species_full].SpeciesCode() try: location_db[url].append(species_code) except Exception: location_db[url] = [species_code] species_added.append(species_full) for species_full in species_annot_db: taxid = species_annot_db[species_full].TaxID() species_code = species_annot_db[species_full].SpeciesCode() if species_full not in species_added: for species_name in species_uniprot_db: tax, url = species_uniprot_db[species_name] if tax == taxid: location_db[url].append(species_code) print species_code for url in location_db: species = string.join(location_db[url], '|') fl = UI.FileLocationData('ftp', url, species) try: file_location_defaults['UniProt'].append(fl) except KeyError: file_location_defaults['UniProt'] = [fl] UI.exportDefaultFileLocations(file_location_defaults)
def getUniProtURLsForAllSupportedSpecies(): ### Import all UniProt supproted species and URLs species_uniprot_db={} fn=filepath('Config/uniprot-species-file.txt') for line in open(fn,'r').xreadlines(): data = cleanUpLine(line) species_full,taxid,url = string.split(data,'\t') if 'H**o sapiens' not in species_full: ### There's a separate file for us humans (so egotistical!!!) species_uniprot_db[species_full] = taxid,url import AltAnalyze ###Get species annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo() ### Export all urls for currently supported species import UI file_location_defaults = UI.importDefaultFileLocations() location_db={}; species_added=[] for species_full in species_annot_db: if species_full in species_uniprot_db: taxid,url = species_uniprot_db[species_full] species_code = species_annot_db[species_full].SpeciesCode() try: location_db[url].append(species_code) except Exception: location_db[url] = [species_code] species_added.append(species_full) for species_full in species_annot_db: taxid = species_annot_db[species_full].TaxID() species_code = species_annot_db[species_full].SpeciesCode() if species_full not in species_added: for species_name in species_uniprot_db: tax,url = species_uniprot_db[species_name] if tax == taxid: location_db[url].append(species_code) print species_code for url in location_db: species = string.join(location_db[url],'|') fl = UI.FileLocationData('ftp', url, species) try: file_location_defaults['UniProt'].append(fl) except KeyError: file_location_defaults['UniProt'] = [fl] UI.exportDefaultFileLocations(file_location_defaults)
def importSplicingAnnotations(species,array_type,avg_all_for_ss): if array_type == 'exon' or array_type == 'gene': probeset_type = 'full' else: probeset_type = 'all' exon_db,constitutive_probeset_db = AltAnalyze.importSplicingAnnotations(array_type,species,probeset_type,avg_all_for_ss,root_dir) return exon_db,constitutive_probeset_db
def remoteRun(fl,Species,Array_type,expression_threshold,filter_method_type,p_val,express_data_format,altanalyze_file_list,avg_all_for_ss): start_time = time.time() global p; global filter_method; global exp_data_format; global array_type; global species; global root_dir; global original_exp_threshold global normalization_method; global exon_exp_threshold; global gene_rpkm_threshold; global junction_exp_threshold global exon_rpkm_threshold; global gene_exp_threshold original_exp_threshold = expression_threshold aspire_output_list=[]; aspire_output_gene_list=[] filter_method = filter_method_type altanalyze_files = altanalyze_file_list p = p_val; species = Species; array_type = Array_type exp_data_format = express_data_format ### Define global variables from the object fl try: normalization_method = fl.FeatureNormalization() except Exception: normalization_method = 'NA' try: exon_exp_threshold = fl.ExonExpThreshold() except Exception: exon_exp_threshold = 0 try: gene_rpkm_threshold = fl.RPKMThreshold() except Exception: gene_rpkm_threshold = 0 root_dir = fl.RootDir() try: junction_exp_threshold = fl.JunctionExpThreshold() except Exception: junction_exp_threshold = 0 try: exon_rpkm_threshold = fl.ExonRPKMThreshold() except Exception: exon_rpkm_threshold = 0 try: gene_exp_threshold = fl.GeneExpThreshold() except Exception: gene_exp_threshold = 0 if 'exon' in array_type: array_type = 'exon' ###In AnalayzeExpressionDataset module, this is named 'exon-array' global log_expression_threshold; global nonlog_exp_threshold; nonlog_exp_threshold = expression_threshold try: log_expression_threshold = math.log(expression_threshold,2) except Exception: log_expression_threshold = 0 ###Occurs if expression_threshold == 0 import_dir = root_dir+'AltExpression/pre-filtered/expression/'; import_dir_dabg = root_dir+'AltExpression/pre-filtered/dabg/' try: dir_list = read_directory(import_dir) #send a sub_directory to a function to identify all files in a directory except Exception: dir_list=[] try: dir_list2 = read_directory(import_dir_dabg) except Exception: dir_list2=[] if len(altanalyze_files) == 0: altanalyze_files = dir_list ###if no filenames input if array_type == 'RNASeq': altmerge_db = root_dir+'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_junctions.txt' elif array_type != 'AltMouse': altmerge_db = 'AltDatabase/'+species+'/'+array_type+'/'+species+'_Ensembl_probesets.txt' else: altmerge_db = "AltDatabase/"+species+"/"+array_type+"/MASTER-probeset-transcript.txt" ###Import probe-level associations if array_type != 'AltMouse': exon_db,altmerge_constituitive = importSplicingAnnotations(species,array_type,avg_all_for_ss) else: exon_db,altmerge_constituitive = import_altmerge(altmerge_db,array_type) ### Prior to version 2.0, this function was distinct from that in AltAnalyze(), so replaced it for consistency global altanalzye_input; altanalyze_output=[] if len(dir_list)>0: for altanalzye_input in dir_list: #loop through each file in the directory to output results if altanalzye_input in altanalyze_files: if altanalzye_input in dir_list2: analyze_dabg = 'yes' else: analyze_dabg = 'no' ind_start_time = time.time() array_db = import_dir + "/"+ altanalzye_input dabg_db = import_dir_dabg + "/"+ altanalzye_input #array_db = array_db[1:] #not sure why, but the '\' needs to be there while reading initally but not while accessing the file late #dabg_db = dabg_db[1:] dataset_name = altanalzye_input[0:-4] + '-' print "Begining to filter",dataset_name[0:-1] #print "Array type is:",array_type #print "Species is:", species #print "Expression format is:",exp_data_format #print "DABG p-value cut-off is:",p #print "Filter method is:",filter_method #print "Log2 expression cut-off is:",log_expression_threshold ###Import expression data and stats try: output_file = expr_analysis(array_db,dabg_db,altmerge_constituitive,exon_db,analyze_dabg) #filter the expression data based on fold and p-value OR expression threshold altanalyze_output.append(output_file) except KeyError: print "Impropper array type (",dataset_name[0:-1],") for",array_type,species,'. Skipping array.' ind_end_time = time.time(); time_diff = int(ind_end_time-ind_start_time) #print dataset_name,"filtering finished in %d seconds" % time_diff end_time = time.time(); time_diff = int(end_time-start_time) #print "Filtering complete for all files in %d seconds" % time_diff AltAnalyze.clearObjectsFromMemory(exon_db) exon_db={}; altmerge_constituitive={}; constitutive_probeset_db={} else: print "No expression files to filter found..." return altanalyze_output
def importSplicingAnnotations(species, array_type, avg_all_for_ss): if array_type == 'exon' or array_type == 'gene': probeset_type = 'full' else: probeset_type = 'all' exon_db, constitutive_probeset_db = AltAnalyze.importSplicingAnnotations( array_type, species, probeset_type, avg_all_for_ss, root_dir) return exon_db, constitutive_probeset_db
def remoteRun(fl, Species, Array_type, expression_threshold, filter_method_type, p_val, express_data_format, altanalyze_file_list, avg_all_for_ss): start_time = time.time() global p global filter_method global exp_data_format global array_type global species global root_dir global original_exp_threshold global normalization_method global exon_exp_threshold global gene_rpkm_threshold global junction_exp_threshold global exon_rpkm_threshold global gene_exp_threshold original_exp_threshold = expression_threshold aspire_output_list = [] aspire_output_gene_list = [] filter_method = filter_method_type altanalyze_files = altanalyze_file_list p = p_val species = Species array_type = Array_type exp_data_format = express_data_format ### Define global variables from the object fl try: normalization_method = fl.FeatureNormalization() except Exception: normalization_method = 'NA' try: exon_exp_threshold = fl.ExonExpThreshold() except Exception: exon_exp_threshold = 0 try: gene_rpkm_threshold = fl.RPKMThreshold() except Exception: gene_rpkm_threshold = 0 root_dir = fl.RootDir() try: junction_exp_threshold = fl.JunctionExpThreshold() except Exception: junction_exp_threshold = 0 try: exon_rpkm_threshold = fl.ExonRPKMThreshold() except Exception: exon_rpkm_threshold = 0 try: gene_exp_threshold = fl.GeneExpThreshold() except Exception: gene_exp_threshold = 0 if 'exon' in array_type: array_type = 'exon' ###In AnalayzeExpressionDataset module, this is named 'exon-array' global log_expression_threshold global nonlog_exp_threshold nonlog_exp_threshold = expression_threshold try: log_expression_threshold = math.log(expression_threshold, 2) except Exception: log_expression_threshold = 0 ###Occurs if expression_threshold == 0 import_dir = root_dir + 'AltExpression/pre-filtered/expression/' import_dir_dabg = root_dir + 'AltExpression/pre-filtered/dabg/' try: dir_list = read_directory( import_dir ) #send a sub_directory to a function to identify all files in a directory except Exception: dir_list = [] try: dir_list2 = read_directory(import_dir_dabg) except Exception: dir_list2 = [] if len(altanalyze_files) == 0: altanalyze_files = dir_list ###if no filenames input if array_type == 'RNASeq': altmerge_db = root_dir + 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_junctions.txt' elif array_type != 'AltMouse': altmerge_db = 'AltDatabase/' + species + '/' + array_type + '/' + species + '_Ensembl_probesets.txt' else: altmerge_db = "AltDatabase/" + species + "/" + array_type + "/MASTER-probeset-transcript.txt" ###Import probe-level associations if array_type != 'AltMouse': exon_db, altmerge_constituitive = importSplicingAnnotations( species, array_type, avg_all_for_ss) else: exon_db, altmerge_constituitive = import_altmerge( altmerge_db, array_type ) ### Prior to version 2.0, this function was distinct from that in AltAnalyze(), so replaced it for consistency global altanalzye_input altanalyze_output = [] if len(dir_list) > 0: for altanalzye_input in dir_list: #loop through each file in the directory to output results if altanalzye_input in altanalyze_files: if altanalzye_input in dir_list2: analyze_dabg = 'yes' else: analyze_dabg = 'no' ind_start_time = time.time() array_db = import_dir + "/" + altanalzye_input dabg_db = import_dir_dabg + "/" + altanalzye_input #array_db = array_db[1:] #not sure why, but the '\' needs to be there while reading initally but not while accessing the file late #dabg_db = dabg_db[1:] dataset_name = altanalzye_input[0:-4] + '-' print "Begining to filter", dataset_name[0:-1] #print "Array type is:",array_type #print "Species is:", species #print "Expression format is:",exp_data_format #print "DABG p-value cut-off is:",p #print "Filter method is:",filter_method #print "Log2 expression cut-off is:",log_expression_threshold ###Import expression data and stats try: output_file = expr_analysis( array_db, dabg_db, altmerge_constituitive, exon_db, analyze_dabg ) #filter the expression data based on fold and p-value OR expression threshold altanalyze_output.append(output_file) except KeyError: print "Impropper array type (", dataset_name[ 0: -1], ") for", array_type, species, '. Skipping array.' ind_end_time = time.time() time_diff = int(ind_end_time - ind_start_time) #print dataset_name,"filtering finished in %d seconds" % time_diff end_time = time.time() time_diff = int(end_time - start_time) #print "Filtering complete for all files in %d seconds" % time_diff AltAnalyze.clearObjectsFromMemory(exon_db) exon_db = {} altmerge_constituitive = {} constitutive_probeset_db = {} else: print "No expression files to filter found..." return altanalyze_output
def TargetScanImport(parse_sequences,force): """The TargetScan data is currently extracted from a cross-species conserved family file. This file only contains gene symbol, microRNA name and 3'UTR seed locations.""" if species == 'Mm': tax = '10090'; prefix = 'mmu-' elif species == 'Hs': tax = '9606'; prefix = 'hsa-' elif species == 'Rn': tax = '10116'; prefix = 'rno-' else: prefix = 'hsa-' import AltAnalyze ###Get taxid annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: if species==species_annot_db[species_full].SpeciesCode(): tax = species_annot_db[species_full].TaxID() global l ### See if the files are already there verifyTSG, target_scan_target_file = verifyExternalDownload('TargetScanGenes') verifyTSS, target_scan_sequence_file = verifyExternalDownload('TargetScanSequences') if verifyTSG == 'no' or verifyTSS == 'no': ### used to be - if force == 'yes' if parse_sequences == 'no': ### Then download the latest annotations and sequences target_scan_target_file = downloadFile('TargetScanGenes') target_scan_sequence_file = downloadFile('TargetScanSequences') ### Cross-species TargetScan file with UTR seqeunces for all genes with reported targets in the conserved family file ### Although this file includes valid sequence data that appears to match up to the target file, the target file ### appears to only list the seed seqeunce location (UTR start and stop) and not the full binding sequence and thus ### is not ammenable to probe set alignment. print 'parsing', target_scan_sequence_file fn=filepath(target_scan_sequence_file); x=0; target_scan_gene_utr_seq={} for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: symbol = string.upper(t[2]); tax_id = t[3]; utr_seq = t[4] if tax_id == tax: utr_seq_no_gaps = string.replace(utr_seq,'-','') utr_seq_no_gaps = string.replace(utr_seq_no_gaps,'U','T') if symbol in symbol_ensembl_current and len(utr_seq_no_gaps)>0: target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps print 'UTR sequence for',len(target_scan_gene_utr_seq),'TargetScan genes stored in memory.' mir_sequences = []; count=0 print 'parsing', target_scan_target_file #verifyFile(target_scan_target_file,species) ### Makes sure file is local and if not downloads. fn=filepath(target_scan_target_file); x=0; k=[]; l=[] for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 data = string.lower(data) t = string.split(data,'\t') i=0 for value in t: if 'mir' in value: m = i elif 'gene id' in value: g = i elif 'gene symbol' in value: s = i elif 'transcript' in value: r = i elif 'species id' in value: txi = i elif 'utr start' in value: us = i elif 'utr end' in value: ue = i i+=1 else: mir = t[m]; geneid = t[g]; gene_symbol = string.upper(t[s]); taxid = t[txi]; utr_start = int(t[us]); utr_end = int(t[ue]) ### Old format #mir = t[0]; gene_symbol = string.upper(t[1]); taxid = t[2]; utr_start = t[3]; utr_end = t[4] if '/' in mir: mir_list=[] mirs = string.split(mir,'/') for mirid in mirs[1:]: mirid = 'miR-'+mirid mir_list.append(mirid) mir_list.append(mirs[0]) else: mir_list = [mir] if taxid == tax: ###human #target_scan_gene_utr_seq[symbol] = utr_seq_no_gaps if gene_symbol in symbol_ensembl_current: ensembl_geneids = symbol_ensembl_current[gene_symbol]; proceed = 'yes'; k.append(gene_symbol) else: proceed = 'no'; l.append(gene_symbol) if gene_symbol in target_scan_gene_utr_seq: ### TargetScan provides the core, while processed miRs are typically 22nt - seems to approximate other databases better adj_start = utr_start-15 if adj_start < 0: adj_start=0 mir_sequences = target_scan_gene_utr_seq[gene_symbol][adj_start:utr_end+1] #if string.lower(gene_symbol) == 'tns3' and mir == 'miR-182': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences else: mir_sequences=[] ###Already multiple geneids associated with each symbol so don't need to worry about renundancy if proceed == 'yes': for ensembl_geneid in ensembl_geneids: for mir in mir_list: #if ensembl_geneid == 'ENSG00000137815' and mir == 'miR-214': print mir,gene_symbol,taxid,utr_start,utr_end,mir_sequences,target_scan_gene_utr_seq[gene_symbol];sys.exit() if parse_sequences == 'yes': if (prefix+mir,ensembl_geneid) in combined_results: combined_results[(prefix+mir,ensembl_geneid)].append(mir_sequences); count+=1 else: #if ensembl_geneid == 'ENSMUSG00000029467': print mir y = MicroRNATargetData(ensembl_geneid,gene_symbol,mir_sequences,prefix+mir,'TargetScan') count+=1 try: microRNA_target_db[prefix+mir].append(y) except KeyError: microRNA_target_db[prefix+mir] = [y] k = unique.unique(k); l = unique.unique(l) print 'ensembls-found:',len(k),', not found:',len(l) print l[:10] print count, 'miRNA-target relationships added for TargetScan'
def pictarImport(parse_sequences,type,added): """Annotations originally from the file: ng1536-S3.xls, posted as supplementary data at: http://www.nature.com/ng/journal/v37/n5/suppinfo/ng1536_S1.html. The file being parsed here has been pre-matched to Ensembl IDs using the ExonModule of LinkEST, for human.""" mir_sequences=[] if species == 'Mm': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-target-annotated.txt'; tax = '10090' else: filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '10116' #if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606' if type == 'pre-computed': if species == 'Hs': filename = 'AltDatabase/miRBS/'+species+'/'+'pictar-conserved-targets-2005.txt'; tax = '9606' else: if species == 'Hs': filename = 'AltDatabase/miRBS/'+'Mm'+'/'+'pictar-target-annotated.txt'; tax = '9606' import AltAnalyze ###Get taxid annotations from the GO-Elite config species_annot_db=AltAnalyze.importGOEliteSpeciesInfo(); tax_db={} for species_full in species_annot_db: if species==species_annot_db[species_full].SpeciesCode(): tax = species_annot_db[species_full].TaxID() print 'parsing', filename; count=0 print 'len(symbol_ensembl)', len(symbol_ensembl) verifyFile(filename,species) ### Makes sure file is local and if not downloads. fn=filepath(filename); x=1 for line in open(fn,'rU').xreadlines(): data = cleanUpLine(line) t = string.split(data,'\t') if x==0: x=1 else: if species == 'Hs': if type == 'pre-computed': ensembl_geneid, mir, mir_sequences = t; ensembl_geneids = [ensembl_geneid] else: symbol=string.upper(t[2]);mir=t[6];mir_sequences=t[11] if symbol in symbol_ensembl and len(symbol)>0: ensembl_geneids=symbol_ensembl[symbol] else: ensembl_geneids=[''] elif species == 'Mm': mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','mmu') if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] elif species == 'Rn': mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11]; mir = string.replace(mir,'hsa','rno') if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] else: mm_symbol=string.upper(t[3]);mir=t[6];mir_sequences=t[11] if mm_symbol in symbol_ensembl and len(mm_symbol)>0: ensembl_geneids=symbol_ensembl[mm_symbol] else: ensembl_geneids=[''] for ensembl_geneid in ensembl_geneids: if len(ensembl_geneid)>1 and (ensembl_geneid,mir) not in added: if parse_sequences == 'yes': if (mir,ensembl_geneid) in combined_results: combined_results[(mir,ensembl_geneid)].append(string.upper(mir_sequences)); count+=1 else: #if count < 800 and '-125b' in mir: print ensembl_geneid, mir, mm_symbol; count+=1 #elif count>799: kill y = MicroRNATargetData(ensembl_geneid,'',mir,mir_sequences,'pictar'); count+=1 try: microRNA_target_db[mir].append(y) except KeyError: microRNA_target_db[mir] = [y] added[(ensembl_geneid,mir)]=[] print count, 'miRNA-target relationships added for PicTar' return added