def get_directory(self): """returns the HTML page for the directory listing""" logging.info('RSAT - get_directory()') cache_file = "/".join([self.cache_dir, 'rsat_dir.html']) return util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file)
def get_rsat_organism(self, kegg_organism): """returns the HTML page for the directory listing""" logging.debug('RSAT - get_directory()') cache_file = "/".join([self.cache_dir, 'rsat_dir.html']) text = util.read_url_cached( "/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file) suggestion1 = util.best_matching_links(self.kegg_species, text)[0].rstrip('/') suggestion2 = util.best_matching_links(kegg_organism, text)[0].rstrip('/') if suggestion1 != suggestion2: ncbi_code1 = self.__get_ncbi_code(suggestion1) ncbi_code2 = self.__get_ncbi_code(suggestion2) if str(ncbi_code1) == str(self.ncbi_code): return suggestion1 elif str(ncbi_code2) == str(self.ncbi_code): return suggestion2 else: logging.warn("can't find the correct RSAT mapping !") return suggestion1 else: ncbi_code = self.__get_ncbi_code(suggestion1) if str(ncbi_code) == str(self.ncbi_code): return suggestion1 else: logging.warn("can't find the correct RSAT mapping !") return suggestion1
def get_rsat_organism(self, kegg_organism): """returns the HTML page for the directory listing""" logging.debug('RSAT - get_directory()') cache_file = "/".join([self.cache_dir, 'rsat_dir.html']) text = util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file) suggestion1 = util.best_matching_links(self.kegg_species, text)[0].rstrip('/') suggestion2 = util.best_matching_links(kegg_organism, text)[0].rstrip('/') if suggestion1 != suggestion2: ncbi_code1 = self.__get_ncbi_code(suggestion1) ncbi_code2 = self.__get_ncbi_code(suggestion2) if str(ncbi_code1) == str(self.ncbi_code): return suggestion1 elif str(ncbi_code2) == str(self.ncbi_code): return suggestion2 else: logging.warn("can't find the correct RSAT mapping !") return suggestion1 else: ncbi_code = self.__get_ncbi_code(suggestion1) if str(ncbi_code) == str(self.ncbi_code): return suggestion1 else: logging.warn("can't find the correct RSAT mapping !") return suggestion1
def get_rsat_organism(self, kegg_organism): """returns the HTML page for the directory listing""" logging.info('RSAT - get_directory()') cache_file = "/".join([self.cache_dir, 'rsat_dir.html']) text = util.read_url_cached( "/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file) return util.best_matching_links(kegg_organism, text)[0].rstrip('/')
def get_organism_names(self, organism): """returns the specified organism name file contents""" logging.info('RSAT - get_organism_names(%s)', organism) cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism]) return util.read_url_cached( "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.ORGANISM_NAMES_PATH]), cache_file)
def get_feature_names(self, organism): """returns the specified organism's feature name file contents""" #logging.info('RSAT - get_feature_names(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_' + self.feature_name + '_names']) rsat_url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, self.feature_names_path]) return util.read_url_cached(rsat_url, cache_file)
def get_contig_sequence(self, organism, contig): """returns the specified contig sequence""" #logging.info('RSAT - get_contig_sequence(%s, %s)', # organism, contig) cache_file = "/".join([self.cache_dir, organism + '_' + contig]) url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, 'genome', contig + '.raw']) return util.read_url_cached(url, cache_file).upper()
def get_rsat_organism(self, kegg_organism): """returns the HTML page for the directory listing""" logging.info('RSAT - get_directory()') cache_file = "/".join([self.cache_dir, 'rsat_dir.html']) text = util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file) return util.best_matching_links(kegg_organism, text)[0].rstrip('/')
def get_features(self, organism): """returns the specified organism's feature file contents Note: the current version only tries to read from feature.tab while the original cMonkey will fall back to cds.tab if that fails """ logging.debug('RSAT - get_features(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_' + self.feature_name]) uCache = util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH, organism, self.feature_path]), cache_file) #Make sure that the fields are in the correct order #Later parts assume that the features file will have the following columns fieldOrder = ['id', 'type', 'name', 'contig', 'start_pos', 'end_pos', 'strand'] uCache = uCache.split('\n') #Remove any blank lines while "" in uCache: uCache.remove("") idxs = {} #Dictionary to store field idxs targIdx = [] #The ordered list of columns for output outString = "" #This will be the new data for line in uCache: try: line = line + '\n' except: continue lineParts = line.split() if lineParts[0] == '--': if lineParts[1] == 'field': idxs[lineParts[3]] = lineParts[2] if lineParts[3] in fieldOrder: newIdx = str(fieldOrder.index(lineParts[3]) + 1) outString = outString + lineParts[0] + " " + lineParts[1] + " " + newIdx + '\t' + lineParts[3] + '\n' else: outString = outString + line else: lineParts = line.strip().split('\t') if len(lineParts) == 1: lineParts = line.split() if (len(targIdx) == 0): #Create the targIdx for curField in fieldOrder: targIdx.append(int(idxs[curField])-1) outline = "" lineParts = line.split('\t') #Resplit to fix empty fields for curTarg in targIdx: outline = outline + lineParts[curTarg].strip() + '\t' #Some RSAT files have a contig with ':'s instead of '_'s outline = outline.replace(':','_') #Now strip trailing \t outline = ''.join(outline.rsplit('\t', 1)) outString = outString + outline + '\n' #To Do: Overwrite cache file & add early check to see if we need the sub return outString
def get_feature_names(self, organism): """returns the specified organism's feature name file contents""" #logging.info('RSAT - get_feature_names(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_feature_names']) return util.read_url_cached( "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.FEATURE_NAMES_PATH ]), cache_file)
def get_operon_predictions_for(self, organism_id): """Retrieve operon predictions for the specified organism""" logging.info("MicrobesOnline.get_operon_predictions_for(%s)", organism_id) url = '/'.join([self.base_url, 'operons', 'gnc%s.named' % str(organism_id)]) cache_file = '/'.join([self.cache_dir, 'gnc%s.named' % str(organism_id)]) return util.read_url_cached(url, cache_file)
def get_taxonomy_id(self, organism): """returns the specified organism name file contents""" logging.info('RSAT - get_organism_names(%s)', organism) cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism]) text = util.read_url_cached( "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.ORGANISM_NAMES_PATH]), cache_file) organism_names_dfile = util.dfile_from_text(text, comment='--') return patches.patch_ncbi_taxonomy(organism_names_dfile.lines[0][0])
def get_feature_names(self, organism): """returns the specified organism's feature name file contents""" #logging.info('RSAT - get_feature_names(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_feature_names']) return util.read_url_cached( "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.FEATURE_NAMES_PATH]), cache_file)
def get_feature_names(self, organism): """returns the specified organism's feature name file contents""" #logging.info('RSAT - get_feature_names(%s)', organism) cache_file = "/".join( [self.cache_dir, organism + '_' + self.feature_name + '_names']) rsat_url = "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, self.feature_names_path ]) return util.read_url_cached(rsat_url, cache_file)
def get_genome_for(self, organism_id): """Returns the genome from Microbes Online, stored in FASTA format""" logging.info('MicrobesOnline.get_genome_for(%s)', str(organism_id)) url = '/'.join([self.base_url, 'cgi-bin', 'genomeInfo.cgi?tId=%s;export=genome' % str(organism_id)]) cache_file = '/'.join([self.cache_dir, 'mo_genome_%s.fasta' % str(organism_id)]) return util.read_url_cached(url, cache_file)
def get_genome_info_for(self, organism_id): """Returns the Genome info from Microbes Online""" logging.info('MicrobesOnline.get_genome_info_for(%s)', str(organism_id)) url = '/'.join([self.base_url, 'cgi-bin', 'genomeInfo.cgi?tId=%s;export=tab' % str(organism_id)]) cache_file = '/'.join([self.cache_dir, 'mo_%s.genome_info' % str(organism_id)]) return util.read_url_cached(url, cache_file)
def get_contig_sequence(self, organism, contig): """returns the specified contig sequence""" #logging.info('RSAT - get_contig_sequence(%s, %s)', # organism, contig) cache_file = "/".join([self.cache_dir, organism + '_' + contig]) url = "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, 'genome', contig + '.raw' ]) seqstr = util.read_url_cached(url, cache_file).upper() return join_contig_sequence(seqstr)
def __get_ncbi_code(self, rsat_organism): """retrieve NCBI code from organism.tab file""" try: cache_file = "/".join([self.cache_dir, '%s.tab' % rsat_organism]) url = "/".join([self.base_url, RsatDatabase.DIR_PATH, rsat_organism, RsatDatabase.ORGANISM_PATH]) text = util.read_url_cached(url, cache_file) spec = [line for line in text.split('\n') if not line.startswith('--')][0] return spec.strip().split('\t')[0] except: return None
def get_taxonomy_id(self, organism): """returns the specified organism name file contents""" logging.info('RSAT - get_organism_names(%s)', organism) cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism]) text = util.read_url_cached( "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.ORGANISM_NAMES_PATH ]), cache_file) organism_names_dfile = util.dfile_from_text(text, comment='--') return patches.patch_ncbi_taxonomy(organism_names_dfile.lines[0][0])
def get_features(self, organism): """returns the specified organism's feature file contents Note: the current version only tries to read from feature.tab while the original cMonkey will fall back to cds.tab if that fails """ #logging.info('RSAT - get_features(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_features']) return util.read_url_cached( "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.FEATURE_PATH ]), cache_file)
def get_features(self, organism): """returns the specified organism's feature file contents Note: the current version only tries to read from feature.tab while the original cMonkey will fall back to cds.tab if that fails """ #logging.info('RSAT - get_features(%s)', organism) cache_file = "/".join([self.cache_dir, organism + '_features']) return util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH, organism, RsatDatabase.FEATURE_PATH]), cache_file)
def read_pssms(): print "reading PSSMs..." pssm_text = util.read_url_cached('http://regulondb.ccg.unam.mx/data/PSSMSet.txt', 'cache/regulondb_pssms.txt').split('\n') num_lines = len(pssm_text) line_number = 0 pssms = [] while line_number < num_lines: line = pssm_text[line_number] if line.startswith('Transcription Factor Name: '): line_number, pssm = read_pssm(line[27:], pssm_text, line_number + 1) pssms.append(pssm) line_number += 1 return pssms
def __get_ncbi_code(self, rsat_organism): """retrieve NCBI code from organism.tab file""" try: cache_file = "/".join([self.cache_dir, '%s.tab' % rsat_organism]) url = "/".join([ self.base_url, RsatDatabase.DIR_PATH, rsat_organism, RsatDatabase.ORGANISM_PATH ]) text = util.read_url_cached(url, cache_file) spec = [ line for line in text.split('\n') if not line.startswith('--') ][0] return spec.strip().split('\t')[0] except: return None
def read_pssms(): print "reading PSSMs..." pssm_text = util.read_url_cached( 'http://regulondb.ccg.unam.mx/data/PSSMSet.txt', 'cache/regulondb_pssms.txt').split('\n') num_lines = len(pssm_text) line_number = 0 pssms = [] while line_number < num_lines: line = pssm_text[line_number] if line.startswith('Transcription Factor Name: '): line_number, pssm = read_pssm(line[27:], pssm_text, line_number + 1) pssms.append(pssm) line_number += 1 return pssms
def get_contig_sequence(self, organism, contig): """returns the specified contig sequence""" logging.debug('RSAT - get_contig_sequence(%s, %s)', organism, contig) cache_file = "/".join([self.cache_dir, organism + '_' + contig]) url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism, 'genome', contig + '.raw']) #10-07-14 Crashed here with URL timeout. Maybe RSAT limits downloads? # On 10-08-14 I could download the other files with pdb.set_trace() # Maybe all I will need is a pause between files? try: seqstr = util.read_url_cached(url, cache_file).upper() except: logging.error("Error downloading file: %s", url) logging.error("RSAT occasionally has connectivity problems.") logging.error("Try again later, or try a different RSAT mirror") logging.error("using the parameter --rsat_base_url") return join_contig_sequence(seqstr)
def get_contig_sequence(self, organism, contig): """returns the specified contig sequence""" logging.debug('RSAT - get_contig_sequence(%s, %s)', organism, contig) cache_file = "/".join([self.cache_dir, organism + '_' + contig]) url = "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, 'genome', contig + '.raw' ]) #10-07-14 Crashed here with URL timeout. Maybe RSAT limits downloads? # On 10-08-14 I could download the other files with pdb.set_trace() # Maybe all I will need is a pause between files? try: seqstr = util.read_url_cached(url, cache_file).upper() except: logging.error("Error downloading file: %s", url) logging.error("RSAT occasionally has connectivity problems.") logging.error("Try again later, or try a different RSAT mirror") logging.error("using the parameter --rsat_base_url") return join_contig_sequence(seqstr)
def get_features(self, organism): """returns the specified organism's feature file contents Note: the current version only tries to read from feature.tab while the original cMonkey will fall back to cds.tab if that fails """ logging.debug('RSAT - get_features(%s)', organism) cache_file = "/".join( [self.cache_dir, organism + '_' + self.feature_name]) uCache = util.read_url_cached( "/".join([ self.base_url, RsatDatabase.DIR_PATH, organism, self.feature_path ]), cache_file) #Make sure that the fields are in the correct order #Later parts assume that the features file will have the following columns fieldOrder = [ 'id', 'type', 'name', 'contig', 'start_pos', 'end_pos', 'strand' ] uCache = uCache.split('\n') #Remove any blank lines while "" in uCache: uCache.remove("") idxs = {} #Dictionary to store field idxs targIdx = [] #The ordered list of columns for output outString = "" #This will be the new data for line in uCache: try: line = line + '\n' except: continue lineParts = line.split() if lineParts[0] == '--': if lineParts[1] == 'field': idxs[lineParts[3]] = lineParts[2] if lineParts[3] in fieldOrder: newIdx = str(fieldOrder.index(lineParts[3]) + 1) outString = outString + lineParts[0] + " " + lineParts[ 1] + " " + newIdx + '\t' + lineParts[3] + '\n' else: outString = outString + line else: if (len(targIdx) == 0): #Create the targIdx for curField in fieldOrder: targIdx.append(int(idxs[curField]) - 1) outline = "" for curTarg in targIdx: outline = outline + lineParts[curTarg] + '\t' #Some RSAT files have a contig with ':'s instead of '_'s outline = outline.replace(':', '_') #Now strip trailing \t outline = ''.join(outline.rsplit('\t', 1)) outString = outString + outline + '\n' #To Do: Overwrite cache file & add early check to see if we need the sub return outString