Example #1
0
 def get_directory(self):
     """returns the HTML page for the directory listing"""
     logging.info('RSAT - get_directory()')
     cache_file = "/".join([self.cache_dir, 'rsat_dir.html'])
     return util.read_url_cached("/".join([self.base_url,
                                           RsatDatabase.DIR_PATH]),
                                 cache_file)
Example #2
0
 def get_rsat_organism(self, kegg_organism):
     """returns the HTML page for the directory listing"""
     logging.debug('RSAT - get_directory()')
     cache_file = "/".join([self.cache_dir, 'rsat_dir.html'])
     text = util.read_url_cached(
         "/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file)
     suggestion1 = util.best_matching_links(self.kegg_species,
                                            text)[0].rstrip('/')
     suggestion2 = util.best_matching_links(kegg_organism,
                                            text)[0].rstrip('/')
     if suggestion1 != suggestion2:
         ncbi_code1 = self.__get_ncbi_code(suggestion1)
         ncbi_code2 = self.__get_ncbi_code(suggestion2)
         if str(ncbi_code1) == str(self.ncbi_code):
             return suggestion1
         elif str(ncbi_code2) == str(self.ncbi_code):
             return suggestion2
         else:
             logging.warn("can't find the correct RSAT mapping !")
             return suggestion1
     else:
         ncbi_code = self.__get_ncbi_code(suggestion1)
         if str(ncbi_code) == str(self.ncbi_code):
             return suggestion1
         else:
             logging.warn("can't find the correct RSAT mapping !")
             return suggestion1
Example #3
0
 def get_rsat_organism(self, kegg_organism):
     """returns the HTML page for the directory listing"""
     logging.debug('RSAT - get_directory()')
     cache_file = "/".join([self.cache_dir, 'rsat_dir.html'])
     text = util.read_url_cached("/".join([self.base_url,
                                           RsatDatabase.DIR_PATH]),
                                 cache_file)
     suggestion1 = util.best_matching_links(self.kegg_species, text)[0].rstrip('/')
     suggestion2 = util.best_matching_links(kegg_organism, text)[0].rstrip('/')
     if suggestion1 != suggestion2:
         ncbi_code1 = self.__get_ncbi_code(suggestion1)
         ncbi_code2 = self.__get_ncbi_code(suggestion2)
         if str(ncbi_code1) == str(self.ncbi_code):
             return suggestion1
         elif str(ncbi_code2) == str(self.ncbi_code):
             return suggestion2
         else:
             logging.warn("can't find the correct RSAT mapping !")
             return suggestion1
     else:
         ncbi_code = self.__get_ncbi_code(suggestion1)
         if str(ncbi_code) == str(self.ncbi_code):
             return suggestion1
         else:
             logging.warn("can't find the correct RSAT mapping !")
             return suggestion1
Example #4
0
 def get_rsat_organism(self, kegg_organism):
     """returns the HTML page for the directory listing"""
     logging.info('RSAT - get_directory()')
     cache_file = "/".join([self.cache_dir, 'rsat_dir.html'])
     text = util.read_url_cached(
         "/".join([self.base_url, RsatDatabase.DIR_PATH]), cache_file)
     return util.best_matching_links(kegg_organism, text)[0].rstrip('/')
Example #5
0
 def get_organism_names(self, organism):
     """returns the specified organism name file contents"""
     logging.info('RSAT - get_organism_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism])
     return util.read_url_cached(
         "/".join([self.base_url, RsatDatabase.DIR_PATH, organism,
                   RsatDatabase.ORGANISM_NAMES_PATH]), cache_file)
Example #6
0
 def get_feature_names(self, organism):
     """returns the specified organism's feature name file contents"""
     #logging.info('RSAT - get_feature_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, organism + '_' + self.feature_name + '_names'])
     rsat_url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism,
                          self.feature_names_path])
     return util.read_url_cached(rsat_url, cache_file)
Example #7
0
 def get_contig_sequence(self, organism, contig):
     """returns the specified contig sequence"""
     #logging.info('RSAT - get_contig_sequence(%s, %s)',
     #             organism, contig)
     cache_file = "/".join([self.cache_dir, organism + '_' + contig])
     url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism,
                     'genome', contig + '.raw'])
     return util.read_url_cached(url, cache_file).upper()
Example #8
0
 def get_rsat_organism(self, kegg_organism):
     """returns the HTML page for the directory listing"""
     logging.info('RSAT - get_directory()')
     cache_file = "/".join([self.cache_dir, 'rsat_dir.html'])
     text = util.read_url_cached("/".join([self.base_url,
                                           RsatDatabase.DIR_PATH]),
                                 cache_file)
     return util.best_matching_links(kegg_organism, text)[0].rstrip('/')
Example #9
0
    def get_features(self, organism):
        """returns the specified organism's feature file contents
        Note: the current version only tries to read from feature.tab
        while the original cMonkey will fall back to cds.tab
        if that fails
        """
        logging.debug('RSAT - get_features(%s)', organism)
        cache_file = "/".join([self.cache_dir, organism + '_' + self.feature_name])
        uCache = util.read_url_cached("/".join([self.base_url, RsatDatabase.DIR_PATH, organism, self.feature_path]), cache_file)

        #Make sure that the fields are in the correct order
        #Later parts assume that the features file will have the following columns
        fieldOrder = ['id', 'type', 'name', 'contig', 'start_pos', 'end_pos', 'strand']

        uCache = uCache.split('\n')
        #Remove any blank lines
        while "" in uCache:
            uCache.remove("")

        idxs = {} #Dictionary to store field idxs
        targIdx = [] #The ordered list of columns for output
        outString = "" #This will be the new data
        for line in uCache:
            try:
                line = line + '\n'
            except:
                continue
        
            lineParts = line.split()        
            if lineParts[0] == '--':
                if lineParts[1] == 'field':
                        idxs[lineParts[3]] = lineParts[2]
                        if lineParts[3] in fieldOrder:
                                newIdx = str(fieldOrder.index(lineParts[3]) + 1)
                                outString = outString + lineParts[0] + " " + lineParts[1] + " " + newIdx + '\t' + lineParts[3] + '\n'
                else:
                        outString = outString + line
            else:
                lineParts = line.strip().split('\t')
                if len(lineParts) == 1:
                    lineParts = line.split()
            
                if (len(targIdx) == 0):
                        #Create the targIdx
                        for curField in fieldOrder:
                                targIdx.append(int(idxs[curField])-1)
                outline = ""
                lineParts = line.split('\t')  #Resplit to fix empty fields
                for curTarg in targIdx:
                        outline = outline + lineParts[curTarg].strip() + '\t'
                #Some RSAT files have a contig with ':'s instead of '_'s
                outline = outline.replace(':','_')
                #Now strip trailing \t
                outline = ''.join(outline.rsplit('\t', 1))
                outString = outString + outline + '\n'

        #To Do: Overwrite cache file & add early check to see if we need the sub
        return outString
Example #10
0
 def get_feature_names(self, organism):
     """returns the specified organism's feature name file contents"""
     #logging.info('RSAT - get_feature_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, organism + '_feature_names'])
     return util.read_url_cached(
         "/".join([
             self.base_url, RsatDatabase.DIR_PATH, organism,
             RsatDatabase.FEATURE_NAMES_PATH
         ]), cache_file)
 def get_operon_predictions_for(self, organism_id):
     """Retrieve operon predictions for the specified organism"""
     logging.info("MicrobesOnline.get_operon_predictions_for(%s)",
                  organism_id)
     url = '/'.join([self.base_url, 'operons',
                    'gnc%s.named' % str(organism_id)])
     cache_file = '/'.join([self.cache_dir,
                           'gnc%s.named' % str(organism_id)])
     return util.read_url_cached(url, cache_file)
 def get_operon_predictions_for(self, organism_id):
     """Retrieve operon predictions for the specified organism"""
     logging.info("MicrobesOnline.get_operon_predictions_for(%s)",
                  organism_id)
     url = '/'.join([self.base_url, 'operons',
                    'gnc%s.named' % str(organism_id)])
     cache_file = '/'.join([self.cache_dir,
                           'gnc%s.named' % str(organism_id)])
     return util.read_url_cached(url, cache_file)
Example #13
0
 def get_taxonomy_id(self, organism):
     """returns the specified organism name file contents"""
     logging.info('RSAT - get_organism_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism])
     text = util.read_url_cached(
         "/".join([self.base_url, RsatDatabase.DIR_PATH, organism,
                   RsatDatabase.ORGANISM_NAMES_PATH]), cache_file)
     organism_names_dfile = util.dfile_from_text(text, comment='--')
     return patches.patch_ncbi_taxonomy(organism_names_dfile.lines[0][0])
Example #14
0
 def get_feature_names(self, organism):
     """returns the specified organism's feature name file contents"""
     #logging.info('RSAT - get_feature_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, organism + '_feature_names'])
     return util.read_url_cached(
         "/".join([self.base_url,
                   RsatDatabase.DIR_PATH,
                   organism,
                   RsatDatabase.FEATURE_NAMES_PATH]),
         cache_file)
Example #15
0
 def get_feature_names(self, organism):
     """returns the specified organism's feature name file contents"""
     #logging.info('RSAT - get_feature_names(%s)', organism)
     cache_file = "/".join(
         [self.cache_dir, organism + '_' + self.feature_name + '_names'])
     rsat_url = "/".join([
         self.base_url, RsatDatabase.DIR_PATH, organism,
         self.feature_names_path
     ])
     return util.read_url_cached(rsat_url, cache_file)
 def get_genome_for(self, organism_id):
     """Returns the genome from Microbes Online, stored in FASTA format"""
     logging.info('MicrobesOnline.get_genome_for(%s)',
                  str(organism_id))
     url = '/'.join([self.base_url, 'cgi-bin',
                     'genomeInfo.cgi?tId=%s;export=genome' %
                     str(organism_id)])
     cache_file = '/'.join([self.cache_dir,
                           'mo_genome_%s.fasta' % str(organism_id)])
     return util.read_url_cached(url, cache_file)
 def get_genome_info_for(self, organism_id):
     """Returns the Genome info from Microbes Online"""
     logging.info('MicrobesOnline.get_genome_info_for(%s)',
                  str(organism_id))
     url = '/'.join([self.base_url, 'cgi-bin',
                     'genomeInfo.cgi?tId=%s;export=tab' %
                     str(organism_id)])
     cache_file = '/'.join([self.cache_dir,
                           'mo_%s.genome_info' % str(organism_id)])
     return util.read_url_cached(url, cache_file)
Example #18
0
 def get_contig_sequence(self, organism, contig):
     """returns the specified contig sequence"""
     #logging.info('RSAT - get_contig_sequence(%s, %s)',
     #             organism, contig)
     cache_file = "/".join([self.cache_dir, organism + '_' + contig])
     url = "/".join([
         self.base_url, RsatDatabase.DIR_PATH, organism, 'genome',
         contig + '.raw'
     ])
     seqstr = util.read_url_cached(url, cache_file).upper()
     return join_contig_sequence(seqstr)
Example #19
0
 def __get_ncbi_code(self, rsat_organism):
     """retrieve NCBI code from organism.tab file"""
     try:
         cache_file = "/".join([self.cache_dir, '%s.tab' % rsat_organism])
         url = "/".join([self.base_url, RsatDatabase.DIR_PATH, rsat_organism,
                         RsatDatabase.ORGANISM_PATH])
         text = util.read_url_cached(url, cache_file)
         spec = [line for line in text.split('\n') if not line.startswith('--')][0]
         return spec.strip().split('\t')[0]
     except:
         return None
Example #20
0
 def get_taxonomy_id(self, organism):
     """returns the specified organism name file contents"""
     logging.info('RSAT - get_organism_names(%s)', organism)
     cache_file = "/".join([self.cache_dir, 'rsatnames_' + organism])
     text = util.read_url_cached(
         "/".join([
             self.base_url, RsatDatabase.DIR_PATH, organism,
             RsatDatabase.ORGANISM_NAMES_PATH
         ]), cache_file)
     organism_names_dfile = util.dfile_from_text(text, comment='--')
     return patches.patch_ncbi_taxonomy(organism_names_dfile.lines[0][0])
Example #21
0
 def get_features(self, organism):
     """returns the specified organism's feature file contents
     Note: the current version only tries to read from feature.tab
     while the original cMonkey will fall back to cds.tab
     if that fails
     """
     #logging.info('RSAT - get_features(%s)', organism)
     cache_file = "/".join([self.cache_dir, organism + '_features'])
     return util.read_url_cached(
         "/".join([
             self.base_url, RsatDatabase.DIR_PATH, organism,
             RsatDatabase.FEATURE_PATH
         ]), cache_file)
Example #22
0
 def get_features(self, organism):
     """returns the specified organism's feature file contents
     Note: the current version only tries to read from feature.tab
     while the original cMonkey will fall back to cds.tab
     if that fails
     """
     #logging.info('RSAT - get_features(%s)', organism)
     cache_file = "/".join([self.cache_dir, organism + '_features'])
     return util.read_url_cached("/".join([self.base_url,
                                           RsatDatabase.DIR_PATH,
                                           organism,
                                           RsatDatabase.FEATURE_PATH]),
                                 cache_file)
def read_pssms():
    print "reading PSSMs..."
    pssm_text = util.read_url_cached('http://regulondb.ccg.unam.mx/data/PSSMSet.txt',
                                     'cache/regulondb_pssms.txt').split('\n')
    num_lines = len(pssm_text)
    line_number = 0
    pssms = []
    while line_number < num_lines:
        line = pssm_text[line_number]
        if line.startswith('Transcription Factor Name: '):
            line_number, pssm = read_pssm(line[27:], pssm_text, line_number + 1)
            pssms.append(pssm)
        line_number += 1
    return pssms
Example #24
0
 def __get_ncbi_code(self, rsat_organism):
     """retrieve NCBI code from organism.tab file"""
     try:
         cache_file = "/".join([self.cache_dir, '%s.tab' % rsat_organism])
         url = "/".join([
             self.base_url, RsatDatabase.DIR_PATH, rsat_organism,
             RsatDatabase.ORGANISM_PATH
         ])
         text = util.read_url_cached(url, cache_file)
         spec = [
             line for line in text.split('\n') if not line.startswith('--')
         ][0]
         return spec.strip().split('\t')[0]
     except:
         return None
def read_pssms():
    print "reading PSSMs..."
    pssm_text = util.read_url_cached(
        'http://regulondb.ccg.unam.mx/data/PSSMSet.txt',
        'cache/regulondb_pssms.txt').split('\n')
    num_lines = len(pssm_text)
    line_number = 0
    pssms = []
    while line_number < num_lines:
        line = pssm_text[line_number]
        if line.startswith('Transcription Factor Name: '):
            line_number, pssm = read_pssm(line[27:], pssm_text,
                                          line_number + 1)
            pssms.append(pssm)
        line_number += 1
    return pssms
Example #26
0
 def get_contig_sequence(self, organism, contig):
     """returns the specified contig sequence"""
     logging.debug('RSAT - get_contig_sequence(%s, %s)',
                 organism, contig)
     cache_file = "/".join([self.cache_dir, organism + '_' + contig])
     url = "/".join([self.base_url, RsatDatabase.DIR_PATH, organism,
                     'genome', contig + '.raw'])
 
     #10-07-14 Crashed here with URL timeout.  Maybe RSAT limits downloads?
     #  On 10-08-14 I could download the other files with pdb.set_trace()
     #  Maybe all I will need is a pause between files?
     try:
         seqstr = util.read_url_cached(url, cache_file).upper()
     except:
         logging.error("Error downloading file: %s", url)
         logging.error("RSAT occasionally has connectivity problems.")
         logging.error("Try again later, or try a different RSAT mirror")
         logging.error("using the parameter --rsat_base_url")
     return join_contig_sequence(seqstr)
Example #27
0
    def get_contig_sequence(self, organism, contig):
        """returns the specified contig sequence"""
        logging.debug('RSAT - get_contig_sequence(%s, %s)', organism, contig)
        cache_file = "/".join([self.cache_dir, organism + '_' + contig])
        url = "/".join([
            self.base_url, RsatDatabase.DIR_PATH, organism, 'genome',
            contig + '.raw'
        ])

        #10-07-14 Crashed here with URL timeout.  Maybe RSAT limits downloads?
        #  On 10-08-14 I could download the other files with pdb.set_trace()
        #  Maybe all I will need is a pause between files?
        try:
            seqstr = util.read_url_cached(url, cache_file).upper()
        except:
            logging.error("Error downloading file: %s", url)
            logging.error("RSAT occasionally has connectivity problems.")
            logging.error("Try again later, or try a different RSAT mirror")
            logging.error("using the parameter --rsat_base_url")
        return join_contig_sequence(seqstr)
Example #28
0
    def get_features(self, organism):
        """returns the specified organism's feature file contents
        Note: the current version only tries to read from feature.tab
        while the original cMonkey will fall back to cds.tab
        if that fails
        """
        logging.debug('RSAT - get_features(%s)', organism)
        cache_file = "/".join(
            [self.cache_dir, organism + '_' + self.feature_name])
        uCache = util.read_url_cached(
            "/".join([
                self.base_url, RsatDatabase.DIR_PATH, organism,
                self.feature_path
            ]), cache_file)

        #Make sure that the fields are in the correct order
        #Later parts assume that the features file will have the following columns
        fieldOrder = [
            'id', 'type', 'name', 'contig', 'start_pos', 'end_pos', 'strand'
        ]

        uCache = uCache.split('\n')
        #Remove any blank lines
        while "" in uCache:
            uCache.remove("")

        idxs = {}  #Dictionary to store field idxs
        targIdx = []  #The ordered list of columns for output
        outString = ""  #This will be the new data
        for line in uCache:
            try:
                line = line + '\n'
            except:
                continue

            lineParts = line.split()
            if lineParts[0] == '--':
                if lineParts[1] == 'field':
                    idxs[lineParts[3]] = lineParts[2]
                    if lineParts[3] in fieldOrder:
                        newIdx = str(fieldOrder.index(lineParts[3]) + 1)
                        outString = outString + lineParts[0] + " " + lineParts[
                            1] + " " + newIdx + '\t' + lineParts[3] + '\n'
                else:
                    outString = outString + line
            else:
                if (len(targIdx) == 0):
                    #Create the targIdx
                    for curField in fieldOrder:
                        targIdx.append(int(idxs[curField]) - 1)
                outline = ""
                for curTarg in targIdx:
                    outline = outline + lineParts[curTarg] + '\t'
                #Some RSAT files have a contig with ':'s instead of '_'s
                outline = outline.replace(':', '_')
                #Now strip trailing \t
                outline = ''.join(outline.rsplit('\t', 1))
                outString = outString + outline + '\n'

        #To Do: Overwrite cache file & add early check to see if we need the sub
        return outString