def test_create_from_delimited_file2_case_sensitive(self):
     """test the delimited file second version"""
     thes = thesaurus.create_from_delimited_file2(MockDelimitedFile2(),
                                                  case_sensitive=True)
     self.assertEquals('gene1', thes['alt1'])
     self.assertEquals('gene1', thes['alt2'])
     self.assertEquals('gene2', thes['alt3'])
 def test_create_from_delimited_file2_case_sensitive(self):
     """test the delimited file second version"""
     thes = thesaurus.create_from_delimited_file2(MockDelimitedFile2(),
                                                  case_sensitive=True)
     self.assertEquals('gene1', thes['alt1'])
     self.assertEquals('gene1', thes['alt2'])
     self.assertEquals('gene2', thes['alt3'])
 def test_create_from_delimited_file2(self):
     """test the delimited file second version"""
     thes = thesaurus.create_from_delimited_file2(MockDelimitedFile2(),
                                                  case_sensitive=False)
     self.assertEquals('GENE1', thes['ALT1'])
     self.assertEquals('GENE1', thes['ALT2'])
     self.assertEquals('GENE2', thes['ALT3'])
 def test_create_from_delimited_file2(self):
     """test the delimited file second version"""
     thes = thesaurus.create_from_delimited_file2(MockDelimitedFile2(),
                                                  case_sensitive=False)
     self.assertEquals('GENE1', thes['ALT1'])
     self.assertEquals('GENE1', thes['ALT2'])
     self.assertEquals('GENE2', thes['ALT3'])
 def thesaurus(self):
     """Reads the synonyms from the provided CSV file"""
     if not self.__synonyms:
         self.__synonyms = thesaurus.create_from_delimited_file2(
             self.__thesaurus_filename)
     return self.__synonyms
Example #6
0
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()

        if os.path.exists(USER_KEGG_FILE_PATH):
            keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
        elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
            keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
        else:
            raise Exception('KEGG file not found !!')

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code'])
        else:
            rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        kegg_map = util.make_dfile_map(keggfile, 1, 3)
        ncbi_code = self['ncbi_code']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks
        
        
        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb,
                                                    kegg_map[self['organism_code']],
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'", stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        keggorg = kegg_map[orgcode]
        rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, self['rsat_organism'],
                                        self['ncbi_code'])
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        if is_microbe:
            organism = org.Microbe(orgcode, keggorg, rsat_info, gotax, mo_db, nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms)
        else:
            organism = org.RSATOrganism(orgcode, keggorg, rsat_info, gotax, nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms)
        
        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])
            
        return organism
Example #7
0
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks
        
        
        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'", stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'],
                                        ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name
    
        is_microbe = True
        if is_microbe:
           organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db,
                                   nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms,
                                   self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax,
                                        nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms,
                                        self['fasta_file'])
        
        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])
            
        return organism
Example #8
0
 def thesaurus(self):
     """Reads the synonyms from the provided CSV file"""
     if not self.__synonyms:
         self.__synonyms = thesaurus.create_from_delimited_file2(
             self.__thesaurus_filename)
     return self.__synonyms