def make_halo(ratio_matrix, search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append( stringdb.get_network_factory2('hal', stringfile, 0.5, normalized=True)) else: logging.warn("no STRING file specified !") nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create('hal', search_distances, scan_distances)
def make_halo(search_distances, scan_distances, ratios=None): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp', 64091) mo_db = microbes_online.MicrobesOnline(CACHE_DIR) stringfile = 'testdata/string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") if ratios is not None: nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5)) keggorg = util.make_dfile_map(keggfile, 1, 3)['hal'] rsat_organism = rsatdb.get_rsat_organism(keggorg) rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories, search_distances, scan_distances, True, None)
def test_get_network_factory(self): """test happy path""" microbes_online = MockMicrobesOnline('testdata/gnc64091.named') network = mo.get_network_factory(microbes_online, 20, 123)(MockOrganism( '64091', {'gene1': st.Feature('feature1', 'typ1', 'feature_name1', st.Location('contig1', 24, 89, False)), 'gene2': st.Feature('feature2', 'typ1', 'feature_name2', st.Location('contig1', 15, 21, False)), 'gene3': st.Feature('feature3', 'typ2', 'feature_name3', st.Location('contig1', 100, 154, False)) })) self.assertEquals(3, network.num_edges()) self.assertEquals(6000, network.total_score()) self.assertEquals(123, network.weight)
def make_microbe(self): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir']) mo_db = microbes_online.MicrobesOnline() stringfile = self.config_params['string_file'] kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) # automatically download STRING file if stringfile == None: rsat_info = rsat_mapper(kegg_mapper(self['organism_code'])) ncbi_code = rsat_info.taxonomy_id print "NCBI CODE IS: ", ncbi_code url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory2( self['organism_code'], stringfile, 0.5)) else: logging.warn("no STRING file specified !") nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'])
def make_halo(ratio_matrix, search_distances, scan_distances): """returns the organism object to work on""" keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#') gofile = util.read_dfile(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR ) mo_db = microbes_online.MicrobesOnline() stringfile = 'string_links_64091.tab' nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory2('hal', stringfile, 0.5)) else: logging.warn("no STRING file specified !") nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create('hal', search_distances, scan_distances)
def make_microbe(self): """returns the organism object to work on""" keggfile = util.DelimitedFile.read(KEGG_FILE_PATH, comment='#') gofile = util.DelimitedFile.read(GO_FILE_PATH) rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir']) mo_db = microbes_online.MicrobesOnline() stringfile = self.config_params['string_file'] nw_factories = [] if stringfile != None: nw_factories.append(stringdb.get_network_factory2(stringfile, 0.5)) else: logging.warn("no STRING file specified !") nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows() / 20, weight=0.5)) org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile), org.make_rsat_organism_mapper(rsatdb), org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'])
def test_get_network_factory(self): """test happy path""" microbes_online = MockMicrobesOnline('testdata/gnc64091.named') network = mo.get_network_factory(microbes_online, 20, 123)( MockOrganism( '64091', { 'gene1': st.Feature('feature1', 'typ1', 'feature_name1', st.Location('contig1', 24, 89, False)), 'gene2': st.Feature('feature2', 'typ1', 'feature_name2', st.Location('contig1', 15, 21, False)), 'gene3': st.Feature('feature3', 'typ2', 'feature_name3', st.Location('contig1', 100, 154, False)) }, { 'gene1': 'gene1', 'gene2': 'gene2', 'gene3': 'gene3' }), check_size=False) self.assertEquals(3, network.num_edges()) self.assertEquals(6000, network.total_score()) self.assertEquals(123, network.weight)
def make_microbe(self): """returns the organism object to work on""" self.__make_dirs_if_needed() if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception( 'override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code']) else: rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile( self['operon_file']) else: logging.info( "attempting automatic download of operons from Microbes Online" ) mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) ncbi_code = self['ncbi_code'] nw_factories = [] # do we use STRING ? if self['donetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = rsat_mapper(kegg_mapper(self['organism_code']), self['rsat_organism']) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append( stringdb.get_network_factory2(self['organism_code'], stringfile, 0.5)) # do we use operons ? if self['donetworks'] and self['use_operons']: logging.info('adding operon network factory') nw_factories.append( microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories, self['ncbi_code']) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'], self['use_operons'], self['rsat_organism'], self.ratio_matrix)
def make_organism(self): """returns the organism object to work on""" self.__make_dirs_if_needed() if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception('override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code']) else: rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file']) else: logging.info("attempting automatic download of operons from Microbes Online") mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] kegg_map = util.make_dfile_map(keggfile, 1, 3) ncbi_code = self['ncbi_code'] nw_factories = [] is_microbe = self['organism_code'] not in VERTEBRATES # determine the final weights. note: for now, we will just check whether # we have 1 or 2 networks num_networks = 0 if not self['nonetworks'] and self['use_string']: num_networks += 1 if is_microbe and not self['nonetworks'] and self['use_operons']: num_networks += 1 network_weight = 0.0 if num_networks > 0: network_weight = 1.0 / num_networks # do we use STRING ? if not self['nonetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_map[self['organism_code']], self['rsat_organism'], None) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append(stringdb.get_network_factory( self['organism_code'], stringfile, network_weight)) # do we use operons ? if is_microbe and not self['nonetworks'] and self['use_operons']: logging.debug('adding operon network factory') nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratios.num_rows / 20, weight=network_weight)) orgcode = self['organism_code'] logging.debug("Creating Microbe object for '%s'", orgcode) keggorg = kegg_map[orgcode] rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, self['rsat_organism'], self['ncbi_code']) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] synonyms = None if self['synonym_file'] is not None: synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'], self['case_sensitive']) if is_microbe: organism = org.Microbe(orgcode, keggorg, rsat_info, gotax, mo_db, nw_factories, self['search_distances'], self['scan_distances'], self['use_operons'], self.ratios, synonyms) else: organism = org.RSATOrganism(orgcode, keggorg, rsat_info, gotax, nw_factories, self['search_distances'], self['scan_distances'], self.ratios, synonyms) conn = self.__dbconn() with conn: for network in organism.networks(): conn.execute("insert into statstypes values ('network',?)", [network.name]) for sequence_type in self['sequence_types']: conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type]) return organism
def make_microbe(self): """returns the organism object to work on""" self.__make_dirs_if_needed() if os.path.exists(USER_KEGG_FILE_PATH): keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#') elif os.path.exists(SYSTEM_KEGG_FILE_PATH): keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#') else: raise Exception('KEGG file not found !!') if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception('override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code']) else: rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file']) else: logging.info("attempting automatic download of operons from Microbes Online") mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] kegg_mapper = org.make_kegg_code_mapper(keggfile) rsat_mapper = org.make_rsat_organism_mapper(rsatdb) ncbi_code = self['ncbi_code'] nw_factories = [] # do we use STRING ? if self['donetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = rsat_mapper(kegg_mapper(self['organism_code']), self['rsat_organism']) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append(stringdb.get_network_factory2( self['organism_code'], stringfile, 0.5)) # do we use operons ? if self['donetworks'] and self['use_operons']: logging.info('adding operon network factory') nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratio_matrix.num_rows / 20, weight=0.5)) org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper, org.make_go_taxonomy_mapper(gofile), mo_db, nw_factories, self['ncbi_code']) return org_factory.create(self['organism_code'], self['search_distances'], self['scan_distances'], self['use_operons'], self['rsat_organism'], self.ratio_matrix)
def make_organism(self): """returns the organism object to work on""" self.__make_dirs_if_needed() ncbi_code, kegg_species = self.__get_kegg_data() if os.path.exists(USER_GO_FILE_PATH): gofile = util.read_dfile(USER_GO_FILE_PATH) elif os.path.exists(SYSTEM_GO_FILE_PATH): gofile = util.read_dfile(SYSTEM_GO_FILE_PATH) else: raise Exception('GO file not found !!') if self['rsat_dir']: if not self['rsat_organism']: raise Exception('override RSAT loading: please specify --rsat_organism') logging.info("using RSAT files for '%s'", self['rsat_organism']) rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url']) else: rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features']) if self['operon_file']: logging.info("using operon file at '%s'", self['operon_file']) mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file']) else: logging.info("attempting automatic download of operons from Microbes Online") mo_db = microbes_online.MicrobesOnline(self['cache_dir']) stringfile = self['string_file'] nw_factories = [] is_microbe = self['organism_code'] not in VERTEBRATES # determine the final weights. note: for now, we will just check whether # we have 1 or 2 networks num_networks = 0 if not self['nonetworks'] and self['use_string']: num_networks += 1 if is_microbe and not self['nonetworks'] and self['use_operons']: num_networks += 1 network_weight = 0.0 if num_networks > 0: network_weight = 1.0 / num_networks # do we use STRING ? if not self['nonetworks'] and self['use_string']: # download if not provided if stringfile is None: if ncbi_code is None: rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], None) ncbi_code = rsat_info.taxonomy_id logging.info("NCBI CODE IS: %s", ncbi_code) url = STRING_URL_PATTERN % ncbi_code stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code) self['string_file'] = stringfile logging.info("Automatically using STRING file in '%s'", stringfile) util.get_url_cached(url, stringfile) else: logging.info("Loading STRING file at '%s'", stringfile) # create and add network nw_factories.append(stringdb.get_network_factory( self['organism_code'], stringfile, network_weight)) # do we use operons ? if is_microbe and not self['nonetworks'] and self['use_operons']: logging.debug('adding operon network factory') nw_factories.append(microbes_online.get_network_factory( mo_db, max_operon_size=self.ratios.num_rows / 20, weight=network_weight)) orgcode = self['organism_code'] logging.debug("Creating Microbe object for '%s'", orgcode) rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'], ncbi_code) gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()] synonyms = None if self['synonym_file'] is not None: synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'], self['case_sensitive']) #New logic: test to see if there's a fastafile. If not, then #Download it from rsat, process it, and then return the new file name is_microbe = True if is_microbe: organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db, nw_factories, self['search_distances'], self['scan_distances'], self['use_operons'], self.ratios, synonyms, self['fasta_file']) else: organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax, nw_factories, self['search_distances'], self['scan_distances'], self.ratios, synonyms, self['fasta_file']) conn = self.__dbconn() with conn: for network in organism.networks(): conn.execute("insert into statstypes values ('network',?)", [network.name]) for sequence_type in self['sequence_types']: conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type]) return organism