def make_halo(ratio_matrix, search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, CACHE_DIR)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(
            stringdb.get_network_factory2('hal',
                                          stringfile,
                                          0.5,
                                          normalized=True))
    else:
        logging.warn("no STRING file specified !")

    nw_factories.append(
        microbes_online.get_network_factory(
            mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5))

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db, nw_factories)

    return org_factory.create('hal', search_distances, scan_distances)
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR,
                               'Halobacterium sp', 64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
 def test_get_network_factory(self):
     """test happy path"""
     microbes_online = MockMicrobesOnline('testdata/gnc64091.named')
     network = mo.get_network_factory(microbes_online, 20, 123)(MockOrganism(
             '64091',
              {'gene1': st.Feature('feature1', 'typ1', 'feature_name1',
                                   st.Location('contig1', 24, 89, False)),
               'gene2': st.Feature('feature2', 'typ1', 'feature_name2',
                                   st.Location('contig1', 15, 21, False)),
               'gene3': st.Feature('feature3', 'typ2', 'feature_name3',
                                   st.Location('contig1', 100, 154, False))
               }))
     self.assertEquals(3, network.num_edges())
     self.assertEquals(6000, network.total_score())
     self.assertEquals(123, network.weight)
Example #4
0
    def make_microbe(self):
        """returns the organism object to work on"""
        keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
        gofile = util.read_dfile(GO_FILE_PATH)
        rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir'])
        mo_db = microbes_online.MicrobesOnline()
        stringfile = self.config_params['string_file']
        kegg_mapper = org.make_kegg_code_mapper(keggfile)
        rsat_mapper = org.make_rsat_organism_mapper(rsatdb)

        # automatically download STRING file
        if stringfile == None:
            rsat_info = rsat_mapper(kegg_mapper(self['organism_code']))
            ncbi_code = rsat_info.taxonomy_id
            print "NCBI CODE IS: ", ncbi_code
            url = STRING_URL_PATTERN % ncbi_code
            stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
            self['string_file'] = stringfile
            logging.info("Automatically using STRING file in '%s'", stringfile)
            util.get_url_cached(url, stringfile)

        nw_factories = []
        if stringfile != None:
            nw_factories.append(stringdb.get_network_factory2(
                    self['organism_code'], stringfile, 0.5))
        else:
            logging.warn("no STRING file specified !")

        nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratio_matrix.num_rows / 20,
                weight=0.5))

        org_factory = org.MicrobeFactory(kegg_mapper,
                                         rsat_mapper,
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db,
                                         nw_factories)
        return org_factory.create(self['organism_code'],
                                  self['search_distances'],
                                  self['scan_distances'])
def make_halo(ratio_matrix, search_distances, scan_distances):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR        )
    mo_db = microbes_online.MicrobesOnline()
    stringfile = 'string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory2('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratio_matrix.num_rows / 20, weight=0.5))

    org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                     org.make_rsat_organism_mapper(rsatdb),
                                     org.make_go_taxonomy_mapper(gofile),
                                     mo_db,
                                     nw_factories)

    return org_factory.create('hal', search_distances, scan_distances)
Example #6
0
    def make_microbe(self):
        """returns the organism object to work on"""
        keggfile = util.DelimitedFile.read(KEGG_FILE_PATH, comment='#')
        gofile = util.DelimitedFile.read(GO_FILE_PATH)
        rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, self['cache_dir'])
        mo_db = microbes_online.MicrobesOnline()
        stringfile = self.config_params['string_file']

        nw_factories = []
        if stringfile != None:
            nw_factories.append(stringdb.get_network_factory2(stringfile, 0.5))
        else:
            logging.warn("no STRING file specified !")

        nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratio_matrix.num_rows() / 20, weight=0.5))

        org_factory = org.MicrobeFactory(org.make_kegg_code_mapper(keggfile),
                                         org.make_rsat_organism_mapper(rsatdb),
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db,
                                         nw_factories)
        return org_factory.create(self['organism_code'], self['search_distances'],
                                  self['scan_distances'])
Example #7
0
 def test_get_network_factory(self):
     """test happy path"""
     microbes_online = MockMicrobesOnline('testdata/gnc64091.named')
     network = mo.get_network_factory(microbes_online, 20, 123)(
         MockOrganism(
             '64091', {
                 'gene1':
                 st.Feature('feature1', 'typ1', 'feature_name1',
                            st.Location('contig1', 24, 89, False)),
                 'gene2':
                 st.Feature('feature2', 'typ1', 'feature_name2',
                            st.Location('contig1', 15, 21, False)),
                 'gene3':
                 st.Feature('feature3', 'typ2', 'feature_name3',
                            st.Location('contig1', 100, 154, False))
             }, {
                 'gene1': 'gene1',
                 'gene2': 'gene2',
                 'gene3': 'gene3'
             }),
         check_size=False)
     self.assertEquals(3, network.num_edges())
     self.assertEquals(6000, network.total_score())
     self.assertEquals(123, network.weight)
Example #8
0
    def make_microbe(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()

        if os.path.exists(USER_KEGG_FILE_PATH):
            keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
        elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
            keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
        else:
            raise Exception('KEGG file not found !!')

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception(
                    'override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'],
                                    self['ncbi_code'])
        else:
            rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(
                self['operon_file'])
        else:
            logging.info(
                "attempting automatic download of operons from Microbes Online"
            )
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        kegg_mapper = org.make_kegg_code_mapper(keggfile)
        rsat_mapper = org.make_rsat_organism_mapper(rsatdb)
        ncbi_code = self['ncbi_code']
        nw_factories = []

        # do we use STRING ?
        if self['donetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = rsat_mapper(kegg_mapper(self['organism_code']),
                                            self['rsat_organism'])
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'",
                             stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(
                stringdb.get_network_factory2(self['organism_code'],
                                              stringfile, 0.5))

        # do we use operons ?
        if self['donetworks'] and self['use_operons']:
            logging.info('adding operon network factory')
            nw_factories.append(
                microbes_online.get_network_factory(
                    mo_db,
                    max_operon_size=self.ratio_matrix.num_rows / 20,
                    weight=0.5))

        org_factory = org.MicrobeFactory(kegg_mapper, rsat_mapper,
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db, nw_factories,
                                         self['ncbi_code'])
        return org_factory.create(self['organism_code'],
                                  self['search_distances'],
                                  self['scan_distances'], self['use_operons'],
                                  self['rsat_organism'], self.ratio_matrix)
Example #9
0
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()

        if os.path.exists(USER_KEGG_FILE_PATH):
            keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
        elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
            keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
        else:
            raise Exception('KEGG file not found !!')

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code'])
        else:
            rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        kegg_map = util.make_dfile_map(keggfile, 1, 3)
        ncbi_code = self['ncbi_code']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks
        
        
        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb,
                                                    kegg_map[self['organism_code']],
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'", stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        keggorg = kegg_map[orgcode]
        rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, self['rsat_organism'],
                                        self['ncbi_code'])
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        if is_microbe:
            organism = org.Microbe(orgcode, keggorg, rsat_info, gotax, mo_db, nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms)
        else:
            organism = org.RSATOrganism(orgcode, keggorg, rsat_info, gotax, nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms)
        
        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])
            
        return organism
Example #10
0
    def make_microbe(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()

        if os.path.exists(USER_KEGG_FILE_PATH):
            keggfile = util.read_dfile(USER_KEGG_FILE_PATH, comment='#')
        elif os.path.exists(SYSTEM_KEGG_FILE_PATH):
            keggfile = util.read_dfile(SYSTEM_KEGG_FILE_PATH, comment='#')
        else:
            raise Exception('KEGG file not found !!')

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], self['ncbi_code'])
        else:
            rsatdb = rsat.RsatDatabase(rsat.RSAT_BASE_URL, self['cache_dir'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        kegg_mapper = org.make_kegg_code_mapper(keggfile)
        rsat_mapper = org.make_rsat_organism_mapper(rsatdb)
        ncbi_code = self['ncbi_code']
        nw_factories = []

        # do we use STRING ?
        if self['donetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = rsat_mapper(kegg_mapper(self['organism_code']),
                                            self['rsat_organism'])
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'", stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory2(
                self['organism_code'], stringfile, 0.5))

        # do we use operons ?
        if self['donetworks'] and self['use_operons']:
            logging.info('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratio_matrix.num_rows / 20,
                weight=0.5))

        org_factory = org.MicrobeFactory(kegg_mapper,
                                         rsat_mapper,
                                         org.make_go_taxonomy_mapper(gofile),
                                         mo_db,
                                         nw_factories,
                                         self['ncbi_code'])
        return org_factory.create(self['organism_code'],
                                  self['search_distances'],
                                  self['scan_distances'],
                                  self['use_operons'],
                                  self['rsat_organism'],
                                  self.ratio_matrix)
Example #11
0
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks
        
        
        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s'", stringfile)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'],
                                        ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name
    
        is_microbe = True
        if is_microbe:
           organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db,
                                   nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms,
                                   self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax,
                                        nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms,
                                        self['fasta_file'])
        
        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])
            
        return organism