Example #1
0
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR, 'Halobacterium sp',
                               64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(
            stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(
            microbes_online.get_network_factory(
                mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
Example #2
0
def make_halo(search_distances, scan_distances, ratios=None):
    """returns the organism object to work on"""
    keggfile = util.read_dfile(KEGG_FILE_PATH, comment='#')
    gofile = util.read_dfile(GO_FILE_PATH)
    rsatdb = rsat.RsatDatabase(RSAT_BASE_URL, CACHE_DIR,
                               'Halobacterium sp', 64091)
    mo_db = microbes_online.MicrobesOnline(CACHE_DIR)
    stringfile = 'testdata/string_links_64091.tab'

    nw_factories = []
    if stringfile != None:
        nw_factories.append(stringdb.get_network_factory('hal', stringfile, 0.5))
    else:
        logging.warn("no STRING file specified !")

    if ratios is not None:
        nw_factories.append(microbes_online.get_network_factory(
            mo_db, max_operon_size=ratios.num_rows / 20, weight=0.5))

    keggorg = util.make_dfile_map(keggfile, 1, 3)['hal']
    rsat_organism = rsatdb.get_rsat_organism(keggorg)
    rsat_info = org.RsatSpeciesInfo(rsatdb, keggorg, rsat_organism, 64091)
    gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
    return org.Microbe('hal', keggorg, rsat_info, gotax, mo_db, nw_factories,
                       search_distances, scan_distances, True, None)
Example #3
0
 def test_get_network_factory(self):
     """test happy path"""
     microbes_online = MockMicrobesOnline('testdata/gnc64091.named')
     network = mo.get_network_factory(microbes_online, 20, 123)(MockOrganism(
             '64091',
              {'gene1': st.Feature('feature1', 'typ1', 'feature_name1',
                                   st.Location('contig1', 24, 89, False)),
               'gene2': st.Feature('feature2', 'typ1', 'feature_name2',
                                   st.Location('contig1', 15, 21, False)),
               'gene3': st.Feature('feature3', 'typ2', 'feature_name3',
                                   st.Location('contig1', 100, 154, False))
               },
             {'gene1': 'gene1', 'gene2': 'gene2', 'gene3': 'gene3'}
             ), check_size=False)
     self.assertEquals(3, network.num_edges())
     self.assertEquals(6000, network.total_score())
     self.assertEquals(123, network.weight)
Example #4
0
 def test_get_network_factory(self):
     """test happy path"""
     microbes_online = MockMicrobesOnline('testdata/gnc64091.named')
     network = mo.get_network_factory(microbes_online, 20, 123)(
         MockOrganism(
             '64091', {
                 'gene1':
                 st.Feature('feature1', 'typ1', 'feature_name1',
                            st.Location('contig1', 24, 89, False)),
                 'gene2':
                 st.Feature('feature2', 'typ1', 'feature_name2',
                            st.Location('contig1', 15, 21, False)),
                 'gene3':
                 st.Feature('feature3', 'typ2', 'feature_name3',
                            st.Location('contig1', 100, 154, False))
             }, {
                 'gene1': 'gene1',
                 'gene2': 'gene2',
                 'gene3': 'gene3'
             }),
         check_size=False)
     self.assertEquals(3, network.num_edges())
     self.assertEquals(6000, network.total_score())
     self.assertEquals(123, network.weight)
Example #5
0
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        if os.path.exists(USER_GO_FILE_PATH):
            gofile = util.read_dfile(USER_GO_FILE_PATH)
        elif os.path.exists(SYSTEM_GO_FILE_PATH):
            gofile = util.read_dfile(SYSTEM_GO_FILE_PATH)
        else:
            raise Exception('GO file not found !!')

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception('override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'], ncbi_code, self['rsat_features'], self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'], self['cache_dir'], kegg_species, ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(self['operon_file'])
        else:
            logging.info("attempting automatic download of operons from Microbes Online")
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks

        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'], None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info("Automatically using STRING file in '%s' (URL: %s)",
                             stringfile, url)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(stringdb.get_network_factory(
                self['organism_code'], stringfile, network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(microbes_online.get_network_factory(
                mo_db, max_operon_size=self.ratios.num_rows / 20,
                weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species, self['rsat_organism'],
                                        ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(self['synonym_file'],
                                                             self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name

        is_microbe = True
        if is_microbe:
           organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax, mo_db,
                                   nw_factories,
                                   self['search_distances'], self['scan_distances'],
                                   self['use_operons'], self.ratios, synonyms,
                                   self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info, gotax,
                                        nw_factories,
                                        self['search_distances'], self['scan_distances'],
                                        self.ratios, synonyms,
                                        self['fasta_file'])

        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)", [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)", [sequence_type])

        return organism
    def make_organism(self):
        """returns the organism object to work on"""
        self.__make_dirs_if_needed()
        ncbi_code, kegg_species = self.__get_kegg_data()

        try:
            go_file_path = resource_filename(Requirement.parse("cmonkey2"),
                                             USER_GO_FILE_PATH)
        except DistributionNotFound:
            go_file_path = USER_GO_FILE_PATH

        gofile = util.read_dfile(go_file_path)

        if self['rsat_dir']:
            if not self['rsat_organism']:
                raise Exception(
                    'override RSAT loading: please specify --rsat_organism')
            logging.info("using RSAT files for '%s'", self['rsat_organism'])
            rsatdb = rsat.RsatFiles(self['rsat_dir'], self['rsat_organism'],
                                    ncbi_code, self['rsat_features'],
                                    self['rsat_base_url'])
        else:
            rsatdb = rsat.RsatDatabase(self['rsat_base_url'],
                                       self['cache_dir'], kegg_species,
                                       ncbi_code, self['rsat_features'])

        if self['operon_file']:
            logging.info("using operon file at '%s'", self['operon_file'])
            mo_db = microbes_online.MicrobesOnlineOperonFile(
                self['operon_file'])
        else:
            logging.info(
                "attempting automatic download of operons from Microbes Online"
            )
            mo_db = microbes_online.MicrobesOnline(self['cache_dir'])

        stringfile = self['string_file']
        nw_factories = []
        is_microbe = self['organism_code'] not in VERTEBRATES

        # determine the final weights. note: for now, we will just check whether
        # we have 1 or 2 networks
        num_networks = 0
        if not self['nonetworks'] and self['use_string']:
            num_networks += 1
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            num_networks += 1
        network_weight = 0.0
        if num_networks > 0:
            network_weight = 1.0 / num_networks

        # do we use STRING ?
        if not self['nonetworks'] and self['use_string']:
            # download if not provided
            if stringfile is None:
                if ncbi_code is None:
                    rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                                    self['rsat_organism'],
                                                    None)
                    ncbi_code = rsat_info.taxonomy_id

                logging.info("NCBI CODE IS: %s", ncbi_code)
                url = STRING_URL_PATTERN % ncbi_code
                stringfile = "%s/%s.gz" % (self['cache_dir'], ncbi_code)
                self['string_file'] = stringfile
                logging.info(
                    "Automatically using STRING file in '%s' (URL: %s)",
                    stringfile, url)
                util.get_url_cached(url, stringfile)
            else:
                logging.info("Loading STRING file at '%s'", stringfile)

            # create and add network
            nw_factories.append(
                stringdb.get_network_factory(self['organism_code'], stringfile,
                                             network_weight))

        # do we use operons ?
        if is_microbe and not self['nonetworks'] and self['use_operons']:
            logging.debug('adding operon network factory')
            nw_factories.append(
                microbes_online.get_network_factory(
                    mo_db,
                    max_operon_size=self.ratios.num_rows / 20,
                    weight=network_weight))

        orgcode = self['organism_code']
        logging.debug("Creating Microbe object for '%s'", orgcode)
        rsat_info = org.RsatSpeciesInfo(rsatdb, kegg_species,
                                        self['rsat_organism'], ncbi_code)
        gotax = util.make_dfile_map(gofile, 0, 1)[rsat_info.go_species()]
        synonyms = None
        if self['synonym_file'] is not None:
            synonyms = thesaurus.create_from_delimited_file2(
                self['synonym_file'], self['case_sensitive'])

        #New logic: test to see if there's a fastafile.  If not, then
        #Download it from rsat, process it, and then return the new file name

        is_microbe = True
        if is_microbe:
            organism = org.Microbe(orgcode, kegg_species, rsat_info, gotax,
                                   mo_db, nw_factories,
                                   self['search_distances'],
                                   self['scan_distances'], self['use_operons'],
                                   self.ratios, synonyms, self['fasta_file'])
        else:
            organism = org.RSATOrganism(orgcode, kegg_species, rsat_info,
                                        gotax, nw_factories,
                                        self['search_distances'],
                                        self['scan_distances'], self.ratios,
                                        synonyms, self['fasta_file'])

        conn = self.__dbconn()
        with conn:
            for network in organism.networks():
                conn.execute("insert into statstypes values ('network',?)",
                             [network.name])
            for sequence_type in self['sequence_types']:
                conn.execute("insert into statstypes values ('seqtype',?)",
                             [sequence_type])

        return organism