def get_db(self, dataset): if dataset in self.baseline_dbs.keys(): return self.baseline_dbs[dataset] elif dataset in self.baseline_db_files.keys(): self._load_baseline_db(dataset) return self.baseline_dbs[dataset] else: Logger.log(f"could not find baseline db for dataset: {dataset}", "ERROR")
def _process_response(self, response): if response.status_code == 200: return True elif response.status_code == 404: Logger.log("could not reach server", "ERROR") return False else: Logger.log("unknown error", "ERROR") return False
def _ask(self): if self.grid_created is False: self._create_grid() param = self.samples.pop(0) if len(self.samples) == 0: message = 'Last parameter being provided - there will not be any more available samples in the grid.' Logger.log(message, 'INFO') return ParameterVector(array=param, param_space=self.param_space)
def maxima(self): message = 'DiscreteMichalewicz has an infinite number of maxima' Logger.log(message, 'INFO') # some maxima maxima = [] params = product([0, 1], repeat=self.param_dim) for param in params: param = list(param) value = self._run(param) maxima.append({'params': param, 'value': value}) return maxima
def list(self): Logger.log("connecting to server", "INFO") url = f"{self.URL}/list_datasets" print("URL", url) response = requests.post(url, data={}) if response.status_code == 200: datasets = response.json()["datasets"] return sorted(datasets) else: return self._process_response(response) return self._process_response(response)
def check_module(module_name, message, **kwargs): try: _ = __import__(module_name) except ModuleNotFoundError: from olympus import Logger error = traceback.format_exc() for line in error.split('\n'): if 'ModuleNotFoundError' in line: module = line.strip().strip("'").split("'")[-1] kwargs.update(locals()) message = f'{message}'.format(**kwargs) Logger.log(message, 'ERROR')
def _validate_args(transformations): # check validity of transformation argument for transformation in transformations: if not (hasattr(DataTransformer, f'_forward_{transformation}') and hasattr(DataTransformer, f'_backward_{transformation}')): raise NotImplementedError( f'transformation {transformation} not implemented. Please select one of the ' f'available transformation.') if 'periodic' in transformations and transformations.index( 'periodic') != 0: message = 'periodic transform is allowed only as the first transformation' Logger.log(message, 'ERROR')
def __init__(self, planner, emulator=None, surface=None, campaign=Campaign(), database=None): """ The Evaluator does higher level operations that Planners and Emulators do not do on their own. For instance, communicating parameters and measurements to each other, keeping track of them ensuring they match, and storing these in a Campaign object. All this can also be done by the user using planner, emulator and campaign objects, which might allow more customization. However, Evaluator provides a convenient higher-level interface for common optimization tasks. Args: planner (Planner): an instance of a Planner. emulator (Emulator): an instance of a trained Emulator. surface (Surface): an instance of a Surface campaign (Campaign): an instance of a Campaign. By default, a new Campaign instance is created. If this is set to None, no campaign info will be stored. database (object): ... """ Object.__init__(**locals()) if emulator is not None: assert surface is None self.emulator_type = 'numerical' elif surface is not None: assert emulator is None self.emulator_type = 'analytic' self.emulator = surface else: Logger.log('One of emulator or surface needs to be provided', 'FATAL') # if isinstance(self.emulator, Emulator): # self.emulator_type = 'numerical' # elif isinstance(self.emulator, Surface): # self.emulator_type = 'analytic' # provide the planner with the parameter space. # NOTE: right now, outside of Evaluator, the param_space for a planner # needs to be set "manually" by the user self.planner.set_param_space( self.emulator.param_space ) # param space in emulator as it originates from dataset if self.campaign is not None: self.campaign.set_planner_specs(planner) self.campaign.set_emulator_specs(emulator)
def _validate_dataset_args(kind, data, columns, target_names): if kind is not None: # ----------------------------------- # check that a correct name is passed # ----------------------------------- # TODO: reduce redundant code by importing the list from where we have it already module_path = os.path.dirname(os.path.abspath(__file__)) olympus_datasets = [] for dir_name in glob(f"{module_path}/dataset_*"): dir_name = dir_name.split("/")[-1][8:] olympus_datasets.append(dir_name) if kind not in olympus_datasets: message = ( "Could not find dataset `{0}`. Please choose from one of the available " "datasets: {1}.".format(kind, ", ".join(list(olympus_datasets)))) Logger.log(message, "FATAL") # -------------------------------------------------------------- # we will discard these arguments, so check if they are provided # -------------------------------------------------------------- if data is not None: message = ( "One of the Olympus datasets has been loaded via the argument `kind`, argument `data` " "will be discarded") Logger.log(message, "WARNING") if columns is not None: message = ( "One of the Olympus datasets has been loaded via the argument `kind`, argument `columns` " "will be discarded") Logger.log(message, "WARNING") if target_names is not None: message = ( "One of the Olympus datasets has been loaded via the argument `kind`, argument " "`target_names` will be discarded") Logger.log(message, "WARNING")
def _create_optimizer(self): from pyDOE import lhs if self.budget is None: message = ( f'Please provide a number of samples for this planner. Given no number of samples provided, ' f'falling back to setting `budget` to {len(self.param_space)}') Logger.log(message, 'WARNING') self.budget = len(self.param_space) self.samples = lhs(len(self.param_space), samples=self.budget) for index, param in enumerate(self.param_space): self.samples[:, index] = ( param.high - param.low) * self.samples[:, index] + param.low self.samples = list(self.samples) self.has_optimizer = True
def _validate_planner_kind(kind): # if we received a string if type(kind) == str: from . import PlannerLoader kind = PlannerLoader.file_to_class(kind) avail_planners = get_planners_list() if kind not in avail_planners: message = ('Planner "{0}" not available in Olympus. Please choose ' "from one of the available planners: {1}".format( kind, ", ".join(avail_planners))) Logger.log(message, "FATAL") # if we get an instance of a planner class elif isinstance(kind, AbstractPlanner): # make sure it has the necessary methods for method in ["_set_param_space", "_tell", "_ask"]: implementation = getattr(kind, method, None) if not callable(implementation): message = f'The object {kind} does not implement the necessary method "{method}"' Logger.log(message, "FATAL") # if we received a custom planner class elif issubclass(kind, AbstractPlanner): # make sure it has the necessary methods for method in ["_set_param_space", "_tell", "_ask"]: implementation = getattr(kind, method, None) if not callable(implementation): message = f'The object {kind} does not implement the necessary method "{method}"' Logger.log(message, "FATAL") # if we do not know what was passed raise an error else: message = 'Could not initialize Planner: the argument "kind" is neither a string or AbstractPlanner subclass' Logger.log(message, "FATAL")
def run(self, params, return_paramvector=False): """Evaluate the surface at the chosen location. Args: params (array): Set of input parameters for which to return the function value. return_paramvector (bool): Whether to return a ``ParameterVector`` object instead of a list of lists. Default is False. Returns: values (ParameterVector): function values evaluated at the chosen locations. """ if isinstance(params, float) or isinstance(params, int): params = np.array([params]) elif type(params) == list: params = np.array(params) if len(params.shape) == 1: params = np.expand_dims(params, axis=0) # TODO: these validations could be moved to ParameterSpace class # validate params if params.shape[1] != len(self.param_space): message = (f'Dimensions of provided params ({params.shape[1]}) does not match expected ' f'dimension ({len(self.param_space)})') Logger.log(message, 'ERROR') # this raises warnings for out-of-bounds parameters for param_set in params: self.param_space.validate(param_set) # get values from the surface class y_preds = [[self._run(param_set)] for param_set in params] # 2d array # if we are not asking for a ParamVector, we can just return y_preds if return_paramvector is False: return y_preds # return a ParameterVector # NOTE: while we do not allow batches or multiple objectives yet, this code is supposed to be able to support # those y_pred_objects = [] # list of ParamVectors with all samples and objectives # iterate over all samples (if we returned a batch of predictions) for y_pred in y_preds: y_pred_object = ParameterVector() # iterate over all objectives/targets for target_name, y in zip(['target_0'], y_pred): y_pred_object.from_dict({target_name: y}) # append object to list y_pred_objects.append(y_pred_object) return y_pred_objects
def _validate_noise_kind(kind): # if we received a string if type(kind) == str: avail_noises = get_noises_list() if kind not in avail_noises: message = ('Noise "{0}" not available in Olympus. Please choose ' 'from one of the available noise objects: {1}'.format(kind, ', '.join(avail_noises))) Logger.log(message, 'FATAL') # if we get an instance of a noise class elif isinstance(kind, AbstractNoise): # make sure it has the necessary methods for method in ['_add_noise']: implementation = getattr(kind, method, None) if not callable(implementation): message = f'The object {kind} does not implement the necessary method "{method}"' Logger.log(message, 'FATAL') # if we received a custom planner class elif issubclass(kind, AbstractNoise): # make sure it has the necessary methods for method in ['_add_noise']: implementation = getattr(kind, method, None) if not callable(implementation): message = f'The object {kind} does not implement the necessary method "{method}"' Logger.log(message, 'FATAL') # if we do not know what was passed raise an error else: message = 'Could not initialize Noise: the argument "kind" is neither a string or AbstractNoise subclass' Logger.log(message, 'FATAL')
def list(self): Logger.log("connecting to github", "INFO") tmp_file = "remote_folders" remote_datasets = [] subprocess.call(f"svn ls -R {self.URL} > {tmp_file}", shell=True) with open(tmp_file, "r") as content: for line in content: dataset_name = line.split("/")[0] if not dataset_name in remote_datasets: remote_datasets.append(dataset_name) os.remove(tmp_file) remote_datasets = [ remote_dataset[8:] for remote_dataset in remote_datasets ] return sorted(remote_datasets)
def train(self, data): """Computes the statistics (e.g. mean and standard deviation) needed for the chosen transformation from the provided dataset. With the exception of the 'identity' transform, the DataTransformer always needs to be trained before the `transform` and `back_transform` methods can be used. Args: data (array, Dataset): the data used to compute the statistics needed for the transformation. This can be a 2d numpy array, or a Dataset object. """ self._dims = None # reset _dims if we retrain the DataTransformer # for splitting periodic variables we need a dataset, so that we can check which variables are periodic # and that are their lower/upper bounds if 'periodic' in self.transformations: if isinstance(data, Dataset) is False: message = 'in order to transform periodic variables you need to provide a Dataset object as the data argument' Logger.log(message, 'ERROR') # remember the input dimensions self._dims = np.shape(data.data)[1] # extract the info about periodic variables self._parse_dataset_for_periodic(data) # Now swap dataset for data after periodic transform. This is done just in case the periodic transform is # composed with other transformations that will then require operating on a higher dimensional array # the means, stddev etc. statistics will need to have matching dimensions data = self._forward_periodic(data.data.to_numpy()) else: # allow passing a dataset if isinstance(data, Dataset) is True: data = data.data.to_numpy() self._validate_data(data) # remember the input dimensions self._dims = np.shape(data)[1] # ------------------------ # Get stats about the data # ------------------------ data = np.array(data) self._mean = np.mean(data, axis=0) self._stddev = np.std(data, axis=0) self._min = np.amin(data, axis=0) self._max = np.amax(data, axis=0) self.trained = True
def add(self, param): """ Args: param: Returns: """ if isinstance(param, ObjectParameter): self._add_param(param) elif isinstance(param, list): for _param in param: self._add_param(_param) else: Logger.log('Please provide a valid parameter', 'ERROR')
def get_param(self, name): """ Args: name: Returns: """ for param in self.parameters: if param['name'] == name: return param message = 'Could not find Parameter with name {0} in {1}'.format( name, str(self)) Logger.log(message, 'WARNING') return None
def _guess_db_kind(self, file_name): from . import databases file_type = file_name.split(".")[-1] for db_kind, db in databases.items(): if db.is_valid_file_type(file_type): break else: from . import db_types Logger.log( "Could not find database type {}. Please choose from {}". format(file_type, db_types), "ERROR", ) return None return db_kind
def add_db(self, kind, *args, **kwargs): try: database = __import__(f"olympus.databases.database_{kind}", fromlist=[f"Wrapper_{kind}"]) except ModuleNotFoundError: Logger.log(" ... proceeding with pickle database", "INFO", only_once=True) kind = "pickle" database = __import__(f"olympus.databases.database_{kind}", fromlist=[f"Wrapper_{kind}"]) database = getattr(database, f"Wrapper_{kind}") db = database(*args, **kwargs) self.dbs[db.name] = db if self.active_db is None: self.active_db = db
def _get_dataset(dataset_name): # check if dataset already exists expected_files = ["dataset.zip", "data.csv", "description.txt"] exists = False for expected_file in expected_files: exists = exists or os.path.isfile( f"{__home__}/datasets/dataset_{dataset_name}/{expected_file}") if exists: Logger.log("The dataset already exists", "INFO") return # download dataset, check with github first for Connector in [ConnectorGithub, ConnectorServer]: success = Connector().get_dataset(dataset_name) if success: break
def __init__(self, kind='continuous', **kwargs): if kind in self.KINDS: self.kind = kind for prop in dir(self.KINDS[kind]): if prop.startswith('ATT_'): setattr(self, prop, getattr(self.KINDS[kind], prop)) self.KINDS[kind].__init__(self) for key, value in kwargs.items(): if 'ATT_{}'.format(key.upper()) in dir(self): self.add(key, value) if not self.KINDS[kind]._validate(self): message = 'Could not validate {}'.format(str(self)) Logger.log(message, 'WARNING') else: message = '''Could not initialize parameter. Parameter kind {} is unknown. Please choose from {}'''.format( kind, ','.join(list(self.KINDS.keys()))) Logger.log(message, 'ERROR')
def _register_dbs(self, datasets_): # only register complete baselines datasets = datasets_.copy() self.baseline_db_files = {} self.baseline_dbs = {} for db_file in glob.glob(f"{__home__}/db_baseline_*sqlite"): dataset = db_file.split("/")[-1].split(".")[0][12:] self.baseline_db_files[dataset] = db_file if dataset in datasets: datasets.remove(dataset) else: Logger.log( f"found complete baseline for not reported dataset: {dataset}", "WARNING", ) for dataset in datasets: Logger.log( f"could not find complete baseline for dataset: {dataset}", "WARNING" )
def __init__(self, from_dict=None, from_json=None, name='CustomConfig'): """ Args: from_dict: from_json: name: """ super(Config, self).__init__(me=name) self.name = name if from_dict is not None: self.from_dict(info_dict=from_dict) elif from_json is not None: self.from_json(json_file=from_json) if from_dict is not None and from_json is not None: message = 'you have passed both "from_dict" and "from_json" arguments to Config: "from_json" will be discarded' Logger.log(message, 'WARNING')
def _generate_first_population(self): if self.verbose is True: Logger.log(f'Creating first population of size {self.pop_size}', 'INFO') # Structure initializers bounds = [param['domain'] for param in self._param_space] self.toolbox.register("individual", self.initIndividual, icls=creator.Individual, bounds=bounds) self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual) self.pop = self.toolbox.population(n=self.pop_size) self.latest_pop_size = self.pop_size # delete creator classes del creator.Individual del creator.FitnessMin
def _load_summaries(self, datasets_): datasets = datasets_.copy() self.baseline_summaries = {} # nothing to do if the summary is not available if not os.path.isfile(self.summary_file): return # load summary with open(self.summary_file, "rb") as content: baseline_summaries = pickle.load(content) for dataset, summary in baseline_summaries.items(): if dataset in datasets: self.baseline_summaries[dataset] = summary datasets.remove(dataset) else: Logger.log( f"found summary for not reported dataset: {dataset}", "WARNING" ) for dataset in datasets: Logger.log(f"could not find summary for dataset: {dataset}", "WARNING")
def _set_dataset(self, dataset): """ registers a dataset for emulator Args: dataset (str): name of available dataset, or Dataset object. """ if type(dataset) == str: self.dataset = Dataset(kind=dataset) elif isinstance(dataset, Dataset): self.dataset = dataset else: raise NotImplementedError # check that the param_space is defined if self.dataset.param_space is None: message = ( "The param_space information is not present in the Dataset object provided. Please use " "Dataset.set_param_space to define the type of variables present in the dataset before " "instantiating the Emulator.") Logger.log(message, "ERROR")
def validate(self, param_vector): """ Args: param_vector: Returns: """ bounds = self.param_bounds valid = True for i, bound in enumerate(bounds): entry = param_vector[i] valid = valid and bound[0] <= entry valid = valid and entry <= bound[1] if valid is False: message = f'Not all parameters of {param_vector} are within bounds!' Logger.log(message, 'WARNING') return valid
def get(self, dataset, kind="summary"): """ Retrieves baseline for a given dataset Args: dataset (str): name of the dataset for which baseline should be retrieved kind (str): indicates format of baseline; choose from "summary", "db", or "campaigns" Returns: requested baseline """ if kind == "summary": return self.get_summary(dataset) elif kind == "db": return self.get_db(dataset) elif kind == "campaigns": return self.get_campaigns(dataset) else: Logger.log( f'could not understand kind: "{kind}". Please choose from "summary", "db", or "campaigns"', "ERROR", )
def load_dataset(kind): """Loads a dataset from the Olympus dataset library. Args: kind (str): kind of Olympus dataset to load. Returns: (tuple): tuple containing: data (array): numpy array where each row is a sample and each column is a feature/target. The columns are sorted such features come first and then targets. config (dict): dict containing information on the features and targets present in data. description (str): string describing the dataset. """ _validate_dataset_args(kind=kind, data=None, columns=None, target_names=None) datasets_path = os.path.dirname(os.path.abspath(__file__)) # load description with open("".join( f"{datasets_path}/dataset_{kind}/description.txt")) as txtfile: description = txtfile.read() # load info on features/targets with open("".join(f"{datasets_path}/dataset_{kind}/config.json"), "r") as content: config = json.loads(content.read()) # load data csv_file = "".join(f"{datasets_path}/dataset_{kind}/data.csv") try: data = read_csv(csv_file, header=None).to_numpy() except FileNotFoundError: Logger.log(f"Could not find data.csv for dataset {kind}", "FATAL") return data, config, description
def _set_param_space(self, param_space): self.param_space = param_space self.dims = len(self.param_space) self.bounds = self.param_space.param_bounds # if budget provided, define levels if self.budget is not None: self._get_approximate_levels() # allow providing a list of levels to tune the budget if isinstance(self.levels, int): self._levels = [self.levels] * self.dims elif isinstance(self.levels, list) or isinstance( self.levels, np.ndarray): if len(self.levels) != self.dims: message = ( f'The number of levels provided ({len(self.levels)}) does not match dimensionality of the ' f'parameter space ({self.dimlists}).') Logger.log(message, 'ERROR') self._levels = list(self.levels) else: raise ValueError( 'Argument `level` can only be a integer or a list.')