def __init__(self, *args, **kwargs): super(OptTask, self).__init__(*args, **kwargs) # Configuration attrs lp = self.get("launchpad", LaunchPad.auto_load()) if isinstance(lp, LaunchPad): lp = lp.to_dict() self.lpad = LaunchPad.from_dict(lp) self.opt_label = self.get("opt_label", "opt_default") self.c = getattr(self.lpad.db, self.opt_label) self.config = self.c.find_one({"doctype": "config"}) if self.config is None: raise NotConfiguredError("Please use MissionControl().configure to " "configure the optimization database " "({} - {}) before running OptTask." "".format(self.lpad.db, self.opt_label)) self.wf_creator = deserialize(self.config["wf_creator"]) self.x_dims = self.config["dimensions"] self._xdim_types = self.config["dim_types"] self.is_discrete_all = self.config["is_discrete_all"] self.is_discrete_any = self.config["is_discrete_any"] self.wf_creator_args = self.config["wf_creator_args"] or [] self.wf_creator_kwargs = self.config["wf_creator_kwargs"] or {} self.predictor = self.config["predictor"] self.predictor_args = self.config["predictor_args"] or [] self.predictor_kwargs = self.config["predictor_kwargs"] or {} self.maximize = self.config["maximize"] self.n_search_pts = self.config["n_search_pts"] self.n_train_pts = self.config["n_train_pts"] self.n_bootstraps = self.config["n_bootstraps"] self.acq = self.config["acq"] self.space_file = self.config["space_file"] self.onehot_categorical = self.config["onehot_categorical"] self.duplicate_check = self.config["duplicate_check"] self.get_z = self.config["get_z"] if self.get_z: self.get_z = deserialize(self.config['get_z']) else: self.get_z = lambda *ars, **kws: [] self.get_z_args = self.config["get_z_args"] or [] self.get_z_kwargs = self.config["get_z_kwargs"] or {} self.z_file = self.config["z_file"] self.enforce_sequential = self.config["enforce_sequential"] self.tolerances = self.config["tolerances"] self.batch_size = self.config["batch_size"] self.timeout = self.config["timeout"] # Declared attrs self.n_objs = None plist = [RandomForestRegressor, GaussianProcessRegressor, ExtraTreesRegressor, GradientBoostingRegressor] self.builtin_predictors = {p.__name__: p for p in plist} self._n_cats = 0 self._encoding_info = [] # Query formats self._completed = {'x': {'$exists': 1}, 'y': {'$exists': 1, '$ne': 'reserved'}, 'z': {'$exists': 1}} self._manager = {'lock': {'$exists': 1}, 'queue': {'$exists': 1}}
def configure(self, wf_creator, dimensions, **kwargs): """ Set up the optimization config. Required before using OptTask, but only needs to be done once. To reconfigure, use MissionControl.reset and then use configure again. Args: wf_creator (function or str): The function object that creates the workflow based on a unique vector, x. Alternatively, the full string module path to that function, e.g. "mypkg.mymodule.my_wf_creator", which must importable and found in PYTHONPATH. dimensions ([tuple]): each 2-tuple in the list defines one dimension in the search space in (low, high) format. For categorical or discontinuous dimensions, includes all possible categories or values as a list of any length or a tuple of length>2. Example: dimensions = dim = [(1,100), (9.293, 18.2838), ("red", "blue", "green")]. **kwargs: Keyword arguments for defining the optimization. A full list of possible kwargs is given below: Optimization data: opt_label (str): The label to use for this collection of optimization data. Workflow creator function: wf_creator_args (list): the positional args to be passed to the wf_creator function alongsize the new x vector wf_creator_kwargs (dict): details the kwargs to be passed to the wf_creator function alongside the new x vector Predictors (optimization): predictor (function or str): a function which given a list of searched points and unsearched points, returns an optimized guess. To use a builtin predictor, pass in one of: 'GaussianProcessRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor', 'random' (random guess) The default is 'GaussianProcessRegressor' To use a custom predictor, pass in the function object. Alternatively, the full string module path to that function, e.g. "mypkg.mymodule.my_predictor", which must importable and found in PYTHONPATH. Example builtin predictor: 'GaussianProcessRegressor' Example custom predictor: my_predictor Example custom predictor 2: 'my_pkg.my_module.my_predictor' predictor_args (list): the positional args to be passed to the model along with a list of points to be searched. For sklearn-based predictors included in OptTask, these positional args are passed to the init method of the chosen model. For custom predictors, these are passed to the chosen predictor function alongside the searched guesses, the output from searched guesses, and an unsearched space to be used with optimization. predictor_kwargs (dict): the kwargs to be passed to the model. Similar to predictor_args. n_search_pts (int): The number of points to be searched in the search space when choosing the next best point. Choosing more points to search may increase the effectiveness of the optimization but take longer to evaluate. The default is 1000. n_train_pts (int): The number of already explored points to be chosen for training. Default is None, meaning all available points will be used for training. Reduce the number of points to decrease training times. n_bootstraps (int): The number of times each optimization should, sample, train, and predict values when generating uncertainty estimates for prediction. At least 10 data points must be present for bootstrapping. Not used if: acq not specified, custom predictor used, or GaussianProcessRegressor used. acq (str): The acquisition function to use. Can be 'ei' for expected improvement, 'pi' for probability of improvement, or 'lcb' for lower confidence bound. Defaults to None, which means no acquisition function is used, and the highest predicted point is picked (greedy algorithm). Only applies to builtin predictors. space_file (str): The fully specified path of a pickle file containing a list of all possible searchable vectors. For example '/Users/myuser/myfolder/myspace.p'. When loaded, this space_file should be a list of tuples. onehot_categorical (bool): If True, preprocesses categorical data (strings) to one-hot encoded binary arrays for use with custom predictor functions. Default False. duplicate_check (bool): If True, checks that custom optimizers are not making duplicate guesses; all built-in optimizers cannot duplicate guess. If the custom predictor suggests a duplicate, OptTask picks a random guess out of the remaining untried space. Default is no duplicate check, and an error is raised if a duplicate is suggested. tolerances (list): The tolerance of each feature when duplicate checking. For categorical features, put 'None' Example: Our dimensions are [(1, 100), ['red', 'blue'], (2.0, 20.0)]. We want our first parameter to be a duplicate only if it is exact, and our third parameter to be a duplicate if it is within 1e-6. Then: tolerances=[0, None, 1e-6] z-vector features: get_z (string): the fully-qualified name of a function which, given an x vector, returns another vector z which provides extra information to the machine learner. The features defined in z are not used to run the workflow, but are used for learning. If z_features are enabled, ONLY z features will be used for learning (x vectors essentially become tags or identifiers only) Examples: get_z = 'my_pkg.my_module.my_fun' get_z = '/path/to/folder/containing/my_dir/my_module.my_fun' get_z_args (list): the positional arguments to be passed to the get_z function alongside x get_z_kwargs (dict): the kwargs to be passed to the get_z function alongside x z_file (str): The filename (pickle file) where OptTask should save /cache z calculations. Specify this argument if calculating z for many (n_search_pts) is not trivial and will cost time in computing. With this argument specified, each z will only be calculated once. Defaults to None, meaning that all unexplored z are re-calculated each iteration. Example: z_file = '/path/to/z_guesses.p' Parallelism: enforce_sequential (bool): WARNING: Experimental feature! If True, enforces that RS optimizations are run sequentially (default), which prevents duplicate guesses from ever being run. If False, allows OptTasks to run optimizations in parallel, which may cause duplicate guesses with high parallelism. batch_size (int): The number of jobs to submit per batch for a batch optimization. For example, batch_size=5 will optimize every 5th job, then submit another 5 jobs based on the best 5 predictions (recomputing the acquisition function after each prediction). timeout (int): The number of seconds to wait before resetting the lock on the db. Returns: None: If you want to run the OptTask workflow, you'll need to pass in the launchpad and opt_label arguments in your wf_creator. """ config = get_default_opttask_kwargs() config["launchpad"] = self.launchpad.to_db_dict() config["opt_label"] = self.opt_label for kw in kwargs.keys(): if kw not in config: raise KeyError( "{} not a valid argument for setup_config. Choose " "from: {}".format(kw, list(config.keys()))) elif kw in ["get_z", "predictor"]: if hasattr(kwargs[kw], '__call__'): config[kw] = serialize(kwargs[kw]) else: config[kw] = kwargs[kw] if hasattr(wf_creator, '__call__'): wf_creator = serialize(wf_creator) config["wf_creator"] = wf_creator config["dimensions"] = dimensions # Determine data types of dimensions config["dim_types"] = check_dims(dimensions) config["is_discrete_any"] = is_discrete(dimensions, criteria="any") config["is_discrete_all"] = is_discrete(dimensions, criteria="all") # Ensure importable functions are importable try: deserialize(wf_creator) except ImportError as IE: self.logger.warn("wf_creator " + IMPORT_WARNING + "\n" + str(IE)) try: pre = config["predictor"] if pre: if "." in pre: deserialize(pre) except ImportError as IE: self.logger.warn("predictor " + IMPORT_WARNING + "\n" + str(IE)) try: getz = config["get_z"] if getz: if "." in getz: deserialize(getz) except ImportError as IE: self.logger.warn("get_z " + IMPORT_WARNING + "\n" + str(IE)) # Ensure acquisition function is valid (for builtin predictors) acq_funcs = [None, 'ei', 'pi', 'lcb', 'maximin'] if config['acq'] not in acq_funcs: raise ValueError( "Invalid acquisition function. Use 'ei', 'pi', 'lcb', " "'maximin' (multiobjective), or None.") config["doctype"] = "config" self.config = config if self.c.find_one({"doctype": "config"}): raise ValueError("A config is already present in this Launchpad " "for opt_label=={}. Please use the MissionControl" " reset method to reset the database config." "".format(self.opt_label)) else: self.c.insert_one(self.config) self.logger.info("Rocketsled configuration succeeded.") self.is_configured = True
def auto_setup(func, dimensions, wfname=None, launch_ready=False, **kwargs): """ Automatically set up a FireWorks-based optimization loop with OptTask with you own function. The loop is set up as a script, which is written to the rocketsled/auto_sleds directory. All you need to do it set up your Fireworks launchpad and run the script created, to get started. Make sure to pass in necessary launchpad data to OptTask through kwargs of this function! Args: func (function object): A function object accepting a single positional argument, x, a vector of ints/floats/strs and returning a single scalar, y. wfname (str): The base name you want for the workflow. dimensions (list): A list of dimensions constraining each of the variables in x. Each 2-tuple in the list defines one dimension in the search space in (low, high) format. For categorical dimensions, includes all possible categories as a list. Example: dimensions = [(1,100), (9.293, 18.2838), ("red", "blue", "green")] launch_ready (bool): If True, the created document can be executed immediately. kwargs: Arguments to be passed as options to OptTask. Valid arguments to be passed are any valid args for OptTask. For example, lpad, host, port, name, opt_label, acq, predictor, etc... """ # Determine the name and directory dir = os.path.dirname(os.path.abspath(__file__)) + '/auto_sleds' time_now = datetime.datetime.utcnow().strftime(FW_BLOCK_FORMAT) if not os.path.exists(dir): os.mkdir(dir) if not os.path.exists(dir + "/__init__.py"): with open(dir + "/__init__.py", "w") as ipy: ipy.write('"""\n This file has been autocreated by ' 'auto_setup.py\n"""') if wfname: if "/" in wfname or " " in wfname: raise ValueError("Please do not use ' ' or '/' in the wf name.") else: wfname = "autosled_" + time_now filename = dir + "/" + wfname + ".py" if os.path.exists(filename): warnings.warn("That workflow file has already been created! Appending " "the current datetime to the filename.") filename = dir + "/" + wfname + "_" + time_now + ".py" wfname += "_" + time_now wfc = "rocketsled.auto_sleds." + wfname + ".wf_creator" kwargs['wf_creator'] = wfc kwargs['dimensions'] = dimensions if func is not None: if not hasattr(func, '__call__'): raise TypeError("func must be a callable function.") rawfunc = os.path.abspath(sys.modules.get(func.__module__).__file__) if rawfunc.endswith(".pyc"): rawfunc = rawfunc[:-3] + func.__name__ else: rawfunc = rawfunc[:-2] + func.__name__ # See if import will work try: deserialize(rawfunc) except AttributeError: warnings.warn("Import attempt failed! File will still be written, " "but if launching does not work, make sure the " "function exists, is named properly, and " "the string argument of the functions location is " "/the/full/path/to/my/module.myfunc") funcpath = "rocketsled.auto_sleds." + wfname + ".f" # for the name of the workflow, # prevent fws name indexing from causing bson errors with large dims # also prevents webgui ugliness with large dims pointstr = " @ ' + str(x)" if len(dimensions) < 5 else "'" with open(filename, 'w') as f: try: f.write("from __future__ import unicode_literals\n") f.write('"""\n') f.write( "This is an automatically created script from auto_setup.\n" "If you are not comfortable working with FireWorks and " "PyTask, do NOT move this\nfile out this directory or " "rename it if you want to run this workflow!\n\nIf you are" " comfortable working with FireWorks and PyTask, feel f" "ree to edit\nand/or move this file to suit your needs. " "See the OptTask documentation and the\nexamples for more " "information on setting up workflow creators.\n") f.write('"""\n') f.write("from fireworks import PyTask, Firework, Workflow, " "LaunchPad\n") f.write( "from fireworks.core.rocket_launcher import rapidfire\n") f.write("from rocketsled.utils import deserialize, " "random_guess\n") f.write("from rocketsled import OptTask\n\n\n") f.write( "# This is your function, imported to rocketsled to use" " with PyTask.\n") f.write("f = deserialize('" + rawfunc + "')\n\n") f.write("def wf_creator(x):\n") f.write(" spec = {'_x_opt':x}\n") f.write(" pt = " + PyTask_as_string(funcpath) + "\n") f.write(" ot = " + OptTask_as_string(**kwargs) + "\n") f.write( " fw0 = Firework([pt], spec=spec, name='PyTaskFW')\n") f.write(" fw1 = Firework([ot], spec=spec, " "name='RocketsledFW')\n") f.write(" wf = Workflow([fw0, fw1], {fw0: [fw1], fw1: []}," " name='" + wfname + pointstr + ")\n") f.write(" return wf\n") f.write("\n\nif __name__=='__main__': \n\n") f.write( " # Make sure the launchpad below is correct, and " "make changes if necessary if\n # it does not match " "the OptTask db ^^^:\n") if all(s in kwargs for s in ['host', 'port', 'name']): h = kwargs['host'] p = kwargs['port'] n = kwargs['name'] f.write(" lpad = LaunchPad(host='{}', port={}, " "name='{}')\n".format(h, p, n)) elif 'lpad' in kwargs: if isinstance(kwargs['lpad'], LaunchPad): lpad = kwargs['lpad'].to_dict() else: lpad = kwargs['lpad'] f.write(" lpad = LaunchPad.from_dict(" + lpad + ")\n") else: f.write(" lpad = LaunchPad.auto_load()\n") f.write( " # lpad.reset(password=None, require_password=False" ")\n") f.write("\n # Define your workflow to start...\n") f.write(" wf1 = wf_creator(random_guess(" + str(dimensions) + "))\n\n") f.write(" # Add it to the launchpad and launch!\n") f.write(" lpad.add_wf(wf1)\n") if launch_ready: f.write(" rapidfire(lpad, nlaunches=5, sleep_time=0)") else: f.write(" # rapidfire(lpad, nlaunches=5, sleep_time=0)") except Exception: raise print("\nFile successfully created!\nFind your auto sled at " "{}\n".format(filename)) else: raise ValueError("Please specify a callable function or a properly" "formatted string location of the function") return filename
def __init__(self, *args, **kwargs): super(OptTask, self).__init__(*args, **kwargs) # Configuration attrs lp = self.get("launchpad", LaunchPad.auto_load()) if isinstance(lp, LaunchPad): lp = lp.to_dict() self.lpad = LaunchPad.from_dict(lp) self.opt_label = self.get("opt_label", "opt_default") self.c = getattr(self.lpad.db, self.opt_label) self.config = self.c.find_one({"doctype": "config"}) if self.config is None: raise NotConfiguredError( "Please use MissionControl().configure to " "configure the optimization database " "({} - {}) before running OptTask." "".format(self.lpad.db, self.opt_label)) self.wf_creator = deserialize(self.config["wf_creator"]) self.x_dims = self.config["dimensions"] self._xdim_types = self.config["dim_types"] self.is_discrete_all = self.config["is_discrete_all"] self.is_discrete_any = self.config["is_discrete_any"] self.wf_creator_args = self.config["wf_creator_args"] or [] self.wf_creator_kwargs = self.config["wf_creator_kwargs"] or {} self.predictor = self.config["predictor"] self.predictor_args = self.config["predictor_args"] or [] self.predictor_kwargs = self.config["predictor_kwargs"] or {} self.maximize = self.config["maximize"] self.n_search_pts = self.config["n_search_pts"] self.n_train_pts = self.config["n_train_pts"] self.n_bootstraps = self.config["n_bootstraps"] self.acq = self.config["acq"] self.space_file = self.config["space_file"] self.onehot_categorical = self.config["onehot_categorical"] self.duplicate_check = self.config["duplicate_check"] self.get_z = self.config["get_z"] if self.get_z: self.get_z = deserialize(self.config["get_z"]) else: self.get_z = lambda *ars, **kws: [] self.get_z_args = self.config["get_z_args"] or [] self.get_z_kwargs = self.config["get_z_kwargs"] or {} self.z_file = self.config["z_file"] self.enforce_sequential = self.config["enforce_sequential"] self.tolerances = self.config["tolerances"] self.batch_size = self.config["batch_size"] self.timeout = self.config["timeout"] # Declared attrs self.n_objs = None self.builtin_predictors = {p.__name__: p for p in BUILTIN_PREDICTORS} self._n_cats = 0 self._encoding_info = [] # Query formats self._completed = { "x": { "$exists": 1 }, "y": { "$exists": 1, "$ne": "reserved" }, "z": { "$exists": 1 }, } self._manager = {"lock": {"$exists": 1}, "queue": {"$exists": 1}}
def optimize(self, fw_spec, manager_id): """ Run the optimization algorithm. Args: fw_spec (dict): The firework spec. manager_id (ObjectId): The MongoDB object id of the manager document. Returns: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows """ x = list(fw_spec["_x"]) y = fw_spec["_y"] if isinstance(y, (list, tuple)): if len(y) == 1: y = y[0] self.n_objs = len(y) if self.acq not in ("maximin", None): raise ValueError( "{} is not a valid acquisition function for multiobjective " "optimization".format(self.acq)) else: if self.acq == "maximin": raise ValueError( "Maximin is not a valid acquisition function for single " "objective optimization.") self.n_objs = 1 # If process A suggests a certain guess and runs it, process B may # suggest the same guess while process A is running its new workflow. # Therefore, process A must reserve the guess. Line below releases # reservation on this document in case of workflow failure or end of # workflow. self.c.delete_one({"x": x, "y": "reserved"}) # fetch additional attributes for constructing ML model z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs) # use all possible training points as default n_completed = self.c.count_documents(self._completed) if not self.n_train_pts or self.n_train_pts > n_completed: self.n_train_pts = n_completed # check if opimization should be done, if in batch mode batch_mode = False if self.batch_size == 1 else True batch_ready = (n_completed not in (0, 1) and (n_completed + 1) % self.batch_size == 0) x = convert_native(x) y = convert_native(y) z = convert_native(z) if batch_mode and not batch_ready: # 'None' predictor means this job was not used for # an optimization run. if self.c.find_one({"x": x}): if self.c.find_one({"x": x, "y": "reserved"}): # For reserved guesses: update everything self.c.find_one_and_update( { "x": x, "y": "reserved" }, { "$set": { "y": y, "z": z, "z_new": [], "x_new": [], "predictor": None, "index": n_completed + 1, } }, ) else: # For completed guesses (ie, this workflow # is a forced duplicate), do not update # index, but update everything else self.c.find_one_and_update( {"x": x}, { "$set": { "y": y, "z": z, "z_new": [], "x_new": [], "predictor": None, } }, ) else: # For new guesses: insert x, y, z, index, # predictor, and dummy new guesses self.c.insert_one({ "x": x, "y": y, "z": z, "x_new": [], "z_new": [], "predictor": None, "index": n_completed + 1, }) self.pop_lock(manager_id) raise BatchNotReadyError # Mongo aggregation framework may give duplicate documents, so we cannot # use $sample to randomize the training points used searched_indices = random.sample(range(1, n_completed + 1), self.n_train_pts) searched_docs = self.c.find({"index": { "$in": searched_indices }}, batch_size=10000) reserved_docs = self.c.find({"y": "reserved"}, batch_size=10000) reserved = [] for doc in reserved_docs: reserved.append(doc["x"]) all_y = [None] * n_completed all_y.append(y) all_x_searched = [None] * n_completed all_x_searched.append(x) z = list(z) all_xz_searched = [None] * n_completed all_xz_searched.append(x + z) for i, doc in enumerate(searched_docs): all_x_searched[i] = doc["x"] all_xz_searched[i] = doc["x"] + doc["z"] all_y[i] = doc["y"] all_x_space = self._discretize_space(self.x_dims) all_x_space = list(all_x_space) if self.z_file else all_x_space all_x_unsearched = [] for xi in all_x_space: xj = list(xi) if xj not in all_x_searched and xj not in reserved: all_x_unsearched.append(xj) if len(all_x_unsearched) == self.n_search_pts: break if self.z_file: if path.exists(self.z_file): with open(self.z_file, "rb") as f: xz_map = pickle.load(f) else: xz_map = { tuple(xi): self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_space } with open(self.z_file, "wb") as f: pickle.dump(xz_map, f) all_xz_unsearched = [ xi + xz_map[tuple(xi)] for xi in all_x_unsearched ] else: all_xz_unsearched = [ xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_unsearched ] # if there are no more unsearched points in the entire # space, either they have been searched (ie have x, y, # and z) or have been reserved. if len(all_xz_unsearched) < 1: if self.is_discrete_all: raise ExhaustedSpaceError( "The discrete space has been searched" " exhaustively.") else: raise TypeError("A comprehensive list of points was exhausted " "but the dimensions are not discrete.") z_dims = self._z_dims(all_xz_unsearched, all_xz_searched) xz_dims = self.x_dims + z_dims # run machine learner on Z or X features if self.predictor in self.builtin_predictors: model = self.builtin_predictors[self.predictor] all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) all_xz_new_onehot = [] for _ in range(self.batch_size): xz1h = self._predict( all_xz_searched, all_y, all_xz_unsearched, model(*self.predictor_args, **self.predictor_kwargs), self.maximize, scaling=True, ) ix = all_xz_unsearched.index(xz1h) all_xz_unsearched.pop(ix) all_xz_new_onehot.append(xz1h) all_xz_new = [ self._decode(xz_onehot, xz_dims) for xz_onehot in all_xz_new_onehot ] elif self.predictor == "random": all_xz_new = random.sample(all_xz_unsearched, self.batch_size) else: # If using a custom predictor, automatically convert # categorical info to one-hot encoded ints. # Used when a custom predictor cannot natively use # categorical info if self.onehot_categorical: all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) try: predictor_fun = deserialize(self.predictor) except Exception as E: raise NameError("The custom predictor {} didnt import " "correctly!\n{}".format(self.predictor, E)) all_xz_new = predictor_fun( all_xz_searched, all_y, self.x_dims, all_xz_unsearched, *self.predictor_args, **self.predictor_kwargs, ) if self.onehot_categorical: all_xz_new = self._decode(all_xz_new, xz_dims) if not isinstance(all_xz_new[0], (list, tuple)): all_xz_new = [all_xz_new] # duplicate checking for custom optimizer functions if self.duplicate_check: if not self.enforce_sequential: raise ValueError( "Duplicate checking cannot work when " "optimizations are not enforced sequentially.") if (self.predictor not in self.builtin_predictors and self.predictor != "random"): all_x_new = [ split_xz(xz_new, self.x_dims, x_only=True) for xz_new in all_xz_new ] all_x_searched = [ split_xz(xz, self.x_dims, x_only=True) for xz in all_xz_searched ] if self.tolerances: for n, x_new in enumerate(all_x_new): if is_duplicate_by_tolerance( x_new, all_x_searched, tolerances=self.tolerances): all_xz_new[n] = random.choice(all_xz_unsearched) else: if self.is_discrete_all: # test only for x, not xz because custom predicted z # may not be accounted for for n, x_new in enumerate(all_x_new): if x_new in all_x_searched or x_new == x: all_xz_new[n] = random.choice( all_xz_unsearched) else: raise ValueError("Define tolerances parameter to " "duplicate check floats.") return x, y, z, all_xz_new, n_completed
def optimize(self, fw_spec, manager_id): """ Run the optimization algorithm. Args: fw_spec (dict): The firework spec. manager_id (ObjectId): The MongoDB object id of the manager document. Returns: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows """ x = list(fw_spec['_x']) y = fw_spec['_y'] if isinstance(y, (list, tuple)): if len(y) == 1: y = y[0] self.n_objs = len(y) if self.acq not in ("maximin", None): raise ValueError( "{} is not a valid acquisition function for multiobjective " "optimization".format(self.acq)) else: if self.acq == "maximin": raise ValueError( "Maximin is not a valid acquisition function for single " "objective optimization.") self.n_objs = 1 # If process A suggests a certain guess and runs it, process B may # suggest the same guess while process A is running its new workflow. # Therefore, process A must reserve the guess. Line below releases # reservation on this document in case of workflow failure or end of # workflow. self.c.delete_one({'x': x, 'y': 'reserved'}) # fetch additional attributes for constructing ML model z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs) # use all possible training points as default n_completed = self.c.count_documents(self._completed) if not self.n_train_pts or self.n_train_pts > n_completed: self.n_train_pts = n_completed # check if opimization should be done, if in batch mode batch_mode = False if self.batch_size == 1 else True batch_ready = n_completed not in (0, 1) and ( n_completed + 1) % self.batch_size == 0 x = convert_native(x) y = convert_native(y) z = convert_native(z) if batch_mode and not batch_ready: # 'None' predictor means this job was not used for # an optimization run. if self.c.find_one({'x': x}): if self.c.find_one({'x': x, 'y': 'reserved'}): # For reserved guesses: update everything self.c.find_one_and_update( {'x': x, 'y': 'reserved'}, {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [], 'predictor': None, 'index': n_completed + 1} }) else: # For completed guesses (ie, this workflow # is a forced duplicate), do not update # index, but update everything else self.c.find_one_and_update( {'x': x}, {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [], 'predictor': None} }) else: # For new guesses: insert x, y, z, index, # predictor, and dummy new guesses self.c.insert_one({'x': x, 'y': y, 'z': z, 'x_new': [], 'z_new': [], 'predictor': None, 'index': n_completed + 1}) self.pop_lock(manager_id) raise BatchNotReadyError # Mongo aggregation framework may give duplicate documents, so we cannot # use $sample to randomize the training points used searched_indices = random.sample( range(1, n_completed + 1), self.n_train_pts) searched_docs = self.c.find( {'index': {'$in': searched_indices}}, batch_size=10000) reserved_docs = self.c.find({'y': 'reserved'}, batch_size=10000) reserved = [] for doc in reserved_docs: reserved.append(doc['x']) all_y = [None] * n_completed all_y.append(y) all_x_searched = [None] * n_completed all_x_searched.append(x) z = list(z) all_xz_searched = [None] * n_completed all_xz_searched.append(x + z) for i, doc in enumerate(searched_docs): all_x_searched[i] = doc['x'] all_xz_searched[i] = doc['x'] + doc['z'] all_y[i] = doc['y'] all_x_space = self._discretize_space(self.x_dims) all_x_space = list(all_x_space) if self.z_file else all_x_space all_x_unsearched = [] for xi in all_x_space: xj = list(xi) if xj not in all_x_searched and xj not in reserved: all_x_unsearched.append(xj) if len(all_x_unsearched) == self.n_search_pts: break if self.z_file: if path.exists(self.z_file): with open(self.z_file, 'rb') as f: xz_map = pickle.load(f) else: xz_map = {tuple(xi): self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_space} with open(self.z_file, 'wb') as f: pickle.dump(xz_map, f) all_xz_unsearched = [xi + xz_map[tuple(xi)] for xi in all_x_unsearched] else: all_xz_unsearched = [ xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_unsearched] # if there are no more unsearched points in the entire # space, either they have been searched (ie have x, y, # and z) or have been reserved. if len(all_xz_unsearched) < 1: if self.is_discrete_all: raise ExhaustedSpaceError("The discrete space has been searched" " exhaustively.") else: raise TypeError("A comprehensive list of points was exhausted " "but the dimensions are not discrete.") z_dims = self._z_dims(all_xz_unsearched, all_xz_searched) xz_dims = self.x_dims + z_dims # run machine learner on Z or X features if self.predictor in self.builtin_predictors: model = self.builtin_predictors[self.predictor] all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) all_xz_new_onehot = [] for _ in range(self.batch_size): xz1h = self._predict(all_xz_searched, all_y, all_xz_unsearched, model(*self.predictor_args, **self.predictor_kwargs), self.maximize, scaling=True) ix = all_xz_unsearched.index(xz1h) all_xz_unsearched.pop(ix) all_xz_new_onehot.append(xz1h) all_xz_new = [self._decode(xz_onehot, xz_dims) for xz_onehot in all_xz_new_onehot] elif self.predictor == 'random': all_xz_new = random.sample(all_xz_unsearched, self.batch_size) else: # If using a custom predictor, automatically convert # categorical info to one-hot encoded ints. # Used when a custom predictor cannot natively use # categorical info if self.onehot_categorical: all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) try: predictor_fun = deserialize(self.predictor) except Exception as E: raise NameError("The custom predictor {} didnt import " "correctly!\n{}".format(self.predictor, E)) all_xz_new = predictor_fun(all_xz_searched, all_y, self.x_dims, all_xz_unsearched, *self.predictor_args, **self.predictor_kwargs) if self.onehot_categorical: all_xz_new = self._decode(all_xz_new, xz_dims) if not isinstance(all_xz_new[0], (list, tuple)): all_xz_new = [all_xz_new] # duplicate checking for custom optimizer functions if self.duplicate_check: if not self.enforce_sequential: raise ValueError("Duplicate checking cannot work when " "optimizations are not enforced sequentially.") if self.predictor not in self.builtin_predictors and \ self.predictor != 'random': all_x_new = [split_xz(xz_new, self.x_dims, x_only=True) for xz_new in all_xz_new] all_x_searched = [split_xz(xz, self.x_dims, x_only=True) for xz in all_xz_searched] if self.tolerances: for n, x_new in enumerate(all_x_new): if is_duplicate_by_tolerance(x_new, all_x_searched, tolerances=self.tolerances): all_xz_new[n] = random.choice( all_xz_unsearched) else: if self.is_discrete_all: # test only for x, not xz because custom predicted z # may not be accounted for for n, x_new in enumerate(all_x_new): if x_new in all_x_searched or x_new == x: all_xz_new[n] = random.choice( all_xz_unsearched) else: raise ValueError("Define tolerances parameter to " "duplicate check floats.") return x, y, z, all_xz_new, n_completed
def test_deserialize(self): cwd = os.path.dirname(os.path.realpath(__file__)) funcstr = cwd + "/deserialize_func.obj_func" f = deserialize(funcstr) self.assertEqual(f([1, 2, 3]), 6) self.assertAlmostEqual(f([1.0, 2.0, 3.0]), 6.0)
def configure(self, wf_creator, dimensions, **kwargs): """ Set up the optimization config. Required before using OptTask, but only needs to be done once. To reconfigure, use MissionControl.reset and then use configure again. Defaults can be found in defaults.yaml. Args: wf_creator (function or str): The function object that creates the workflow based on a unique vector, x. Alternatively, the full string module path to that function, e.g. "mypkg.mymodule.my_wf_creator", which must importable and found in PYTHONPATH. dimensions ([tuple]): each 2-tuple in the list defines one dimension in the search space in (low, high) format. For categorical or discontinuous dimensions, includes all possible categories or values as a list of any length or a tuple of length>2. Example: dimensions = dim = [(1,100), (9.293, 18.2838), ("red", "blue", "green")]. **kwargs: Keyword arguments for defining the optimization. A full list of possible kwargs is given below: Optimization data: opt_label (str): The label to use for this collection of optimization data. Workflow creator function: wf_creator_args (list): the positional args to be passed to the wf_creator function alongsize the new x vector wf_creator_kwargs (dict): details the kwargs to be passed to the wf_creator function alongside the new x vector Predictors (optimization): predictor (function or str): a function which given a list of searched points and unsearched points, returns an optimized guess. To use a builtin predictor, pass in one of: 'GaussianProcessRegressor', 'RandomForestRegressor', 'ExtraTreesRegressor', 'GradientBoostingRegressor', 'random' (random guess) The default is 'GaussianProcessRegressor' To use a custom predictor, pass in the function object. Alternatively, the full string module path to that function, e.g. "mypkg.mymodule.my_predictor", which must importable and found in PYTHONPATH. Example builtin predictor: 'GaussianProcessRegressor' Example custom predictor: my_predictor Example custom predictor 2: 'my_pkg.my_module.my_predictor' predictor_args (list): the positional args to be passed to the model along with a list of points to be searched. For sklearn-based predictors included in OptTask, these positional args are passed to the init method of the chosen model. For custom predictors, these are passed to the chosen predictor function alongside the searched guesses, the output from searched guesses, and an unsearched space to be used with optimization. predictor_kwargs (dict): the kwargs to be passed to the model. Similar to predictor_args. n_search_pts (int): The number of points to be searched in the search space when choosing the next best point. Choosing more points to search may increase the effectiveness of the optimization but take longer to evaluate. The default is 1000. n_train_pts (int): The number of already explored points to be chosen for training. Default is None, meaning all available points will be used for training. Reduce the number of points to decrease training times. n_bootstraps (int): The number of times each optimization should, sample, train, and predict values when generating uncertainty estimates for prediction. At least 10 data points must be present for bootstrapping. Not used if: acq not specified, custom predictor used, or GaussianProcessRegressor used. acq (str): The acquisition function to use. Can be 'ei' for expected improvement, 'pi' for probability of improvement, or 'lcb' for lower confidence bound, or None for greedy selection. Only works with builtin predictors. space_file (str): The fully specified path of a pickle file containing a list of all possible searchable vectors. For example '/Users/myuser/myfolder/myspace.p'. When loaded, this space_file should be a list of tuples. onehot_categorical (bool): If True, preprocesses categorical data (strings) to one-hot encoded binary arrays for use with custom predictor functions. Default False. duplicate_check (bool): If True, checks that custom optimizers are not making duplicate guesses; all built-in optimizers cannot duplicate guess. If the custom predictor suggests a duplicate, OptTask picks a random guess out of the remaining untried space. Default is no duplicate check, and an error is raised if a duplicate is suggested. tolerances (list): The tolerance of each feature when duplicate checking. For categorical features, put 'None' Example: Our dimensions are [(1, 100), ['red', 'blue'], (2.0, 20.0)]. We want our first parameter to be a duplicate only if it is exact, and our third parameter to be a duplicate if it is within 1e-6. Then: tolerances=[0, None, 1e-6] maximize (bool): If True, maximizes the objective function instead of minimizing. Defaults to False, meaninng minimze. z-vector features: get_z (string): the fully-qualified name of a function which, given an x vector, returns another vector z which provides extra information to the machine learner. The features defined in z are not used to run the workflow, but are used for learning. If z_features are enabled, ONLY z features will be used for learning (x vectors essentially become tags or identifiers only) Examples: get_z = 'my_pkg.my_module.my_fun' get_z = '/path/to/folder/containing/my_dir/my_module.my_fun' get_z_args (list): the positional arguments to be passed to the get_z function alongside x get_z_kwargs (dict): the kwargs to be passed to the get_z function alongside x z_file (str): The filename (pickle file) where OptTask should save /cache z calculations. Specify this argument if calculating z for many (n_search_pts) is not trivial and will cost time in computing. With this argument specified, each z will only be calculated once. Defaults to None, meaning that all unexplored z are re-calculated each iteration. Example: z_file = '/path/to/z_guesses.p' Parallelism: enforce_sequential (bool): WARNING: Experimental feature! If True, enforces that RS optimizations are run sequentially (default), which prevents duplicate guesses from ever being run. If False, allows OptTasks to run optimizations in parallel, which may cause duplicate guesses with high parallelism. batch_size (int): The number of jobs to submit per batch for a batch optimization. For example, batch_size=5 will optimize every 5th job, then submit another 5 jobs based on the best 5 predictions (recomputing the acquisition function after each prediction). timeout (int): The number of seconds to wait before resetting the lock on the db. Returns: None: If you want to run the OptTask workflow, you'll need to pass in the launchpad and opt_label arguments in your wf_creator. """ config = get_default_opttask_kwargs() config["launchpad"] = self.launchpad.to_db_dict() config["opt_label"] = self.opt_label for kw in kwargs.keys(): if kw not in config: raise KeyError( "{} not a valid argument for setup_config. Choose " "from: {}".format(kw, list(config.keys()))) elif kw in ["get_z", "predictor"]: if hasattr(kwargs[kw], '__call__'): config[kw] = serialize(kwargs[kw]) else: config[kw] = kwargs[kw] if hasattr(wf_creator, '__call__'): wf_creator = serialize(wf_creator) config["wf_creator"] = wf_creator config["dimensions"] = dimensions # Determine data types of dimensions config["dim_types"] = check_dims(dimensions) config["is_discrete_any"] = is_discrete(dimensions, criteria="any") config["is_discrete_all"] = is_discrete(dimensions, criteria="all") # Ensure importable functions are importable try: deserialize(wf_creator) except ImportError as IE: self.logger.warn("wf_creator " + IMPORT_WARNING + "\n" + str(IE)) try: pre = config["predictor"] if pre: if "." in pre: deserialize(pre) except ImportError as IE: self.logger.warn("predictor " + IMPORT_WARNING + "\n" + str(IE)) try: getz = config["get_z"] if getz: if "." in getz: deserialize(getz) except ImportError as IE: self.logger.warn("get_z " + IMPORT_WARNING + "\n" + str(IE)) # Ensure acquisition function is valid (for builtin predictors) acq_funcs = [None, 'ei', 'pi', 'lcb', 'maximin'] if config['acq'] not in acq_funcs: raise ValueError( "Invalid acquisition function. Use 'ei', 'pi', 'lcb', " "'maximin' (multiobjective), or None.") config["doctype"] = "config" self.config = config if self.c.find_one({"doctype": "config"}): raise ValueError("A config is already present in this Launchpad " "for opt_label=={}. Please use the MissionControl" " reset method to reset the database config." "".format(self.opt_label)) else: self.c.insert_one(self.config) self.logger.info("Rocketsled configuration succeeded.") self.is_configured = True