def stash(self, x, y, z, all_xz_new, n_completed): """ Write documents to database after optimization. Args: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows Returns: opt_id (pymongo InsertedOneResult): The result of the insertion of the new optimization document in the database. If multiple opt_ids are valid (ie batch mode is enabled), the last opt_id is returned. """ for xz_new in all_xz_new: # separate 'predicted' z features from the new x vector x_new, z_new = split_xz(xz_new, self.x_dims) x_new = convert_native(x_new) z_new = convert_native(z_new) # if it is a duplicate (such as a forced # identical first guess) forced_dupe = self.c.find_one({"x": x}) acqmap = { "ei": "Expected Improvement", "pi": "Probability of Improvement", "lcb": "Lower Confidence Boundary", None: "Highest Value", "maximin": "Maximin Expected " "Improvement", } if self.predictor in self.builtin_predictors: predictorstr = (self.predictor + " with acquisition: " + acqmap[self.acq]) if self.n_objs > 1: predictorstr += " using {} objectives".format(self.n_objs) else: predictorstr = self.predictor if forced_dupe: # only update the fields which should be updated self.c.find_one_and_update( {"x": x}, { "$set": { "y": y, "z": z, "z_new": z_new, "x_new": x_new, "predictor": predictorstr, } }, ) else: # update all the fields, as it is a new document self.c.insert_one({ "z": z, "y": y, "x": x, "z_new": z_new, "x_new": x_new, "predictor": predictorstr, "index": n_completed + 1, }) # ensure previously fin. workflow results are not overwritten by # concurrent predictions if (self.c.count_documents({ "x": x_new, "y": { "$exists": 1, "$ne": "reserved" } }) == 0): # reserve the new x to prevent parallel processes from # registering it as unsearched, since the next iteration of this # process will be exploring it res = self.c.insert_one({"x": x_new, "y": "reserved"}) opt_id = res.inserted_id else: raise ValueError( "The predictor suggested a guess which has already been " "tried: {}".format(x_new)) return opt_id
def optimize(self, fw_spec, manager_id): """ Run the optimization algorithm. Args: fw_spec (dict): The firework spec. manager_id (ObjectId): The MongoDB object id of the manager document. Returns: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows """ x = list(fw_spec["_x"]) y = fw_spec["_y"] if isinstance(y, (list, tuple)): if len(y) == 1: y = y[0] self.n_objs = len(y) if self.acq not in ("maximin", None): raise ValueError( "{} is not a valid acquisition function for multiobjective " "optimization".format(self.acq)) else: if self.acq == "maximin": raise ValueError( "Maximin is not a valid acquisition function for single " "objective optimization.") self.n_objs = 1 # If process A suggests a certain guess and runs it, process B may # suggest the same guess while process A is running its new workflow. # Therefore, process A must reserve the guess. Line below releases # reservation on this document in case of workflow failure or end of # workflow. self.c.delete_one({"x": x, "y": "reserved"}) # fetch additional attributes for constructing ML model z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs) # use all possible training points as default n_completed = self.c.count_documents(self._completed) if not self.n_train_pts or self.n_train_pts > n_completed: self.n_train_pts = n_completed # check if opimization should be done, if in batch mode batch_mode = False if self.batch_size == 1 else True batch_ready = (n_completed not in (0, 1) and (n_completed + 1) % self.batch_size == 0) x = convert_native(x) y = convert_native(y) z = convert_native(z) if batch_mode and not batch_ready: # 'None' predictor means this job was not used for # an optimization run. if self.c.find_one({"x": x}): if self.c.find_one({"x": x, "y": "reserved"}): # For reserved guesses: update everything self.c.find_one_and_update( { "x": x, "y": "reserved" }, { "$set": { "y": y, "z": z, "z_new": [], "x_new": [], "predictor": None, "index": n_completed + 1, } }, ) else: # For completed guesses (ie, this workflow # is a forced duplicate), do not update # index, but update everything else self.c.find_one_and_update( {"x": x}, { "$set": { "y": y, "z": z, "z_new": [], "x_new": [], "predictor": None, } }, ) else: # For new guesses: insert x, y, z, index, # predictor, and dummy new guesses self.c.insert_one({ "x": x, "y": y, "z": z, "x_new": [], "z_new": [], "predictor": None, "index": n_completed + 1, }) self.pop_lock(manager_id) raise BatchNotReadyError # Mongo aggregation framework may give duplicate documents, so we cannot # use $sample to randomize the training points used searched_indices = random.sample(range(1, n_completed + 1), self.n_train_pts) searched_docs = self.c.find({"index": { "$in": searched_indices }}, batch_size=10000) reserved_docs = self.c.find({"y": "reserved"}, batch_size=10000) reserved = [] for doc in reserved_docs: reserved.append(doc["x"]) all_y = [None] * n_completed all_y.append(y) all_x_searched = [None] * n_completed all_x_searched.append(x) z = list(z) all_xz_searched = [None] * n_completed all_xz_searched.append(x + z) for i, doc in enumerate(searched_docs): all_x_searched[i] = doc["x"] all_xz_searched[i] = doc["x"] + doc["z"] all_y[i] = doc["y"] all_x_space = self._discretize_space(self.x_dims) all_x_space = list(all_x_space) if self.z_file else all_x_space all_x_unsearched = [] for xi in all_x_space: xj = list(xi) if xj not in all_x_searched and xj not in reserved: all_x_unsearched.append(xj) if len(all_x_unsearched) == self.n_search_pts: break if self.z_file: if path.exists(self.z_file): with open(self.z_file, "rb") as f: xz_map = pickle.load(f) else: xz_map = { tuple(xi): self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_space } with open(self.z_file, "wb") as f: pickle.dump(xz_map, f) all_xz_unsearched = [ xi + xz_map[tuple(xi)] for xi in all_x_unsearched ] else: all_xz_unsearched = [ xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_unsearched ] # if there are no more unsearched points in the entire # space, either they have been searched (ie have x, y, # and z) or have been reserved. if len(all_xz_unsearched) < 1: if self.is_discrete_all: raise ExhaustedSpaceError( "The discrete space has been searched" " exhaustively.") else: raise TypeError("A comprehensive list of points was exhausted " "but the dimensions are not discrete.") z_dims = self._z_dims(all_xz_unsearched, all_xz_searched) xz_dims = self.x_dims + z_dims # run machine learner on Z or X features if self.predictor in self.builtin_predictors: model = self.builtin_predictors[self.predictor] all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) all_xz_new_onehot = [] for _ in range(self.batch_size): xz1h = self._predict( all_xz_searched, all_y, all_xz_unsearched, model(*self.predictor_args, **self.predictor_kwargs), self.maximize, scaling=True, ) ix = all_xz_unsearched.index(xz1h) all_xz_unsearched.pop(ix) all_xz_new_onehot.append(xz1h) all_xz_new = [ self._decode(xz_onehot, xz_dims) for xz_onehot in all_xz_new_onehot ] elif self.predictor == "random": all_xz_new = random.sample(all_xz_unsearched, self.batch_size) else: # If using a custom predictor, automatically convert # categorical info to one-hot encoded ints. # Used when a custom predictor cannot natively use # categorical info if self.onehot_categorical: all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) try: predictor_fun = deserialize(self.predictor) except Exception as E: raise NameError("The custom predictor {} didnt import " "correctly!\n{}".format(self.predictor, E)) all_xz_new = predictor_fun( all_xz_searched, all_y, self.x_dims, all_xz_unsearched, *self.predictor_args, **self.predictor_kwargs, ) if self.onehot_categorical: all_xz_new = self._decode(all_xz_new, xz_dims) if not isinstance(all_xz_new[0], (list, tuple)): all_xz_new = [all_xz_new] # duplicate checking for custom optimizer functions if self.duplicate_check: if not self.enforce_sequential: raise ValueError( "Duplicate checking cannot work when " "optimizations are not enforced sequentially.") if (self.predictor not in self.builtin_predictors and self.predictor != "random"): all_x_new = [ split_xz(xz_new, self.x_dims, x_only=True) for xz_new in all_xz_new ] all_x_searched = [ split_xz(xz, self.x_dims, x_only=True) for xz in all_xz_searched ] if self.tolerances: for n, x_new in enumerate(all_x_new): if is_duplicate_by_tolerance( x_new, all_x_searched, tolerances=self.tolerances): all_xz_new[n] = random.choice(all_xz_unsearched) else: if self.is_discrete_all: # test only for x, not xz because custom predicted z # may not be accounted for for n, x_new in enumerate(all_x_new): if x_new in all_x_searched or x_new == x: all_xz_new[n] = random.choice( all_xz_unsearched) else: raise ValueError("Define tolerances parameter to " "duplicate check floats.") return x, y, z, all_xz_new, n_completed
def stash(self, x, y, z, all_xz_new, n_completed): """ Write documents to database after optimization. Args: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows Returns: opt_id (pymongo InsertedOneResult): The result of the insertion of the new optimization document in the database. If multiple opt_ids are valid (ie batch mode is enabled), the last opt_id is returned. """ for xz_new in all_xz_new: # separate 'predicted' z features from the new x vector x_new, z_new = split_xz(xz_new, self.x_dims) x_new = convert_native(x_new) z_new = convert_native(z_new) # if it is a duplicate (such as a forced # identical first guess) forced_dupe = self.c.find_one({'x': x}) acqmap = {"ei": "Expected Improvement", "pi": "Probability of Improvement", "lcb": "Lower Confidence Boundary", None: "Highest Value", "maximin": "Maximin Expected " "Improvement"} if self.predictor in self.builtin_predictors: predictorstr = self.predictor + " with acquisition: " + acqmap[ self.acq] if self.n_objs > 1: predictorstr += " using {} objectives".format(self.n_objs) else: predictorstr = self.predictor if forced_dupe: # only update the fields which should be updated self.c.find_one_and_update( {'x': x}, {'$set': {'y': y, 'z': z, 'z_new': z_new, 'x_new': x_new, 'predictor': predictorstr} }) else: # update all the fields, as it is a new document self.c.insert_one( {'z': z, 'y': y, 'x': x, 'z_new': z_new, 'x_new': x_new, 'predictor': predictorstr, 'index': n_completed + 1}) # ensure previously fin. workflow results are not overwritten by # concurrent predictions if self.c.count_documents( {'x': x_new, 'y': {'$exists': 1, '$ne': 'reserved'}}) == 0: # reserve the new x to prevent parallel processes from # registering it as unsearched, since the next iteration of this # process will be exploring it res = self.c.insert_one({'x': x_new, 'y': 'reserved'}) opt_id = res.inserted_id else: raise ValueError( "The predictor suggested a guess which has already been " "tried: {}".format(x_new)) return opt_id
def optimize(self, fw_spec, manager_id): """ Run the optimization algorithm. Args: fw_spec (dict): The firework spec. manager_id (ObjectId): The MongoDB object id of the manager document. Returns: x (iterable): The current x guess. y: (iterable): The current y (objective function) value z (iterable): The z vector associated with x all_xz_new ([list] or [tuple]): The predicted next best guess(es), including their associated z vectors n_completed (int): The number of completed guesses/workflows """ x = list(fw_spec['_x']) y = fw_spec['_y'] if isinstance(y, (list, tuple)): if len(y) == 1: y = y[0] self.n_objs = len(y) if self.acq not in ("maximin", None): raise ValueError( "{} is not a valid acquisition function for multiobjective " "optimization".format(self.acq)) else: if self.acq == "maximin": raise ValueError( "Maximin is not a valid acquisition function for single " "objective optimization.") self.n_objs = 1 # If process A suggests a certain guess and runs it, process B may # suggest the same guess while process A is running its new workflow. # Therefore, process A must reserve the guess. Line below releases # reservation on this document in case of workflow failure or end of # workflow. self.c.delete_one({'x': x, 'y': 'reserved'}) # fetch additional attributes for constructing ML model z = self.get_z(x, *self.get_z_args, **self.get_z_kwargs) # use all possible training points as default n_completed = self.c.count_documents(self._completed) if not self.n_train_pts or self.n_train_pts > n_completed: self.n_train_pts = n_completed # check if opimization should be done, if in batch mode batch_mode = False if self.batch_size == 1 else True batch_ready = n_completed not in (0, 1) and ( n_completed + 1) % self.batch_size == 0 x = convert_native(x) y = convert_native(y) z = convert_native(z) if batch_mode and not batch_ready: # 'None' predictor means this job was not used for # an optimization run. if self.c.find_one({'x': x}): if self.c.find_one({'x': x, 'y': 'reserved'}): # For reserved guesses: update everything self.c.find_one_and_update( {'x': x, 'y': 'reserved'}, {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [], 'predictor': None, 'index': n_completed + 1} }) else: # For completed guesses (ie, this workflow # is a forced duplicate), do not update # index, but update everything else self.c.find_one_and_update( {'x': x}, {'$set': {'y': y, 'z': z, 'z_new': [], 'x_new': [], 'predictor': None} }) else: # For new guesses: insert x, y, z, index, # predictor, and dummy new guesses self.c.insert_one({'x': x, 'y': y, 'z': z, 'x_new': [], 'z_new': [], 'predictor': None, 'index': n_completed + 1}) self.pop_lock(manager_id) raise BatchNotReadyError # Mongo aggregation framework may give duplicate documents, so we cannot # use $sample to randomize the training points used searched_indices = random.sample( range(1, n_completed + 1), self.n_train_pts) searched_docs = self.c.find( {'index': {'$in': searched_indices}}, batch_size=10000) reserved_docs = self.c.find({'y': 'reserved'}, batch_size=10000) reserved = [] for doc in reserved_docs: reserved.append(doc['x']) all_y = [None] * n_completed all_y.append(y) all_x_searched = [None] * n_completed all_x_searched.append(x) z = list(z) all_xz_searched = [None] * n_completed all_xz_searched.append(x + z) for i, doc in enumerate(searched_docs): all_x_searched[i] = doc['x'] all_xz_searched[i] = doc['x'] + doc['z'] all_y[i] = doc['y'] all_x_space = self._discretize_space(self.x_dims) all_x_space = list(all_x_space) if self.z_file else all_x_space all_x_unsearched = [] for xi in all_x_space: xj = list(xi) if xj not in all_x_searched and xj not in reserved: all_x_unsearched.append(xj) if len(all_x_unsearched) == self.n_search_pts: break if self.z_file: if path.exists(self.z_file): with open(self.z_file, 'rb') as f: xz_map = pickle.load(f) else: xz_map = {tuple(xi): self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_space} with open(self.z_file, 'wb') as f: pickle.dump(xz_map, f) all_xz_unsearched = [xi + xz_map[tuple(xi)] for xi in all_x_unsearched] else: all_xz_unsearched = [ xi + self.get_z(xi, *self.get_z_args, **self.get_z_kwargs) for xi in all_x_unsearched] # if there are no more unsearched points in the entire # space, either they have been searched (ie have x, y, # and z) or have been reserved. if len(all_xz_unsearched) < 1: if self.is_discrete_all: raise ExhaustedSpaceError("The discrete space has been searched" " exhaustively.") else: raise TypeError("A comprehensive list of points was exhausted " "but the dimensions are not discrete.") z_dims = self._z_dims(all_xz_unsearched, all_xz_searched) xz_dims = self.x_dims + z_dims # run machine learner on Z or X features if self.predictor in self.builtin_predictors: model = self.builtin_predictors[self.predictor] all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) all_xz_new_onehot = [] for _ in range(self.batch_size): xz1h = self._predict(all_xz_searched, all_y, all_xz_unsearched, model(*self.predictor_args, **self.predictor_kwargs), self.maximize, scaling=True) ix = all_xz_unsearched.index(xz1h) all_xz_unsearched.pop(ix) all_xz_new_onehot.append(xz1h) all_xz_new = [self._decode(xz_onehot, xz_dims) for xz_onehot in all_xz_new_onehot] elif self.predictor == 'random': all_xz_new = random.sample(all_xz_unsearched, self.batch_size) else: # If using a custom predictor, automatically convert # categorical info to one-hot encoded ints. # Used when a custom predictor cannot natively use # categorical info if self.onehot_categorical: all_xz_searched = self._encode(all_xz_searched, xz_dims) all_xz_unsearched = self._encode(all_xz_unsearched, xz_dims) try: predictor_fun = deserialize(self.predictor) except Exception as E: raise NameError("The custom predictor {} didnt import " "correctly!\n{}".format(self.predictor, E)) all_xz_new = predictor_fun(all_xz_searched, all_y, self.x_dims, all_xz_unsearched, *self.predictor_args, **self.predictor_kwargs) if self.onehot_categorical: all_xz_new = self._decode(all_xz_new, xz_dims) if not isinstance(all_xz_new[0], (list, tuple)): all_xz_new = [all_xz_new] # duplicate checking for custom optimizer functions if self.duplicate_check: if not self.enforce_sequential: raise ValueError("Duplicate checking cannot work when " "optimizations are not enforced sequentially.") if self.predictor not in self.builtin_predictors and \ self.predictor != 'random': all_x_new = [split_xz(xz_new, self.x_dims, x_only=True) for xz_new in all_xz_new] all_x_searched = [split_xz(xz, self.x_dims, x_only=True) for xz in all_xz_searched] if self.tolerances: for n, x_new in enumerate(all_x_new): if is_duplicate_by_tolerance(x_new, all_x_searched, tolerances=self.tolerances): all_xz_new[n] = random.choice( all_xz_unsearched) else: if self.is_discrete_all: # test only for x, not xz because custom predicted z # may not be accounted for for n, x_new in enumerate(all_x_new): if x_new in all_x_searched or x_new == x: all_xz_new[n] = random.choice( all_xz_unsearched) else: raise ValueError("Define tolerances parameter to " "duplicate check floats.") return x, y, z, all_xz_new, n_completed
def test_convert_native(self): a = [np.int(10), np.float(12.2), np.str("a str"), 12.3, 100, "ok"] native = convert_native(a) self.assertListEqual( [type(i) for i in native], [int, float, str, float, int, str] )
def test_convert_native(self): a = [np.int(10), np.float(12.2), np.str("a str"), 12.3, 100, "ok"] native = convert_native(a) self.assertListEqual([type(i) for i in native], [int, float, str, float, int, str])