def _standardize_user_data(self, s, a, s_next, r, absorbing, theta, check_batch_dim=False): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) check_batch_dim (bool): default False Returns: The standardized values (s, a, s_next, r, theta) """ s = standardize_input_data( s, ['s'], [(None, self.state_dim)] if self.state_dim is not None else None, exception_prefix='state') a = standardize_input_data( a, ['a'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='action') # r = standardize_input_data(r, ['r'], [(None, 1)], # check_batch_dim=False, exception_prefix='reward') s_next = standardize_input_data( s_next, ['s_next'], [(None, self.state_dim)] if self.state_dim is not None else None, exception_prefix='state_next') theta = standardize_input_data(theta, ['theta'], (None, self.bellman_model.n_inputs()), exception_prefix='theta') check_array_lengths(s, a, s_next) return s, a, s_next, r, absorbing, theta
def _standardize_user_data(self, s, a, s_next, r, check_batch_dim=False): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) check_batch_dim (bool): default False Returns: The standardized values (s, a, s_next, r, theta) """ s = standardize_input_data( s, ['s'], [(None, self.state_dim)] if self.state_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='state') a = standardize_input_data( a, ['a'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='action') # r = standardize_input_data(r, ['r'], [(None, 1)], # check_batch_dim=False, exception_prefix='reward') s_next = standardize_input_data( s_next, ['s_next'], [(None, self.state_dim)] if self.state_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='state_next') check_array_lengths(s, a, s_next) return s, a, s_next, r
def _standardize_user_data(self, s, a, s_next, r, check_batch_dim=False): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) check_batch_dim (bool): default False Returns: The standardized values (s, a, s_next, r, theta) """ s = standardize_input_data(s, ['s'], [(None, self.state_dim)] if self.state_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='state') a = standardize_input_data(a, ['a'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='action') # r = standardize_input_data(r, ['r'], [(None, 1)], # check_batch_dim=False, exception_prefix='reward') s_next = standardize_input_data(s_next, ['s_next'], [(None, self.state_dim)] if self.state_dim is not None else None, check_batch_dim=check_batch_dim, exception_prefix='state_next') check_array_lengths(s, a, s_next) return s, a, s_next, r
def _standardize_user_data(model, x, y, sample_weight=None, class_weight=None, check_batch_dim=True, batch_size=None): if not hasattr(model, 'optimizer'): raise Exception('You must compile a model before training/testing.' ' Use `model.compile(optimizer, loss)`.') output_shapes = [] for output_shape, loss_fn in zip(model.internal_output_shapes, model.loss_functions): if loss_fn.__name__ == 'sparse_categorical_crossentropy': output_shapes.append(output_shape[:-1] + (1, )) elif getattr(objectives, loss_fn.__name__, None) is None: output_shapes.append(None) else: output_shapes.append(output_shape) x = standardize_input_data(x, model.input_names, model.internal_input_shapes, check_batch_dim=False, exception_prefix='model input') y = standardize_input_data(y, model.output_names, output_shapes, check_batch_dim=False, exception_prefix='model target') sample_weights = standardize_sample_weights(sample_weight, model.output_names) class_weights = standardize_class_weights(class_weight, model.output_names) sample_weights = [ standardize_weights(ref, sw, cw, mode) for (ref, sw, cw, mode) in zip( y, sample_weights, class_weights, model.sample_weight_modes) ] ''' We only need to comment out check_array_lengeh(x, y, weights) in the next line to let the model compile and train. ''' # check_array_lengths(x, y, sample_weights) check_loss_and_target_compatibility(y, model.loss_functions, model.internal_output_shapes) if model.stateful and batch_size: if x[0].shape[0] % batch_size != 0: raise Exception('In a stateful network, ' 'you should only pass inputs with ' 'a number of samples that can be ' 'divided by the batch size. Found: ' + str(x[0].shape[0]) + ' samples') return x, y, sample_weights
def finetune(self, X, Y, batch_size=128, gp_n_iter=10, verbose=1): '''Finetune the output GP layers assuming the network is pre-trained. Arguments: ---------- X : np.ndarray or list of np.ndarrays Y : np.ndarray or list of np.ndarrays batch_size : uint (default: 128) Batch size used for data streaming through the network. gp_n_iter : uint (default: 100) Number of iterations for GP training. verbose : uint (default: 1) Verbosity mode, 0 or 1. ''' # Validate user data X = standardize_input_data(X, self.input_names, self.internal_input_shapes, check_batch_dim=False) H = self.transform(X, batch_size=batch_size) if verbose: print("Finetuning output GPs...") for gp, h, y in zip(self.gp_output_layers, H, Y): # Update GP data (and grid if necessary) gp.backend.update_data('tr', h, y) if gp.update_grid: gp.backend.update_grid('tr') # Train GP gp.hyp = gp.backend.train(gp_n_iter, verbose=verbose) if verbose: print("Done.")
def _standardize_user_data(model, x, y, sample_weight=None, class_weight=None, check_batch_dim=True, batch_size=None): if not hasattr(model, 'optimizer'): raise Exception('You must compile a model before training/testing.' ' Use `model.compile(optimizer, loss)`.') output_shapes = [] for output_shape, loss_fn in zip(model.internal_output_shapes, model.loss_functions): if loss_fn.__name__ == 'sparse_categorical_crossentropy': output_shapes.append(output_shape[:-1] + (1,)) elif getattr(objectives, loss_fn.__name__, None) is None: output_shapes.append(None) else: output_shapes.append(output_shape) x = standardize_input_data(x, model.input_names, model.internal_input_shapes, check_batch_dim=False, exception_prefix='model input') y = standardize_input_data(y, model.output_names, output_shapes, check_batch_dim=False, exception_prefix='model target') sample_weights = standardize_sample_weights(sample_weight, model.output_names) class_weights = standardize_class_weights(class_weight, model.output_names) sample_weights = [standardize_weights(ref, sw, cw, mode) for (ref, sw, cw, mode) in zip(y, sample_weights, class_weights, model.sample_weight_modes)] ''' We only need to comment out check_array_lengeh(x, y, weights) in the next line to let the model compile and train. ''' # check_array_lengths(x, y, sample_weights) check_loss_and_target_compatibility(y, model.loss_functions, model.internal_output_shapes) if model.stateful and batch_size: if x[0].shape[0] % batch_size != 0: raise Exception('In a stateful network, ' 'you should only pass inputs with ' 'a number of samples that can be ' 'divided by the batch size. Found: ' + str(x[0].shape[0]) + ' samples') return x, y, sample_weights
def _standardize_user_data(self, s, a, s_next, r, absorbing, theta, check_batch_dim=False): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) check_batch_dim (bool): default False Returns: The standardized values (s, a, s_next, r, theta) """ s = standardize_input_data( s, ['s'], [(None, self.state_dim)] if self.state_dim is not None else None, exception_prefix='state') a = standardize_input_data( a, ['a'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='action') # r = standardize_input_data(r, ['r'], [(None, 1)], # check_batch_dim=False, exception_prefix='reward') s_next = standardize_input_data( s_next, ['s_next'], [(None, self.state_dim)] if self.state_dim is not None else None, exception_prefix='state_next') theta = standardize_input_data(theta, ['theta'], (None, self.bellman_model.n_inputs()), exception_prefix='theta') check_array_lengths(s, a, s_next) return s, a, s_next, r, absorbing, theta
def __init__( self, estimator, gamma, discrete_actions, optimizer="adam", state_dim=None, action_dim=None, norm_value=2, update_theta_every=1, horizon=10, verbose=0, ): super(GenGradFQI, self).__init__(estimator, state_dim, action_dim, discrete_actions, gamma, horizon, verbose) # save MDP information self.norm_value = norm_value self.update_theta_every = update_theta_every if update_theta_every > 0 else -1 # create theano variables self.T_Y = T.dvector() # define bellman operator (check that BOP has only one output) assert isinstance(estimator.inputs, list) assert len(estimator.inputs) == 1 assert isinstance(estimator.outputs, list) assert len(estimator.outputs) == 1 # construct (theano) Bellman error v = self._estimator.outputs[0] - self.T_Y if self.norm_value == np.inf: err = T.max(v ** 2) else: err = T.mean(v ** self.norm_value) ** (1.0 / self.norm_value) self.fqi_loss = err # define function to be used for train and drawing actions self.train_function = None # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data( discrete_actions, ["discrete_actions"], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix="discrete_actions", )
def __init__(self, q_model, gamma, discrete_actions, optimizer, state_dim=None, action_dim=None, incremental=True): # save MDP information self.state_dim = state_dim self.action_dim = action_dim self.incremental = incremental self.gamma = gamma # create theano variables T_s = T.matrix() T_a = T.matrix() T_s_next = T.matrix() T_r = T.vector() # T_r = T.dmatrix() T_discrete_actions = T.matrix() # store models of bellman apx and Q-function self.q_model = q_model # construct (theano) Bellman error self.T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r, self.gamma, T_discrete_actions) # define function to be used for train and drawing actions self.train_function = None self.draw_action_function = None self.T_s = T_s self.T_a = T_a self.T_s_next = T_s_next self.T_r = T_r self.T_discrete_actions = T_discrete_actions # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data( discrete_actions, ['discrete_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=False, exception_prefix='discrete_actions')
def __init__(self, estimator, gamma, discrete_actions, optimizer="adam", state_dim=None, action_dim=None, norm_value=2, update_theta_every=1, horizon=10, verbose=0): super(GenGradFQI, self).__init__(estimator, state_dim, action_dim, discrete_actions, gamma, horizon, verbose) # save MDP information self.norm_value = norm_value self.update_theta_every = update_theta_every if update_theta_every > \ 0 else -1 # create theano variables self.T_Y = T.dvector() # define bellman operator (check that BOP has only one output) assert isinstance(estimator.inputs, list) assert len(estimator.inputs) == 1 assert isinstance(estimator.outputs, list) assert len(estimator.outputs) == 1 # construct (theano) Bellman error v = self._estimator.outputs[0] - self.T_Y if self.norm_value == np.inf: err = T.max(v**2) else: err = T.mean(v**self.norm_value)**(1. / self.norm_value) self.fqi_loss = err # define function to be used for train and drawing actions self.train_function = None # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data( discrete_actions, ['discrete_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='discrete_actions')
def draw_action(self, state, done, flag): """ Samples the action to be executed. Args: state (numpy.array): the state to be evaluated (1, state_dim) or (state_dim,) done: ?? flag: ?? Returns: The action to be executed in the state """ state = state.astype(theano.config.floatX) self._make_draw_action_function() state = standardize_input_data(state, ['state'], [(None, self.state_dim)] if self.state_dim is not None else None, check_batch_dim=False, exception_prefix='draw_state') return self.draw_action_function(state[0], self.discrete_actions[ 0]) # we take index zero since they are lists of numpy matrices
def predict(self, X, X_tr=None, Y_tr=None, batch_size=128, return_var=False, verbose=0): '''Generate output predictions for the input samples batch by batch. Arguments: ---------- X : np.ndarray or list of np.ndarrays batch_size : uint (default: 128) return_var : bool (default: False) Whether predictive variance is returned. verbose : uint (default: 0) Verbosity mode, 0 or 1. Returns: -------- preds : a list or a tuple of lists Lists of output predictions and variance estimates. ''' # Update GP data if provided (and grid if necessary) if X_tr is not None and Y_tr is not None: X_tr, Y_tr, _ = self._standardize_user_data(X_tr, Y_tr, check_batch_dim=False, batch_size=batch_size) H_tr = self.transform(X_tr, batch_size=batch_size) for gp, h, y in zip(self.gp_output_layers, H_tr, Y_tr): gp.backend.update_data('tr', h, y) if gp.update_grid: gp.backend.update_grid('tr') # Validate user data X = standardize_input_data(X, self.input_names, self.internal_input_shapes, check_batch_dim=False) H = self.transform(X, batch_size=batch_size) preds = [] for gp, h in zip(self.gp_output_layers, H): preds.append(gp.backend.predict(h, return_var=return_var)) if return_var: preds = map(list, zip(*preds)) return preds
def draw_action(self, state, done, flag): """ Samples the action to be executed. Args: state (numpy.array): the state to be evaluated (1, state_dim) or (state_dim,) done: ?? flag: ?? Returns: The action to be executed in the state """ self._make_draw_action_function() state = standardize_input_data( state, ['state'], [(None, self.state_dim)] if self.state_dim is not None else None, exception_prefix='draw_state') return self.draw_action_function( state[0], self.learned_theta_value, self.discrete_actions[0] ) # we take index zero since they are lists of numpy matrices
def __init__(self, q_model, gamma, discrete_actions, optimizer, state_dim=None, action_dim=None, incremental=True): # save MDP information self.state_dim = state_dim self.action_dim = action_dim self.incremental = incremental self.gamma = gamma # create theano variables T_s = T.matrix() T_a = T.matrix() T_s_next = T.matrix() T_r = T.vector() # T_r = T.dmatrix() T_discrete_actions = T.matrix() # store models of bellman apx and Q-function self.q_model = q_model # construct (theano) Bellman error self.T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r, self.gamma, T_discrete_actions) # define function to be used for train and drawing actions self.train_function = None self.draw_action_function = None self.T_s = T_s self.T_a = T_a self.T_s_next = T_s_next self.T_r = T_r self.T_discrete_actions = T_discrete_actions # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data(discrete_actions, ['discrete_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=False, exception_prefix='discrete_actions')
def fit(self, s, a, s_next, r, absorbing, theta, batch_size=32, nb_epoch=10, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ s, a, s_next, r, absorbing, theta = self._standardize_user_data( s, a, s_next, r, absorbing, theta, check_batch_dim=False) all_actions = standardize_input_data( self.discrete_actions, ['all_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='discrete_actions') n_updates = 0 history = {"theta": [], 'rho': []} for k in theta_metrics.keys(): history.update({k: []}) ins = s + a + s_next + [r, absorbing] self._make_train_function() f = self.train_function nb_train_sample = ins[0].shape[0] index_array = np.arange(nb_train_sample) # append evolution of theta for independent case for _ in range(len(self.theta_list) - 1): if self.incremental: tmp = theta[-1] + self.bellman_model.predict(theta[-1]) else: tmp = self.bellman_model.predict(theta[-1]) theta += [tmp] term_condition = self.term_condition stop = False old_theta = theta for epoch in range(nb_epoch): if stop: break if shuffle == 'batch': index_array = batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(nb_train_sample, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): history["theta"].append(theta[0]) if hasattr(self.bellman_model, '_model'): history["rho"].append( self.bellman_model._model.get_weights()) else: history["rho"].append(self.bellman_model.get_weights()) for k, v in iteritems(theta_metrics): history[k].append(v(theta)) batch_ids = index_array[batch_start:batch_end] try: if type(ins[-1]) is float: # do not slice the training phase flag ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_X(ins, batch_ids) except TypeError: raise Exception('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') inp = ins_batch + theta + all_actions outs = f(*inp) n_updates += 1 if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0: tmp = self.apply_bo(theta[0], n_times=self.steps_per_theta_update) theta = [tmp] for _ in range(len(self.theta_list) - 1): if self.incremental: tmp = tmp + self.bellman_model.predict(tmp) else: tmp = self.bellman_model.predict(tmp) theta += [tmp] if term_condition is not None: stop = term_condition(old_theta, theta) if stop: break old_theta = theta # finally apply the bellman operator K-times to get the final point self.learned_theta_value = self.apply_bo(theta[0], n_times=100) if self.verbose > 1: print('learned theta: {}'.format(self.learned_theta_value)) self.history = history return history
def fit(self, sast, r, batch_size=32, nb_epoch=10, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ sast = standardize_input_data( sast, ["sast"], (None, 2 * self.state_dim + self.action_dim + 1), exception_prefix="sast" )[0] next_states_idx = self.state_dim + self.action_dim sa = sast[:, :next_states_idx] s_next = sast[:, next_states_idx:-1] absorbing = sast[:, -1] n_updates = 0 maxq, maxa = self.maxQA(s_next, absorbing) if hasattr(self._estimator, "adapt"): # update estimator structure self._estimator.adapt(iteration=self._iteration) # y = np.reshape(r + self.gamma * maxq, (-1, 1)) y = r + self.gamma * maxq ins = [sa, y] self._make_train_function() f = self.train_function nb_train_sample = sa.shape[0] index_array = np.arange(nb_train_sample) history = {"theta": []} for k in theta_metrics.keys(): history.update({k: []}) for epoch in range(nb_epoch): if shuffle == "batch": index_array = batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(nb_train_sample, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): if hasattr(self._estimator, "_model"): ltheta = self._model.get_weights() history["theta"].append(ltheta) else: ltheta = self._estimator.get_weights() history["theta"].append(ltheta) for k, v in iteritems(theta_metrics): history[k].append(v(ltheta)) batch_ids = index_array[batch_start:batch_end] try: if type(ins[-1]) is float: # do not slice the training phase flag ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_X(ins, batch_ids) except TypeError: raise Exception( "TypeError while preparing batch. " "If using HDF5 input data, " 'pass shuffle="batch".' ) outs = f(*ins_batch) n_updates += 1 if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0: maxq, maxa = self.maxQA(s_next, absorbing) if hasattr(self._estimator, "adapt"): # update estimator structure self._estimator.adapt(iteration=self._iteration) # y = np.reshape(r + self.gamma * maxq, (-1, 1)) y = r + self.gamma * maxq ins = [ins[0], y] if self._verbose > 1: print("learned theta: {}".format(self._estimator.get_weights())) return history
def fit(self, sast, r, batch_size=32, nb_epoch=10, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ sast = standardize_input_data( sast, ['sast'], (None, 2 * self.state_dim + self.action_dim + 1), exception_prefix='sast')[0] next_states_idx = self.state_dim + self.action_dim sa = sast[:, :next_states_idx] s_next = sast[:, next_states_idx:-1] absorbing = sast[:, -1] n_updates = 0 maxq, maxa = self.maxQA(s_next, absorbing) if hasattr(self._estimator, 'adapt'): # update estimator structure self._estimator.adapt(iteration=self._iteration) # y = np.reshape(r + self.gamma * maxq, (-1, 1)) y = r + self.gamma * maxq ins = [sa, y] self._make_train_function() f = self.train_function nb_train_sample = sa.shape[0] index_array = np.arange(nb_train_sample) history = {"theta": []} for k in theta_metrics.keys(): history.update({k: []}) for epoch in range(nb_epoch): if shuffle == 'batch': index_array = batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(nb_train_sample, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): if hasattr(self._estimator, '_model'): ltheta = self._model.get_weights() history["theta"].append(ltheta) else: ltheta = self._estimator.get_weights() history["theta"].append(ltheta) for k, v in iteritems(theta_metrics): history[k].append(v(ltheta)) batch_ids = index_array[batch_start:batch_end] try: if type(ins[-1]) is float: # do not slice the training phase flag ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_X(ins, batch_ids) except TypeError: raise Exception('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') outs = f(*ins_batch) n_updates += 1 if self.update_theta_every > 0 \ and n_updates % self.update_theta_every == 0: maxq, maxa = self.maxQA(s_next, absorbing) if hasattr(self._estimator, 'adapt'): # update estimator structure self._estimator.adapt(iteration=self._iteration) # y = np.reshape(r + self.gamma * maxq, (-1, 1)) y = r + self.gamma * maxq ins = [ins[0], y] if self._verbose > 1: print('learned theta: {}'.format(self._estimator.get_weights())) return history
def fit(self, s, a, s_next, r, batch_size=32, nb_epoch=10, verbose=1, callbacks=[], validation_split=0., validation_data=None, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ s, a, s_next, r = self._standardize_user_data(s, a, s_next, r, check_batch_dim=False) all_actions = standardize_input_data( self.discrete_actions, ['all_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=False, exception_prefix='discrete_actions') # # prepare validation data # if validation_data: # do_validation = True # if len(validation_data) == 4: # val_s, val_a, val_s_next, val_r = validation_data # elif len(validation_data) == 5: # val_s, val_a, val_s_next, val_r, val_theta = validation_data # else: # raise # # val_s, val_a, val_s_next, val_r, val_theta = self._standardize_user_data( # val_s, val_a, val_s_next, val_r, val_theta, # check_batch_dim=False, # batch_size=batch_size # ) # self._make_test_function() # val_f = self.test_function # val_ins = val_s + val_a + val_s_next + [val_r] # # elif validation_split and 0. < validation_split < 1.: # do_validation = True # split_at = int(len(x[0]) * (1. - validation_split)) # x, val_x = (slice_X(x, 0, split_at), slice_X(x, split_at)) # y, val_y = (slice_X(y, 0, split_at), slice_X(y, split_at)) # sample_weights, val_sample_weights = ( # slice_X(sample_weights, 0, split_at), slice_X(sample_weights, split_at)) # self._make_test_function() # val_f = self.test_function # if self.uses_learning_phase and type(K.learning_phase()) is not int: # val_ins = val_x + val_y + val_sample_weights + [0.] # else: # val_ins = val_x + val_y + val_sample_weights # else: # do_validation = False # val_f = None # val_ins = None do_validation = False val_f = None val_ins = None ins = s + a + s_next + [r] self._make_train_function() f = self.train_function # prepare display labels out_labels = ['bellman_error'] if do_validation: callback_metrics = copy.copy(out_labels) + [ 'val_' + n for n in out_labels ] else: callback_metrics = copy.copy(out_labels) return self._fit_loop(f, ins, all_actions, out_labels=out_labels, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=callbacks, val_f=val_f, val_ins=val_ins, shuffle=shuffle, callback_metrics=callback_metrics, theta_metrics=theta_metrics)
def __init__(self, bellman_model, q_model, steps_ahead, gamma, discrete_actions, optimizer, state_dim=None, action_dim=None, incremental=True, norm_value=np.inf, update_theta_every=1, steps_per_theta_update=None, independent=False, verbose=0, term_condition=None): # save MDP information self.state_dim = state_dim self.action_dim = action_dim self.incremental = incremental self.gamma = gamma self.norm_value = norm_value self.update_theta_every = update_theta_every if update_theta_every > 0 else -1 self.verbose = verbose self.independent = independent self.steps_per_theta_update = steps_ahead if steps_per_theta_update is None else max( 1, steps_per_theta_update) # create theano variables T_s = T.dmatrix() T_a = T.dmatrix() T_s_next = T.dmatrix() T_r = T.dvector() T_absorbing = T.dvector() # T_r = T.dmatrix() T_discrete_actions = T.dmatrix() # store models of bellman apx and Q-function self.bellman_model = bellman_model self.q_model = q_model self.steps_ahead = steps_ahead # define bellman operator (check that BOP has only one output) assert isinstance(bellman_model.inputs, list) assert len(bellman_model.inputs) == 1 assert isinstance(bellman_model.outputs, list) assert len(bellman_model.outputs) == 1 # construct (theano) Bellman error self.theta_list = [bellman_model.inputs[0]] if not independent: self.T_bellman_err, _ = self.k_step_bellman_error( T_s, T_a, T_s_next, T_r, T_absorbing, self.theta_list[0], gamma, T_discrete_actions, steps_ahead) assert len(self.theta_list) == 1 else: self.theta_list += [T.fmatrix(str(ll)) for ll in range( steps_ahead - 1)] # theta_0, theta_1, ..., theta_steps T_bellman_err = None for theta in self.theta_list: if T_bellman_err is None: T_bellman_err = self.bellman_error( T_s, T_a, T_s_next, T_r, theta, gamma, T_discrete_actions)[0] else: T_bellman_err = T_bellman_err + \ self.bellman_error( T_s, T_a, T_s_next, T_r, theta, gamma, T_discrete_actions)[0] self.T_bellman_err = T_bellman_err assert len(self.theta_list) == steps_ahead # define function to be used for train and drawing actions self.train_function = None self.draw_action_function = None self.T_s = T_s self.T_a = T_a self.T_s_next = T_s_next self.T_r = T_r self.T_discrete_actions = T_discrete_actions self.T_absorbing = T_absorbing # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data( discrete_actions, ['discrete_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='discrete_actions') if isinstance(term_condition, str): self.term_condition = DEFAULT_TERM[term_condition] else: self.term_condition = term_condition
def fit(self, s, a, s_next, r, absorbing, theta, batch_size=32, nb_epoch=10, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) theta (numpy.array): the sample of the Q-function parameters (1, n_params) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ s, a, s_next, r, absorbing, theta = self._standardize_user_data( s, a, s_next, r, absorbing, theta, check_batch_dim=False ) all_actions = standardize_input_data( self.discrete_actions, ['all_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='discrete_actions') n_updates = 0 history = {"theta": [], 'rho': []} for k in theta_metrics.keys(): history.update({k: []}) ins = s + a + s_next + [r, absorbing] self._make_train_function() f = self.train_function nb_train_sample = ins[0].shape[0] index_array = np.arange(nb_train_sample) # append evolution of theta for independent case for _ in range(len(self.theta_list) - 1): if self.incremental: tmp = theta[-1] + self.bellman_model.predict(theta[-1]) else: tmp = self.bellman_model.predict(theta[-1]) theta += [tmp] term_condition = self.term_condition stop = False old_theta = theta for epoch in range(nb_epoch): if stop: break if shuffle == 'batch': index_array = batch_shuffle(index_array, batch_size) elif shuffle: np.random.shuffle(index_array) batches = make_batches(nb_train_sample, batch_size) for batch_index, (batch_start, batch_end) in enumerate(batches): history["theta"].append(theta[0]) if hasattr(self.bellman_model, '_model'): history["rho"].append( self.bellman_model._model.get_weights()) else: history["rho"].append(self.bellman_model.get_weights()) for k, v in iteritems(theta_metrics): history[k].append(v(theta)) batch_ids = index_array[batch_start:batch_end] try: if type(ins[-1]) is float: # do not slice the training phase flag ins_batch = slice_X(ins[:-1], batch_ids) + [ins[-1]] else: ins_batch = slice_X(ins, batch_ids) except TypeError: raise Exception('TypeError while preparing batch. ' 'If using HDF5 input data, ' 'pass shuffle="batch".') inp = ins_batch + theta + all_actions outs = f(*inp) n_updates += 1 if self.update_theta_every > 0 and n_updates % self.update_theta_every == 0: tmp = self.apply_bo(theta[0], n_times=self.steps_per_theta_update) theta = [tmp] for _ in range(len(self.theta_list) - 1): if self.incremental: tmp = tmp + self.bellman_model.predict(tmp) else: tmp = self.bellman_model.predict(tmp) theta += [tmp] if term_condition is not None: stop = term_condition(old_theta, theta) if stop: break old_theta = theta # finally apply the bellman operator K-times to get the final point self.learned_theta_value = self.apply_bo(theta[0], n_times=100) if self.verbose > 1: print('learned theta: {}'.format(self.learned_theta_value)) self.history = history return history
def __init__(self, bellman_model, q_model, steps_ahead, gamma, discrete_actions, optimizer, state_dim=None, action_dim=None, incremental=True, norm_value=np.inf, update_theta_every=1, steps_per_theta_update=None, independent=False, verbose=0, term_condition=None): # save MDP information self.state_dim = state_dim self.action_dim = action_dim self.incremental = incremental self.gamma = gamma self.norm_value = norm_value self.update_theta_every = update_theta_every if update_theta_every > 0 else -1 self.verbose = verbose self.independent = independent self.steps_per_theta_update = steps_ahead if steps_per_theta_update is None else max( 1, steps_per_theta_update) # create theano variables T_s = T.dmatrix() T_a = T.dmatrix() T_s_next = T.dmatrix() T_r = T.dvector() T_absorbing = T.dvector() # T_r = T.dmatrix() T_discrete_actions = T.dmatrix() # store models of bellman apx and Q-function self.bellman_model = bellman_model self.q_model = q_model self.steps_ahead = steps_ahead # define bellman operator (check that BOP has only one output) assert isinstance(bellman_model.inputs, list) assert len(bellman_model.inputs) == 1 assert isinstance(bellman_model.outputs, list) assert len(bellman_model.outputs) == 1 # construct (theano) Bellman error self.theta_list = [bellman_model.inputs[0]] if not independent: self.T_bellman_err, _ = self.k_step_bellman_error( T_s, T_a, T_s_next, T_r, T_absorbing, self.theta_list[0], gamma, T_discrete_actions, steps_ahead) assert len(self.theta_list) == 1 else: self.theta_list += [ T.fmatrix(str(ll)) for ll in range(steps_ahead - 1) ] # theta_0, theta_1, ..., theta_steps T_bellman_err = None for theta in self.theta_list: if T_bellman_err is None: T_bellman_err = self.bellman_error(T_s, T_a, T_s_next, T_r, theta, gamma, T_discrete_actions)[0] else: T_bellman_err = T_bellman_err + \ self.bellman_error( T_s, T_a, T_s_next, T_r, theta, gamma, T_discrete_actions)[0] self.T_bellman_err = T_bellman_err assert len(self.theta_list) == steps_ahead # define function to be used for train and drawing actions self.train_function = None self.draw_action_function = None self.T_s = T_s self.T_a = T_a self.T_s_next = T_s_next self.T_r = T_r self.T_discrete_actions = T_discrete_actions self.T_absorbing = T_absorbing # get keras optimizer self.optimizer = optimizers.get(optimizer) # validate input data (the output is a list storing the validated input) self.discrete_actions = standardize_input_data( discrete_actions, ['discrete_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, exception_prefix='discrete_actions') if isinstance(term_condition, str): self.term_condition = DEFAULT_TERM[term_condition] else: self.term_condition = term_condition
def fit(self, s, a, s_next, r, batch_size=32, nb_epoch=10, verbose=1, callbacks=[], validation_split=0., validation_data=None, shuffle=True, theta_metrics={}): """ Args: s (numpy.array): the samples of the state (nsamples, state_dim) a (numpy.array): the samples of the state (nsamples, action_dim) s_next (numpy.array): the samples of the next (reached) state (nsamples, state_dim) r (numpy.array): the sample of the reward (nsamples, ) batch_size (int): dimension of the batch used for a single step of the gradient nb_epoch (int): number of epochs verbose (int): 0 or 1. Verbosity mode. 0 = silent, 1 = verbose. callbacks (list): list of callbacks to be called during training. See [Keras Callbacks](https://keras.io/callbacks/). validation_split (float): float between 0 and 1: fraction of the training data to be used as validation data. The model will set apart this fraction of the training data, will not train on it, and will evaluate the loss and any model metrics on this data at the end of each epoch. validation_data (tuple): data on which to evaluate the loss and any model metrics at the end of each epoch. The model will not be trained on this data. This could be a tuple (val_s, val_a, val_s_next, val_r) or a tuple (val_s, val_a, val_s_next, val_r, val_theta). shuffle (boolean): whether to shuffle the training data before each epoch. theta_metrics (dict): dictionary storing the pairs (name: callable object). The callable object/function is used to evaluate the Q-function parameters at each iteration. The signature of the callable is simple: f(theta) e.g.: theta_metrics={'k': lambda theta: evaluate(theta)}) Returns: A PBOHistory instance storing train information """ s, a, s_next, r = self._standardize_user_data( s, a, s_next, r, check_batch_dim=False ) all_actions = standardize_input_data(self.discrete_actions, ['all_actions'], [(None, self.action_dim)] if self.action_dim is not None else None, check_batch_dim=False, exception_prefix='discrete_actions') # # prepare validation data # if validation_data: # do_validation = True # if len(validation_data) == 4: # val_s, val_a, val_s_next, val_r = validation_data # elif len(validation_data) == 5: # val_s, val_a, val_s_next, val_r, val_theta = validation_data # else: # raise # # val_s, val_a, val_s_next, val_r, val_theta = self._standardize_user_data( # val_s, val_a, val_s_next, val_r, val_theta, # check_batch_dim=False, # batch_size=batch_size # ) # self._make_test_function() # val_f = self.test_function # val_ins = val_s + val_a + val_s_next + [val_r] # # elif validation_split and 0. < validation_split < 1.: # do_validation = True # split_at = int(len(x[0]) * (1. - validation_split)) # x, val_x = (slice_X(x, 0, split_at), slice_X(x, split_at)) # y, val_y = (slice_X(y, 0, split_at), slice_X(y, split_at)) # sample_weights, val_sample_weights = ( # slice_X(sample_weights, 0, split_at), slice_X(sample_weights, split_at)) # self._make_test_function() # val_f = self.test_function # if self.uses_learning_phase and type(K.learning_phase()) is not int: # val_ins = val_x + val_y + val_sample_weights + [0.] # else: # val_ins = val_x + val_y + val_sample_weights # else: # do_validation = False # val_f = None # val_ins = None do_validation = False val_f = None val_ins = None ins = s + a + s_next + [r] self._make_train_function() f = self.train_function # prepare display labels out_labels = ['bellman_error'] if do_validation: callback_metrics = copy.copy(out_labels) + ['val_' + n for n in out_labels] else: callback_metrics = copy.copy(out_labels) return self._fit_loop(f, ins, all_actions, out_labels=out_labels, batch_size=batch_size, nb_epoch=nb_epoch, verbose=verbose, callbacks=callbacks, val_f=val_f, val_ins=val_ins, shuffle=shuffle, callback_metrics=callback_metrics, theta_metrics=theta_metrics)