def __init__(self, state_normalization_parameters: Dict[str, NormalizationParameters], parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], skip_normalization: Optional[bool] = False) -> None: print(state_normalization_parameters) print(parameters) self._state_normalization_parameters = state_normalization_parameters MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self._buffers = None self.minibatch_size = parameters.training.minibatch_size self.skip_normalization = skip_normalization self._prepare_state_normalization()
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training) self.target_network = TargetNetwork( self, parameters.rl.target_update_rate ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob('states', np.array([0], dtype=np.float32)) workspace.FeedBlob('actions', np.array([0], dtype=np.float32)) workspace.FeedBlob('rewards', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_states', np.array([0], dtype=np.float32)) workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32)) workspace.FeedBlob( 'possible_next_actions', np.array([0], dtype=np.float32) ) workspace.FeedBlob( 'possible_next_actions_lengths', np.array([0], dtype=np.float32) ) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size
def __init__( self, fc_parameters: DiscreteActionModelParameters, cnn_parameters: CNNModelParameters, img_height: int, img_width: int, ) -> None: MLConvTrainer.__init__(self, "ml_conv_trainer", fc_parameters.training, cnn_parameters, img_height, img_width) self.target_network = TargetNetwork( self, fc_parameters.rl.target_update_rate) self.reward_burnin = fc_parameters.rl.reward_burnin self.maxq_learning = fc_parameters.rl.maxq_learning self.rl_discount_rate = fc_parameters.rl.gamma self.training_iteration = 0 self._buffers = None self.minibatch_size = fc_parameters.training.minibatch_size self.skip_normalization = True
class RLTrainer(MLTrainer): def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size @property def sarsa(self) -> bool: """ Returns whether or not this trainer generates target values using SARSA. """ return not self.maxq_learning def stream_tdp(self, tdp: TrainingDataPage, evaluator: Optional[Evaluator] = None) -> None: """ Loads a large batch of transitions from a page of training data. This batch will be further broken down into minibatches for training. :param tdp: TrainingDataPage object that supplies transitions. :param evaluator: Evaluator object to record TD and compute MC losses. """ raise NotImplementedError() def get_max_q_values(self, next_states: np.ndarray, possible_next_actions) -> np.ndarray: """ Takes in an array of next_states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). Uses target network for Q(state_i, pna) approximation. :param next_states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: See subclass' `get_max_q_values` documentation. """ raise NotImplementedError() def get_sarsa_values(self, next_states: np.ndarray, next_actions: np.ndarray) -> np.ndarray: """ Takes in a set of next_states and corresponding next_actions. For each (next_state_i, next_action_i) pair, calculates Q(next_state, next_action). Returns these q values in a Numpy array of shape (batch_size, 1). :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next_state. :param next_actions: See subclass' `get_sarsa_values` documentation. """ raise NotImplementedError() def update_model(self, states: np.ndarray, actions: np.ndarray, q_vals_target: np.ndarray) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets. Updates Q Network's weights according to loss and optimizer. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ raise NotImplementedError() def stream(self, states, actions, rewards, next_states, next_actions, not_terminals, possible_next_actions, reward_timelines, evaluator): """ Load large batch as training set. This batch will be broken down into minibatches. Assumes that states, next_states, and actions (in the parametric action case) need no further normalization. """ assert rewards.ndim == 2 assert not_terminals.ndim == 2 page_size = states.shape[0] assert page_size == self.minibatch_size self.train( states, actions, rewards, next_states, next_actions, not_terminals, possible_next_actions, ) if evaluator is not None: evaluator.report( reward_timelines, self.get_q_values(states, actions), workspace.FetchBlob(self.loss_blob), ) def train( self, states: np.ndarray, actions: np.ndarray, rewards: np.ndarray, next_states: np.ndarray, next_actions: Optional[np.ndarray], not_terminals: np.ndarray, possible_next_actions, ) -> None: """ Takes in a batch of transitions. For transition i, calculates target qval: next_q_values_i = { max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning Q(next_state_i, next_action_i), self.sarsa } q_val_target_i = { r_i + gamma * next_q_values_i, not_terminals_i r_i, !not_terminals_i } Trains Q Network on the q_val_targets as labels. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: See subclass' `train` documentation. :param rewards: Numpy array with shape (batch_size, 1). The ith entry is the reward experienced at the ith transition. :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry is equal to 1 iff the ith transition's state is not terminal. :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next state. :param next_actions: See subclass' `train` documentation. :param possible_next_actions: See subclass' `train` documentation. """ batch_size = self.minibatch_size assert rewards.shape == ( batch_size, 1 ), "Invalid reward shape: " + \ str(rewards.shape) + " != " + str(self.minibatch_size) assert rewards.dtype == np.float32 assert not_terminals.shape == ( batch_size, 1), 'terminals invalid ' + str(not_terminals.shape) q_vals_target = np.copy(rewards) if self.training_iteration >= self.reward_burnin: if self.training_iteration == self.reward_burnin: logger.info( "Minibatch number == reward_burnin. Starting RL updates.") if self.maxq_learning: next_q_values = self.get_max_q_values(next_states, possible_next_actions) else: next_q_values = self.get_sarsa_values(next_states, next_actions) q_vals_target += not_terminals * self.rl_discount_rate * next_q_values self.update_model(states, actions, q_vals_target) if self.training_iteration >= self.reward_burnin: self.target_network.enable_slow_updates() self.target_network.target_update() self.training_iteration += 1
class RLTrainer(MLTrainer): def __init__(self, state_normalization_parameters: Dict[str, NormalizationParameters], parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], skip_normalization: Optional[bool] = False) -> None: print(state_normalization_parameters) print(parameters) self._state_normalization_parameters = state_normalization_parameters MLTrainer.__init__(self, "rl_trainer", parameters.training) self.target_network = TargetNetwork(self, parameters.rl.target_update_rate) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.training_iteration = 0 self._buffers = None self.minibatch_size = parameters.training.minibatch_size self.skip_normalization = skip_normalization self._prepare_state_normalization() def _normalize_states(self, states: np.ndarray) -> np.ndarray: """ Normalizes input states and replaces NaNs with 0. Returns a matrix of the same shape. Make sure to have set up the underlying normalization net with `_prepare_state_normalization`. :param states: Numpy array with shape (batch_size, state_dim) containing raw state inputs """ if self.skip_normalization: return states return normalize_dense_matrix(states, self._state_features, self._state_normalization_parameters, self.state_norm_blobs, self.state_norm_net, self.state_norm_blobname_template, self.num_state_features) def _prepare_state_normalization(self): """ Sets up operators for action normalization net. """ if self.skip_normalization: return self._state_features = list( self._state_normalization_parameters.keys()) self.state_norm_net = core.Net("state_norm_net") self.state_norm_blobname_template = '{}_input_state' self.state_norm_blobs = prepare_normalization( self.state_norm_net, self._state_normalization_parameters, self._state_features, self.state_norm_blobname_template, True) def get_state_features(self) -> List[str]: return self._state_features @property def num_state_features(self) -> int: """ Returns the number of features in each preprocessed state. """ raise NotImplementedError() @property def sarsa(self) -> bool: """ Returns whether or not this trainer generates target values using SARSA. """ return not self.maxq_learning def predictor(self) -> RLPredictor: """ Builds a Predictor using the networks undrlying this Trainer. """ raise NotImplementedError() def stream_tdp(self, tdp: TrainingDataPage, evaluator: Optional[Evaluator] = None) -> None: """ Loads a large batch of transitions from a page of training data. This batch will be further broken down into minibatches for training. :param tdp: TrainingDataPage object that supplies transitions. :param evaluator: Evaluator object to record TD and compute MC losses. """ raise NotImplementedError() def get_max_q_values(self, next_states: np.ndarray, possible_next_actions) -> np.ndarray: """ Takes in an array of next_states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). Uses target network for Q(state_i, pna) approximation. :param next_states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: See subclass' `get_max_q_values` documentation. """ raise NotImplementedError() def get_sarsa_values(self, next_states: np.ndarray, next_actions: np.ndarray) -> np.ndarray: """ Takes in a set of next_states and corresponding next_actions. For each (next_state_i, next_action_i) pair, calculates Q(next_state, next_action). Returns these q values in a Numpy array of shape (batch_size, 1). :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next_state. :param next_actions: See subclass' `get_sarsa_values` documentation. """ raise NotImplementedError() def update_model(self, states: np.ndarray, actions: np.ndarray, q_vals_target: np.ndarray) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets. Updates Q Network's weights according to loss and optimizer. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ raise NotImplementedError() def _validate_train_inputs( self, states: np.ndarray, actions: np.ndarray, rewards: np.ndarray, next_states: np.ndarray, next_actions: Optional[np.ndarray], not_terminals: np.ndarray, possible_next_actions: np.ndarray, ): raise NotImplementedError() def stream(self, states, actions, rewards, next_states, next_actions, not_terminals, possible_next_actions, reward_timelines, evaluator): """ Load large batch as training set. This batch will be broken down into minibatches. Assumes that states, next_states, and actions (in the parametric action case) need no further normalization. """ if rewards.ndim == 1: rewards = rewards.reshape(-1, 1) if not_terminals.ndim == 1: not_terminals = not_terminals.reshape(-1, 1) use_next_actions = next_actions is not None and self.sarsa use_pna = possible_next_actions is not None and self.maxq_learning use_rt = reward_timelines is not None num_buffers = 8 if self._buffers is not None and self._buffers[0].shape[0] > 0: actions = np.concatenate([self._buffers[0], actions]) states = np.concatenate([self._buffers[1], states]) rewards = np.concatenate([self._buffers[2], rewards]) next_states = np.concatenate([self._buffers[3], next_states]) if use_next_actions: next_actions = np.concatenate([self._buffers[4], next_actions]) not_terminals = np.concatenate([self._buffers[5], not_terminals]) if use_pna: possible_next_actions = np.concatenate( [self._buffers[6], possible_next_actions]) if use_rt: reward_timelines = np.concatenate( [self._buffers[7], reward_timelines]) self._buffers = None page_size = states.shape[0] for batch_start in range(0, page_size, self.minibatch_size): batch_end = batch_start + self.minibatch_size if page_size < batch_end: self._buffers = [[] for _ in range(num_buffers)] self._buffers[0] = actions[batch_start:] self._buffers[1] = states[batch_start:] self._buffers[2] = rewards[batch_start:] self._buffers[3] = next_states[batch_start:] if use_next_actions: self._buffers[4] = next_actions[batch_start:] self._buffers[5] = not_terminals[batch_start:] if use_pna: self._buffers[6] = possible_next_actions[batch_start:] if use_rt: self._buffers[7] = reward_timelines[batch_start:] else: na_batch = (next_actions[batch_start:batch_end] if use_next_actions else None) pna_batch = (possible_next_actions[batch_start:batch_end] if use_pna else None) rt_batch = (reward_timelines[batch_start:batch_end] if use_rt else None) states_batch = states[batch_start:batch_end] actions_batch = actions[batch_start:batch_end] self.train(states_batch, actions_batch, rewards[batch_start:batch_end], next_states[batch_start:batch_end], na_batch, not_terminals[batch_start:batch_end], pna_batch) if evaluator is not None: evaluator.report( rt_batch, self.get_q_values(states_batch, actions_batch), workspace.FetchBlob(self.loss_blob)) def train(self, states: np.ndarray, actions: np.ndarray, rewards: np.ndarray, next_states: np.ndarray, next_actions: Optional[np.ndarray], not_terminals: np.ndarray, possible_next_actions: Optional[List]) -> None: """ Takes in a batch of transitions. For transition i, calculates target qval: next_q_values_i = { max_{pna_i} Q(next_state_i, pna_i), self.maxq_learning Q(next_state_i, next_action_i), self.sarsa } q_val_target_i = { r_i + gamma * next_q_values_i, not_terminals_i r_i, !not_terminals_i } Trains Q Network on the q_val_targets as labels. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: See subclass' `train` documentation. :param rewards: Numpy array with shape (batch_size, 1). The ith entry is the reward experienced at the ith transition. :param not_terminals: Numpy array with shape (batch_size, 1). The ith entry is equal to 1 iff the ith transition's state is not terminal. :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next state. :param next_actions: See subclass' `train` documentation. :param possible_next_actions: See subclass' `train` documentation. """ self._validate_train_inputs(states, actions, rewards, next_states, next_actions, not_terminals, possible_next_actions) batch_size = self.minibatch_size assert rewards.shape == (batch_size, 1) assert rewards.dtype == np.float32 assert not_terminals.shape == (batch_size, 1) q_vals_target = np.copy(rewards) if self.training_iteration >= self.reward_burnin: if self.training_iteration == self.reward_burnin: logger.info( "Minibatch number == reward_burnin. Starting RL updates.") if self.maxq_learning: next_q_values = self.get_max_q_values(next_states, possible_next_actions) else: next_q_values = self.get_sarsa_values(next_states, next_actions) q_vals_target += not_terminals * self.rl_discount_rate * next_q_values self.update_model(states, actions, q_vals_target) if self.training_iteration >= self.reward_burnin: self.target_network.enable_slow_updates() self.target_network.target_update() self.training_iteration += 1
def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) RLTrainer.num_trainers += 1 self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers) if parameters.training.cnn_parameters is not None: self.conv_ml_trainer = ConvMLTrainer( CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, ) # The final layer of the conv net is the input to the fc net. parameters.training.layers[ 0] = self.conv_ml_trainer.get_output_size() self.conv_target_network = ConvTargetNetwork( CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, parameters.rl.target_update_rate, self.conv_ml_trainer, ) else: self.conv_ml_trainer = None self.conv_target_network = None assert (parameters.training.layers[0] >= 0), "Set layers[0] to a the number of features" self.ml_trainer = MLTrainer( ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training) self.target_network = TargetNetwork( TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training, parameters.rl.target_update_rate, self.ml_trainer, ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob("states", np.array([0], dtype=np.float32)) workspace.FeedBlob("actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("rewards", np.array([0], dtype=np.float32)) workspace.FeedBlob("next_states", np.array([0], dtype=np.float32)) workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32)) if self.maxq_learning: workspace.FeedBlob("possible_next_actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("possible_next_actions_lengths", np.array([0], dtype=np.float32)) else: workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32)) # Setting to 1 serves as a 1 unit time_diff if not set by user workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None
class RLTrainer: num_trainers = 0 DEFAULT_TRAINING_NUM_WORKERS = 4 def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) RLTrainer.num_trainers += 1 self.model_id = RL_TRAINER_PREFIX + str(RLTrainer.num_trainers) if parameters.training.cnn_parameters is not None: self.conv_ml_trainer = ConvMLTrainer( CONV_ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, ) # The final layer of the conv net is the input to the fc net. parameters.training.layers[ 0] = self.conv_ml_trainer.get_output_size() self.conv_target_network = ConvTargetNetwork( CONV_TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training.cnn_parameters, parameters.rl.target_update_rate, self.conv_ml_trainer, ) else: self.conv_ml_trainer = None self.conv_target_network = None assert (parameters.training.layers[0] >= 0), "Set layers[0] to a the number of features" self.ml_trainer = MLTrainer( ML_TRAINER_PREFIX + str(RLTrainer.num_trainers), parameters.training) self.target_network = TargetNetwork( TARGET_NETWORK_PREFIX + str(RLTrainer.num_trainers), parameters.training, parameters.rl.target_update_rate, self.ml_trainer, ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.use_seq_num_diff_as_time_diff = parameters.rl.use_seq_num_diff_as_time_diff self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob("states", np.array([0], dtype=np.float32)) workspace.FeedBlob("actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("rewards", np.array([0], dtype=np.float32)) workspace.FeedBlob("next_states", np.array([0], dtype=np.float32)) workspace.FeedBlob("not_terminals", np.array([0], dtype=np.float32)) if self.maxq_learning: workspace.FeedBlob("possible_next_actions", np.array([0], dtype=np.float32)) workspace.FeedBlob("possible_next_actions_lengths", np.array([0], dtype=np.float32)) else: workspace.FeedBlob("next_actions", np.array([0], dtype=np.float32)) # Setting to 1 serves as a 1 unit time_diff if not set by user workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None def get_possible_next_actions(self): raise NotImplementedError() def get_max_q_values(self, next_states: str, possible_next_actions, use_target_network: bool) -> str: """ Takes in an array of next_states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). Uses target network for Q(state_i, pna) approximation. :param next_states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: See subclass' `get_max_q_values` documentation. """ raise NotImplementedError() def get_q_values(self, states: str, actions: str, use_target_network: bool) -> str: """ Takes in a set of next_states and corresponding next_actions. For each (next_state_i, next_action_i) pair, calculates Q(next_state, next_action). Returns these q values in a Numpy array of shape (batch_size, 1). :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next_state. :param next_actions: See subclass' `get_sarsa_values` documentation. """ raise NotImplementedError() def update_model(self, states: str, actions: str, q_vals_target: str) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets. Updates Q Network's weights according to loss and optimizer. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ raise NotImplementedError() def _create_reward_train_net(self) -> None: raise NotImplementedError() def _create_rl_train_net(self) -> None: raise NotImplementedError() def _create_q_score_net(self) -> None: self.q_score_model = ModelHelper(name="q_score_" + self.model_id) C2.set_model(self.q_score_model) self.q_score_output = self.get_q_values("states", "actions", True) workspace.RunNetOnce(self.q_score_model.param_init_net) self.q_score_model.net.Proto().num_workers = ( RLTrainer.DEFAULT_TRAINING_NUM_WORKERS) self.q_score_model.net.Proto().type = "async_scheduling" workspace.CreateNet(self.q_score_model.net) C2.set_model(None) def train_numpy(self, tdp: TrainingDataPage, evaluator: Optional[Evaluator]): workspace.FeedBlob("states", tdp.states) workspace.FeedBlob("actions", tdp.actions) workspace.FeedBlob("rewards", tdp.rewards) workspace.FeedBlob("next_states", tdp.next_states) workspace.FeedBlob("not_terminals", tdp.not_terminals) workspace.FeedBlob("time_diff", np.array([1], dtype=np.float32)) if self.maxq_learning: if isinstance(tdp.possible_next_actions, StackedArray): workspace.FeedBlob("possible_next_actions", tdp.possible_next_actions.values) workspace.FeedBlob("possible_next_actions_lengths", tdp.possible_next_actions.lengths) else: workspace.FeedBlob("possible_next_actions", tdp.possible_next_actions) else: workspace.FeedBlob("next_actions", tdp.next_actions) self.train() if evaluator is not None: self.evaluate(evaluator, tdp.actions, tdp.propensities, tdp.episode_values) def train(self) -> None: assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None if self.training_iteration >= self.reward_burnin: if self.training_iteration == self.reward_burnin: logger.info( "Minibatch number == reward_burnin. Starting RL updates.") self.target_network.enable_slow_updates() if self.conv_target_network: self.conv_target_network.enable_slow_updates() workspace.RunNet(self.rl_train_model.net) else: workspace.RunNet(self.reward_train_model.net) workspace.RunNet(self.target_network._update_model.net) if self.conv_target_network: workspace.RunNet(self.conv_target_network._update_model.net) self.training_iteration += 1 workspace.RunNet(self.q_score_model.net) def evaluate( self, evaluator: Optional[Evaluator], logged_actions: Optional[np.ndarray], logged_propensities: Optional[np.ndarray], logged_values: Optional[np.ndarray], ): raise NotImplementedError() def build_predictor(self, model, input_blob, output_blob) -> List[str]: retval: List[str] = [] if self.conv_ml_trainer is not None: conv_output = model.net.NextBlob("conv_output") retval = self.conv_ml_trainer.build_predictor( model, input_blob, conv_output) conv_output_flat = model.net.NextBlob("conv_output_flat") model.net.Flatten([conv_output], [conv_output_flat]) input_blob = conv_output_flat retval += self.ml_trainer.build_predictor(model, input_blob, output_blob) return retval
class RLTrainer(MLTrainer): def __init__( self, parameters: Union[DiscreteActionModelParameters, ContinuousActionModelParameters], ) -> None: logger.info(str(parameters)) assert parameters.training.layers[0] >= 0,\ "Set layers[0] to a the number of features" self.num_features = parameters.training.layers[0] MLTrainer.__init__(self, RL_TRAINER_MODEL_ID, parameters.training) self.target_network = TargetNetwork( self, parameters.rl.target_update_rate ) self.reward_burnin = parameters.rl.reward_burnin self.maxq_learning = parameters.rl.maxq_learning self.rl_discount_rate = parameters.rl.gamma self.rl_temperature = parameters.rl.temperature self.training_iteration = 0 self.minibatch_size = parameters.training.minibatch_size self.parameters = parameters self.loss_blob: Optional[str] = None workspace.FeedBlob('states', np.array([0], dtype=np.float32)) workspace.FeedBlob('actions', np.array([0], dtype=np.float32)) workspace.FeedBlob('rewards', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_states', np.array([0], dtype=np.float32)) workspace.FeedBlob('not_terminals', np.array([0], dtype=np.float32)) workspace.FeedBlob('next_actions', np.array([0], dtype=np.float32)) workspace.FeedBlob( 'possible_next_actions', np.array([0], dtype=np.float32) ) workspace.FeedBlob( 'possible_next_actions_lengths', np.array([0], dtype=np.float32) ) self.rl_train_model: Optional[ModelHelper] = None self.reward_train_model: Optional[ModelHelper] = None self.q_score_model: Optional[ModelHelper] = None self._create_reward_train_net() self._create_rl_train_net() self._create_q_score_net() assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None def get_possible_next_actions(self): raise NotImplementedError() def get_max_q_values( self, next_states: str, possible_next_actions, use_target_network: bool, ) -> str: """ Takes in an array of next_states and outputs an array of the same shape whose ith entry = max_{pna} Q(state_i, pna). Uses target network for Q(state_i, pna) approximation. :param next_states: Numpy array with shape (batch_size, state_dim). Each row contains a representation of a state. :param possible_next_actions: See subclass' `get_max_q_values` documentation. """ raise NotImplementedError() def get_q_values( self, states: str, actions: str, use_target_network: bool, ) -> str: """ Takes in a set of next_states and corresponding next_actions. For each (next_state_i, next_action_i) pair, calculates Q(next_state, next_action). Returns these q values in a Numpy array of shape (batch_size, 1). :param next_states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's next_state. :param next_actions: See subclass' `get_sarsa_values` documentation. """ raise NotImplementedError() def update_model( self, states: str, actions: str, q_vals_target: str, ) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets. Updates Q Network's weights according to loss and optimizer. :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ raise NotImplementedError() def _create_reward_train_net(self) -> None: raise NotImplementedError() def _create_rl_train_net(self) -> None: raise NotImplementedError() def _create_q_score_net(self) -> None: self.q_score_model = ModelHelper(name="q_score_" + self.model_id) C2.set_model(self.q_score_model) self.q_score_output = self.get_q_values('states', 'actions', True) workspace.RunNetOnce(self.q_score_model.param_init_net) workspace.CreateNet(self.q_score_model.net) C2.set_model(None) def train_numpy( self, tdp: TrainingDataPage, evaluator: Optional[Evaluator], ): workspace.FeedBlob('states', tdp.states) workspace.FeedBlob('actions', tdp.actions) workspace.FeedBlob('rewards', tdp.rewards) workspace.FeedBlob('next_states', tdp.next_states) workspace.FeedBlob('not_terminals', tdp.not_terminals) if self.maxq_learning: if isinstance(tdp.possible_next_actions, StackedArray): workspace.FeedBlob( 'possible_next_actions', tdp.possible_next_actions.values ) workspace.FeedBlob( 'possible_next_actions_lengths', tdp.possible_next_actions.lengths ) else: workspace.FeedBlob( 'possible_next_actions', tdp.possible_next_actions ) else: workspace.FeedBlob('next_actions', tdp.next_actions) self.train(tdp.reward_timelines, evaluator) def train( self, reward_timelines: Optional[List[Dict[int, float]]], evaluator: Optional[Evaluator], ) -> None: assert self.rl_train_model is not None assert self.reward_train_model is not None assert self.q_score_model is not None if self.training_iteration >= self.reward_burnin: if self.training_iteration == self.reward_burnin: logger.info( "Minibatch number == reward_burnin. Starting RL updates." ) self.target_network.enable_slow_updates() workspace.RunNet(self.rl_train_model.net) else: workspace.RunNet(self.reward_train_model.net) self.target_network.target_update() self.training_iteration += 1 workspace.RunNet(self.q_score_model.net) if evaluator is not None: assert reward_timelines is not None assert self.loss_blob is not None evaluator.report( reward_timelines, workspace.FetchBlob(self.q_score_output), workspace.FetchBlob(self.loss_blob), )