def update_model(self, states: str, actions: str, q_vals_target: str) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets Updates Q Network's weights according to loss and optimizer :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row is a representation of the ith transition's action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ model = C2.model() q_vals_target = C2.StopGradient(q_vals_target) q_values = C2.NextBlob("train_output") state_action_pairs, _ = C2.Concat(states, actions, axis=1) self.ml_trainer.make_forward_pass_ops(model, state_action_pairs, q_values, False) self.loss_blob = self.ml_trainer.generateLossOps( model, q_values, q_vals_target) model.AddGradientOperators([self.loss_blob]) for param in model.params: if param in model.param_to_grad: param_grad = model.param_to_grad[param] param_grad = C2.NanCheck(param_grad) self.ml_trainer.addParameterUpdateOps(model)
def update_model(self, states: str, actions: str, q_vals_target: str) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets Updates Q Network's weights according to loss and optimizer :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ model = C2.model() q_vals_target = C2.StopGradient(q_vals_target) output_blob = C2.NextBlob("train_output") if self.conv_ml_trainer is not None: conv_output_blob = C2.NextBlob("conv_output") self.conv_ml_trainer.make_conv_pass_ops(model, states, conv_output_blob) states = conv_output_blob self.ml_trainer.make_forward_pass_ops(model, states, output_blob, False) q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions)) q_values = C2.ExpandDims(q_val_select, dims=[1]) self.loss_blob = self.ml_trainer.generateLossOps(model, q_values, q_vals_target) model.AddGradientOperators([self.loss_blob]) for param in model.params: if param in model.param_to_grad: param_grad = model.param_to_grad[param] param_grad = C2.NanCheck(param_grad) self.ml_trainer.addParameterUpdateOps(model)
def update_model( self, states: str, actions: str, q_vals_target: str, ) -> None: """ Takes in states, actions, and target q values. Updates the model: Runs the forward pass, computing Q(states, actions). Q(states, actions)[i][j] is an approximation of Q*(states[i], action_j). Comptutes Loss of Q(states, actions) with respect to q_vals_targets Updates Q Network's weights according to loss and optimizer :param states: Numpy array with shape (batch_size, state_dim). The ith row is a representation of the ith transition's state. :param actions: Numpy array with shape (batch_size, action_dim). The ith row contains the one-hotted representation of the ith action. :param q_vals_targets: Numpy array with shape (batch_size, 1). The ith row is the label to train against for the data from the ith transition. """ model = C2.model() q_vals_target = C2.StopGradient(q_vals_target) output_blob = C2.NextBlob("train_output") MakeForwardPassOps( model, self.model_id, states, output_blob, self.weights, self.biases, self.activations, self.layers, self.dropout_ratio, False, ) q_val_select = C2.ReduceBackSum(C2.Mul(output_blob, actions)) q_values = C2.ExpandDims(q_val_select, dims=[1]) self.loss_blob = GenerateLossOps( model, q_values, q_vals_target, ) model.AddGradientOperators([self.loss_blob]) for param in model.params: if param in model.param_to_grad: param_grad = model.param_to_grad[param] param_grad = C2.NanCheck(param_grad) AddParameterUpdateOps( model, optimizer_input=self.optimizer, base_learning_rate=self.learning_rate, gamma=self.gamma, policy=self.lr_policy, )