class CriticCNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer(self.value, self.loss, [ adam(self.value.parameters, lr=[0.0001] * 600 + [0.00005] * 600 + [0.00001] * 600 + [0.000005], momentum=0.9) ]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((1, ) + self.observation_space_shape) self.target_current_state_value = C.input_variable((1, )) if pretrained_policy is None: h = C.layers.Convolution2D(filter_shape=(7, 7), num_filters=32, strides=(4, 4), pad=True, name='conv_1', activation=C.relu)(self.image_frame) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=64, strides=(2, 2), pad=True, name='conv_2', activation=C.relu)(h) h = C.layers.Convolution2D(filter_shape=(3, 3), num_filters=128, strides=(1, 1), pad=True, name='conv_3', activation=C.relu)(h) h = C.layers.Dense(64, activation=C.relu, name='dense_1')(h) self.value = C.layers.Dense(1, name='dense_2')(h) else: self.value = C.Function.load(pretrained_policy)(self.image_frame) self.loss = C.squared_error(self.target_current_state_value, self.value) def optimise(self, image_frame, target_current_state_value): self.trainer.train_minibatch({ self.image_frame: image_frame, self.target_current_state_value: target_current_state_value }) def predict(self, image_frame): return self.value.eval({self.image_frame: image_frame})
class StackedFrameCNNPolicy: def __init__(self, name, num_frames_to_stack, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.num_frames_to_stack = num_frames_to_stack self.observation_space_shape = observation_space_shape self.frame_stacker = FrameStacker(stack_size=num_frames_to_stack, frame_shape=observation_space_shape) self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer(self.q, self.loss, [sgd(self.q.parameters, lr=0.000001)]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((self.num_frames_to_stack, ) + self.observation_space_shape) if pretrained_policy is None: h = C.layers.Convolution2D(filter_shape=(7, 7), num_filters=32, strides=(4, 4), pad=True, name='conv_1', activation=C.relu)(self.image_frame) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=64, strides=(2, 2), pad=True, name='conv_2', activation=C.relu)(h) h = C.layers.Convolution2D(filter_shape=(3, 3), num_filters=128, strides=(1, 1), pad=True, name='conv_3', activation=C.relu)(h) h = C.layers.Dense(64, activation=C.relu, name='dense_1')(h) self.q = C.layers.Dense(self.num_actions, name='dense_1')(h) else: self.q = C.Function.load(pretrained_policy)(self.image_frame) self.q_target = C.input_variable(self.num_actions) self.loss = C.mean(C.losses.squared_error(self.q_target, self.q)) def optimise(self, image_frame, q_target): self.trainer.train_minibatch({ self.image_frame: image_frame, self.q_target: q_target }) def predict(self, image_frame): return self.q.eval({self.image_frame: image_frame})
class ActorNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.probabilities, self.loss, [adam(self.probabilities.parameters, lr=0.0001, momentum=0.9)]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((1, ) + self.observation_space_shape) self.td_error = C.input_variable((1, )) self.action_index = C.input_variable((1, )) one_hot_action = C.one_hot(self.action_index, self.num_actions) if pretrained_policy is None: h = C.layers.Dense(64, activation=C.relu, name='dense_1')(self.image_frame) self.probabilities = C.layers.Dense(self.num_actions, name='dense_2', activation=C.softmax)(h) else: self.probabilities = C.Function.load(pretrained_policy)( self.image_frame) selected_action_probablity = C.ops.times_transpose( self.probabilities, one_hot_action) self.log_probability = C.ops.log(selected_action_probablity) self.loss = -self.td_error * self.log_probability # self.probabilities = C.softmax(self.logits) # log_probability_of_action_taken = cross_entropy_with_softmax(self.logits, one_hot_action) # self.loss = C.reduce_mean(self.td_error*log_probability_of_action_taken) def optimise(self, image_frame, td_error, action_index): self.trainer.train_minibatch({ self.image_frame: image_frame, self.td_error: td_error, self.action_index: action_index }) def predict(self, image_frame): return self.probabilities.eval({self.image_frame: image_frame})
class REINFORCENNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.action_probabilities, self.loss, [sgd(self.action_probabilities.parameters, lr=0.000001)]) def _build_network(self, pretrained_policy): self.input = C.input_variable(self.observation_space_shape, name='image frame') self.target = C.input_variable((1, ), name='q_target') self.action_index = C.input_variable((1, )) one_hot_action = C.ops.squeeze( C.one_hot(self.action_index, self.num_actions)) if pretrained_policy is None: h = C.layers.Dense(64, activation=C.relu, name='dense_1')(self.input) h = C.layers.Dense(32, activation=C.relu, name='dense_1')(h) self.action_probabilities = C.layers.Dense(self.num_actions, activation=C.softmax, name='dense_1')(h) else: self.action_probabilities = C.Function.load(pretrained_policy)( self.input) selected_action_probablity = C.ops.times_transpose( self.action_probabilities, one_hot_action) self.log_probability = C.ops.log(selected_action_probablity) self.loss = C.sum(self.log_probability * self.target) def optimise(self, state, action, target): self.trainer.train_minibatch({ self.input: state, self.action_index: action, self.target: target }) def predict(self, state): return self.action_probabilities.eval({self.input: state})
class ActorNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.log_probability, self.loss, [adam(self.probabilities.parameters, lr=0.001, momentum=0.9)]) def _build_network(self, pretrained_policy): self.input = C.input_variable(self.observation_space_shape) self.td_error = C.input_variable((1, )) self.action_index = C.input_variable((1, )) one_hot_action = C.ops.squeeze( C.one_hot(self.action_index, self.num_actions)) if pretrained_policy is None: h = C.layers.Dense(64, activation=C.relu, name='dense_1')(self.input) h = C.layers.Dense(64, activation=C.tanh, name='dense_2')(h) self.probabilities = C.layers.Dense(self.num_actions, name='dense_3', activation=C.softmax)(h) else: self.probabilities = C.Function.load(pretrained_policy)(self.input) selected_action_probablity = C.ops.times_transpose( self.probabilities, one_hot_action) self.log_probability = C.ops.log(selected_action_probablity) self.loss = -self.td_error * self.log_probability def optimise(self, state, td_error, action_index): self.trainer.train_minibatch({ self.input: state, self.td_error: td_error, self.action_index: action_index }) def predict(self, state): return self.probabilities.eval({self.input: state})
class CriticNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.value, self.loss, [adam(self.value.parameters, lr=0.001, momentum=0.9)]) def _build_network(self, pretrained_policy): self.input = C.input_variable(self.observation_space_shape) self.target_current_state_value = C.input_variable((1, )) if pretrained_policy is None: h = C.layers.Dense(64, activation=C.relu, name='dense_1')(self.input) h = C.layers.Dense(64, activation=C.relu, name='dense_2')(h) self.value = C.layers.Dense(1, name='dense_3')(h) else: self.value = C.Function.load(pretrained_policy)(self.input) self.loss = C.squared_error(self.target_current_state_value, self.value) def optimise(self, state, target_current_state_value): self.trainer.train_minibatch({ self.input: state, self.target_current_state_value: target_current_state_value }) def predict(self, state): return self.value.eval({self.input: state})
class SimpleNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer(self.q, self.loss, [sgd(self.q.parameters, lr=5e-4)]) def _build_network(self, pretrained_policy): self.input = C.input_variable(self.observation_space_shape, name='image frame') if pretrained_policy is None: h = C.layers.Dense(64, activation=C.relu, name='dense_1')(self.input) h = C.layers.Dense(64, activation=C.relu, name='dense_2')(h) self.q = C.layers.Dense(self.num_actions, activation=None, name='output')(h) else: self.q = C.Function.load(pretrained_policy)(self.input) self.q_target = C.input_variable(self.num_actions, name='q_target') self.loss = C.losses.squared_error(self.q_target, self.q) def optimise(self, state, q_target): self.trainer.train_minibatch({ self.input: state, self.q_target: q_target }) def predict(self, state): return self.q.eval({self.input: state})
class QNeuralNetwork(CntkModel, QModel): """ Represents a learning capable entity using CNTK """ def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd) @property def loss_val(self): return self._trainer.previous_minibatch_loss_average def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((8, 8), 32, strides=(4, 4)), Convolution((4, 4), 64, strides=(2, 2)), Convolution((3, 3), 64, strides=(1, 1)), Dense(512, init=he_normal(0.01)), Dense(self._nb_actions, activation=None, init=he_normal(0.01)) ]) return model def train(self, x, q_value_targets, actions=None): assert actions is not None, 'actions cannot be None' # We need to add extra dimensions to shape [N, 1] => [N, 1] if check_rank(q_value_targets.shape, 1): q_value_targets = q_value_targets.reshape((-1, 1)) # Add extra dimensions to match shape [N, 1] required by one_hot if check_rank(actions.shape, 1): actions = actions.reshape((-1, 1)) # We need batch axis if check_rank(x.shape, len(self._environment.shape)): x = prepend_batch_axis(x) self._trainer.train_minibatch({ self._environment: x, self._actions: Value.one_hot(actions, self._nb_actions), self._q_targets: q_value_targets }) # Counter number of train calls self._steps += 1 # Update the model with the target one if (self._steps % self._target_update_interval) == 0: self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment}) def evaluate(self, data, model=QModel.ACTION_VALUE_NETWORK): # If evaluating a single sample, expand the minibatch axis # (minibatch = 1, input_shape...) if len(data.shape) == len(self.input_shape): data = prepend_batch_axis(data) # Append minibatch dim if model == QModel.TARGET_NETWORK: predictions = self._target.eval({self._environment: data}) else: predictions = self._model.eval({self._environment: data}) return predictions.squeeze()
class ActorStackedFrameCNNPolicy: def __init__(self, name, num_frames_to_stack, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.num_frames_to_stack = num_frames_to_stack self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.log_probability, self.loss, [adam(self.probabilities.parameters, lr=0.00001, momentum=0.9)]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((self.num_frames_to_stack, ) + self.observation_space_shape) self.td_error = C.input_variable((1, )) self.action_index = C.input_variable((1, )) one_hot_action = C.ops.squeeze( C.one_hot(self.action_index, self.num_actions)) if pretrained_policy is None: h = C.layers.Convolution2D(filter_shape=(7, 7), num_filters=32, strides=(4, 4), pad=True, name='conv_1', activation=C.relu)(self.image_frame) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=64, strides=(2, 2), pad=True, name='conv_2', activation=C.relu)(h) h = C.layers.Convolution2D(filter_shape=(3, 3), num_filters=128, strides=(1, 1), pad=True, name='conv_3', activation=C.relu)(h) h = C.layers.Dense(64, activation=C.relu, name='dense_1')(h) self.probabilities = C.layers.Dense(self.num_actions, name='dense_2', activation=C.softmax)(h) else: self.probabilities = C.Function.load(pretrained_policy)( self.image_frame) selected_action_probablity = C.ops.times_transpose( self.probabilities, one_hot_action) self.log_probability = C.ops.log(selected_action_probablity) self.loss = -self.td_error * self.log_probability def optimise(self, image_frame, td_error, action_index): self.trainer.train_minibatch({ self.image_frame: image_frame, self.td_error: td_error, self.action_index: action_index }) def predict(self, image_frame): return self.probabilities.eval({self.image_frame: image_frame})
class CriticCNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.value, self.loss, [adam(self.value.parameters, lr=0.0001, momentum=0.9)]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((1, ) + self.observation_space_shape) self.next_image_frame = C.input_variable((1, ) + self.observation_space_shape) self.reward = C.input_variable((1, )) if pretrained_policy is None: h = C.layers.Convolution2D(filter_shape=(7, 7), num_filters=32, strides=(4, 4), pad=True, name='conv_1', activation=C.relu) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=64, strides=(2, 2), pad=True, name='conv_2', activation=C.relu)(h) h = C.layers.Convolution2D(filter_shape=(3, 3), num_filters=128, strides=(1, 1), pad=True, name='conv_3', activation=C.relu)(h) h = C.layers.Dense(64, activation=C.relu, name='dense_1')(h) v = C.layers.Dense(1, name='dense_2')(h) self.value = v(self.image_frame) self.next_value = v(self.next_image_frame) self.output = C.combine([self.value, self.next_value]) else: self.output = C.Function.load(pretrained_policy)( self.image_frame, self.next_image_frame) [self.value, self.next_value] = self.output[ self.value.output], self.output[self.next_value.output] target = DISCOUNT_FACTOR * self.next_value + self.reward self.loss = C.squared_error(target, self.value) def optimise(self, image_frame, next_image_frame, rewards): self.trainer.train_minibatch({ self.image_frame: image_frame, self.next_image_frame: next_image_frame, self.reward: rewards }) def predict(self, image_frame): return self.value.eval({self.image_frame: image_frame})
class ActorCriticCNNPolicy: def __init__(self, name, observation_space_shape, num_actions, pretrained_policy=None, *args, **kwargs): self.name = name self.observation_space_shape = observation_space_shape self.num_actions = num_actions self._build_network(pretrained_policy) self.trainer = Trainer( self.output, self.loss, [adam(self.output.parameters, lr=0.00003, momentum=0.9)]) def _build_network(self, pretrained_policy): self.image_frame = C.input_variable((1, ) + self.observation_space_shape) self.next_image_frame = C.input_variable((1, ) + self.observation_space_shape) self.advantage = C.input_variable((1, )) self.action_index = C.input_variable((1, )) self.target_value = C.input_variable((1, )) one_hot_action = C.one_hot(self.action_index, self.num_actions) if pretrained_policy is None: h = C.layers.Convolution2D(filter_shape=(7, 7), num_filters=32, strides=(4, 4), pad=True, name='conv_1', activation=C.relu) h = C.layers.Convolution2D(filter_shape=(5, 5), num_filters=64, strides=(2, 2), pad=True, name='conv_2', activation=C.relu)(h) h = C.layers.Convolution2D(filter_shape=(3, 3), num_filters=128, strides=(1, 1), pad=True, name='conv_3', activation=C.relu)(h) h = C.layers.Dense(64, activation=C.relu, name='dense_1')(h) self.probabilities = C.layers.Dense(self.num_actions, name='dense_2', activation=C.softmax)(h( self.image_frame)) v = C.layers.Dense(1, name='dense_3')(h) self.value = v(self.image_frame) self.next_value = v(self.next_image_frame) self.output = C.combine( [self.probabilities, self.value, self.next_value]) else: [self.probabilities, self.value, self.next_value] = list( C.Function.load(pretrained_policy)(self.image_frame, self.next_image_frame)) self.values_output = C.combine([self.value, self.next_value]) selected_action_probablity = C.ops.times_transpose( self.probabilities, one_hot_action) self.log_probability = C.ops.log(selected_action_probablity) self.actor_loss = -self.advantage * self.log_probability self.critic_loss = C.squared_error(self.target_value, self.value) self.loss = 0.5 * self.actor_loss + 0.5 * self.critic_loss # self.probabilities = C.softmax(self.logits) # log_probability_of_action_taken = cross_entropy_with_softmax(self.logits, one_hot_action) # self.loss = C.reduce_mean(self.td_error*log_probability_of_action_taken) def optimise(self, image_frame, action_index, advantage, targets): self.trainer.train_minibatch({ self.image_frame: image_frame, self.advantage: advantage, self.action_index: action_index, self.target_value: targets }) def predict(self, image_frame): return self.probabilities.eval({self.image_frame: image_frame}) def values(self, image_frame, next_image_frame): output = self.values_output.eval({ self.image_frame: image_frame, self.next_image_frame: next_image_frame }) return output[self.value.output], output[self.next_value.output]
class QNeuralNetwork(CntkModel, QModel): """ Represents a learning capable entity using CNTK """ def __init__(self, in_shape, output_shape, device_id=None, learning_rate=0.00025, momentum=0.9, minibatch_size=32, update_interval=10000, n_workers=1, visualizer=None): """ Q Neural Network following Mnih and al. implementation and default options. The network has the following topology: Convolution(32, (8, 8)) Convolution(64, (4, 4)) Convolution(64, (2, 2)) Dense(512) :param in_shape: Shape of the observations perceived by the learner (the neural net input) :param output_shape: Size of the action space (mapped to the number of output neurons) :param device_id: Use None to let CNTK select the best available device, -1 for CPU, >= 0 for GPU (default: None) :param learning_rate: Learning rate (default: 0.00025, as per Mnih et al.) :param momentum: Momentum, provided as momentum value for averaging gradients without unit gain filter Note that CNTK does not currently provide an implementation of Graves' RmsProp with momentum. It uses AdamSGD optimizer instead. (default: 0, no momentum with RProp optimizer) :param minibatch_size: Minibatch size (default: 32, as per Mnih et al.) :param n_workers: Number of concurrent worker for distributed training. (default: 1, not distributed) :param visualizer: Optional visualizer allowing the model to save summary data (default: None, no visualization) Ref: Mnih et al.: "Human-level control through deep reinforcement learning." Nature 518.7540 (2015): 529-533. """ assert learning_rate > 0, 'learning_rate should be > 0' assert 0. <= momentum < 1, 'momentum should be 0 <= momentum < 1' QModel.__init__(self, in_shape, output_shape) CntkModel.__init__(self, device_id, False, n_workers, visualizer) self._nb_actions = output_shape self._steps = 0 self._target_update_interval = update_interval self._target = None # Input vars self._environment = input(in_shape, name='env', dynamic_axes=(Axis.default_batch_axis())) self._q_targets = input(1, name='q_targets', dynamic_axes=(Axis.default_batch_axis())) self._actions = input(output_shape, name='actions', dynamic_axes=(Axis.default_batch_axis())) # Define the neural network graph self._model = self._build_model()(self._environment) self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment} ) # Define the learning rate lr_schedule = learning_rate_schedule(learning_rate, UnitType.minibatch) # AdamSGD optimizer m_schedule = momentum_schedule(momentum) vm_schedule = momentum_schedule(0.999) l_sgd = adam(self._model.parameters, lr_schedule, momentum=m_schedule, unit_gain=True, variance_momentum=vm_schedule) if self.distributed_training: raise NotImplementedError('ASGD not implemented yet.') # _actions is a sparse 1-hot encoding of the actions done by the agent q_acted = reduce_sum(self._model * self._actions, axis=0) # Define the trainer with Huber Loss function criterion = huber_loss(q_acted, self._q_targets, 1.0) self._learner = l_sgd self._trainer = Trainer(self._model, (criterion, None), l_sgd) @property def loss_val(self): return self._trainer.previous_minibatch_loss_average def _build_model(self): with default_options(init=he_uniform(), activation=relu, bias=True): model = Sequential([ Convolution((8, 8), 32, strides=(4, 4)), Convolution((4, 4), 64, strides=(2, 2)), Convolution((3, 3), 64, strides=(1, 1)), Dense(512, init=he_normal(0.01)), Dense(self._nb_actions, activation=None, init=he_normal(0.01)) ]) return model def train(self, x, q_value_targets, actions=None): assert actions is not None, 'actions cannot be None' # We need to add extra dimensions to shape [N, 1] => [N, 1] if check_rank(q_value_targets.shape, 1): q_value_targets = q_value_targets.reshape((-1, 1)) # Add extra dimensions to match shape [N, 1] required by one_hot if check_rank(actions.shape, 1): actions = actions.reshape((-1, 1)) # We need batch axis if check_rank(x.shape, len(self._environment.shape)): x = prepend_batch_axis(x) self._trainer.train_minibatch({ self._environment: x, self._actions: Value.one_hot(actions, self._nb_actions), self._q_targets: q_value_targets }) # Counter number of train calls self._steps += 1 # Update the model with the target one if (self._steps % self._target_update_interval) == 0: self._target = self._model.clone( CloneMethod.freeze, {self._environment: self._environment} ) def evaluate(self, data, model=QModel.ACTION_VALUE_NETWORK): # If evaluating a single sample, expand the minibatch axis # (minibatch = 1, input_shape...) if len(data.shape) == len(self.input_shape): data = prepend_batch_axis(data) # Append minibatch dim if model == QModel.TARGET_NETWORK: predictions = self._target.eval({self._environment: data}) else: predictions = self._model.eval({self._environment: data}) return predictions.squeeze()