def test_linear_approximator(): np.random.seed(88) noise = 1e-3 a = np.random.rand(1000, 3) k = np.random.rand(3, 2) b = a.dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(2, )) approximator.fit(a, b) khat = approximator.get_weights() deltaK = (khat - k.T.flatten()) assert np.linalg.norm(deltaK) < noise point = np.random.randn(3, ) derivative = approximator.diff(point) lp = len(point) for i in range(derivative.shape[1]): assert (derivative[i * lp:(i + 1) * lp, i] == point).all()
def test_pytorch_approximator(): np.random.seed(88) torch.manual_seed(88) noise = 1e-3**2 a = np.random.rand(1000, 4) k = np.random.rand(4, 2) b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(PyTorchApproximator, input_shape=(4, ), output_shape=(2, ), network=ExampleNet, optimizer={ 'class': optim.Adam, 'params': {} }, loss=F.mse_loss, n_neurons=100, n_hidden=1, n_epochs=200, batch_size=100, quiet=True) approximator.fit(a, b) bhat = approximator.predict(a) error = np.linalg.norm(b - bhat, 'fro') / 1000 error_inf = np.max(np.abs(b - bhat)) print(b[:10]) print(bhat[:10]) print(error_inf) assert error < 2e-4 gradient = approximator.diff(a[0]) assert gradient.shape[1] == 2 old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) bhat_random = approximator.predict(a) assert not np.array_equal(bhat, bhat_random)
def test_pytorch_approximator(): np.random.seed(1) torch.manual_seed(1) n_actions = 2 s = np.random.rand(1000, 4) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(TorchApproximator, input_shape=(4,), output_shape=(2,), n_actions=n_actions, network=ExampleNet, optimizer={'class': optim.Adam, 'params': {}}, loss=F.mse_loss, batch_size=100, quiet=True) approximator.fit(s, a, q, n_epochs=20) x_s = np.random.rand(2, 4) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a) y_test = np.array([0.37191153, 0.5920861]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.47908658, 0.37191153], [0.5920861, 0.27575058]]) assert np.allclose(y, y_test) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0., 0., 0., 0., 0.02627479, 0.76513696, 0.6672573, 0.35979462, 0., 1.]) assert np.allclose(gradient, gradient_test) gradient = approximator.diff(x_s[0]) gradient_test = np.array([[0.02627479, 0.], [0.76513696, 0.], [0.6672573, 0.], [0.35979462, 0.], [0., 0.02627479], [0., 0.76513696], [0., 0.6672573], [0., 0.35979462], [1, 0.], [0., 1.]]) assert np.allclose(gradient, gradient_test) old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights))
def test_pytorch_approximator(): np.random.seed(88) torch.manual_seed(88) noise = 1e-3**2 a = np.random.rand(1000, 4) k = np.random.rand(4, 2) b = np.sin(a).dot(k) + np.random.randn(1000, 2) * noise approximator = Regressor(PyTorchApproximator, input_shape=(4, ), output_shape=(2, ), network=ExampleNet, optimizer={ 'class': optim.Adam, 'params': {} }, loss=F.mse_loss, n_neurons=100, n_hidden=1, n_epochs=200, batch_size=100, quiet=True) approximator.fit(a, b) bhat = approximator.predict(a) error = np.linalg.norm(b - bhat, 'fro') / 1000 error_inf = np.max(np.abs(b - bhat)) print(b[:10]) print(bhat[:10]) print(error_inf) assert error < 2e-4
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, initial_replay_size, max_replay_size, target_update_frequency=2500, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, dtype=np.float32): """ Constructor. Args: batch_size (int): the number of samples in a batch; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; target_update_frequency (int): the number of samples collected between each update of the target network; fit_params (dict, None): parameters of the fitting algorithm of the approximator; approximator_params (dict, None): parameters of the approximator to build; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; history_length (int, 1): the number of samples composing a state; clip_reward (bool, True): whether to clip the reward or not; max_no_op_actions (int, 0): maximum number of no-op actions that can be sampled; no_op_action_value (int, 0): value of the no-op action; dtype (object, np.float32): dtype of the state array. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, batch_size, approximator_params, target_update_frequency, replay_memory=None, initial_replay_size=500, max_replay_size=5000, fit_params=None, n_approximators=1, clip_reward=True): """ Constructor. Args: approximator (object): the approximator to use to fit the Q-function; batch_size (int): the number of samples in a batch; approximator_params (dict): parameters of the approximator to build; target_update_frequency (int): the number of samples collected between each update of the target network; replay_memory ([ReplayMemory, PrioritizedReplayMemory], None): the object of the replay memory to use; if None, a default replay memory is created; initial_replay_size (int): the number of samples to collect before starting the learning; max_replay_size (int): the maximum number of samples in the replay memory; fit_params (dict, None): parameters of the fitting algorithm of the approximator; n_approximators (int, 1): the number of approximator to use in ``AverageDQN``; clip_reward (bool, True): whether to clip the reward or not. """ self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency if replay_memory is not None: self._replay_memory = replay_memory if isinstance(replay_memory, PrioritizedReplayMemory): self._fit = self._fit_prioritized else: self._fit = self._fit_standard else: self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._fit = self._fit_standard self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_target = deepcopy(approximator_params) self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in range(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super().__init__(policy, mdp_info) def fit(self, dataset): self._fit(dataset) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _fit_standard(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self._fit_params) def _fit_prioritized(self, dataset): self._replay_memory.add( dataset, np.ones(len(dataset)) * self._replay_memory.max_priority) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, idxs, is_weight = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next td_error = q - self.approximator.predict(state, action) self._replay_memory.update(td_error, idxs) self.approximator.fit(state, action, q, weights=is_weight, **self._fit_params) def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): action = super(DQN, self).draw_action(np.array(state)) return action
class DQN(Agent): """ Deep Q-Network algorithm. "Human-Level Control Through Deep Reinforcement Learning". Mnih V. et al.. 2015. """ def __init__(self, approximator, policy, mdp_info, params): alg_params = params['algorithm_params'] self._batch_size = alg_params.get('batch_size') self._n_approximators = alg_params.get('n_approximators', 1) self._clip_reward = alg_params.get('clip_reward', True) self._train_frequency = alg_params.get('train_frequency') self._target_update_frequency = alg_params.get( 'target_update_frequency') self._max_no_op_actions = alg_params.get('max_no_op_actions', 0) self._no_op_action_value = alg_params.get('no_op_action_value', 0) self._replay_memory = ReplayMemory( mdp_info, alg_params.get('initial_replay_size'), alg_params.get('max_replay_size'), alg_params.get('history_length', 1)) self._buffer = Buffer(size=alg_params.get('history_length', 1)) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(params['approximator_params']) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(params['approximator_params']) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, n_models=self._n_approximators, **apprx_params_target) policy.set_q(self.approximator) if self._n_approximators == 1: self.target_approximator.model.set_weights( self.approximator.model.get_weights()) else: for i in xrange(self._n_approximators): self.target_approximator.model[i].set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info, params) def fit(self, dataset): self._replay_memory.add(dataset) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _ =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, **self.params['fit_params']) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in ``next_state``. Returns: Maximum action-value for each state in ``next_state``. """ q = self.target_approximator.predict(next_state) if np.any(absorbing): q *= 1 - absorbing.reshape(-1, 1) return np.max(q, axis=1) def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update() else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): self._no_op_actions = np.random.randint(self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0
class DQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, train_frequency, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, history_length=1, clip_reward=True, max_no_op_actions=0, no_op_action_value=0, p_mask=2 / 3., dtype=np.float32, weighted_update=False): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency // train_frequency self._max_no_op_actions = max_no_op_actions self._no_op_action_value = no_op_action_value self._p_mask = p_mask self.weighted_update = weighted_update self._replay_memory = ReplayMemory(mdp_info, initial_replay_size, max_replay_size, history_length, n_approximators, dtype) self._buffer = Buffer(history_length, dtype) self._n_updates = 0 self._episode_steps = 0 self._no_op_actions = None apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(DQN, self).__init__(policy, mdp_info) def fit(self, dataset): mask = np.random.binomial(1, self._p_mask, size=(len(dataset), self._n_approximators)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask =\ self._replay_memory.get(self._batch_size) q = np.array(self.approximator.predict(state))[0] q = q.reshape((self._n_approximators * self._batch_size, -1)) q = q[np.arange(self._n_approximators * self._batch_size), np.tile(action.ravel(), self._n_approximators)] q = q.reshape((self._n_approximators, self._batch_size)).T idxs = q.argsort() if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q_next_ordered = np.sort(q_next) #order target values to match the source values for i in range(idxs.shape[0]): q_next[i, idxs[i]] = q_next_ordered[i] q = reward.reshape(self._batch_size, 1) + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, mask=mask, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q = np.array(self.target_approximator.predict(next_state))[0] for i in range(q.shape[1]): if absorbing[i]: q[:, i, :] *= 1. - absorbing[i] if not self.weighted_update: #find best actions best_actions = np.argmax(np.mean(q, axis=0), axis=1) max_q = np.zeros((q.shape[1], q.shape[0])) for i in range(q.shape[1]): max_q[i, :] = q[:, i, best_actions[i]] return max_q else: N = q.shape[0] num_actions = q.shape[2] batch_size = q.shape[1] probs = np.zeros((batch_size, num_actions)) weights = 1 / N #calculate probability of being maximum for b in range(batch_size): for i in range(num_actions): particles = q[:, b, i] p = 0 for k in range(N): p2 = 1 p_k = particles[k] for j in range(num_actions): if (j != i): particles2 = q[:, b, j] p3 = 0 for l in range(N): if particles2[l] <= p_k: p3 += weights p2 *= p3 p += weights * p2 probs[b, i] = p max_q = np.zeros((batch_size, N)) for i in range(batch_size): particles = np.zeros(N) for j in range(num_actions): particles += q[:, i, j] * probs[i, j] max_q[i, :] = particles return max_q def draw_action(self, state): self._buffer.add(state) if self._episode_steps < self._no_op_actions: action = np.array([self._no_op_action_value]) self.policy.update_epsilon(state) else: extended_state = self._buffer.get() action = super(DQN, self).draw_action(extended_state) self._episode_steps += 1 return action def episode_start(self): if self._max_no_op_actions == 0: self._no_op_actions = 0 else: self._no_op_actions = np.random.randint( self._buffer.size, self._max_no_op_actions + 1) self._episode_steps = 0 self.policy.set_idx(np.random.randint(self._n_approximators))
def test_linear_approximator(): np.random.seed(1) # Generic regressor a = np.random.rand(1000, 3) k = np.random.rand(3, 2) b = a.dot(k) + np.random.randn(1000, 2) approximator = Regressor(LinearApproximator, input_shape=(3, ), output_shape=(2, )) approximator.fit(a, b) x = np.random.rand(2, 3) y = approximator.predict(x) y_test = np.array([[0.57638247, 0.1573216], [0.11388247, 0.24123678]]) assert np.allclose(y, y_test) point = np.random.randn(3, ) derivative = approximator.diff(point) lp = len(point) for i in range(derivative.shape[1]): assert (derivative[i * lp:(i + 1) * lp, i] == point).all() old_weights = approximator.get_weights() approximator.set_weights(old_weights) new_weights = approximator.get_weights() assert np.array_equal(new_weights, old_weights) random_weights = np.random.randn(*old_weights.shape).astype(np.float32) approximator.set_weights(random_weights) random_weight_new = approximator.get_weights() assert np.array_equal(random_weights, random_weight_new) assert not np.any(np.equal(random_weights, old_weights)) # Action regressor + Ensemble n_actions = 2 s = np.random.rand(1000, 3) a = np.random.randint(n_actions, size=(1000, 1)) q = np.random.rand(1000) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions, n_models=5) approximator.fit(s, a, q) x_s = np.random.rand(2, 3) x_a = np.random.randint(n_actions, size=(2, 1)) y = approximator.predict(x_s, x_a, prediction='mean') y_test = np.array([0.49225698, 0.69660881]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='sum') y_test = np.array([2.46128492, 3.48304404]) assert np.allclose(y, y_test) y = approximator.predict(x_s, x_a, prediction='min') y_test = np.array([[0.49225698, 0.69660881]]) assert np.allclose(y, y_test) y = approximator.predict(x_s) y_test = np.array([[0.49225698, 0.44154141], [0.69660881, 0.69060195]]) assert np.allclose(y, y_test) approximator = Regressor(LinearApproximator, input_shape=(3, ), n_actions=n_actions) approximator.fit(s, a, q) gradient = approximator.diff(x_s[0], x_a[0]) gradient_test = np.array([0.88471362, 0.11666548, 0.45466254, 0., 0., 0.]) assert np.allclose(gradient, gradient_test)
class GaussianDQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, clip_reward=True, update_type='weighted', delta=0.1, store_prob=False, q_max=100, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.update_type = update_type self.delta = delta self.standard_bound = norm.ppf(1 - self.delta, loc=0, scale=1) self.store_prob = store_prob self.q_max = q_max self.max_spread = max_spread self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._epsilon = 1e-7 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(GaussianDQN, self).__init__(policy, mdp_info) @staticmethod def _compute_prob_max(mean_list, sigma_list): n_actions = len(mean_list) lower_limit = mean_list - 8 * sigma_list upper_limit = mean_list + 8 * sigma_list epsilon = 1e2 n_trapz = 100 x = np.zeros(shape=(n_trapz, n_actions)) y = np.zeros(shape=(n_trapz, n_actions)) integrals = np.zeros(n_actions) for j in range(n_actions): if sigma_list[j] < epsilon: p = 1 for k in range(n_actions): if k != j: p *= norm.cdf(mean_list[j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = p else: x[:, j] = np.linspace(lower_limit[j], upper_limit[j], n_trapz) y[:, j] = norm.pdf(x[:, j], loc=mean_list[j], scale=sigma_list[j]) for k in range(n_actions): if k != j: y[:, j] *= norm.cdf(x[:, j], loc=mean_list[k], scale=sigma_list[k]) integrals[j] = (upper_limit[j] - lower_limit[j]) / ( 2 * (n_trapz - 1)) * (y[0, j] + y[-1, j] + 2 * np.sum(y[1:-1, j])) # print(np.sum(integrals)) # assert np.isclose(np.sum(integrals), 1) with np.errstate(divide='raise'): try: return integrals / np.sum(integrals) except FloatingPointError: print(integrals) print(mean_list) print(sigma_list) input() def fit(self, dataset): mask = np.ones((len(dataset), 2)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask = \ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next, sigma_next, prob_explore = self._next_q( next_state, absorbing) q = reward + self.mdp_info.gamma * q_next sigma = self.mdp_info.gamma * sigma_next stacked = np.stack([q, sigma]) self.approximator.fit(state, action, stacked, prob_exploration=prob_explore, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q_and_sigma = self.target_approximator.predict(next_state).squeeze() q = q_and_sigma[0, :, :] sigma = q_and_sigma[1, :, :] for i in range(q.shape[0]): if absorbing[i]: q[i] *= 0 sigma[i] *= self._epsilon max_q = np.zeros((q.shape[0])) max_sigma = np.zeros((q.shape[0])) probs = [] prob_explore = np.zeros(q.shape[0]) for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = GaussianDQN._compute_prob_max(means, sigmas) probs.append(prob) prob_explore[i] = 1. - np.max(prob) if self.update_type == 'mean': best_actions = np.argmax(q, axis=1) for i in range(q.shape[0]): max_q[i] = q[i, best_actions[i]] max_sigma[i] = sigma[i, best_actions[i]] elif self.update_type == 'weighted': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] prob = probs[i] max_q[i] = np.sum(means * prob) max_sigma[i] = np.sum(sigmas * prob) elif self.update_type == 'optimistic': for i in range(q.shape[0]): # for each batch means = q[i, :] sigmas = sigma[i, :] bounds = sigmas * self.standard_bound + means bounds = np.clip(bounds, -self.q_max, self.q_max) next_index = np.random.choice( np.argwhere(bounds == np.max(bounds)).ravel()) max_q[i] = q[i, next_index] max_sigma[i] = sigma[i, next_index] else: raise ValueError("Update type not implemented") return max_q, max_sigma, np.mean(prob_explore) def draw_action(self, state): action = super(GaussianDQN, self).draw_action(np.array(state)) return action def episode_start(self): return
class Optimistic_AC(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, clip_reward=True, weighted_update=False, update_type='weighted', delta=0.1, q_max=100, store_prob=False, max_spread=None): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self.weighted_update = weighted_update self.update_type = update_type self.q_max = q_max self.store_prob = store_prob self.max_spread = max_spread quantiles = [i * 1. / (n_approximators - 1) for i in range(n_approximators)] for p in range(n_approximators): if quantiles[p] >= 1 - delta: self.delta_index = p break self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(Optimistic_AC, self).__init__(policy, mdp_info) @staticmethod def _compute_prob_max(q_list): q_array = np.array(q_list).T score = (q_array[:, :, None, None] >= q_array).astype(int) prob = score.sum(axis=3).prod(axis=2).sum(axis=1) prob = prob.astype(np.float32) return prob / np.sum(prob) @staticmethod def scale(x, out_range=(-1, 1), axis=None): domain = np.min(x, axis), np.max(x, axis) y = (x - (domain[1] + domain[0]) / 2) / (domain[1] - domain[0]) return y * (out_range[1] - out_range[0]) + (out_range[1] + out_range[0]) / 2 def fit(self, dataset): mask = np.ones((len(dataset), self._n_approximators)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask =\ self._replay_memory.get(self._batch_size) self.policy.update(state, self.approximator) if self.update_type == 'ensemble_policy': self.ensemble_policy.update(state, self.approximator) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next, prob_explore = self._next_q(next_state, absorbing) if self.max_spread is not None: for i in range(q_next.shape[0]): #for each batch min_range = np.min(q_next[i]) max_range = np.max(q_next[i]) if max_range - min_range > self.max_spread: clip_range = (max_range - min_range) - self.max_spread out_range = [min_range + clip_range / 2, max_range - clip_range / 2] q_next[i] = Optimistic_AC.scale(q_next[i], out_range=out_range, axis=None) q = reward.reshape(self._batch_size, 1) + self.mdp_info.gamma * q_next margin = 0.05 self.approximator.fit(state, action, q, mask=mask, prob_exploration=prob_explore, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ if self.update_type == 'sarsa': a = self.policy.predict(next_state) q = np.array(self.target_approximator.predict(next_state, a))[0] for i in range(q.shape[1]): if absorbing[i]: q[:, i, :] *= 0 return q, 0 elif self.update_type == 'ensemble_policy': max_q = [] for b in range(next_state.shape[0]): s = next_state[b] actions = self.ensemble_policy.predict(s) # num_policies x num_particles q = np.array(self.target_approximator.predict([s] * actions.shape[0], actions))[0] particles = q[:, :] particles = np.sort(particles, axis=0) means = np.mean(particles, axis=0) bounds = means + particles[self.delta_index, :] bounds = np.clip(bounds, -self.q_max, self.q_max) next_index = np.random.choice(np.argwhere(bounds == np.max(bounds)).ravel()) max_q[b, :] = particles[:, next_index] return max_q, 0 else: raise ValueError("Update type not supported") def draw_action(self, state): action = super(Optimistic_AC, self).draw_action(np.array(state)) return action def episode_start(self): self.policy.set_idx(np.random.randint(self._n_approximators))
class BootstrappedDQN(Agent): def __init__(self, approximator, policy, mdp_info, batch_size, target_update_frequency, initial_replay_size, max_replay_size, fit_params=None, approximator_params=None, n_approximators=1, clip_reward=True, p_mask=2 / 3.): self._fit_params = dict() if fit_params is None else fit_params self._batch_size = batch_size self._n_approximators = n_approximators self._clip_reward = clip_reward self._target_update_frequency = target_update_frequency self._p_mask = p_mask self._replay_memory = ReplayMemory(initial_replay_size, max_replay_size) self._n_updates = 0 self._episode_steps = 0 apprx_params_train = deepcopy(approximator_params) apprx_params_train['name'] = 'train' apprx_params_target = deepcopy(approximator_params) apprx_params_target['name'] = 'target' self.approximator = Regressor(approximator, **apprx_params_train) self.target_approximator = Regressor(approximator, **apprx_params_target) policy.set_q(self.approximator) self.target_approximator.model.set_weights( self.approximator.model.get_weights()) super(BootstrappedDQN, self).__init__(policy, mdp_info) def fit(self, dataset): mask = np.random.binomial(1, self._p_mask, size=(len(dataset), self._n_approximators)) self._replay_memory.add(dataset, mask) if self._replay_memory.initialized: state, action, reward, next_state, absorbing, _, mask =\ self._replay_memory.get(self._batch_size) if self._clip_reward: reward = np.clip(reward, -1, 1) q_next = self._next_q(next_state, absorbing) q = reward.reshape(self._batch_size, 1) + self.mdp_info.gamma * q_next self.approximator.fit(state, action, q, mask=mask, **self._fit_params) self._n_updates += 1 if self._n_updates % self._target_update_frequency == 0: self._update_target() def _update_target(self): """ Update the target network. """ self.target_approximator.model.set_weights( self.approximator.model.get_weights()) def _next_q(self, next_state, absorbing): """ Args: next_state (np.ndarray): the states where next action has to be evaluated; absorbing (np.ndarray): the absorbing flag for the states in `next_state`. Returns: Maximum action-value for each state in `next_state`. """ q = np.array(self.target_approximator.predict(next_state))[0] for i in range(q.shape[1]): if absorbing[i]: q[:, i, :] *= 1. - absorbing[i] max_q = np.max(q, axis=2) return max_q.T def draw_action(self, state): action = super(BootstrappedDQN, self).draw_action(np.array(state)) self._episode_steps += 1 return action def episode_start(self): self._episode_steps = 0 self.policy.set_idx(np.random.randint(self._n_approximators))