def __init__(self, env, q_network, loss_func=None, optimizer=None, gamma=0.99, buffer_size=1e6, node_selector=None, test_node_selector=None, logger=None): super(DQN, self).__init__() if loss_func is None: loss_func = rm.ClippedMeanSquaredError() if optimizer is None: optimizer = rm.Rmsprop(lr=0.00025, g=0.95) # Reset parameters. self._q_network = q_network # Copy network architectures. # Target network. self._target_q_network = copy.deepcopy(self._q_network) # The network which earned highest summed reward in each update period. self._best_q_network = copy.deepcopy(self._q_network) self._gamma = gamma self.env = env self.loss_func = loss_func self._optimizer = optimizer self.gamma = gamma self.epsilon_update = None self.node_selector = MaxNodeChooser( ) if node_selector is None else node_selector self.test_node_selector = MaxNodeChooser( ) if test_node_selector is None else test_node_selector # Check Env class type. if isinstance(env, BaseEnv): action_shape = env.action_shape state_shape = env.state_shape else: raise Exception("Argument env must be a object of BaseEnv class.") # Check state and action shape assert state_shape == self.env.reset().shape, \ "Expected state shape is {} but acctual is {}".format( state_shape, self.env.reset().shape) action = self._q_network(np.zeros((1, *state_shape))).as_ndarray() assert action.shape[1:] == action_shape, \ "Expected action shape is {} but acctual is {}".format(action_shape, action.shape[1:]) self._action_shape = action_shape self._state_shape = state_shape self._buffer = ReplayBuffer([ 1, ], self._state_shape, buffer_size) self._initialize() # logger logger = DQNLogger() if logger is None else logger assert isinstance(logger, Logger), "Argument logger must be Logger class" logger._key_check(log_key=_dqn_keys, log_key_epoch=_dqn_keys_epoch) self.logger = logger
def train(self, env, loss_func=rm.ClippedMeanSquaredError(), optimizer=rm.Rmsprop(lr=0.00025, g=0.95), epoch=100, batch_size=32, random_step=1000, one_epoch_step=20000, test_step=1000, test_env=None, update_period=10000, greedy_step=1000000, min_greedy=0.0, max_greedy=0.9, test_greedy=0.95, train_frequency=4): """This method executes training of a q-network. Training will be done with epsilon-greedy method. Args: env (function): A function which accepts action as an argument and returns prestate, state, reward and terminal. loss_func (Model): Loss function for training q-network. optimizer (Optimizer): Optimizer object for training q-network. epoch (int): Number of epoch for training. batch_size (int): Batch size. random_step (int): Number of random step which will be executed before training. one_epoch_step (int): Number of step of one epoch. test_step (int): Number of test step. test_env (function): A environment function for test. update_period (int): Period of updating target network. greedy_step (int): Number of step min_greedy (int): Minimum greedy value max_greedy (int): Maximum greedy value test_greedy (int): Greedy threshold train_frequency (int): For the learning step, training is done at this cycle Returns: (dict): A dictionary which includes reward list of training and loss list. Example: >>> import renom as rm >>> from renom.algorithm.reinforcement.dqn import DQN >>> >>> q_network = rm.Sequential([ ... rm.Conv2d(32, filter=8, stride=4), ... rm.Relu(), ... rm.Conv2d(64, filter=4, stride=2), ... rm.Relu(), ... rm.Conv2d(64, filter=3, stride=1), ... rm.Relu(), ... rm.Flatten(), ... rm.Dense(512), ... rm.Relu(), ... rm.Dense(action_pattern) ... ]) >>> >>> state_size = (4, 84, 84) >>> action_pattern = 4 >>> >>> def environment(action): ... prestate = ... ... state = ... ... reward = ... ... terminal = ... ... return prestate, state, reward, terminal >>> >>> # Instantiation of DQN object >>> dqn = DQN(model, ... state_size=state_size, ... action_pattern=action_pattern, ... ganma=0.99, ... buffer_size=buffer_size) >>> >>> # Training >>> train_history = dqn.train(environment, ... loss_func=rm.ClippedMeanSquaredError(clip=(-1, 1)), ... epoch=50, ... random_step=5000, ... one_epoch_step=25000, ... test_step=2500, ... test_env=environment, ... optimizer=rm.Rmsprop(lr=0.00025, g=0.95)) >>> Executing random action for 5000 step... epoch 000 avg loss:0.0060 avg reward:0.023: 100%|██████████| 25000/25000 [19:12<00:00, 21.70it/s] /// Result Average train error: 0.006 Avg train reward in one epoch: 1.488 Avg test reward in one epoch: 1.216 Test reward: 63.000 Greedy: 0.0225 Buffer: 29537 ... >>> >>> print(train_history["train_reward"]) """ # History of Learning train_reward_list = [] test_reward_list = [] train_error_list = [] greedy = min_greedy g_step = (max_greedy - min_greedy) / greedy_step if test_env is None: test_env = env print("Executing random action for %d step..." % random_step) for r in range(random_step): action = int(np.random.rand() * self._action_size) prestate, state, reward, terminal = env(action) if prestate is not None: self._buffer.store(prestate, np.array(action), np.array(reward), state, np.array(terminal)) state = None prestate = None count = 0 for e in range(epoch): loss = 0 sum_reward = 0 train_one_epoch_reward = [] train_each_epoch_reward = [] test_one_epoch_reward = [] test_each_epoch_reward = [] tq = tqdm(range(one_epoch_step)) for j in range(one_epoch_step): if greedy > np.random.rand() and state is not None: action = np.argmax(np.atleast_2d( self._network(state[None, ...])), axis=1) else: action = int(np.random.rand() * self._action_size) prestate, state, reward, terminal = env(action) greedy += g_step greedy = np.clip(greedy, min_greedy, max_greedy) sum_reward += reward if prestate is not None: self._buffer.store(prestate, np.array(action), np.array(reward), state, np.array(terminal)) train_one_epoch_reward.append(reward) else: if len(train_one_epoch_reward) > 0: train_each_epoch_reward.append( np.sum(train_one_epoch_reward)) train_one_epoch_reward = [] if j % train_frequency == 0: # Training train_prestate, train_action, train_reward, train_state, train_terminal = \ self._buffer.get_minibatch(batch_size) self._network.set_models(inference=True) self._target_network.set_models(inference=True) target = self._network(train_prestate).as_ndarray() target.setflags(write=True) # train_state = train_state.reshape(batch_size, *self._state_size) value = self._target_network(train_state).as_ndarray( ) * self._ganma * (~train_terminal[:, None]) for i in range(batch_size): a = train_action[i, 0].astype(np.integer) target[i, a] = train_reward[i] + value[i, a] self._network.set_models(inference=False) with self._network.train(): z = self._network(train_prestate) l = loss_func(z, target) l.grad().update(optimizer) loss += l.as_ndarray() if count % update_period == 0: self.update() count = 0 count += 1 msg = "epoch {:03d} loss:{:6.4f} sum reward:{:5.3f}".format( e, float(l.as_ndarray()), sum_reward) tq.set_description(msg) tq.update(1) train_reward_list.append(sum_reward) train_error_list.append(float(loss) / (j + 1)) msg = ("epoch {:03d} avg loss:{:6.4f} avg reward:{:5.3f}".format( e, float(loss) / (j + 1), sum_reward / one_epoch_step)) tq.set_description(msg) tq.update(0) tq.refresh() tq.close() # Test state = None sum_reward = 0 for j in range(test_step): if test_greedy > np.random.rand() and state is not None: action = self.action(state) else: action = int(np.random.rand() * self._action_size) prestate, state, reward, terminal = test_env(action) if prestate is not None: test_one_epoch_reward.append(reward) else: if len(test_one_epoch_reward) > 0: test_each_epoch_reward.append( np.sum(test_one_epoch_reward)) test_one_epoch_reward = [] sum_reward += float(reward) test_reward_list.append(sum_reward) tq.write(" /// Result") tq.write(" Average train error: {:5.3f}".format( float(loss) / one_epoch_step)) tq.write(" Avg train reward in one epoch: {:5.3f}".format( np.mean(train_each_epoch_reward))) tq.write(" Avg test reward in one epoch: {:5.3f}".format( np.mean(test_each_epoch_reward))) tq.write(" Test reward: {:5.3f}".format(sum_reward)) tq.write(" Greedy: {:1.4f}".format(greedy)) tq.write(" Buffer: {}".format(len(self._buffer))) sleep(0.25) # This is for jupyter notebook representation. return { "train_reward": train_reward_list, "train_error": train_error_list, "test_reward": test_reward_list }
def train(self, env, loss_func=rm.ClippedMeanSquaredError(), optimizer_critic=rm.Adam(lr=0.0001), optimizer_actor=rm.Adam(lr=0.0001), episode=100, batch_size=32, random_step=1000, one_episode_step=5000, test_step=1000, test_env=None, update_period=10000, greedy_step=1000000, min_greedy=0.1, max_greedy=0.9, exploration_rate=1., test_greedy=0.95, callbacks=None): greedy = min_greedy g_step = (max_greedy - min_greedy) / greedy_step if test_env is None: test_env = env print("Execute random action for %d step..." % random_step) for r in range(random_step): action = np.random.rand(*self._action_size) prestate, action, reward, state, terminal = env(action) if prestate is not None: self._buffer.store(prestate, np.array(action), np.array(reward), state, np.array(terminal)) state = None prestate = None count = 0 for e in range(episode): loss = 0 tq = tqdm(range(one_episode_step)) for j in range(one_episode_step): action = np.atleast_2d(self.action(state[None, ...])) + \ np.random.randn(batch_size, self._action_size) * (1 - greedy) * exploration_rate prestate, action, reward, state, terminal = env(action) greedy += g_step greedy = np.clip(greedy, min_greedy, max_greedy) if prestate is not None: self._buffer.store(prestate, np.array(action), np.array(reward), state, np.array(terminal)) # Training train_prestate, train_action, train_reward, train_state, train_terminal = \ self._buffer.get_minibatch(batch_size) target = np.zeros((batch_size, self._action_size), dtype=state.dtype) for i in range(batch_size): target[i, train_action[ i, 0].astype(np.integer)] = train_reward[i] self._target_actor.set_models(inference=True) self._target_critic.set_models(inference=True) action_state_value = self._target_critic( train_state, self._target_actor(train_state)) target += (action_state_value * self._ganma * (~train_terminal[:, None])).as_ndarray() self._actor.set_models(inference=True) self._critic.set_models(inference=False) with self._critic.train(): z = self._critic(train_prestate, self._actor(train_prestate)) ls = loss_func(z, target) with self._actor.prevent_upadate(): ls.grad().update(optimizer_critic) self._actor.set_models(inference=True) self._critic.set_models(inference=False) with self._critic.train(): z = self._critic(train_prestate, self._actor(train_prestate)) with self._actor.prevent_upadate(): z.grad(-1.).update(optimizer_actor) loss += ls.as_ndarray() if count % update_period == 0: self.update() count = 0 count += 1 tq.set_description("episode {:03d} loss:{:6.4f}".format( e, float(ls.as_ndarray()))) tq.update(1) tq.set_description("episode {:03d} avg loss:{:6.4f}".format( e, float(loss) / (j + 1))) tq.update(0) tq.refresh() tq.close() # Test state = None sum_reward = 0 for j in range(test_step): if state is not None: action = self.action(state) +\ np.random.randn(batch_size, self._action_size) * \ (1 - test_step) * exploration_rate prestate, action, reward, state, terminal = test_env(action) sum_reward += float(reward) tq.write(" /// Result") tq.write(" Average train error:{:1.6f}".format( float(loss) / one_episode_step)) tq.write(" Test reward:{}".format(sum_reward)) tq.write(" Greedy:{:1.4f}".format(greedy)) tq.write(" Buffer:{}".format(len(self._buffer))) if isinstance(callbacks, dict): func = callbacks.get("end_episode", False) if func: func() sleep(0.25) # This is for jupyter notebook representation.