Exemple #1
0
    def __init__(self,
                 env,
                 q_network,
                 loss_func=None,
                 optimizer=None,
                 gamma=0.99,
                 buffer_size=1e6,
                 node_selector=None,
                 test_node_selector=None,
                 logger=None):
        super(DQN, self).__init__()

        if loss_func is None:
            loss_func = rm.ClippedMeanSquaredError()

        if optimizer is None:
            optimizer = rm.Rmsprop(lr=0.00025, g=0.95)

        # Reset parameters.
        self._q_network = q_network

        # Copy network architectures.
        # Target network.
        self._target_q_network = copy.deepcopy(self._q_network)
        # The network which earned highest summed reward in each update period.
        self._best_q_network = copy.deepcopy(self._q_network)

        self._gamma = gamma
        self.env = env
        self.loss_func = loss_func
        self._optimizer = optimizer
        self.gamma = gamma
        self.epsilon_update = None

        self.node_selector = MaxNodeChooser(
        ) if node_selector is None else node_selector
        self.test_node_selector = MaxNodeChooser(
        ) if test_node_selector is None else test_node_selector

        # Check Env class type.
        if isinstance(env, BaseEnv):
            action_shape = env.action_shape
            state_shape = env.state_shape
        else:
            raise Exception("Argument env must be a object of BaseEnv class.")

        # Check state and action shape
        assert state_shape == self.env.reset().shape, \
            "Expected state shape is {} but acctual is {}".format(
                state_shape, self.env.reset().shape)

        action = self._q_network(np.zeros((1, *state_shape))).as_ndarray()
        assert action.shape[1:] == action_shape, \
            "Expected action shape is {} but acctual is {}".format(action_shape, action.shape[1:])

        self._action_shape = action_shape
        self._state_shape = state_shape
        self._buffer = ReplayBuffer([
            1,
        ], self._state_shape, buffer_size)
        self._initialize()

        # logger
        logger = DQNLogger() if logger is None else logger
        assert isinstance(logger,
                          Logger), "Argument logger must be Logger class"
        logger._key_check(log_key=_dqn_keys, log_key_epoch=_dqn_keys_epoch)
        self.logger = logger
Exemple #2
0
    def train(self,
              env,
              loss_func=rm.ClippedMeanSquaredError(),
              optimizer=rm.Rmsprop(lr=0.00025, g=0.95),
              epoch=100,
              batch_size=32,
              random_step=1000,
              one_epoch_step=20000,
              test_step=1000,
              test_env=None,
              update_period=10000,
              greedy_step=1000000,
              min_greedy=0.0,
              max_greedy=0.9,
              test_greedy=0.95,
              train_frequency=4):
        """This method executes training of a q-network.
        Training will be done with epsilon-greedy method.

        Args:
            env (function): A function which accepts action as an argument
                and returns prestate, state,  reward and terminal.
            loss_func (Model): Loss function for training q-network.
            optimizer (Optimizer): Optimizer object for training q-network.
            epoch (int): Number of epoch for training.
            batch_size (int): Batch size.
            random_step (int): Number of random step which will be executed before training.
            one_epoch_step (int): Number of step of one epoch.
            test_step (int): Number of test step.
            test_env (function): A environment function for test.
            update_period (int): Period of updating target network.
            greedy_step (int): Number of step
            min_greedy (int): Minimum greedy value
            max_greedy (int): Maximum greedy value
            test_greedy (int): Greedy threshold
            train_frequency (int): For the learning step, training is done at this cycle

        Returns:
            (dict): A dictionary which includes reward list of training and loss list.

        Example:
            >>> import renom as rm
            >>> from renom.algorithm.reinforcement.dqn import DQN
            >>>
            >>> q_network = rm.Sequential([
            ...    rm.Conv2d(32, filter=8, stride=4),
            ...    rm.Relu(),
            ...    rm.Conv2d(64, filter=4, stride=2),
            ...    rm.Relu(),
            ...    rm.Conv2d(64, filter=3, stride=1),
            ...    rm.Relu(),
            ...    rm.Flatten(),
            ...    rm.Dense(512),
            ...    rm.Relu(),
            ...    rm.Dense(action_pattern)
            ... ])
            >>>
            >>> state_size = (4, 84, 84)
            >>> action_pattern = 4
            >>>
            >>> def environment(action):
            ...     prestate = ...
            ...     state = ...
            ...     reward = ...
            ...     terminal = ...
            ...     return prestate, state, reward, terminal
            >>>
            >>> # Instantiation of DQN object
            >>> dqn = DQN(model,
            ...           state_size=state_size,
            ...           action_pattern=action_pattern,
            ...           ganma=0.99,
            ...           buffer_size=buffer_size)
            >>>
            >>> # Training
            >>> train_history = dqn.train(environment,
            ...           loss_func=rm.ClippedMeanSquaredError(clip=(-1, 1)),
            ...           epoch=50,
            ...           random_step=5000,
            ...           one_epoch_step=25000,
            ...           test_step=2500,
            ...           test_env=environment,
            ...           optimizer=rm.Rmsprop(lr=0.00025, g=0.95))
            >>>
            Executing random action for 5000 step...
            epoch 000 avg loss:0.0060 avg reward:0.023: 100%|██████████| 25000/25000 [19:12<00:00, 21.70it/s]
                /// Result
                Average train error: 0.006
                Avg train reward in one epoch: 1.488
                Avg test reward in one epoch: 1.216
                Test reward: 63.000
                Greedy: 0.0225
                Buffer: 29537
                ...
            >>>
            >>> print(train_history["train_reward"])

        """

        # History of Learning
        train_reward_list = []
        test_reward_list = []
        train_error_list = []

        greedy = min_greedy
        g_step = (max_greedy - min_greedy) / greedy_step

        if test_env is None:
            test_env = env

        print("Executing random action for %d step..." % random_step)
        for r in range(random_step):
            action = int(np.random.rand() * self._action_size)
            prestate, state, reward, terminal = env(action)
            if prestate is not None:
                self._buffer.store(prestate, np.array(action),
                                   np.array(reward), state, np.array(terminal))

        state = None
        prestate = None
        count = 0
        for e in range(epoch):
            loss = 0
            sum_reward = 0
            train_one_epoch_reward = []
            train_each_epoch_reward = []

            test_one_epoch_reward = []
            test_each_epoch_reward = []

            tq = tqdm(range(one_epoch_step))
            for j in range(one_epoch_step):
                if greedy > np.random.rand() and state is not None:
                    action = np.argmax(np.atleast_2d(
                        self._network(state[None, ...])),
                                       axis=1)
                else:
                    action = int(np.random.rand() * self._action_size)
                prestate, state, reward, terminal = env(action)
                greedy += g_step
                greedy = np.clip(greedy, min_greedy, max_greedy)
                sum_reward += reward

                if prestate is not None:
                    self._buffer.store(prestate, np.array(action),
                                       np.array(reward), state,
                                       np.array(terminal))
                    train_one_epoch_reward.append(reward)
                else:
                    if len(train_one_epoch_reward) > 0:
                        train_each_epoch_reward.append(
                            np.sum(train_one_epoch_reward))
                    train_one_epoch_reward = []

                if j % train_frequency == 0:
                    # Training
                    train_prestate, train_action, train_reward, train_state, train_terminal = \
                        self._buffer.get_minibatch(batch_size)

                    self._network.set_models(inference=True)
                    self._target_network.set_models(inference=True)

                    target = self._network(train_prestate).as_ndarray()
                    target.setflags(write=True)

                    # train_state = train_state.reshape(batch_size, *self._state_size)
                    value = self._target_network(train_state).as_ndarray(
                    ) * self._ganma * (~train_terminal[:, None])

                    for i in range(batch_size):
                        a = train_action[i, 0].astype(np.integer)
                        target[i, a] = train_reward[i] + value[i, a]

                    self._network.set_models(inference=False)
                    with self._network.train():
                        z = self._network(train_prestate)
                        l = loss_func(z, target)
                    l.grad().update(optimizer)
                    loss += l.as_ndarray()

                    if count % update_period == 0:
                        self.update()
                        count = 0
                    count += 1

                msg = "epoch {:03d} loss:{:6.4f} sum reward:{:5.3f}".format(
                    e, float(l.as_ndarray()), sum_reward)
                tq.set_description(msg)
                tq.update(1)

            train_reward_list.append(sum_reward)
            train_error_list.append(float(loss) / (j + 1))

            msg = ("epoch {:03d} avg loss:{:6.4f} avg reward:{:5.3f}".format(
                e,
                float(loss) / (j + 1), sum_reward / one_epoch_step))
            tq.set_description(msg)
            tq.update(0)
            tq.refresh()
            tq.close()

            # Test
            state = None
            sum_reward = 0
            for j in range(test_step):
                if test_greedy > np.random.rand() and state is not None:
                    action = self.action(state)
                else:
                    action = int(np.random.rand() * self._action_size)
                prestate, state, reward, terminal = test_env(action)

                if prestate is not None:
                    test_one_epoch_reward.append(reward)
                else:
                    if len(test_one_epoch_reward) > 0:
                        test_each_epoch_reward.append(
                            np.sum(test_one_epoch_reward))
                    test_one_epoch_reward = []

                sum_reward += float(reward)
            test_reward_list.append(sum_reward)

            tq.write("    /// Result")
            tq.write("    Average train error: {:5.3f}".format(
                float(loss) / one_epoch_step))
            tq.write("    Avg train reward in one epoch: {:5.3f}".format(
                np.mean(train_each_epoch_reward)))
            tq.write("    Avg test reward in one epoch: {:5.3f}".format(
                np.mean(test_each_epoch_reward)))
            tq.write("    Test reward: {:5.3f}".format(sum_reward))
            tq.write("    Greedy: {:1.4f}".format(greedy))
            tq.write("    Buffer: {}".format(len(self._buffer)))

            sleep(0.25)  # This is for jupyter notebook representation.

        return {
            "train_reward": train_reward_list,
            "train_error": train_error_list,
            "test_reward": test_reward_list
        }
Exemple #3
0
    def train(self,
              env,
              loss_func=rm.ClippedMeanSquaredError(),
              optimizer_critic=rm.Adam(lr=0.0001),
              optimizer_actor=rm.Adam(lr=0.0001),
              episode=100,
              batch_size=32,
              random_step=1000,
              one_episode_step=5000,
              test_step=1000,
              test_env=None,
              update_period=10000,
              greedy_step=1000000,
              min_greedy=0.1,
              max_greedy=0.9,
              exploration_rate=1.,
              test_greedy=0.95,
              callbacks=None):

        greedy = min_greedy
        g_step = (max_greedy - min_greedy) / greedy_step

        if test_env is None:
            test_env = env

        print("Execute random action for %d step..." % random_step)
        for r in range(random_step):
            action = np.random.rand(*self._action_size)
            prestate, action, reward, state, terminal = env(action)
            if prestate is not None:
                self._buffer.store(prestate, np.array(action),
                                   np.array(reward), state, np.array(terminal))
        state = None
        prestate = None
        count = 0
        for e in range(episode):
            loss = 0
            tq = tqdm(range(one_episode_step))
            for j in range(one_episode_step):
                action = np.atleast_2d(self.action(state[None, ...])) + \
                    np.random.randn(batch_size, self._action_size) * (1 - greedy) * exploration_rate
                prestate, action, reward, state, terminal = env(action)
                greedy += g_step
                greedy = np.clip(greedy, min_greedy, max_greedy)
                if prestate is not None:
                    self._buffer.store(prestate, np.array(action),
                                       np.array(reward), state,
                                       np.array(terminal))

                # Training
                train_prestate, train_action, train_reward, train_state, train_terminal = \
                    self._buffer.get_minibatch(batch_size)

                target = np.zeros((batch_size, self._action_size),
                                  dtype=state.dtype)
                for i in range(batch_size):
                    target[i, train_action[
                        i, 0].astype(np.integer)] = train_reward[i]

                self._target_actor.set_models(inference=True)
                self._target_critic.set_models(inference=True)
                action_state_value = self._target_critic(
                    train_state, self._target_actor(train_state))
                target += (action_state_value * self._ganma *
                           (~train_terminal[:, None])).as_ndarray()

                self._actor.set_models(inference=True)
                self._critic.set_models(inference=False)
                with self._critic.train():
                    z = self._critic(train_prestate,
                                     self._actor(train_prestate))
                    ls = loss_func(z, target)

                with self._actor.prevent_upadate():
                    ls.grad().update(optimizer_critic)

                self._actor.set_models(inference=True)
                self._critic.set_models(inference=False)
                with self._critic.train():
                    z = self._critic(train_prestate,
                                     self._actor(train_prestate))

                with self._actor.prevent_upadate():
                    z.grad(-1.).update(optimizer_actor)

                loss += ls.as_ndarray()
                if count % update_period == 0:
                    self.update()
                    count = 0
                count += 1
                tq.set_description("episode {:03d} loss:{:6.4f}".format(
                    e, float(ls.as_ndarray())))
                tq.update(1)
            tq.set_description("episode {:03d} avg loss:{:6.4f}".format(
                e,
                float(loss) / (j + 1)))
            tq.update(0)
            tq.refresh()
            tq.close()

            # Test
            state = None
            sum_reward = 0

            for j in range(test_step):
                if state is not None:
                    action = self.action(state) +\
                        np.random.randn(batch_size, self._action_size) * \
                        (1 - test_step) * exploration_rate
                prestate, action, reward, state, terminal = test_env(action)
                sum_reward += float(reward)

            tq.write("    /// Result")
            tq.write("    Average train error:{:1.6f}".format(
                float(loss) / one_episode_step))
            tq.write("    Test reward:{}".format(sum_reward))
            tq.write("    Greedy:{:1.4f}".format(greedy))
            tq.write("    Buffer:{}".format(len(self._buffer)))

            if isinstance(callbacks, dict):
                func = callbacks.get("end_episode", False)
                if func:
                    func()

            sleep(0.25)  # This is for jupyter notebook representation.