예제 #1
0
    def __init__(self, env, config):
        super(DQN, self).__init__("dqn", config)
        
        self.env = env
        self.batch_size = config.batch_size
        self.replay_buffer = ReplayBuffer(config.batch_size, config.memory_size, env.observation_space.shape)
        self.num_train = 0

        self.input_layer = tf.placeholder(tf.float32, (None,) + self.env.observation_space.shape)

        with tf.variable_scope(self.name):
            self._build_network()

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.name)

        self._saver = tf.train.Saver(model_vars)
예제 #2
0
def train_network(config: AlphaZeroConfig, storage: SharedStorage,
                  replay_buffer: ReplayBuffer):
    network = Network()
    optimizer = tf.train.MomentumOptimizer(config.learning_rate_schedule,
                                           config.momentum)
    for i in range(config.training_steps):
        if i % config.checkpoint_interval == 0:
            storage.save_network(i, network)
        batch = replay_buffer.sample_batch()
        update_weights(optimizer, network, batch, config.weight_decay)
    storage.save_network(config.training_steps, network)
예제 #3
0
class DQN(BaseModel):
    def __init__(self, env, config):
        super(DQN, self).__init__("dqn", config)

        self.env = env
        self.batch_size = config.batch_size
        self.replay_buffer = ReplayBuffer(config.batch_size,
                                          config.memory_size,
                                          env.observation_space.shape)
        self.num_train = 0

        self.input_layer = tf.placeholder(tf.float32, (None, ) +
                                          self.env.observation_space.shape)

        with tf.variable_scope(self.name):
            self._build_network()

        self.sess = tf.Session()
        self.sess.run(tf.global_variables_initializer())

        model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                       self.name)

        self._saver = tf.train.Saver(model_vars)

    def _greedy_policy(self, obs):
        if random.random() < self.eps:
            with self.sess.as_default():
                action = self.q_action.eval(
                    {self.input_layer: obs.reshape((1, 4))})[0]
        else:
            action = self.env.action_space.sample()

        return action

    def pick_action(self, obs, policy='greedy', train=True):
        # run eval network
        if train:
            if policy == "greedy":
                return self._greedy_policy(obs)
            else:
                return self.env.action_space.sample()
        else:
            with self.sess.as_default():
                action = self.q_action.eval(
                    {self.input_layer: obs.reshape((1, 4))})[0]
            return action

    def perceive(self, obs, action, reward, done):
        self.replay_buffer.put(obs, action, reward, done)

    def train(self, num_train):
        self._train(num_train)
        self.eps = max(self.eps_decay, (self.eps - self.eps_decay))

    def _build_network(self):
        """This method implements the structure of DQN or DDQN,
        and for convenient, all weights and bias will be recorded in `self.e_w` and `self.t_w`
        """
        activation_func = tf.nn.relu

        # === Build Evaluation Network ===
        with tf.variable_scope("eval"):
            self.eval_scope_name = tf.get_variable_scope().name

            self.l1 = tf.layers.dense(self.input_layer,
                                      units=20,
                                      activation=activation_func,
                                      name="eval_l1")
            self.l2 = tf.layers.dense(self.l1,
                                      units=20,
                                      activation=activation_func,
                                      name="eval_l2")
            self.l3 = tf.layers.dense(self.l2,
                                      units=20,
                                      activation=activation_func,
                                      name="eval_l3")

            if self.dueling:
                pass
            else:
                # dense layer
                self.e_q = tf.layers.dense(self.l3,
                                           units=self.env.action_space.n,
                                           activation=activation_func,
                                           use_bias=False,
                                           name="eval_q")

            # record the index of final-layer, also map to the action index
            self.q_action = tf.argmax(self.e_q,
                                      axis=1,
                                      name="eval_action_select")

        # === Build Target Network ===
        with tf.variable_scope("target"):
            self.target_scope_name = tf.get_variable_scope().name

            self.t_l1 = tf.layers.dense(self.input_layer,
                                        units=20,
                                        activation=activation_func,
                                        name="target_l1")
            self.t_l2 = tf.layers.dense(self.t_l1,
                                        units=20,
                                        activation=activation_func,
                                        name="target_l2")
            self.t_l3 = tf.layers.dense(self.t_l2,
                                        units=20,
                                        activation=activation_func,
                                        name="target_l3")

            if self.dueling:
                pass
            else:
                # dense layer
                self.t_q = tf.layers.dense(self.t_l3,
                                           units=self.env.action_space.n,
                                           activation=activation_func,
                                           use_bias=False,
                                           name="target_q")

                # if we training with double DQN, then the target network will produce an action with indicator from
                # evaluation network so the Q selection should accept an `index` tensor which depends on the result of
                # evaluation-network's selection
            self.target_q_idx_input = tf.placeholder(
                tf.int32, shape=(None, None), name="DDQN_max_action_index")
            self.target_q_action_with_idx = tf.gather_nd(
                self.t_q, self.target_q_idx_input)

        # === Define the process of network update ===
        with tf.variable_scope("update"):
            self.update_op = []

            eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                            self.eval_scope_name)
            target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES,
                                              self.target_scope_name)

            for i in range(len(target_params)):
                self.update_op.append(
                    tf.assign(target_params[i], eval_params[i]))

        # === Define the optimization ===
        with tf.variable_scope("optimization"):
            self.t_q_input = tf.placeholder(tf.float32,
                                            shape=(None, ),
                                            name="target_q_input")
            self.action_input = tf.placeholder(tf.int32,
                                               shape=(None, ),
                                               name="action_input")

            action_one_hot = tf.one_hot(self.action_input,
                                        self.env.action_space.n,
                                        on_value=1.0,
                                        off_value=0.0,
                                        name="action_one_hot")
            self.q_eval_with_act = tf.reduce_sum(self.e_q * action_one_hot,
                                                 axis=1,
                                                 name="q_eval_with_action")

            temp = tf.square(self.t_q_input - self.q_eval_with_act)
            self.loss = 0.5 * tf.reduce_mean(temp)

            # TODO: consider add variant leraning rate

            self.train_op = tf.train.RMSPropOptimizer(
                self.learning_rate).minimize(self.loss)

    def _update(self):
        """Implement the network update
        """
        self.sess.run(self.update_op)

    def _train(self, num_train):
        """Execute the training task with `mini_batch` setting.
        and this traninig module will training with game emulator"""

        print("\n[*] Begin #{0} training / EPS: {1:.3f} / MemorySize: {2} ...".
              format(num_train, self.eps, self.replay_buffer.size))
        time.sleep(0.5)

        loss = []
        target_q_value = []
        eval_q_value = []
        start_time = time.time()

        buffer_size = self.replay_buffer.size
        self.iteration = (buffer_size + self.batch_size - 1) // self.batch_size

        for i in tqdm.tqdm(range(self.iteration), ncols=60):
            # emulator for training
            info = self._mini_batch()
            loss.append(info["loss"])
            target_q_value.append(info["target_q"])
            eval_q_value.append(info["eval_q"])
            if (i + 1) % self.update_every == 0:
                self._update()

        end_time = time.time()
        time.sleep(0.01)

        # loss record
        mean_loss = sum(loss) / len(loss)
        max_q, min_q = max(target_q_value[-1]), min(target_q_value[-1])
        max_e, min_e = max(eval_q_value[-1]), min(eval_q_value[-1])

        self.loss_record.append(mean_loss)

        print(
            "\n[*] Time consumption: {0:.3f}s, Average loss: {1:.6f}, Max-q: {2:.6f}, Min-q: {3:.6f}, Max-e: {4:.6f}, Min-e: {5:.6f}"
            .format(end_time - start_time, mean_loss, max_q, min_q, max_e,
                    min_e))

    def _mini_batch(self):
        """Implement mini-batch training
        """

        info = dict(loss=0.0, time_consumption=0.0)  # info registion

        # sample from replay-buffer
        data_batch = self.replay_buffer.sample()

        with self.sess.as_default():
            if self.use_double:
                pred_act_batch = self.q_action.eval({
                    self.input_layer:
                    data_batch.obs_next
                })  # get the action of next observation
                max_q_value = self.target_q_action_with_idx.eval({
                    self.input_layer:
                    data_batch.obs_next,
                    self.target_q_idx_input:
                    [[idx, act_idx]
                     for idx, act_idx in enumerate(pred_act_batch)]
                })
            else:
                t_q_value = self.t_q.eval(
                    {self.input_layer: data_batch.obs_next})
                max_q_value = np.max(t_q_value, axis=1)

        # target_q = (1. - data_batch.done) * max_q_value * self.eps + data_batch.reward
        target_q = np.where(data_batch.done, data_batch.reward,
                            data_batch.reward + max_q_value * self.eps)

        info["loss"], info["eval_q"], _ = self.sess.run(
            [self.loss, self.q_eval_with_act, self.train_op],
            {
                self.t_q_input: target_q,
                self.action_input: data_batch.action,
                self.input_layer: data_batch.obs
                # self.learning_rate_step: self.train_step
            })

        info["target_q"] = target_q

        return info
예제 #4
0
def run_selfplay(config: AlphaZeroConfig, storage: SharedStorage,
                 replay_buffer: ReplayBuffer):
    while True:
        network = storage.latest_network()
        game = play_game(config, network)
        replay_buffer.save_game(game)
예제 #5
0
    config = AlphaZeroConfig()
    config.num_simulations = 400
    config.window_size = 512
    config.batch_size = 128
    config.num_sampling_moves = 40
    # A typical competitive Checkers game lasts for ~49 half-moves
    # Ref: https://boardgames.stackexchange.com/questions/34659/how-many-turns-does-an-average-game-of-checkers-draughts-go-for
    config.max_moves = 200

    # Log all hyperparameters
    print('Hyperparameters')
    for attr, val in vars(config).items():
        print(attr, val)

    storage = SharedStorage(make_uniform_network)
    buffer = ReplayBuffer(config)

    model = CheckersNetwork()
    model.cuda()
    # # HACK: Continue from adam-0-1/
    # model.load_state_dict(torch.load('logs/adam-0-1/model-1999-l52.9.pt'))
    # storage.save_network(0, model)
    # optimizer = optim.SGD(model.parameters(), lr=2e-2, momentum=config.momentum, weight_decay=config.weight_decay)
    optimizer = optim.Adam(model.parameters(),
                           lr=1e-4,
                           weight_decay=config.weight_decay)
    val_loss = nn.MSELoss(reduction='sum')

    for step in range(2000):
        # Generate some games
        for i in range(1):