def __init__(self, env, config): super(DQN, self).__init__("dqn", config) self.env = env self.batch_size = config.batch_size self.replay_buffer = ReplayBuffer(config.batch_size, config.memory_size, env.observation_space.shape) self.num_train = 0 self.input_layer = tf.placeholder(tf.float32, (None,) + self.env.observation_space.shape) with tf.variable_scope(self.name): self._build_network() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.name) self._saver = tf.train.Saver(model_vars)
def train_network(config: AlphaZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer): network = Network() optimizer = tf.train.MomentumOptimizer(config.learning_rate_schedule, config.momentum) for i in range(config.training_steps): if i % config.checkpoint_interval == 0: storage.save_network(i, network) batch = replay_buffer.sample_batch() update_weights(optimizer, network, batch, config.weight_decay) storage.save_network(config.training_steps, network)
class DQN(BaseModel): def __init__(self, env, config): super(DQN, self).__init__("dqn", config) self.env = env self.batch_size = config.batch_size self.replay_buffer = ReplayBuffer(config.batch_size, config.memory_size, env.observation_space.shape) self.num_train = 0 self.input_layer = tf.placeholder(tf.float32, (None, ) + self.env.observation_space.shape) with tf.variable_scope(self.name): self._build_network() self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.name) self._saver = tf.train.Saver(model_vars) def _greedy_policy(self, obs): if random.random() < self.eps: with self.sess.as_default(): action = self.q_action.eval( {self.input_layer: obs.reshape((1, 4))})[0] else: action = self.env.action_space.sample() return action def pick_action(self, obs, policy='greedy', train=True): # run eval network if train: if policy == "greedy": return self._greedy_policy(obs) else: return self.env.action_space.sample() else: with self.sess.as_default(): action = self.q_action.eval( {self.input_layer: obs.reshape((1, 4))})[0] return action def perceive(self, obs, action, reward, done): self.replay_buffer.put(obs, action, reward, done) def train(self, num_train): self._train(num_train) self.eps = max(self.eps_decay, (self.eps - self.eps_decay)) def _build_network(self): """This method implements the structure of DQN or DDQN, and for convenient, all weights and bias will be recorded in `self.e_w` and `self.t_w` """ activation_func = tf.nn.relu # === Build Evaluation Network === with tf.variable_scope("eval"): self.eval_scope_name = tf.get_variable_scope().name self.l1 = tf.layers.dense(self.input_layer, units=20, activation=activation_func, name="eval_l1") self.l2 = tf.layers.dense(self.l1, units=20, activation=activation_func, name="eval_l2") self.l3 = tf.layers.dense(self.l2, units=20, activation=activation_func, name="eval_l3") if self.dueling: pass else: # dense layer self.e_q = tf.layers.dense(self.l3, units=self.env.action_space.n, activation=activation_func, use_bias=False, name="eval_q") # record the index of final-layer, also map to the action index self.q_action = tf.argmax(self.e_q, axis=1, name="eval_action_select") # === Build Target Network === with tf.variable_scope("target"): self.target_scope_name = tf.get_variable_scope().name self.t_l1 = tf.layers.dense(self.input_layer, units=20, activation=activation_func, name="target_l1") self.t_l2 = tf.layers.dense(self.t_l1, units=20, activation=activation_func, name="target_l2") self.t_l3 = tf.layers.dense(self.t_l2, units=20, activation=activation_func, name="target_l3") if self.dueling: pass else: # dense layer self.t_q = tf.layers.dense(self.t_l3, units=self.env.action_space.n, activation=activation_func, use_bias=False, name="target_q") # if we training with double DQN, then the target network will produce an action with indicator from # evaluation network so the Q selection should accept an `index` tensor which depends on the result of # evaluation-network's selection self.target_q_idx_input = tf.placeholder( tf.int32, shape=(None, None), name="DDQN_max_action_index") self.target_q_action_with_idx = tf.gather_nd( self.t_q, self.target_q_idx_input) # === Define the process of network update === with tf.variable_scope("update"): self.update_op = [] eval_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.eval_scope_name) target_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.target_scope_name) for i in range(len(target_params)): self.update_op.append( tf.assign(target_params[i], eval_params[i])) # === Define the optimization === with tf.variable_scope("optimization"): self.t_q_input = tf.placeholder(tf.float32, shape=(None, ), name="target_q_input") self.action_input = tf.placeholder(tf.int32, shape=(None, ), name="action_input") action_one_hot = tf.one_hot(self.action_input, self.env.action_space.n, on_value=1.0, off_value=0.0, name="action_one_hot") self.q_eval_with_act = tf.reduce_sum(self.e_q * action_one_hot, axis=1, name="q_eval_with_action") temp = tf.square(self.t_q_input - self.q_eval_with_act) self.loss = 0.5 * tf.reduce_mean(temp) # TODO: consider add variant leraning rate self.train_op = tf.train.RMSPropOptimizer( self.learning_rate).minimize(self.loss) def _update(self): """Implement the network update """ self.sess.run(self.update_op) def _train(self, num_train): """Execute the training task with `mini_batch` setting. and this traninig module will training with game emulator""" print("\n[*] Begin #{0} training / EPS: {1:.3f} / MemorySize: {2} ...". format(num_train, self.eps, self.replay_buffer.size)) time.sleep(0.5) loss = [] target_q_value = [] eval_q_value = [] start_time = time.time() buffer_size = self.replay_buffer.size self.iteration = (buffer_size + self.batch_size - 1) // self.batch_size for i in tqdm.tqdm(range(self.iteration), ncols=60): # emulator for training info = self._mini_batch() loss.append(info["loss"]) target_q_value.append(info["target_q"]) eval_q_value.append(info["eval_q"]) if (i + 1) % self.update_every == 0: self._update() end_time = time.time() time.sleep(0.01) # loss record mean_loss = sum(loss) / len(loss) max_q, min_q = max(target_q_value[-1]), min(target_q_value[-1]) max_e, min_e = max(eval_q_value[-1]), min(eval_q_value[-1]) self.loss_record.append(mean_loss) print( "\n[*] Time consumption: {0:.3f}s, Average loss: {1:.6f}, Max-q: {2:.6f}, Min-q: {3:.6f}, Max-e: {4:.6f}, Min-e: {5:.6f}" .format(end_time - start_time, mean_loss, max_q, min_q, max_e, min_e)) def _mini_batch(self): """Implement mini-batch training """ info = dict(loss=0.0, time_consumption=0.0) # info registion # sample from replay-buffer data_batch = self.replay_buffer.sample() with self.sess.as_default(): if self.use_double: pred_act_batch = self.q_action.eval({ self.input_layer: data_batch.obs_next }) # get the action of next observation max_q_value = self.target_q_action_with_idx.eval({ self.input_layer: data_batch.obs_next, self.target_q_idx_input: [[idx, act_idx] for idx, act_idx in enumerate(pred_act_batch)] }) else: t_q_value = self.t_q.eval( {self.input_layer: data_batch.obs_next}) max_q_value = np.max(t_q_value, axis=1) # target_q = (1. - data_batch.done) * max_q_value * self.eps + data_batch.reward target_q = np.where(data_batch.done, data_batch.reward, data_batch.reward + max_q_value * self.eps) info["loss"], info["eval_q"], _ = self.sess.run( [self.loss, self.q_eval_with_act, self.train_op], { self.t_q_input: target_q, self.action_input: data_batch.action, self.input_layer: data_batch.obs # self.learning_rate_step: self.train_step }) info["target_q"] = target_q return info
def run_selfplay(config: AlphaZeroConfig, storage: SharedStorage, replay_buffer: ReplayBuffer): while True: network = storage.latest_network() game = play_game(config, network) replay_buffer.save_game(game)
config = AlphaZeroConfig() config.num_simulations = 400 config.window_size = 512 config.batch_size = 128 config.num_sampling_moves = 40 # A typical competitive Checkers game lasts for ~49 half-moves # Ref: https://boardgames.stackexchange.com/questions/34659/how-many-turns-does-an-average-game-of-checkers-draughts-go-for config.max_moves = 200 # Log all hyperparameters print('Hyperparameters') for attr, val in vars(config).items(): print(attr, val) storage = SharedStorage(make_uniform_network) buffer = ReplayBuffer(config) model = CheckersNetwork() model.cuda() # # HACK: Continue from adam-0-1/ # model.load_state_dict(torch.load('logs/adam-0-1/model-1999-l52.9.pt')) # storage.save_network(0, model) # optimizer = optim.SGD(model.parameters(), lr=2e-2, momentum=config.momentum, weight_decay=config.weight_decay) optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=config.weight_decay) val_loss = nn.MSELoss(reduction='sum') for step in range(2000): # Generate some games for i in range(1):