env.init() for epoch in range(1, num_epochs + 1): steps, num_episodes = 0, 0 losses, rewards = [], [] env.display_screen = False # training loop while steps < num_steps_train: episode_reward = 0.0 agent.start_episode() while env.game_over() == False and steps < num_steps_train: state = env.getGameState() reward, action = agent.act(state, epsilon=epsilon) memory.add([state, action, reward, env.game_over()]) if steps % update_frequency == 0: loss = memory.train_agent_batch(agent) if loss is not None: losses.append(loss) epsilon = np.max(epsilon_min, epsilon - epsilon_rate) episode_reward += reward steps += 1 if num_episodes % 5 == 0: print "Episode {:01d}: Reward {:0.1f}".format(num_episodes, episode_reward) rewards.append(episode_reward)
class DqnPolicy(BaseTFModel): def __init__(self, env, training, name=None, model_path=None, gamma=0.99, lr=0.001, lr_decay=1.0, epsilon=1.0, epsilon_final=0.02, batch_size=32, memory_capacity=100000, model_params={}, layer_sizes=[32, 32], target_update_type='hard', target_update_params={}, double_q=True, dueling=True, **kwargs): if name is None: self.name = self.__class__.__name__ else: self.name = name if model_path is None: self.model_path = os.path.join('model', self.name) else: self.model_path = model_path self.env = env self.training = training self.gamma = gamma self.lr = lr self.lr_decay = lr_decay self.epsilon = epsilon self.epsilon_final = epsilon_final self.batch_size = batch_size self.memory_capacity = memory_capacity self.model_params = model_params self.layer_sizes = layer_sizes self.double_q = double_q self.dueling = dueling self.target_update_type = target_update_type self.target_update_every_step = target_update_params.get( 'every_step', 100) self.target_update_tau = target_update_params.get('tau', 0.05) self.memory = ReplayMemory(capacity=memory_capacity) self.action_size = self.env.action_space.n self.state_size = np.prod(list(self.env.observation_space.shape)) print 'action_size: {a}, state_size: {s}'.format(a=self.action_size, s=self.state_size) if self.training: # clear existing model files if os.path.exists(self.model_path): print 'deleting existing model files at {}'.format( self.model_path) if os.path.isdir(self.model_path): shutil.rmtree(self.model_path) else: os.remove(self.model_path) BaseTFModel.__init__(self, self.name, self.model_path, saver_max_to_keep=5) print 'building graph ...' with self.graph.as_default(): self.__build_graph() def act(self, state, epsilon=0.1): """ :param state: 1d np.ndarray :param epsilon: :return: int """ assert isinstance(state, np.ndarray) and state.ndim == 1 if self.training and np.random.random() < epsilon: return self.env.action_space.sample() with self.sess.as_default(): return self.actions_selected_by_q.eval( {self.states: state.reshape((1, -1))})[0] def train(self, n_episodes=500, annealing_episodes=450, every_episode=10, **kwargs): if self.training is False: raise Exception( 'prohibited to call train() for a non-training model') reward_history = [0.0] reward_averaged = [] lr = self.lr eps = self.epsilon annealing_episodes = annealing_episodes or n_episodes eps_drop = (self.epsilon - self.epsilon_final) / annealing_episodes print "eps_drop: {}".format(eps_drop) step = 0 # calling the property method of BaseTFModel to start a session self.sess.run(self.init_vars) self.__init_target_q_net() for n_episode in range(n_episodes): ob = self.env.reset() done = False traj = [] reward = 0. while not done: a = self.act(ob, eps) assert a >= 0 new_ob, r, done, _ = self.env.step(a) step += 1 reward += r traj.append(Transition(ob, a, r, new_ob, done)) ob = new_ob # No enough samples in the buffer yet. if self.memory.size < self.batch_size: continue # Training with a mini batch of samples batch_data = self.memory.sample(self.batch_size) feed_dict = { self.learning_rate: lr, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.states_next: batch_data['s_next'], self.done_flags: batch_data['done'] } if self.double_q: actions_next = self.sess.run( self.actions_selected_by_q, {self.states: batch_data['s_next']}) feed_dict.update({self.actions_next: actions_next}) _, q_val, q_target_val, loss, summ_str = self.sess.run( [ self.optimizer, self.q, self.q_target, self.loss, self.merged_summary ], feed_dict=feed_dict) self.writer.add_summary(summ_str, step) # update the target q net if necessary self.__update_target_q_net(step) self.memory.add(traj) reward_history.append(reward) reward_averaged.append(np.mean(reward_history[-10:])) # Annealing the learning and exploration rate after every episode lr *= self.lr_decay if eps > self.epsilon_final: eps -= eps_drop if reward_history and every_episode and n_episode % every_episode == 0: print "[episodes: {}/step: {}], best: {}, avg: {:.2f}:{}, lr: {:.4f}, eps: {:.4f}".format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr, eps) self.save_model(step=step) print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history)) fig_path = os.path.join(self.model_path, 'figs') makedirs(fig_path) fig_file = os.path.join( fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time()))) plot_learning_curve(fig_file, { 'reward': reward_history, 'reward_avg': reward_averaged }, xlabel='episode') def evaluate(self, n_episodes): if self.training: raise Exception( 'prohibited to call evaluate() for a training model') reward_history = [] for episode in xrange(n_episodes): state = self.env.reset() reward_episode = 0. while True: action = self.act(state) new_state, reward, done, _ = self.env.step(action) reward_episode += reward state = new_state if done: break reward_history.append(reward_episode) return reward_history def __build_graph(self): self.__create_q_networks() # q is the Q(s, a) of the behavior policy self.actions_selected_by_q = tf.argmax(self.q, axis=-1, name='action_selected') action_one_hot = tf.one_hot(self.actions, self.action_size, dtype=tf.float32, name='action_one_hot') pred = tf.reduce_sum(self.q * action_one_hot, axis=-1, name='pred') # q_target is the Q(s, a) of the target policy that is what we learning for. if self.double_q: action_next_one_hot = tf.one_hot(self.actions_next, self.action_size, dtype=tf.float32, name='action_next_one_hot') max_q_next_target = tf.reduce_sum(self.q_target * action_next_one_hot, axis=-1, name='max_q_next_target') else: max_q_next_target = tf.reduce_max(self.q_target, axis=-1) y = self.rewards + (1. - self.done_flags) * self.gamma * max_q_next_target self.loss = tf.reduce_mean(tf.square(pred - tf.stop_gradient(y)), name="loss_mse_train") self.optimizer = tf.train.AdamOptimizer(self.learning_rate).minimize( self.loss, name="adam") self.init_vars = tf.global_variables_initializer() with tf.variable_scope('summary'): q_summ = [] avg_q = tf.reduce_mean(self.q, 0) for idx in range(self.action_size): q_summ.append(tf.summary.histogram('q/%s' % idx, avg_q[idx])) self.q_summ = tf.summary.merge(q_summ, 'q_summary') self.q_y_summ = tf.summary.histogram("batch/y", y) self.q_pred_summ = tf.summary.histogram("batch/pred", pred) self.loss_summ = tf.summary.scalar("loss", self.loss) self.merged_summary = tf.summary.merge_all( key=tf.GraphKeys.SUMMARIES) def __create_q_networks(self): # mini-batch self.states = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state') self.states_next = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state_next') self.actions = tf.placeholder(tf.int32, shape=(None, ), name='action') # actions_next is not the actual actions in the next step; # it is used to predict the action value in the Bellman equation. self.actions_next = tf.placeholder(tf.int32, shape=(None, ), name='action_next') self.rewards = tf.placeholder(tf.float32, shape=(None, ), name='reward') self.done_flags = tf.placeholder(tf.float32, shape=(None, ), name='done') self.learning_rate = tf.placeholder(tf.float32, shape=None, name='learning_rate') if self.dueling: with tf.variable_scope('Q_primary'): self.q_hidden = dense_nn(self.states, self.layer_sizes[:-1], name='q_hidden', training=self.training) # advantage function A(s, a) self.adv = dense_nn(self.q_hidden, [self.layer_sizes[-1], self.action_size], name='adv', training=self.training) # state value function V(s) self.v = dense_nn(self.q_hidden, [self.layer_sizes[-1], 1], name='v', training=self.training) self.q = self.v + (self.adv - tf.reduce_mean( self.adv, reduction_indices=1, keep_dims=True)) with tf.variable_scope('Q_target'): self.q_target_hidden = dense_nn(self.states_next, self.layer_sizes[:-1], name='q_hidden', training=self.training) self.adv_target = dense_nn( self.q_target_hidden, [self.layer_sizes[-1], self.action_size], name='adv', training=self.training) self.v_target = dense_nn(self.q_target_hidden, [self.layer_sizes[-1], 1], name='v', training=self.training) self.q_target = self.v_target + ( self.adv_target - tf.reduce_mean( self.adv_target, reduction_indices=1, keep_dims=True)) else: self.q = dense_nn(self.states, self.layer_sizes + [self.action_size], name='Q_primary', training=self.training) self.q_target = dense_nn(self.states_next, self.layer_sizes + [self.action_size], name='Q_target', training=self.training) self.q_vars = self.scope_vars('Q_primary') self.q_target_vars = self.scope_vars('Q_target') assert len(self.q_vars) == len( self.q_target_vars), "Two Q-networks are not same in structure." def __init_target_q_net(self): self.__update_target_q_net_hard() def __update_target_q_net_hard(self): self.sess.run( [v_t.assign(v) for v_t, v in zip(self.q_target_vars, self.q_vars)]) def __update_target_q_net_soft(self, tau=0.05): self.sess.run([ v_t.assign(v_t * (1. - tau) + v * tau) for v_t, v in zip(self.q_target_vars, self.q_vars) ]) def __update_target_q_net(self, step): if self.target_update_type == 'hard': if step % self.target_update_every_step == 0: self.__update_target_q_net_hard() else: self.__update_target_q_net_soft(self.target_update_tau)
class Execute: def __init__(self, path): self.config = Configuration.construct(path) self.env = Environment(self.config) self.memory = ReplayMemory(self.config) self.model = Model(self.config) self.ep = None def get_epsilon(self, is_play): if is_play: return self.config.play.ep ep_start = self.config.train.ep.start ep_final = self.config.train.ep.final ep_num_frames = self.config.train.ep.num_frames decay = (ep_start - ep_final) / ep_num_frames if self.ep is None: self.ep = ep_start self.ep = max(self.ep - decay, ep_final) return self.ep def log(self, **kawrgs): log = "" for name, value in kawrgs.items(): log += f"{name}: {value}, " print(log) def run_episode(self, episode=1, steps=0, is_play=True, debug=False): config = self.config self.env.reset() action = 1 _, _, curr_state, is_done = self.env.step(action) total_reward = 0 update_net = 0; C = config.train.network_update_freq t = 0; T = config.max_episode_length while not is_done and t < T: if t % config.action_repeat == 0: ep = self.get_epsilon(is_play) action = self.model.choose_action(curr_state, ep) prev_state, reward, curr_state, is_done = self.env.step(action) total_reward += reward t += 1 if is_play: self.env.render("human") if debug and t % config.play.debug.time == 0: self.log(ftype=self.env.get_frame_type(), action=action, reward=total_reward) continue self.memory.add((prev_state, action, reward, curr_state, is_done)) if self.memory.get_size() > config.train.replay_start_size: for i in range(config.train.batch_run): batch = self.memory.sample() self.model.optimize(batch) steps = (steps + 1) % C if steps % C == 0: self.model.update_qhat() update_net += 1 if not is_play and debug and episode % config.train.debug.time == 0: self.log(ftype=self.env.get_frame_type(), total_reward=total_reward, network_update_steps=update_net, episode_time=t, ep=ep) return total_reward, steps def load_model(self): ftype = self.env.get_frame_type() in_size = self.env.get_in_size() num_actions = self.env.get_num_actions() self.model.load_model(ftype, in_size, num_actions) def play(self, debug=False): self.load_model() for ep in range(1): self.run_episode(is_play=True, debug=debug) def train(self, debug=False): self.load_model() optimize_steps = 0 episodes = self.config.train.episodes for episode in range(1, episodes+1): reward, steps = self.run_episode(episode=episode, steps=optimize_steps, is_play=False, debug=debug) optimize_steps += steps if episode % self.config.train.save_model_episode == 0: self.model.save_model() self.model.update_qhat() self.model.save_model() def close(self): self.env.close() self.memory.close()
def train(self, config: TrainConfig): # experience replay memory replay_mem = ReplayMemory(config.memmory_capacity) # reward history reward = 0 reward_history = [] reward_avg = [] # learning rate related alpha = config.lrn_rate eps = config.epsilon eps_delta = (config.epsilon - config.epsilon_final) / config.warmup_episodes step = 0 for epi in range(config.total_episodes): obs = self.env.reset() done = False traj = [] reward = 0 while not done: # random choose action with epsilon-greedy action = self.act(obs, eps) obs_next, r, done, info = self.env.step(action) reward += r step += 1 # record trajectories traj.append( Transition(obs.flatten(), action, r, obs_next.flatten(), done)) obs = obs_next if replay_mem.size < self.batch_size: continue # update q networks with mini-batch replay samples batch_data = replay_mem.sample(self.batch_size) feed_dict = { self.learning_rate: alpha, self.states: batch_data['s'], self.actions: batch_data['a'], self.rewards: batch_data['r'], self.next_states: batch_data['s_next'], self.dones: batch_data['done'], self.epi_reward: reward_history[-1] } _, q, q_target, loss, summary = self.session.run([ self.optimizer, self.Q, self.Q_target, self.loss, self.merged_summary ], feed_dict) # update target q networks hardly if step % config.target_update_every_steps == 0: self._update_target_q_net() self.writer.add_summary(summary) replay_mem.add(traj) # one episode done reward_history.append(reward) reward_avg.append(np.mean(reward_history[-10:])) # update training param alpha *= config.lrn_rate_decay if eps > config.epsilon_final: eps -= eps_delta # report progress # if reward_history and config.log_every_episodes and epi % config.log_every_episodes == 0 : print( "[episodes:{}/step:{}], best:{}, avg:{:.2f}:{}, lrn_rate:{:.4f}, eps:{:.4f}" .format(epi, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], alpha, eps)) self.save_checkpoint(step=step) print( "[FINAL] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history))) return {'rwd': reward_history, 'rwd_avg': reward_avg}
class ActorCriticPolicy(BaseTFModel): def __init__(self, env, training, name=None, model_path=None, gamma=0.9, lr_a=0.01, lr_a_decay=0.999, lr_c=0.01, lr_c_decay=0.999, epsilon=1.0, epsilon_final=0.05, batch_size=16, layer_sizes=[32], grad_clip_norm=None, act='bayesian', seed=None, **kwargs): """ :param env: :param name: :param model_path: :param training: :param gamma: :param lr_a: :param lr_a_decay: :param lr_c: :param lr_c_decay: :param epsilon: :param epsilon_final: :param batch_size: :param layer_sizes: :param grad_clip_norm: :param act: baysian or epsilon :param seed: """ if name is None: self.name = self.__class__.__name__ else: self.name = name if model_path is None: self.model_path = os.path.join('model', self.name) else: self.model_path = model_path self.env = env self.training = training self.gamma = gamma self.lr_a = lr_a self.lr_a_decay = lr_a_decay self.lr_c = lr_c self.lr_c_decay = lr_c_decay self.epsilon = epsilon self.epsilon_final = epsilon_final self.batch_size = batch_size self.layer_sizes = layer_sizes self.grad_clip_norm = grad_clip_norm self.seed = seed self.memory = ReplayMemory(tuple_class=Record) self.action_size = self.env.action_space.n self.state_size = np.prod(list(self.env.observation_space.shape)) print 'action_size: {a}, state_size: {s}'.format(a=self.action_size, s=self.state_size) if self.training: # clear existing model files if os.path.exists(self.model_path): print 'deleting existing model files at {}'.format(self.model_path) if os.path.isdir(self.model_path): shutil.rmtree(self.model_path) else: os.remove(self.model_path) BaseTFModel.__init__(self, self.name, self.model_path, saver_max_to_keep=5) print 'building graph ...' with self.graph.as_default(): if self.seed is not None: np.random.seed(self.seed) tf.set_random_seed(int(self.seed/3)) self.__build_graph() if act == 'bayesian': self.act = self.act_bayesian elif act == 'epsilon': self.act = self.act_epsilon else: raise Exception('not supported act {}'.format(act)) def act_epsilon(self, state, **kwargs): """ epsilon-greedy exploration is not effective in the case of large action spaces :param state: :param epsilon: :return: """ if self.training and np.random.random() < kwargs['epsilon']: return self.env.action_space.sample() proba = self.sess.run(self.actor_proba, {self.states: state.reshape((1, -1))})[0] return np.argmax(proba) def act_bayesian(self, state, **kwargs): """ :param state: 1d np.ndarray :return: """ assert isinstance(state, np.ndarray) and state.ndim == 1 # return self.sess.run(self.sampled_actions, {self.states: state.reshape((1, -1))}) if self.training: return self.sess.run(self.sampled_actions, {self.states: state.reshape((1, -1))}) else: return self.sess.run(self.selected_actions, {self.states: state.reshape((1, -1))}) def __build_graph(self): # c: critic, a: actor self.learning_rate_c = tf.placeholder(tf.float32, shape=None, name='learning_rate_c') self.learning_rate_a = tf.placeholder(tf.float32, shape=None, name='learning_rate_a') # inputs self.states = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state') self.states_next = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state_next') self.actions = tf.placeholder(tf.int32, shape=(None,), name='action') self.rewards = tf.placeholder(tf.float32, shape=(None,), name='reward') # actor: action probabilities self.actor = dense_nn(self.states, self.layer_sizes + [self.action_size], training=self.training, name='actor') # integer tensor self.sampled_actions = tf.squeeze(tf.multinomial(self.actor, 1)) self.selected_actions = tf.squeeze(tf.argmax(self.actor, axis=-1)) self.actor_proba = tf.nn.softmax(self.actor) self.actor_vars = self.scope_vars('actor') # critic: action value (Q-value) self.critic = dense_nn(self.states, self.layer_sizes + [1], training=self.training, name='critic') self.critic_vars = self.scope_vars('critic') self.td_targets = self.rewards \ + self.gamma * tf.squeeze(dense_nn(self.states_next, self.layer_sizes + [1], training=self.training, name='critic', reuse=True)) # print the shape of td_targets # self.td_targets = tf.Print(self.td_targets, [tf.shape(self.td_targets)], first_n=1) action_ohe = tf.one_hot(self.actions, self.action_size, dtype=tf.float32, name='action_one_hot') self.pred_value = tf.reduce_sum(self.critic * action_ohe, axis=-1, name='q_action') self.td_errors = tf.stop_gradient(self.td_targets) - self.pred_value with tf.variable_scope('critic_train'): # self.reg_c = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.critic_vars]) self.loss_c = tf.reduce_mean(tf.square(self.td_errors)) # + 0.001 * self.reg_c self.optim_c = tf.train.AdamOptimizer(self.learning_rate_c) self.grads_c = self.optim_c.compute_gradients(self.loss_c, self.critic_vars) if self.grad_clip_norm: self.grads_c = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_c] self.train_op_c = self.optim_c.apply_gradients(self.grads_c) with tf.variable_scope('actor_train'): # self.reg_a = tf.reduce_mean([tf.nn.l2_loss(x) for x in self.actor_vars]) self.loss_a = tf.reduce_mean( tf.stop_gradient(self.td_errors) * tf.nn.sparse_softmax_cross_entropy_with_logits( logits=self.actor, labels=self.actions), name='loss_actor') # + 0.001 * self.reg_a self.optim_a = tf.train.AdamOptimizer(self.learning_rate_a) self.grads_a = self.optim_a.compute_gradients(self.loss_a, self.actor_vars) if self.grad_clip_norm: self.grads_a = [(tf.clip_by_norm(grad, self.grad_clip_norm), var) for grad, var in self.grads_a] self.train_op_a = self.optim_a.apply_gradients(self.grads_a) with tf.variable_scope('summary'): self.grads_a_summ = [tf.summary.scalar('grads/a_' + var.name, tf.norm(grad)) for grad, var in self.grads_a if grad is not None] self.grads_c_summ = [tf.summary.scalar('grads/c_' + var.name, tf.norm(grad)) for grad, var in self.grads_c if grad is not None] self.loss_c_summ = tf.summary.scalar('loss/critic', self.loss_c) self.loss_a_summ = tf.summary.scalar('loss/actor', self.loss_a) self.merged_summary = tf.summary.merge_all(key=tf.GraphKeys.SUMMARIES) self.train_ops = [self.train_op_a, self.train_op_c] self.init_vars = tf.global_variables_initializer() def train(self, n_episodes, annealing_episodes=None, every_episode=None, done_rewards=None, **kwargs): if self.training is False: raise Exception('prohibited to call train() for a non-training model') step = 0 reward_history = [] reward_averaged = [] lr_c = self.lr_c lr_a = self.lr_a eps = self.epsilon annealing_episodes = annealing_episodes or n_episodes eps_drop = (eps - self.epsilon_final) / annealing_episodes print "eps_drop: {}".format(eps_drop) self.sess.run(self.init_vars) for n_episode in range(n_episodes): ob = self.env.reset() episode_reward = 0. done = False while not done: a = self.act(ob, epsilon=eps) ob_next, r, done, _ = self.env.step(a) step += 1 episode_reward += r if done: r = done_rewards or 0. self.memory.add(Record(ob, a, r, ob_next)) ob = ob_next while self.memory.size >= self.batch_size: batch = self.memory.pop(self.batch_size) _, summ_str = self.sess.run( [self.train_ops, self.merged_summary], feed_dict={ self.learning_rate_c: lr_c, self.learning_rate_a: lr_a, self.states: batch['s'], self.actions: batch['a'], self.rewards: batch['r'], self.states_next: batch['s_next'] }) self.writer.add_summary(summ_str, step) reward_history.append(episode_reward) reward_averaged.append(np.mean(reward_history[-10:])) lr_c *= self.lr_c_decay lr_a *= self.lr_a_decay if eps > self.epsilon_final: eps -= eps_drop if reward_history and every_episode and n_episode % every_episode == 0: print( "[episodes: {}/step: {}], best: {}, avg10: {:.2f}: {}, lr: {:.4f} | {:.4f} eps: {:.4f}".format( n_episode, step, np.max(reward_history), np.mean(reward_history[-10:]), reward_history[-5:], lr_c, lr_a, eps )) self.save_model(step=step) print "[training completed] episodes: {}, Max reward: {}, Average reward: {}".format( len(reward_history), np.max(reward_history), np.mean(reward_history)) fig_path = os.path.join(self.model_path, 'figs') makedirs(fig_path) fig_file = os.path.join(fig_path, '{n}-{t}.png'.format(n=self.name, t=int(time.time()))) plot_learning_curve(fig_file, {'reward': reward_history, 'reward_avg': reward_averaged}, xlabel='episode') def evaluate(self, n_episodes): if self.training: raise Exception('prohibited to call evaluate() for a training model') reward_history = [] for episode in xrange(n_episodes): state = self.env.reset() reward_episode = 0. while True: action = self.act(state) new_state, reward, done, _ = self.env.step(action) reward_episode += reward state = new_state if done: break reward_history.append(reward_episode) return reward_history