def main(config): env_name = config['run']['env'] env = gym.make(env_name) np.random.seed(config['random_seed']) tf.random.set_seed(config['random_seed']) env.seed(config['random_seed']) batch_size = config['train']['batch_size'] state_dim = env.observation_space.shape # Use action_dim[0]: (a_dim,) --> a_dim action_dim = env.action_space.shape[0] # Define action boundaries for continuous but bounded action space action_low = env.action_space.low action_high = env.action_space.high print(f'-------- {env_name} --------') print('STATE DIM: ', state_dim) print('ACTION DIM: ', action_dim) print('ACTION LOW: ', action_low) print('ACTION HIGH: ', action_high) print('----------------------------') # Initialize memory for experience replay replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed']) # Take a random action in the environment to initialize networks env.reset() _, initial_reward, _, _ = env.step(env.action_space.sample()) # Use agent_factory to build the agent using the algorithm specified in the config file Agent = agent_factory(config['agent']['model']) agent = Agent(config, state_dim, action_dim, action_low, action_high, initial_reward) for episode in range(int(config['train']['max_episodes'])): s = env.reset() s = s / 255.0 episode_reward = 0 episode_average_max_q = 0 for step in range(int(config['train']['max_episode_len'])): if config['run']['render_env'] == True: env.render() # 1. Use current behavioural policy network to predict an action to take # TODO: the [0] works for new SAC. Check again with DDPG updates. a = agent.actor.model.predict(np.expand_dims(s, 0))[0] # print('ACTION: ', a) # print('a[0]: ', a[0]) # 2. Use action to take step in environment and receive next step, reward, etc. s2, r, terminal, info = env.step(a[0]) s2 = s2 / 255.0 # 3. Update the replay buffer with the most recent experience replay_buffer.add(np.reshape(s, state_dim), np.reshape(a, action_dim), r, np.reshape(s2, state_dim), terminal) # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences if replay_buffer.size() > batch_size: experience = replay_buffer.sample_batch(batch_size) # Train current behavioural networks # predicted_Q_value = agent.train_networks(experience) loss_actor, criticQ, criticV = agent.train_networks(experience) # Update for logging # episode_average_max_q += np.amax(predicted_Q_value) # Soft update of frozen target networks agent.update_target_networks() # Update information for next step s = s2 episode_reward += r if terminal: print( f'Epoch {epoch} training losses: ACTOR: {loss_actor} | CRITIC_Q: {criticQ} | CRITIC_V: {criticV}' ) # print(f'| Reward: {int(episode_reward)} | Episode: {episode} | Qmax: {episode_average_max_q / float(step)}') break if config['run']['use_gym_monitor'] == True: env.monitor.close()
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) self.critic = DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self.target_critic = DDPGValueNet( feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) self._copy_para(self.critic.model, self.target_critic.model) self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self._copy_para(self.actor, self.target_actor) self.ema = tf.train.ExponentialMovingAverage(decay=1.0 - self.parms.tau) def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _ema_update(self): paras = self.actor.trainable_weights + \ self.critic.model.trainable_weights self.ema.apply(paras) for i, j in zip(self.target_actor.trainable_weights + \ self.target_critic.model.trainable_weights, paras): i.assign(self.ema.average(j)) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Train critic with tf.GradientTape() as tape: pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = self.target_critic([s_next, a_next]) y = r + self.parms.gamma * q_next * not_done q = self.critic([s, a]) c_loss = tf.losses.mean_squared_error(y, q) c_grads = tape.gradient(c_loss, self.critic.model.trainable_weights) self.critic.model.optimizer.apply_gradients( zip(c_grads, self.critic.model.trainable_weights)) # Train actor with tf.GradientTape() as tape: pi = self.actor(s) a = pi.sample() q = self.critic([s, a]) a_loss = -tf.reduce_mean(q) a_grads = tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) self._ema_update() def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
def q_learning(sess, env, agent, num_episodes, max_time_per_episode, discount_factor=0.99, epsilon=0.4, epsilon_decay=.95, use_experience_replay=False, max_replay_buffer_size=4000, batch_size=128, target=None, tf_saver=None, save_path=None, save_interval=None): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Implements the options of online learning or using experience replay and also target calculation by target networks, depending on the flags. You can reuse your Q-learning implementation of the last exercise. Args: env: PLE game approx: Action-Value function estimator num_episodes: Number of episodes to run for. max_time_per_episode: maximum number of time steps before episode is terminated discount_factor: gamma, discount factor of future rewards. epsilon: Chance to sample a random action. Float betwen 0 and 1. epsilon_decay: decay rate of epsilon parameter use_experience_replay: Indicator if experience replay should be used. batch_size: Number of samples per batch. target: Slowly updated target network to calculate the targets. Ignored if None. Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ # Keeps track of useful statistics stats = EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) replay_buffer = ReplayBuffer(max_replay_buffer_size) action_set = env.getActionSet() for i_episode in range(num_episodes): # The policy we're following policy = make_epsilon_greedy_policy(agent.predict, len(action_set)) # Print out which episode we're on, useful for debugging. # Also print reward for last episode last_reward = stats.episode_rewards[i_episode - 1] avg_reward = np.mean(stats.episode_rewards[max(i_episode - 100, 0):i_episode]) print("\rEpisode {}/{} ({}), avg reward: {}".format( i_episode + 1, num_episodes, last_reward, avg_reward), end="") # sys.stdout.flush() # Reset the current environment env.reset_game() state = list(env.getGameState()) done = False loss = None # Iterate through steps for t in range(max_time_per_episode): if env.game_over(): done = True # Update target network maybe if target: pass # Take a step action_probs = policy([state], epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) reward = env.act(action_set[action]) next_state = list(env.getGameState()) # episode stats stats.episode_lengths[i_episode] = t # print(reward) stats.episode_rewards[i_episode] += reward if done: print("\rStep {} ({}) loss: {}\n".format( t, max_time_per_episode, loss), end="") break if use_experience_replay: # Update replay buffer replay_buffer.add_transition(state, action, next_state, reward, done) # Sample minibatch from replay buffer batch_states, batch_actions, batch_next_states, batch_rewards, batch_dones = \ replay_buffer.next_batch(min(batch_size, replay_buffer.size())) batch_actions = list( zip(range(len(batch_actions)), batch_actions)) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. batch_next_q_values = (target if target else agent.train_model).predict( batch_next_states, None, None) batch_best_next_action = np.argmax(batch_next_q_values, axis=1) batch_td_target = [ batch_rewards[j] + discount_factor * batch_next_q_values[j][batch_best_next_action[j]] for j in range(len(batch_states)) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(batch_states, batch_actions, batch_td_target) else: next_q_values = (target if target else agent).predict( [next_state], None, None) best_next_action = np.argmax(next_q_values, axis=1) td_target = reward + (discount_factor * next_q_values[0] * best_next_action) loss = agent.train([state], [[0, action]], td_target) if target: target.update() epsilon *= epsilon_decay state = next_state if i_episode % save_interval == 0: tf_saver.save(sess, save_path, global_step=i_episode) return stats
def q_learning(q_network, env, test_env, seed, total_timesteps, log_interval, test_interval, show_interval, logdir, lr, max_grad_norm, units_per_hlayer, activ_fcn, gamma=0.95, epsilon=0.4, epsilon_decay=.95, buffer_size=4000, batch_size=128, trace_length=32, tau=0.99, update_interval=30, early_stop=False, keep_model=2, save_model=True, restore_model=False, save_traj=False): # """ # Q-Learning algorithm for off-policy TD control using Function Approximation. # Finds the optimal greedy policy while following an epsilon-greedy policy. # Implements the options of online learning or using experience replay and also # target calculation by target networks, depending on the flags. You can reuse # your Q-learning implementation of the last exercise. # # Args: # env: PLE game # approx: Action-Value function estimator # num_episodes: Number of episodes to run for. # max_time_per_episode: maximum number of time steps before episode is terminated # discount_factor: gamma, discount factor of future rewards. # epsilon: Chance to sample a random action. Float betwen 0 and 1. # epsilon_decay: decay rate of epsilon parameter # use_experience_replay: Indicator if experience replay should be used. # batch_size: Number of samples per batch. # target: Slowly updated target network to calculate the targets. Ignored if None. # # Returns: # An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. # """ logger = logging.getLogger(__name__) # logger.info(datetime.time) tf.reset_default_graph() set_global_seeds(seed) # Params ob_space = env.observation_space ac_space = env.action_space nd, = ob_space.shape n_ac = ac_space.n # Create learning agent and the replay buffer agent = DQNAgent(q_network=q_network, ob_space=ob_space, ac_space=ac_space, lr=lr, max_grad_norm=max_grad_norm, units_per_hlayer=units_per_hlayer, activ_fcn=activ_fcn, log_interval=log_interval, logdir=logdir, batch_size=batch_size, trace_length=trace_length, update_interval=update_interval, tau=tau, keep_model=keep_model) summary_writer = agent.get_summary_writer() result_path = os.path.join(logdir, 'train_results.csv') if save_traj: rew_traj = [] rew_results_path = os.path.join( logdir, ('lr' + str(lr) + '_tracking_results.csv')) else: rew_results_path = None replay_buffer = ReplayBuffer(buffer_size) # Keeps track of useful statistics stats = EpisodeStats if restore_model: for el in os.listdir(logdir): if 'final' in el and '.meta' in el: # Load pre trained model and set network parameters logger.info('load %s' % os.path.join(logdir, el[:-5])) agent.load(os.path.join(logdir, el[:-5])) # Reset global step parameter. agent.sess.run(agent.global_step.assign(0)) # ------------------ TRAINING -------------------------------------------- logger.info("Start Training") early_stopped = False i_episode, i_sample, i_train = 0, 0, 0 len, rew = 0, 0 horizon = 100 reward_window = deque(maxlen=horizon) avg_rm = deque(maxlen=30) nbatch = batch_size * trace_length return_threshold = -0.05 # 40 # Reset envnn obs = env.reset() obs = normalize_obs(obs) done = False rnn_state0 = agent.step_initial_state if rnn_state0 is None: # If we use a normal feed forward architecture, we sample a batch of single samples, not a batch of sequences. trace_length = 1 # Set the target network to be equal to the primary network agent.update_target(agent.target_ops) while i_sample < total_timesteps: if np.random.rand(1) < epsilon: _, next_rnn_state = agent.step([obs], rnn_state0) # epsilon greedy action action = np.random.randint(0, n_ac) else: AP, next_rnn_state = agent.step( [obs], rnn_state0) # epsilon greedy action action = AP[0] next_obs, reward, done, _ = env.step(action) next_obs = normalize_obs(next_obs) i_sample += 1 # render only every i-th episode if show_interval != 0: if i_episode % show_interval == 0: env.render() len += 1 rew += reward reward_window.append(reward) # When episode is done, add episode information to tensorboard summary and stats if done: # env.game_over(): next_obs = list(np.zeros_like(next_obs, dtype=np.float32)) stats['episode_lengths'].append(len) stats['episode_rewards'].append(rew) if summary_writer is not None: summary = tf.Summary() summary.value.add( tag='envs/ep_return', simple_value=stats['episode_rewards'][i_episode]) summary.value.add( tag="envs/ep_length", simple_value=stats['episode_lengths'][i_episode]) summary_writer.add_summary(summary, i_episode) summary_writer.flush() if save_model and rew > return_threshold: return_threshold = rew logger.info('Save model at max reward %s' % return_threshold) agent.save('inter_model') i_episode += 1 len, rew = 0, 0 # Update replay buffer replay_buffer.add_transition(obs, action, next_obs, reward, done) if save_traj: rew_traj.append(reward) # Update model parameters every #update_interval steps. Use real experience and replayed experience. if replay_buffer.size() > nbatch and (i_sample % update_interval == 0): if (env.spec._env_name == 'ContFlappyBird'): rm = sum(reward_window) / horizon if summary_writer is not None: s_summary = tf.Summary() s_summary.value.add(tag='envs/isample_return', simple_value=rm) summary_writer.add_summary(s_summary, i_sample) summary_writer.flush() if save_model and rm > return_threshold: return_threshold = rm logger.info('Save model at max rolling mean %s' % return_threshold) agent.save('inter_model') avg_rm.append(rm) if early_stop: if (i_sample > 60000) and (i_sample <= (60000 + update_interval)): if (sum(avg_rm) / 30) <= -0.88: print('breaked') early_stopped = True break agent.update_target(agent.target_ops) # reset rnn state (history knowledge) before every training step rnn_state_train = agent.train_initial_state # Sample training mini-batch from replay buffer if rnn_state_train is not None: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch_of_seq(batch_size, trace_length) else: mb_obs, mb_actions, mb_next_obs, mb_rewards, _, batch_dones = \ replay_buffer.recent_and_next_batch(batch_size) # Calculate TD target for batch. Use "old" fixed parameters if target network is available # to compute targets else use "old" parameters of value function estimate. # mb_next_obs = np.reshape(mb_next_obs, (-1, nd)) mb_next_q_values, _ = agent.target_model.predict( mb_next_obs, rnn_state_train) mb_best_next_action = np.argmax(mb_next_q_values, axis=1) mb_td_target = [ mb_rewards[j] + gamma * mb_next_q_values[j][mb_best_next_action[j]] for j in range(nbatch) ] # Update Q value estimator parameters by optimizing between Q network and Q-learning targets loss = agent.train(mb_obs, mb_actions, mb_td_target, rnn_state_train) i_train += 1 # If test_interval > 0 the learned model is evaluated every "test_interval" gradient updates if test_interval > 0 and i_train > 0 and (i_train % test_interval == 0): ep_return = agent.test_run(test_env, n_eps=10, n_pipes=2000) with open(result_path, "a") as csvfile: writer = csv.writer(csvfile) ep_return[0:0] = [i_sample, i_train] writer.writerow(ep_return) if done: # Reset the model next_obs = env.reset() next_obs = normalize_obs(next_obs) epsilon *= epsilon_decay obs = next_obs rnn_state0 = next_rnn_state # Save final model when training is finished. if save_model: agent.save('final_model') logger.info('Finished Training. Saving Final model.') if rew_results_path is not None: logger.info('Save reward trajectory to %s' % rew_results_path) with open(rew_results_path, "a") as csvfile: writer = csv.writer(csvfile) traj = np.asanyarray(rew_traj).reshape(-1).tolist() traj[0:0] = [np.mean(traj)] # i_train, i_sample writer.writerow(traj) logger.info('*******************************************************') logger.info('Total number of interactions with the environment: %s' % i_sample) logger.info('Total number of parameter updates during training: %s' % i_train) logger.info('*******************************************************\n') return early_stopped, i_sample
class Trainer(): def __init__(self, params: Parameters): self.parms = params self.env = Env(params.game, params.gamma, norm_rewards=None, norm_states=False) self.buffer = ReplayBuffer(params.replay_size) # Seed self.env.seed(params.seed) np.random.seed(params.seed) tf.random.set_seed(params.seed) # Four critic nets critic_nets = [ DDPGValueNet(feature_shape=self.env.features_shape, a_num=self.env.num_actions, lr=params.lr_c) for _ in range(4) ] self.critic1, self.critic2, self.target_critic1, self.target_critic2 = critic_nets # Two actor nets self.actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) self.target_actor = CtsPolicy(action_bound=self.env.action_bound, action_dim=self.env.num_actions, lr=params.lr_a) # Copy parms self._copy_para(self.critic1, self.target_critic1) self._copy_para(self.critic2, self.target_critic2) self._copy_para(self.actor, self.target_actor) self.train_step_cnt = 0 def _copy_para(self, from_model, to_model): """ Copy parameters for soft updating :param from_model: latest model :param to_model: target model :return: None """ for i, j in zip(from_model.trainable_weights, to_model.trainable_weights): j.assign(i) def _target_soft_update(self, net, target_net): """ soft update the target net with Polyak averaging """ for target_param, param in zip(target_net.trainable_weights, net.trainable_weights): target_param.assign( # copy weight value into target parameters target_param * (1.0 - self.parms.tau) + param * self.parms.tau) def _train(self): # Sample batch = self.buffer.sample(self.parms.batch_size) s = np.array([batch_[0] for batch_ in batch]) a = np.array([batch_[1] for batch_ in batch]) r = np.array([batch_[2] for batch_ in batch]) s_next = np.array([batch_[3] for batch_ in batch]) not_done = np.array([not batch_[4] for batch_ in batch]) # Reshpe r = r[:, np.newaxis] not_done = not_done[:, np.newaxis] # Set target y pi_next = self.target_actor(s_next) a_next = pi_next.sample() q_next = tf.minimum(self.target_critic1([s_next, a_next]), self.target_critic2([s_next, a_next])) y = r + self.parms.gamma * q_next * not_done # Train critic1 with tf.GradientTape() as c1_tape: q1 = self.critic1([s, a]) c1_loss = tf.losses.mean_squared_error(y, q1) c1_grads = c1_tape.gradient(c1_loss, self.critic1.trainable_weights) self.critic1.optimizer.apply_gradients( zip(c1_grads, self.critic1.trainable_weights)) # Train critic2 with tf.GradientTape() as c2_tape: q2 = self.critic2([s, a]) c2_loss = tf.losses.mean_squared_error(y, q2) c2_grads = c2_tape.gradient(c2_loss, self.critic2.trainable_weights) self.critic2.optimizer.apply_gradients( zip(c2_grads, self.critic2.trainable_weights)) # Train actor if self.train_step_cnt % self.parms.actor_interval == 0: with tf.GradientTape() as a_tape: pi = self.actor(s) a = pi.sample() q = self.critic1([s, a]) a_loss = -tf.reduce_mean(q) a_grads = a_tape.gradient(a_loss, self.actor.trainable_weights) self.actor.optimizer.apply_gradients( zip(a_grads, self.actor.trainable_weights)) # update parms self._target_soft_update(self.actor, self.target_actor) self._target_soft_update(self.critic1, self.target_critic1) self._target_soft_update(self.critic2, self.target_critic2) def train_step(self): # Episode infomation episode_ret = [] # Initialize s s = self.env.reset() for _ in range(self.parms.train_step_len): # Interact pi = self.actor(s[np.newaxis, :]) # batch_size=1 a = pi.sample()[0] s_next, r, done, info = self.env.step(a) # Store self.buffer.store((s, a, r, s_next, done)) # Train if self.buffer.size() > self.parms.start_size: self._train() self.train_step_cnt += 1 if done: _, ret = info['done'] episode_ret.append(ret) s_next = self.env.reset() s = s_next return np.mean(episode_ret)
class DQNAgent: def __init__(self, env): self.env = env self.state_size = env.observation_space.shape self.action_size = env.action_space.n self.replay_buffer = ReplayBuffer(buffer_size=1000000, batch_size=32) self.target_update_frequency = 16 self.target_update_counter = 0 self.gamma = 0.95 self.initial_epsilon = 1 self.epsilon = self.initial_epsilon self.epsilon_decay_rate = 0.99995 self.min_epsilon = 0.01 self.rho = 0.95 self.learning_rate = 0.00025 self.training_scores = [] # main model # gets trained every step self.model = self.build_model() # Target model this is what we .predict against every step self.target_model = self.build_model() self.target_model.set_weights(self.model.get_weights()) def build_model(self): # Neural Network architecture for Deep-Q learning Model model = Sequential() model.add( Conv2D(filters=32, kernel_size=8, strides=4, activation='relu', input_shape=self.state_size)) model.add( Conv2D(filters=32, kernel_size=4, strides=2, activation='relu')) model.add( Conv2D(filters=32, kernel_size=3, strides=1, activation='relu')) model.add(Flatten()) model.add( Dense(256, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add( Dense(128, activation='relu', kernel_regularizer=regularizers.l2(0.001))) model.add(Dense(self.action_size, activation='linear')) model.compile(loss=utils.huber_loss_mean, optimizer=RMSprop(lr=self.learning_rate, rho=self.rho, epsilon=self.min_epsilon), metrics=["accuracy"]) model.summary() return model def reset_episode(self, initial_state): """Reset variables for a new episode.""" # Gradually decrease exploration rate self.epsilon *= self.epsilon_decay_rate self.epsilon = max(self.epsilon, self.min_epsilon) self.prev_state = self.preprocess_state(initial_state) self.prev_action = np.argmax(self.model.predict(self.prev_state)) return self.prev_action def preprocess_state(self, state): # Preprocessing code return np.expand_dims(np.array(state), axis=0) def reset_exploration(self, epsilon=None): """Reset exploration rate used when training.""" self.epsilon = epsilon if epsilon is not None else self.initial_epsilon def plot_scores(self, scores, rolling_window=100): """Plot scores and optional rolling mean using specified window.""" plt.title("Scores") plt.xlabel("Episodes -->") plt.ylabel("Scores -->") plt.plot(scores) rolling_mean = pd.Series(scores).rolling(rolling_window).mean() plt.plot(rolling_mean) def act(self, next_state, reward, done, mode="train", time_delay=None): """Pick next action and update weights of the neural network (when mode != 'test').""" next_state = self.preprocess_state(next_state) if mode == "test": # Test mode: Simply produce an action action = np.argmax(self.model.predict(next_state)) if time_delay != None: # Adding time delay to watch the agent perform at a little slower pace. time.sleep(time_delay) else: # Exploration vs. exploitation do_exploration = np.random.uniform(0, 1) < self.epsilon if do_exploration: # Pick a random action action = np.random.randint(0, self.action_size) else: # Pick the best action from Q table action = np.argmax(self.model.predict(next_state)) # Store the experience in replay memory self.replay_buffer.add(self.prev_state, self.prev_action, reward, next_state, done) # Learn self.replay(done) # Roll over current state, action for next step self.prev_state = next_state self.prev_action = action return action def replay(self, done): if self.replay_buffer.size() < self.replay_buffer.batch_size: return terminal_state = done # Determine if the episode has ended. minibatch = self.replay_buffer.sample() # X : states, y : predictions X = [] y = [] prev_states = np.array([transition[0][0] for transition in minibatch]) prev_qs = self.model.predict(prev_states) next_states = np.array([transition[3][0] for transition in minibatch]) next_qs = self.target_model.predict(next_states) for index, (prev_state, prev_action, reward, next_state, done) in enumerate(minibatch): # Setting the target for the model to improve upon if not done: target = reward + (self.gamma * np.max(next_qs[index])) else: target = reward new_q_value = prev_qs[index] new_q_value[prev_action] = target X.append(prev_state) y.append(new_q_value) # Fit on all samples as one batch, log only on terminal state self.model.fit(np.vstack(X), np.vstack(y), batch_size=self.replay_buffer.batch_size, verbose=0, shuffle=False) if terminal_state: self.target_update_counter += 1 # If counter reaches set value, update target network with weights of main network if self.target_update_counter > self.target_update_frequency: self.target_model.set_weights(self.model.get_weights()) self.target_update_counter = 0 def run(self, num_episodes=20000, mode="train", time_delay=0.01, score_threshold=None, weights_path=None, scores_path=None): """Run agent in given reinforcement learning environment and return scores.""" scores = [] max_score = -np.inf min_score = np.inf max_avg_score = -np.inf avg_score = -np.inf for i_episode in range(1, num_episodes + 1): # Initialize episode state = self.env.reset() action = self.reset_episode(state) total_reward = 0 done = False # Roll out steps until done while not done: next_state, reward, done, info = self.env.step(action) total_reward += reward action = self.act(next_state, reward, done, mode, time_delay) self.env.render() # Save final score scores.append(total_reward) # Print episode stats if mode == 'train': self.training_done = True if total_reward > max_score: max_score = total_reward if total_reward < min_score: min_score = total_reward if len(scores) > 100: avg_score = np.mean(scores[-100:]) if avg_score > max_avg_score: max_avg_score = avg_score if weights_path != None and i_episode % 100 == 0: self.model.save_weights(weights_path) if scores_path != None: logs = {"scores": scores} logs = pd.DataFrame.from_dict(data=logs, orient='index') logs.to_csv(scores_path, index=False) print( "\rEpisode {}/{} | Episode Score: {} | Min. Score: {} | Max. Score: {} | Current Avg. Score: {} | Max. Average Score: {} | epsilon: {}" .format(i_episode, num_episodes, total_reward, min_score, max_score, avg_score, max_avg_score, self.epsilon), end="") sys.stdout.flush() # Terminating loop if the agent achieves reward threshold if score_threshold != None and max_avg_score > score_threshold: print( "\nEnvironment solved after {} episodes".format(i_episode)) break # Close rendering self.env.close() if mode == "test": print("\nScore: ", np.mean(scores)) else: self.training_scores.append(scores)
def main(config): tf.compat.v1.reset_default_graph() env_name = config['run']['env'] env = gym.make(env_name) np.random.seed(config['random_seed']) tf.compat.v1.set_random_seed(config['random_seed']) env.seed(config['random_seed']) batch_size = config['train']['batch_size'] state_dim = env.observation_space.shape # Use action_dim[0]: (a_dim,) --> a_dim action_dim = env.action_space.shape[0] # Define action boundaries for continuous but bounded action space action_low = env.action_space.low action_high = env.action_space.high print(f'-------- {env_name} --------') print('STATE DIM: ', state_dim) print('ACTION DIM: ', action_dim) print('ACTION LOW: ', action_low) print('ACTION HIGH: ', action_high) print('----------------------------') # Initialize memory for experience replay replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed']) # Set up summary TF operations summary_ops, summary_vars = build_summaries() with tf.compat.v1.Session() as sess: # sess.run(tf.compat.v1.global_variables_initializer()) writer = tf.compat.v1.summary.FileWriter(config['output']['summary_dir'], sess.graph) # Use agent_factory to build the agent using the algorithm specified in the config file. Agent = agent_factory(config['agent']['model']) agent = Agent(config, state_dim, action_dim, action_low, action_high, sess) sess.run(tf.compat.v1.global_variables_initializer()) for i in range(int(config['train']['max_episodes'])): s = env.reset() episode_reward = 0 episode_average_max_q = 0 for j in range(int(config['train']['max_episode_len'])): if config['run']['render_env'] == True: env.render() # 1. Predict an action to take a = agent.actor.predict_action(np.expand_dims(s, 0)) # 2. Use action to take step in environment and receive next step, reward, etc. s2, r, terminal, info = env.step(a[0]) # 3. Update the replay buffer with the most recent experience replay_buffer.add(np.reshape(s, state_dim), np.reshape(a, action_dim), r, np.reshape(s2, state_dim), terminal) # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences if replay_buffer.size() > batch_size: experience = replay_buffer.sample_batch(batch_size) # Train current behavioural networks predicted_Q_value = agent.train_networks(experience) # Update for logging episode_average_max_q += np.amax(predicted_Q_value) # Update target networks agent.update_target_networks() # Update information for next step s = s2 episode_reward += r if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_average_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j)))) break if config['run']['use_gym_monitor'] == True: env.monitor.close()
def main(config): env_name = config['run']['env'] env = gym.make(env_name) np.random.seed(config['random_seed']) tf.set_random_seed(config['random_seed']) env.seed(config['random_seed']) batch_size = config['train']['batch_size'] state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] # Define action boundaries for continuous but bounded action space action_bound = env.action_space.high print(f'-------- {env_name} --------') print('ACTION SPACE: ', action_dim) print('ACTION BOUND: ', action_bound) print('STATE SPACE: ', state_dim) print(f'------------------------') # TODO (20190831, JP): add normalization for envs that require it. # Ensure action bound is symmetric - important assert (env.action_space.high == -env.action_space.low) # Use agent_factory to build the agent using the algorithm specified in the config file. Agent = agent_factory(config['agent']['model']) agent = Agent(config, state_dim, action_dim, action_bound) # Initialize memory for experience replay replay_buffer = ReplayBuffer(config['train']['replay_buffer_size'], config['random_seed']) print(replay_buffer) # Set up summary TF operations summary_ops, summary_vars = build_summaries() with tf.Session() as sess: sess.run(tf.compat.v1.global_variables_initializer()) writer = tf.summary.FileWriter(config['output']['summary_dir'], sess.graph) # Initialize target network weights agent.update_target_networks(sess) for i in range(int(config['train']['max_episodes'])): s = env.reset() episode_reward = 0 episode_average_max_q = 0 for j in range(int(config['train']['max_episode_len'])): if config['run']['render_env'] == True: env.render() # 1. Predict an action to take a = agent.actor_predict_action(np.reshape(s, (1, state_dim)), sess) # 2. Use action to take step in environment and receive next step, reward, etc. s2, r, terminal, info = env.step(a[0]) # 3. Update the replay buffer with the most recent experience replay_buffer.add(np.reshape(s, (state_dim,)), np.reshape(a, (action_dim,)), r, np.reshape(s2, (state_dim,)), terminal) # 4. When there are enough experiences in the replay buffer, sample minibatches of training experiences if replay_buffer.size() > batch_size: s_batch, a_batch, r_batch, s2_batch, t_batch = replay_buffer.sample_batch(batch_size) # Train current behavioural networks predicted_Q_value = agent.train_networks(s_batch, a_batch, r_batch, s2_batch, t_batch, sess) # Update for logging episode_average_max_q += np.amax(predicted_Q_value) # Update target networks agent.update_target_networks(sess) # Update information for next step s = s2 episode_reward += r # TODO (20190815, JP): as this could be different for each agent, do # agent.summarize_episode(summary_ops, summary_vars, episode_reward, sess) for when each agent requires own summaries? if terminal: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_average_max_q / float(j) }) writer.add_summary(summary_str, i) writer.flush() print('| Reward: {:d} | Episode: {:d} | Qmax: {:.4f}'.format(int(episode_reward), i, (episode_average_max_q / float(j)))) break if config['run']['use_gym_monitor'] == True: env.monitor.close()