class ActorCriticAgent: """ Advantage Actor Critic agent """ def __init__(self, num_actions, checkpoint=None): self.network, self.trainable_parameters = self.init_network( num_actions) self.optimizer = torch.optim.Adam(self.trainable_parameters, lr=1e-4) self.memory = Memory() if checkpoint is not None: load_checkpoint(self.network, self.optimizer, checkpoint) def init_network(self, num_actions): network = {'actor_critic': ActorCritic(num_actions)} trainable_parameters = list(network['actor_critic'].parameters()) return network, trainable_parameters def play(self, environment, max_games=1, max_steps=500, train=False, verbose=False, recorder=None): n_steps = 0 n_games = 0 current_game_infos = { 'game': n_games + 1, 'reward': 0, 'game_duration': 0 } observation = environment.reset() if recorder is not None: recorder.reset() recorder.record(environment) while (n_steps < max_steps) and (n_games < max_games): self.init_rollout(observation) for rollout_step in range(20): value, log_policy, action = self.network['actor_critic']( observation) self.memory.append({ 'value': value, 'log_policy': log_policy, 'action': action }) observation, extrinsic_reward, is_game_over, infos = environment.step( action.numpy()[0]) if recorder is not None: recorder.record(environment) reward = self.get_reward(observation, extrinsic_reward) self.memory.append({'reward': reward}) current_game_infos['reward'] += reward current_game_infos['game_duration'] += 1 n_steps += 1 if is_game_over: n_games += 1 print(current_game_infos) current_game_infos = { 'game': n_games + 1, 'reward': 0, 'game_duration': 0 } observation = environment.reset() break self.end_rollout(observation, is_game_over) if verbose: print(current_game_infos) if train: loss = self.compute_loss() self.backpropagate(loss) if recorder is not None: recorder.stop() def init_rollout(self, observation): self.memory.reset() self.network['actor_critic'].detach_internal_state() def end_rollout(self, observation, is_game_over): if is_game_over: next_value = torch.Tensor([[0]]) self.network['actor_critic'].reset_internal_state() else: next_value = self.network['actor_critic'](observation)[0].detach() self.memory.append({'value': next_value}) def get_reward(self, observation, extrinsic_reward): return np.clip(extrinsic_reward, -1, 1) def compute_loss(self): loss = self.network['actor_critic'].loss(self.memory) return loss def backpropagate(self, loss, max_gradient_norm=40): self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.trainable_parameters, max_gradient_norm) self.optimizer.step()
def run(self): total_time = time.time() steps = 0 try: save_interval = 0 memory = Memory() do_training = True while do_training: episode_time = time.time() self.env.initial_position = { 'p_x': np.random.uniform(1, 4), 'p_y': 3.7, 'p_z': 0.05, 'o_x': 0, 'o_y': 0.0, 'o_z': np.random.uniform(0.4, 1), 'o_w': 0.855 } state = self.env.reset() state = preprocess(state, self.img_y_offset, self.img_x_scale, self.img_y_scale) # state = np.expand_dims(state, axis=0) done = False cumulated_reward = 0 stacked_states = deque(maxlen=self.n_frames) stacked_next_states = deque(maxlen=self.n_frames) for i in range(self.n_frames): stacked_states.append(state) stacked_next_states.append(state) episode_steps = 0 while not done: steps += 1 episode_steps += 1 action = self.agent.act( np.expand_dims(np.stack(stacked_states, axis=2), axis=0)) next_state, reward, done, _ = self.env.step(action) next_state = preprocess(next_state, self.img_y_offset, self.img_x_scale, self.img_y_scale) stacked_next_states.append(next_state) memory.append(action, np.stack(stacked_states, axis=2), np.stack(stacked_next_states, axis=2), reward, done) stacked_states.append(next_state) cumulated_reward += reward save_interval += 1 if save_interval >= self.sample_batch_size: save_interval = 0 replay_time = time.time() self.agent.replay(memory) rospy.loginfo("Replay time {}".format(time.time() - replay_time)) if steps >= self.sample_batch_size * 200: do_training = False memory = Memory() if self.highest_reward < cumulated_reward: self.highest_reward = cumulated_reward rospy.loginfo("total episode_steps {}, reward {}/{}".format( episode_steps, cumulated_reward, self.highest_reward)) rospy.loginfo("Episode time {}, total {}".format( self.format_time(episode_time), self.format_time(total_time))) rospy.loginfo("exploration_rate {}".format( self.agent.exploration_rate)) finally: self.env.close() rospy.loginfo("Total time: {}".format( self.format_time(total_time))) rospy.loginfo("Total steps: {}".format(steps))