Exemple #1
0
    def test_phase(self, i_run, i_episode, writer_test):
        total_reward = 0
        ts = time.time()
        for i in range(self.eval_episode):
            old = self.env.reset()
            state_buffer = StateBuffer(self.state_buffer_size, old)
            episode_reward = 0
            done = False
            while not done:
                state = state_buffer.get_state()
                action = self.select_action(state, eval=True)

                next_state, reward, done, _ = self.env.step(action)
                episode_reward += reward

                state_buffer.push(next_state)
            total_reward += episode_reward

        writer_test.add_scalar('reward/test', total_reward / self.eval_episode,
                               i_episode)

        self.logger.info("----------------------------------------")
        self.logger.info(
            f"Test {self.eval_episode} ep.: {i_episode}, mean_r: {round(total_reward / self.eval_episode, 2)}"
            f", time_spent {round(time.time() - ts, 2)}s")
        self.save_model(self.env_name,
                        "./runs/" + self.folder + f"run_{i_run}/", i_episode)
        self.logger.info('Saving models...')
        self.logger.info("----------------------------------------")
Exemple #2
0
 def do_one_test(self):
     old = self.env.reset()
     state_buffer = StateBuffer(self.state_buffer_size, old)
     episode_reward = 0
     done = False
     while not done:
         state = state_buffer.get_state()
         action = self.select_action(state, eval=True)
         
         next_state, reward, done, _ = self.env.step(action)
         episode_reward += reward
         
         state_buffer.push(next_state)
     return episode_reward
Exemple #3
0
 def train(self, num_run=1, restore=False):
     memory = None
     start_episode = 0
     start_updates = 0
     start_run = 0
     start_total_numsteps = 0
     start_running_episode_reward = 0
     start_running_episode_reward_100 = 0
     start_rewards = []
     start_last_episode_steps = 0
     start_episode_reward = 0
     start_episode_steps = 0
     start_timing = 0
     start_total_timing = 0
     
     # Restore Phase
     if restore:
         # TODO: Not tested deeply yet
         with open(self.folder + "memory.pkl", "rb") as pickle_out:
             memory = ReplayMemory(self.replay_size, self.seed)
             memory.load(pickle_out)
         with open(self.folder + "context.json", "r+") as pickle_out:
             (start_episode, start_run, start_updates, start_total_numsteps, start_running_episode_reward,
              start_running_episode_reward_100, start_last_episode_steps, start_episode_reward, start_episode_steps,
              start_timing, start_total_timing) = json.load(pickle_out)
         with open(self.folder + "rewards.pkl", "rb") as pickle_out:
             start_rewards = pickle.load(pickle_out)
         self.restore_model()
         self.logger.important("Load completed!")
     
     in_ts = time.time()
     
     # Start of the iteration on runs
     for i_run in range(start_run, num_run):
         
         # Break the loop if the phase "Save'n'Close" is triggered
         if self.env.is_save_and_close():
             break
         
         self.logger.important(f"START TRAINING RUN {i_run}")
         
         # Set Seed for repeatability
         torch.manual_seed(self.seed + i_run)
         np.random.seed(self.seed + i_run)
         self.env.seed(self.seed + i_run)
         self.env.action_space.np_random.seed(self.seed + i_run)
         
         # Setup TensorboardX
         writer_train = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/train')
         writer_learn = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/learn')
         writer_test = SummaryWriter(log_dir=self.folder + 'run_' + str(i_run) + '/test')
         
         # Setup Replay Memory: create new memory if is not the restore case
         if not restore:
             memory = ReplayMemory(self.replay_size, self.seed)
         # Create a backup memory for Forget-Phase
         backup_memory = copy.deepcopy(memory)
         
         # TRAINING LOOP
         # All these variables must be backed up and restored
         updates = start_updates
         total_numsteps = start_total_numsteps
         running_episode_reward = start_running_episode_reward
         running_episode_reward_100 = start_running_episode_reward_100
         rewards = start_rewards
         i_episode = start_episode
         last_episode_steps = start_last_episode_steps
         episode_reward = start_episode_reward
         episode_steps = start_episode_steps
         timing = start_timing
         total_timing = start_total_timing
         updates_episode = 0
         episode_images = list()
         
         '''
             LOOP: Episode
         '''
         while True:
             
             # Stop the robot
             self.env.stop_all_motors()
             
             # Wait for the human to leave the command
             while self.env.is_human_controlled():
                 pass
             
             # Let's forget (if it is the case)
             if self.env.is_forget_enabled():
                 # print('forget')
                 i_episode -= 1
                 print(len(memory))
                 # Restore Nets
                 self.restore_model()
                 self.env.reset_forget()
                 # Restore Memory
                 memory = copy.deepcopy(backup_memory)
                 print(len(memory))
                 # memory.forget_last(last_episode_steps)
                 self.logger.info("Last Episode Forgotten")
             elif i_episode != start_episode:
                 # LEARNING AND PRINTING PHASE
                 ep_print = i_episode - 1
                 last_episode_steps = episode_steps
                 if self.pics:
                     for i, image in enumerate(episode_images):
                         writer_train.add_image('episode_{}'
                                                .format(str(ep_print)), image.unsqueeze(0),
                                                i)
                 
                 if len(memory) > self.min_replay_size and ep_print > self.warm_up_episodes:
                     updates = self.learning_phase((last_episode_steps // 10) * 10 + 10, memory, updates,
                                                   writer_learn)
                 self.print_nets(writer_train, ep_print)
                 rewards.append(episode_reward)
                 running_episode_reward += (episode_reward - running_episode_reward) / (ep_print + 1)
                 if len(rewards) < 100:
                     running_episode_reward_100 = running_episode_reward
                 else:
                     last_100 = rewards[-100:]
                     running_episode_reward_100 = np.array(last_100).mean()
                 
                 writer_train.add_scalar('reward/train', episode_reward, ep_print)
                 writer_train.add_scalar('reward/steps', last_episode_steps, ep_print)
                 writer_train.add_scalar('reward/running_mean', running_episode_reward, ep_print)
                 writer_train.add_scalar('reward/running_mean_last_100', running_episode_reward_100, ep_print)
                 self.logger.info("Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                                  .format(ep_print, self.num_episode, episode_steps, round(episode_reward, 2),
                                          round(running_episode_reward_100, 2), round(timing, 2),
                                          str(datetime.timedelta(seconds=total_timing))))
             
             # Security Wall, useful for longer training Phase
             while self.env.is_human_controlled():
                 pass
             
             # Let's test (if it is the case)
             if i_episode % self.eval_every == 0 and self.eval and i_episode != 0 and not restore:
                 # print('test')
                 self.test_phase(writer_test, i_run, updates)
                 # Wait for the human to leave the command
                 while self.env.is_human_controlled():
                     pass
             
             # TODO: HP Checkpoint and check correctness of checkpoint restoring
             if i_episode % self.eval_every == 0 and i_episode != 0 and not restore:
                 self.logger.important("Saving context...")
                 self.logger.info("To restart from here set this flag: --restore " + self.folder)
                 # Save Replay, net weights, hp, i_episode and i_run
                 with open(self.folder + "memory.pkl", "wb") as pickle_out:
                     memory.dump(pickle_out)
                 with open(self.folder + "context.json", "w+") as pickle_out:
                     json.dump((i_episode, i_run, updates, total_numsteps, running_episode_reward,
                                running_episode_reward_100, last_episode_steps, episode_reward, episode_steps,
                                timing, total_timing), pickle_out)
                 with open(self.folder + "rewards.pkl", "wb") as pickle_out:
                     pickle.dump(rewards, pickle_out)
                 self.backup_model()
                 if os.path.exists(self.folder[:-1] + "_bak" + self.folder[-1:]):
                     shutil.rmtree(self.folder[:-1] + "_bak" + self.folder[-1:])
                 print(self.folder[:-1] + "_bak" + self.folder[-1:])
                 shutil.copytree(self.folder, self.folder[:-1] + "_bak" + self.folder[-1:])
                 self.logger.important("Save completed!")
             
             # Limit of episode/run reached. Let's start a new RUN
             if i_episode > self.num_episode:
                 break
             
             # Backup NNs and memory (useful in case of Forget Phase)
             self.backup_model()
             backup_memory = copy.deepcopy(memory)
             
             # Setup the episode
             self.logger.important(f"START EPISODE {i_episode}")
             ts = time.time()
             episode_reward = episode_steps = 0
             done = False
             info = {'undo': False}
             state = self.env.reset()
             state_buffer = None
             
             # If you use CNNs, the use of StateBuffer is enabled (see doc).
             if self.pics:
                 state_buffer = StateBuffer(self.state_buffer_size, state)
                 state = state_buffer.get_state()
                 episode_images = list()
             updates_episode = 0
             
             # Start of the episode
             while not done:
                 if self.pics:
                     episode_images.append(state_buffer.get_tensor()[0])
                 
                 if i_episode < self.warm_up_episodes or len(memory) < self.min_replay_size:
                     # Warm_up phase -> Completely random choice of an action
                     action = self.env.action_space.sample()
                 else:
                     # Training phase -> Action sampled from policy
                     action = self.select_action(state)
                 
                 assert action.shape == self.env.action_space.shape
                 assert action is not None
                 writer_train.add_histogram('action_speed/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[0]), episode_steps)
                 writer_train.add_histogram('action_turn/episode_{}'
                                            .format(str(i_episode)), torch.tensor(action[1]), episode_steps)
                 
                 # Make the action
                 next_state, reward, done, info = self.env.step(action)
                 
                 # Save the step
                 if self.pics:
                     state_buffer.push(next_state)
                     next_state = state_buffer.get_state()
                 episode_steps += 1
                 total_numsteps += 1
                 episode_reward += reward
                 mask = 1 if done else float(not done)
                 
                 # Push the transition in the memory only if n steps is greater than 5
                 # print('push')
                 if episode_steps > 5:
                     memory.push(state, action, reward, next_state, mask)
                 state = next_state
             print("Memory {}/{}".format(len(memory), self.replay_size))
             timing = time.time() - ts
             total_timing = time.time() - in_ts
             start_episode = 0
             i_episode += 1
             # Disable restore phase after the restored run
             restore = False
Exemple #4
0
    def train(self, num_run=1):
        in_ts = time.time()
        for i_run in range(num_run):
            self.logger.important(f"START TRAINING RUN {i_run}")
            # Make the environment

            # Set Seed for repeatability
            torch.manual_seed(self.seed + i_run)
            np.random.seed(self.seed + i_run)
            self.env.seed(self.seed + i_run)
            self.env.action_space.np_random.seed(self.seed + i_run)

            # Setup TensorboardX
            writer_train = SummaryWriter(log_dir='runs/' + self.folder +
                                         'run_' + str(i_run) + '/train')
            writer_test = SummaryWriter(log_dir='runs/' + self.folder +
                                        'run_' + str(i_run) + '/test')

            # Setup Replay Memory
            memory = ReplayMemory(self.replay_size)

            # TRAINING LOOP
            total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0
            rewards = []
            i_episode = 0
            last_episode_steps = 0
            while True:
                self.env.stop_all_motors()
                while self.env.is_human_controlled():
                    continue
                if self.env.is_forget_enabled():
                    self.restore_model()
                    memory.forget_last(last_episode_steps)
                    i_episode -= 1
                    self.logger.info("Last Episode Forgotten")
                if self.env.is_test_phase():
                    self.test_phase(i_run, i_episode, writer_test)
                    continue
                if i_episode > self.num_episode:
                    break
                self.backup_model()
                self.logger.important(f"START EPISODE {i_episode}")
                ts = time.time()
                episode_reward = episode_steps = 0
                done = False
                info = {'undo': False}
                state = self.env.reset()
                state_buffer = None
                if self.pics:
                    state_buffer = StateBuffer(self.state_buffer_size, state)
                    state = state_buffer.get_state()

                critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

                while not done:
                    if self.pics:
                        writer_train.add_image(
                            'episode_{}'.format(str(i_episode)),
                            state_buffer.get_tensor(), episode_steps)
                    if len(memory) < self.warm_up_steps:
                        action = self.env.action_space.sample()
                    else:
                        action = self.select_action(
                            state)  # Sample action from policy
                        if len(memory) > self.batch_size:
                            # Number of updates per step in environment
                            for i in range(self.updates_per_step):
                                # Update parameters of all the networks
                                critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = self.update_parameters(
                                    memory, self.batch_size, updates)

                                critic_1_loss_acc += critic_1_loss
                                critic_2_loss_acc += critic_2_loss
                                policy_loss_acc += policy_loss
                                ent_loss_acc += ent_loss
                                alpha_acc += alpha
                                updates += 1

                    next_state, reward, done, info = self.env.step(
                        action)  # Step
                    if self.pics:
                        state_buffer.push(next_state)
                        next_state = state_buffer.get_state()
                    episode_steps += 1
                    total_numsteps += 1
                    episode_reward += reward
                    mask = 1 if done else float(not done)
                    memory.push(state, action, reward, next_state,
                                mask)  # Append transition to memory

                    state = next_state
                last_episode_steps = episode_steps
                i_episode += 1

                rewards.append(episode_reward)
                running_episode_reward += (episode_reward -
                                           running_episode_reward) / i_episode
                if len(rewards) < 100:
                    running_episode_reward_100 = running_episode_reward
                else:
                    last_100 = rewards[-100:]
                    running_episode_reward_100 = np.array(last_100).mean()
                writer_train.add_scalar('loss/critic_1',
                                        critic_1_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/critic_2',
                                        critic_2_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/policy',
                                        policy_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('loss/entropy_loss',
                                        ent_loss_acc / episode_steps,
                                        i_episode)
                writer_train.add_scalar('entropy_temperature/alpha',
                                        alpha_acc / episode_steps, i_episode)
                writer_train.add_scalar('reward/train', episode_reward,
                                        i_episode)
                writer_train.add_scalar('reward/running_mean',
                                        running_episode_reward, i_episode)
                writer_train.add_scalar('reward/running_mean_last_100',
                                        running_episode_reward_100, i_episode)
                self.logger.info(
                    "Ep. {}/{}, t {}, r_t {}, 100_mean {}, time_spent {}s | {}s "
                    .format(
                        i_episode, self.num_episode, episode_steps,
                        round(episode_reward, 2),
                        round(running_episode_reward_100, 2),
                        round(time.time() - ts, 2),
                        str(datetime.timedelta(seconds=time.time() - in_ts))))
            self.env.close()
Exemple #5
0
            critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

            while not done:
                # if cnn:
                #     writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps)
                if i_episode < args.warm_up_episode:
                    action = env.action_space.sample()  # Sample random action
                else:
                    action = agent.select_action(
                        state)  # Sample action from policy

                next_state, reward, done, _ = env.step(action)  # Step
                env.render()
                if cnn:
                    state_buffer.push(next_state)
                    next_state = state_buffer.get_state()
                episode_steps += 1
                total_numsteps += 1
                episode_reward += reward
                # # Ignore the "done" signal if it comes from hitting the time horizon.
                # # (https://github.com/openai/spinningup/blob/master/spinup/algos/sac/sac.py)
                mask = 1 if done else float(not done)
                memory.push(state, action, reward, next_state,
                            mask)  # Append transition to memory

                state = next_state
            if len(memory
                   ) > args.batch_size and i_episode > args.warm_up_episode:
                # Number of updates per step in environment
                # Update parameters of all the networks