Esempio n. 1
0
    def explore_and_collect(self,episode):
        tqdm.write("Collect new data:")
        reward = 0
        # Data collection
        with torch.no_grad():
            done = False
            observation, total_reward = self.env.reset(), 0
            belief, posterior_state, action = torch.zeros(1, self.parms.belief_size, device=self.parms.device), torch.zeros(1, self.parms.state_size, device=self.parms.device), torch.zeros(1, self.env.action_size, device=self.parms.device)
            t = 0
            real_rew = []
            predicted_rew = [] 
            total_steps = self.parms.max_episode_length // self.env.action_repeat
            explore = True

            for t in tqdm(range(total_steps)):
                # Here we need to explore
                belief, posterior_state, action, next_observation, reward, done, pred_next_rew = self.update_belief_and_act(self.env, belief, posterior_state, action, observation.to(device=self.parms.device), [reward], self.env.action_range[0], self.env.action_range[1], explore=explore)
                self.D.append(observation, action.cpu(), reward, done)
                real_rew.append(reward)
                predicted_rew.append(pred_next_rew.to(device=self.parms.device).item())
                total_reward += reward
                observation = next_observation
                if self.parms.flag_render:
                    env.render()
                if done:
                    break

        # Update and plot train reward metrics
        self.metrics['steps'].append( (t * self.env.action_repeat) + self.metrics['steps'][-1])
        self.metrics['episodes'].append(episode)
        self.metrics['train_rewards'].append(total_reward)
        self.metrics['predicted_rewards'].append(np.array(predicted_rew).sum())

        lineplot(self.metrics['episodes'][-len(self.metrics['train_rewards']):], self.metrics['train_rewards'], 'train_rewards', self.statistics_path)
        double_lineplot(self.metrics['episodes'], self.metrics['train_rewards'], self.metrics['predicted_rewards'], "train_r_vs_pr", self.statistics_path)
Esempio n. 2
0
    def test_model(self, episode=None): #no explore here
        if episode is None:
            episode = self.tested_episodes


        # Set models to eval mode
        self.transition_model.eval()
        self.observation_model.eval()
        self.reward_model.eval()
        self.encoder.eval()
        
        # Initialise parallelised test environments
        test_envs = EnvBatcher(ControlSuiteEnv, (self.parms.env_name, self.parms.seed, self.parms.max_episode_length, self.parms.bit_depth), {}, self.parms.test_episodes)
        total_steps = self.parms.max_episode_length // test_envs.action_repeat
        rewards = np.zeros(self.parms.test_episodes)
        
        real_rew = torch.zeros([total_steps,self.parms.test_episodes])
        predicted_rew = torch.zeros([total_steps,self.parms.test_episodes])

        with torch.no_grad():
            observation, total_rewards, video_frames = test_envs.reset(), np.zeros((self.parms.test_episodes, )), []            
            belief, posterior_state, action = torch.zeros(self.parms.test_episodes, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.parms.state_size, device=self.parms.device), torch.zeros(self.parms.test_episodes, self.env.action_size, device=self.parms.device)
            tqdm.write("Testing model.")
            for t in range(total_steps):     
                belief, posterior_state, action, next_observation, rewards, done, pred_next_rew  = self.update_belief_and_act(test_envs,  belief, posterior_state, action, observation.to(device=self.parms.device), list(rewards), self.env.action_range[0], self.env.action_range[1])
                total_rewards += rewards.numpy()
                real_rew[t] = rewards
                predicted_rew[t]  = pred_next_rew

                observation = self.env.get_original_frame().unsqueeze(dim=0)

                video_frames.append(make_grid(torch.cat([observation, self.observation_model(belief, posterior_state).cpu()], dim=3) + 0.5, nrow=5).numpy())  # Decentre
                observation = next_observation
                if done.sum().item() == self.parms.test_episodes:
                    break
            
        real_rew = torch.transpose(real_rew, 0, 1)
        predicted_rew = torch.transpose(predicted_rew, 0, 1)
        
        #save and plot metrics 
        self.tested_episodes += 1
        self.metrics['test_episodes'].append(episode)
        self.metrics['test_rewards'].append(total_rewards.tolist())

        lineplot(self.metrics['test_episodes'], self.metrics['test_rewards'], 'test_rewards', self.statistics_path)
        
        write_video(video_frames, 'test_episode_%s' % str(episode), self.video_path)  # Lossy compression
        # Set models to train mode
        self.transition_model.train()
        self.observation_model.train()
        self.reward_model.train()
        self.encoder.train()
        # Close test environments
        test_envs.close()
        return self.metrics
Esempio n. 3
0
    def fit_buffer(self,episode):
        ####
        # Fit data taken from buffer 
        ######

        # Model fitting
        losses = []
        tqdm.write("Fitting buffer")
        for s in tqdm(range(self.parms.collect_interval)):

            # Draw sequence chunks {(o_t, a_t, r_t+1, terminal_t+1)} ~ D uniformly at random from the dataset (including terminal flags)
            observations, actions, rewards, nonterminals = self.D.sample(self.parms.batch_size, self.parms.chunk_size)  # Transitions start at time t = 0
            # Create initial belief and state for time t = 0
            init_belief, init_state = torch.zeros(self.parms.batch_size, self.parms.belief_size, device=self.parms.device), torch.zeros(self.parms.batch_size, self.parms.state_size, device=self.parms.device)
            encoded_obs = bottle(self.encoder, (observations[1:], ))

            # Update belief/state using posterior from previous belief/state, previous action and current observation (over entire sequence at once)
            beliefs, prior_states, prior_means, prior_std_devs, posterior_states, posterior_means, posterior_std_devs = self.transition_model(init_state, actions[:-1], init_belief, encoded_obs, nonterminals[:-1])
            
            # Calculate observation likelihood, reward likelihood and KL losses (for t = 0 only for latent overshooting); sum over final dims, average over batch and time (original implementation, though paper seems to miss 1/T scaling?)
            # LOSS
            observation_loss = F.mse_loss(bottle(self.observation_model, (beliefs, posterior_states)), observations[1:], reduction='none').sum((2, 3, 4)).mean(dim=(0, 1))
            kl_loss = torch.max(kl_divergence(Normal(posterior_means, posterior_std_devs), Normal(prior_means, prior_std_devs)).sum(dim=2), self.free_nats).mean(dim=(0, 1))  
            reward_loss = F.mse_loss(bottle(self.reward_model, (beliefs, posterior_states)), rewards[:-1], reduction='none').mean(dim=(0, 1))            

            # Update model parameters
            self.optimiser.zero_grad()

            (observation_loss + reward_loss + kl_loss).backward() # BACKPROPAGATION
            nn.utils.clip_grad_norm_(self.param_list, self.parms.grad_clip_norm, norm_type=2)
            self.optimiser.step()
            # Store (0) observation loss (1) reward loss (2) KL loss
            losses.append([observation_loss.item(), reward_loss.item(), kl_loss.item()])#, regularizer_loss.item()])

        #save statistics and plot them
        losses = tuple(zip(*losses))  
        self.metrics['observation_loss'].append(losses[0])
        self.metrics['reward_loss'].append(losses[1])
        self.metrics['kl_loss'].append(losses[2])
      
        lineplot(self.metrics['episodes'][-len(self.metrics['observation_loss']):], self.metrics['observation_loss'], 'observation_loss', self.statistics_path)
        lineplot(self.metrics['episodes'][-len(self.metrics['reward_loss']):], self.metrics['reward_loss'], 'reward_loss', self.statistics_path)
        lineplot(self.metrics['episodes'][-len(self.metrics['kl_loss']):], self.metrics['kl_loss'], 'kl_loss', self.statistics_path)
Esempio n. 4
0
        (observation_loss + reward_loss + kl_loss).backward()
        nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2)
        optimiser.step()
        transition_model.rnn.memory = None
        # Store (0) observation loss (1) reward loss (2) KL loss
        losses.append(
            [observation_loss.item(),
             reward_loss.item(),
             kl_loss.item()])

    # Update and plot loss metrics
    losses = tuple(zip(*losses))
    metrics['observation_loss'].append(losses[0])
    metrics['reward_loss'].append(losses[1])
    metrics['kl_loss'].append(losses[2])
    lineplot(metrics['episodes'][-len(metrics['observation_loss']):],
             metrics['observation_loss'], 'observation_loss', results_dir)
    lineplot(metrics['episodes'][-len(metrics['reward_loss']):],
             metrics['reward_loss'], 'reward_loss', results_dir)
    lineplot(metrics['episodes'][-len(metrics['kl_loss']):],
             metrics['kl_loss'], 'kl_loss', results_dir)

    print("Starting data collection")
    # Data collection
    with torch.no_grad():
        observation, total_reward = env.reset(), 0
        belief, posterior_state, action = torch.zeros(
            1, args.belief_size,
            device=args.device), torch.zeros(1,
                                             args.state_size,
                                             device=args.device), torch.zeros(
                                                 1,
Esempio n. 5
0
                                    ]),
                                    policy_trajectories['log_prob_actions'].
                                    exp())  # TODO: Implement terminal masking?

                # Perform PPO updates
                for epoch in tqdm(range(args.ppo_epochs), leave=False):
                    ppo_update(agent, policy_trajectories, agent_optimiser,
                               args.ppo_clip, epoch, args.value_loss_coeff,
                               args.entropy_loss_coeff)

    # Evaluate agent and plot metrics
    if step % args.evaluation_interval == 0:
        metrics['test_steps'].append(step)
        metrics['test_returns'].append(
            evaluate_agent(agent, args.evaluation_episodes, seed=args.seed))
        lineplot(metrics['test_steps'], metrics['test_returns'],
                 'test_returns')
        if args.imitation != 'BC':
            lineplot(metrics['train_steps'], metrics['train_returns'],
                     'train_returns')

if args.save_trajectories:
    # Store trajectories from agent after training
    _, trajectories = evaluate_agent(agent,
                                     args.evaluation_episodes,
                                     return_trajectories=True,
                                     seed=args.seed)
    torch.save(trajectories, os.path.join('results', 'trajectories.pth'))

# Save agent and metrics
torch.save(agent.state_dict(), os.path.join('results', 'agent.pth'))
if args.imitation in ['AIRL', 'GAIL']:
Esempio n. 6
0
#####################################################

transition_model.eval()
decoder.eval()
encoder.eval()

## If testing run planning
test_env = Env(args.env,
               args.seed,
               args.max_episode_length,
               args.action_repeat,
               problem=args.task_name)
num_trials = 200
with torch.no_grad():
    newpath = os.path.join(results_dir, "planning")
    os.makedirs(newpath, exist_ok=True)
    rewards = []
    for tr in range(num_trials):
        trpath = os.path.join(newpath, "planning_" + str(tr))
        os.makedirs(trpath, exist_ok=True)

        ## If running planning trials
        success = run_trial(test_env, encoder, transition_model, decoder,
                            trpath, args)
        rewards.append(success)
        print("******", np.mean(rewards))
    metrics["test_rewards"].append(np.mean(rewards))
    test_env.close()
lineplot(metrics['teststeps'][-len(metrics['test_rewards']):],
         metrics['test_rewards'], 'test_rewards', results_dir)
Esempio n. 7
0
    ## Logging Models/Images
    if (s % args.checkpoint_interval == 0):
        newpath = os.path.join(results_dir, str(s))
        os.makedirs(newpath, exist_ok=True)
        metrics['teststeps'].append(s)
        video_frames = []

        for p in range(predlen + 1):
            video_frames.append(
                make_grid(torch.cat([
                    observations[p, :5, :3, :, :].cpu().detach(),
                    observations[0, :5, 3:, :, :].cpu().detach(),
                    target[p, :5, :, :, :].cpu().detach(),
                    target_preds_logging[p, :5, :, :, :].cpu().detach(),
                ],
                                    dim=3),
                          nrow=1).numpy() / 255.0)

        write_video(video_frames, 'train_step%s' % s,
                    newpath)  # Lossy compression
        lineplot(metrics['trainsteps'][-len(metrics['observation_loss']):],
                 metrics['observation_loss'], 'observation_loss', results_dir)

        torch.save(
            {
                'transition_model': transition_model.state_dict(),
                'decoder': decoder.state_dict(),
                'encoder': encoder.state_dict(),
                'optimiser': optimiser.state_dict()
            }, os.path.join(results_dir, 'models_%d.pth' % s))
Esempio n. 8
0
                       torch.cat(overshooting_vars[6], dim=1)),
                Normal(prior_means, prior_std_devs)
            ) * seq_mask).sum(dim=2), free_nats).mean(dim=(0, 1)) * (
                args.chunk_size
                - 1
            )  # Update KL loss (compensating for extra average over each overshooting/open loop sequence)

        # Apply linearly ramping learning rate schedule
        if args.learning_rate_schedule != 0:
            for group in optimiser.param_groups:
                group['lr'] = min(
                    group['lr'] +
                    args.learning_rate / args.learning_rate_schedule,
                    args.learning_rate)
        # Update model parameters
        optimiser.zero_grad()
        (observation_loss + kl_loss).backward()
        nn.utils.clip_grad_norm_(param_list, args.grad_clip_norm, norm_type=2)
        optimiser.step()
        # Store (0) observation loss (1) reward loss (2) KL loss
        losses.append([observation_loss.item(), kl_loss.item()])

    # Update and plot loss metrics
    losses = tuple(zip(*losses))
    metrics['observation_loss'].append(losses[0])
    metrics['kl_loss'].append(losses[2])
    lineplot(metrics['episodes'][-len(metrics['observation_loss']):],
             metrics['observation_loss'], 'observation_loss', results_dir)
    lineplot(metrics['episodes'][-len(metrics['kl_loss']):],
             metrics['kl_loss'], 'kl_loss', results_dir)
Esempio n. 9
0
def train(config_filepath, save_dir, device, visualize_interval):
    conf = load_toml_config(config_filepath)
    data_dir, log_dir = create_save_dir(save_dir)
    # Save config file
    shutil.copyfile(config_filepath,
                    os.path.join(save_dir, os.path.basename(config_filepath)))
    device = torch.device(device)

    # Set up log metrics
    metrics = {
        'episode': [],
        'episodic_step': [],
        'collected_total_samples': [],
        'reward': [],
        'q_loss': [],
        'policy_loss': [],
        'alpha_loss': [],
        'alpha': [],
        'policy_switch_epoch': [],
        'policy_switch_sample': [],
        'test_episode': [],
        'test_reward': [],
    }

    policy_switch_samples = conf.policy_switch_samples if hasattr(
        conf, "policy_switch_samples") else None
    total_collected_samples = 0

    # Create environment
    env = make_env(conf.environment, render=False)

    # Instantiate modules
    # memory = ReplayBuffer(int(conf.replay_buffer_capacity), env.observation_space.shape, env.action_space.shape)
    memory = ReplayMemory(conf.replay_buffer_capacity)
    agent = getattr(agents, conf.agent_type)(env.observation_space,
                                             env.action_space,
                                             device=device,
                                             **conf.agent)

    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        policy_switch_samples = ckpt['policy_switch_samples']
        total_collected_samples = ckpt['total_collected_samples']

    def save_checkpoint():
        # Save checkpoint
        ckpt = {
            'metrics': metrics,
            'agent': agent.state_dict(),
            'memory': memory.state_dict(),
            'policy_switch_samples': policy_switch_samples,
            'total_collected_samples': total_collected_samples
        }
        path = os.path.join(data_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(data_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(data_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(
        metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    agent_update_count = 0
    for episode in pbar:
        episodic_reward = 0
        o = env.reset()
        q1_loss, q2_loss, policy_loss, alpha_loss, alpha = None, None, None, None, None

        for t in range(conf.horizon):
            if total_collected_samples <= conf.random_sample_num:  # Select random actions at the begining of training.
                h = env.action_space.sample()
            elif memory.step <= conf.random_sample_num:  # Select actions from random latent variable soon after inserting a new subpolicy.
                h = agent.select_action(o, random=True)
            else:
                h = agent.select_action(o)

            a = agent.post_process_action(
                o, h)  # Convert abstract action h to actual action a

            o_next, r, done, _ = env.step(a)
            total_collected_samples += 1
            episodic_reward += r
            memory.push(o, h, r, o_next, done)
            o = o_next

            if memory.step > conf.random_sample_num:
                # Update agent
                batch_data = memory.sample(conf.agent_update_batch_size)
                q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(
                    batch_data, agent_update_count)
                agent_update_count += 1

            if done:
                break

        # Describe and save episodic metrics
        reward_moving_avg = (
            1. - MOVING_AVG_COEF
        ) * reward_moving_avg + MOVING_AVG_COEF * episodic_reward if reward_moving_avg else episodic_reward
        pbar.set_description(
            "EPISODE {} (total samples {}, subpolicy samples {}) --- Step {}, Reward {:.1f} (avg {:.1f})"
            .format(episode, total_collected_samples, memory.step, t,
                    episodic_reward, reward_moving_avg))
        metrics['episode'].append(episode)
        metrics['reward'].append(episodic_reward)
        metrics['episodic_step'].append(t)
        metrics['collected_total_samples'].append(total_collected_samples)
        if episode % visualize_interval == 0:
            # Visualize metrics
            lineplot(metrics['episode'][-len(metrics['reward']):],
                     metrics['reward'], 'REWARD', log_dir)
            reward_avg = np.array(metrics['reward']) / np.array(
                metrics['episodic_step'])
            lineplot(metrics['episode'][-len(reward_avg):], reward_avg,
                     'AVG_REWARD', log_dir)
            lineplot(
                metrics['collected_total_samples'][-len(metrics['reward']):],
                metrics['reward'],
                'SAMPLE-REWARD',
                log_dir,
                xaxis='sample')

        # Save metrics for agent update
        if q1_loss is not None:
            metrics['q_loss'].append(np.mean([q1_loss, q2_loss]))
            metrics['policy_loss'].append(policy_loss)
            metrics['alpha_loss'].append(alpha_loss)
            metrics['alpha'].append(alpha)
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q_loss']):],
                         metrics['q_loss'], 'Q_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):],
                         metrics['policy_loss'], 'POLICY_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):],
                         metrics['alpha_loss'], 'ALPHA_LOSS', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):],
                         metrics['alpha'], 'ALPHA', log_dir)

        # Insert new subpolicy layer and reset memory if a specific amount of samples is collected
        if policy_switch_samples and len(
                policy_switch_samples
        ) > 0 and total_collected_samples >= policy_switch_samples[0]:
            print(
                "----------------------\nInser new policy\n----------------------"
            )
            agent.insert_subpolicy()
            memory.reset()
            metrics['policy_switch_epoch'].append(episode)
            metrics['policy_switch_sample'].append(total_collected_samples)
            policy_switch_samples = policy_switch_samples[1:]

        # Test a policy
        if episode % conf.test_interval == 0:
            test_rewards = []
            for _ in range(conf.test_times):
                episodic_reward = 0
                obs = env.reset()
                for t in range(conf.horizon):
                    h = agent.select_action(obs, eval=True)
                    a = agent.post_process_action(o, h)
                    obs_next, r, done, _ = env.step(a)
                    episodic_reward += r
                    obs = obs_next

                    if done:
                        break

                test_rewards.append(episodic_reward)

            test_reward_avg, test_reward_std = np.mean(test_rewards), np.std(
                test_rewards)
            print("   TEST --- ({} episodes) Reward {:.1f} (pm {:.1f})".format(
                conf.test_times, test_reward_avg, test_reward_std))
            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(test_rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):],
                     metrics['test_reward'], "TEST_REWARD", log_dir)

        # Save checkpoint
        if episode % conf.checkpoint_interval:
            save_checkpoint()

    # Save the final model
    torch.save({'agent': agent.state_dict()},
               os.path.join(data_dir, 'final_model.pth'))
Esempio n. 10
0
def train(config_file_path: str, save_dir: str, use_vime: bool, random_policy: bool, device: str, visualize_interval: int):
    conf_d = toml.load(open(config_file_path))
    conf = namedtuple('Config', conf_d.keys())(*conf_d.values())

    # Check if saving directory is valid
    if "test" in save_dir and os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    if os.path.exists(save_dir):
        raise ValueError("Directory {} already exists.".format(save_dir))
    # Create save dir
    os.makedirs(save_dir)
    ckpt_dir = os.path.join(save_dir, 'checkpoints')
    os.makedirs(ckpt_dir)
    log_dir = os.path.join(save_dir, 'logs')
    os.makedirs(log_dir)
    # Save config file
    shutil.copyfile(config_file_path, os.path.join(save_dir, os.path.basename(config_file_path)))

    # Set random variable
    np.random.seed(int(time.time()))
    torch.manual_seed(int(time.time()))
    device = torch.device(device)
    if device.type == 'cuda':
        torch.cuda.manual_seed(int(time.time()))

    # Set up log metrics
    metrics = {
        'episode': [],
        'collected_samples': [],
        'reward': [], # cummulated reward
        'curiosity_reward': [], # cummulated reward with information gain
        'likelihood': [], # likelihood of leanred dynamics model
        'D_KL_median': [], 'D_KL_mean': [],
        'q1_loss': [], 'policy_loss': [], 'alpha_loss': [], 'alpha': [],
        'ELBO': [],
        'step': [], 'step_reward': [],
        'test_episode': [], 'test_reward': [],
    }

    # Set up environment
    print("----------------------------------------\nTrain in {}\n----------------------------------------".format(conf.environment))
    env = gym.make(conf.environment)

    if use_vime:
        print("Use VIME")
    if random_policy:
        print("Keep using random policy.")

    # Training set up
    agent = SAC(env.observation_space, env.action_space, device, **conf.agent)
    memory = ReplayBuffer(conf.replay_buffer_capacity, env.observation_space.shape, env.action_space.shape)
    vime = VIME(env.observation_space.shape[0], env.action_space.shape[0], device, **conf.vime) if use_vime else None
    # Load checkpoint if specified in config
    if conf.checkpoint != '':
        ckpt = torch.load(conf.checkpoint, map_location=device)
        metrics = ckpt['metrics']
        agent.load_state_dict(ckpt['agent'])
        memory.load_state_dict(ckpt['memory'])
        if use_vime:
            vime.load_state_dict(ckpt['vime'])

    def save_checkpoint():
        # Save checkpoint
        ckpt = {'metrics': metrics, 'agent': agent.state_dict(), 'memory': memory.state_dict()}
        if use_vime:
            ckpt['vime'] = vime.state_dict()
        path = os.path.join(ckpt_dir, 'checkpoint.pth')
        torch.save(ckpt, path)

        # Save agent model only
        model_ckpt = {'agent': agent.state_dict()}
        model_path = os.path.join(ckpt_dir, 'model.pth')
        torch.save(model_ckpt, model_path)

        # Save metrics only
        metrics_ckpt = {'metrics': metrics}
        metrics_path = os.path.join(ckpt_dir, 'metrics.pth')
        torch.save(metrics_ckpt, metrics_path)

    # Train agent
    init_episode = 0 if len(metrics['episode']) == 0 else metrics['episode'][-1] + 1
    pbar = tqdm.tqdm(range(init_episode, conf.episodes))
    reward_moving_avg = None
    moving_avg_coef = 0.1
    agent_update_count = 0
    total_steps = 0

    for episode in pbar:
        o = env.reset()
        rewards, curiosity_rewards = [], []
        info_gains = []
        log_likelihoods = []
        q1_losses, q2_losses, policy_losses, alpha_losses, alphas = [],[],[],[],[]

        for t in range(conf.horizon):
            if len(memory) < conf.random_sample_num or random_policy:
                a = env.action_space.sample()
            else:
                a = agent.select_action(o, eval=False)

            o_next, r, done, _ = env.step(a)
            total_steps += 1
            metrics['step'].append(total_steps)
            metrics['step_reward'].append(r)
            done = False if t == env._max_episode_steps - 1 else bool(done)  # done should be False if an episode is terminated forcefully
            rewards.append(r)

            if use_vime and len(memory) >= conf.random_sample_num:
                # Calculate curiosity reward in VIME
                info_gain, log_likelihood = vime.calc_info_gain(o, a, o_next)
                assert not np.isnan(info_gain).any() and not np.isinf(info_gain).any(), "invalid information gain, {}".format(info_gains)
                info_gains.append(info_gain)
                log_likelihoods.append(log_likelihood)
                vime.memorize_episodic_info_gains(info_gain)            
                r = vime.calc_curiosity_reward(r, info_gain)
            curiosity_rewards.append(r)

            memory.append(o, a, r, o_next, done)
            o = o_next

            # Update agent
            if len(memory) >= conf.random_sample_num and not random_policy:
                for _ in range(conf.agent_update_per_step):
                    batch_data = memory.sample(conf.agent_update_batch_size)
                    q1_loss, q2_loss, policy_loss, alpha_loss, alpha = agent.update_parameters(batch_data, agent_update_count)
                    q1_losses.append(q1_loss)
                    q2_losses.append(q2_loss)
                    policy_losses.append(policy_loss)
                    alpha_losses.append(alpha_loss)
                    alphas.append(alpha)
                    agent_update_count += 1

            if done:
                break

        if len(log_likelihoods) == 0:
            log_likelihoods.append(-np.inf)

        # Display performance
        episodic_reward = np.sum(rewards)
        reward_moving_avg = episodic_reward if reward_moving_avg is None else (1-moving_avg_coef) * reward_moving_avg + moving_avg_coef * episodic_reward
        if use_vime:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Curiosity {:.1f}, Rwd {:.1f} (m.avg {:.1f}), Likelihood {:.2E}".format(
                episode, memory.step, len(memory), len(rewards), np.sum(curiosity_rewards), episodic_reward, reward_moving_avg, np.mean(np.exp(log_likelihoods))))
        else:
            pbar.set_description("EPISODE {}, TOTAL STEPS {}, SAMPLES {} --- Steps {}, Rwd {:.1f} (mov avg {:.1f})".format(
                episode, memory.step, len(memory), len(rewards), episodic_reward, reward_moving_avg))

        # Save episodic metrics
        metrics['episode'].append(episode)
        metrics['collected_samples'].append(total_steps)
        metrics['reward'].append(episodic_reward)
        metrics['curiosity_reward'].append(np.sum(curiosity_rewards))
        metrics['likelihood'].append(np.mean(np.exp(log_likelihoods)))
        if episode % visualize_interval == 0:
            lineplot(metrics['step'][-len(metrics['step_reward']):], metrics['step_reward'], 'stepwise_reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['reward']):], metrics['reward'], 'reward', log_dir)
            lineplot(metrics['collected_samples'][-len(metrics['reward']):], metrics['reward'], 'sample-reward', log_dir, xaxis='total step')
            lineplot(metrics['episode'][-len(metrics['curiosity_reward']):], metrics['curiosity_reward'], 'curiosity_reward', log_dir)
            lineplot(metrics['episode'][-len(metrics['likelihood']):], metrics['likelihood'], 'likelihood', log_dir)
        # Agent update related metrics
        if len(policy_losses) > 0 and not random_policy:
            metrics['q1_loss'].append(np.mean(q1_losses))
            metrics['policy_loss'].append(np.mean(policy_losses))
            metrics['alpha_loss'].append(np.mean(alpha_losses))
            metrics['alpha'].append(np.mean(alphas))
            if episode % visualize_interval == 0:
                lineplot(metrics['episode'][-len(metrics['q1_loss']):], metrics['q1_loss'], 'q1_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['policy_loss']):], metrics['policy_loss'], 'policy_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha_loss']):], metrics['alpha_loss'], 'alpha_loss', log_dir)
                lineplot(metrics['episode'][-len(metrics['alpha']):], metrics['alpha'], 'alpha', log_dir)

        # Update VIME
        if use_vime and len(memory) >= conf.random_sample_num:
            for _ in range(conf.vime_update_per_episode):
                batch_s, batch_a, _, batch_s_next, _ = memory.sample(conf.vime_update_batch_size)
                elbo = vime.update_posterior(batch_s, batch_a, batch_s_next)
            metrics['ELBO'].append(elbo)
            lineplot(metrics['episode'][-len(metrics['ELBO']):], metrics['ELBO'], 'ELBO', log_dir)
            if len(info_gains) > 0:
                metrics['D_KL_median'].append(np.median(info_gains))
                metrics['D_KL_mean'].append(np.mean(info_gains))
                multiple_lineplot(metrics['episode'][-len(metrics['D_KL_median']):], np.array([metrics['D_KL_median'], metrics['D_KL_mean']]).T, 'D_KL', ['median', 'mean'], log_dir)

        # Test current policy
        if episode % conf.test_interval == 0:
            rewards = []
            for _ in range(conf.test_times):
                o = env.reset()
                done = False
                episode_reward = 0
                while not done:
                    a = agent.select_action(o, eval=True)
                    o_next, r, done, _ = env.step(a)
                    episode_reward += r
                    o = o_next

                rewards.append(episode_reward)

            mean, std = np.mean(rewards), np.std(rewards)
            print("\nTEST AT EPISODE {} ({} episodes) --- Avg. Reward {:.2f} (+- {:.2f})".format(episode, conf.test_times, mean, std))

            metrics['test_episode'].append(episode)
            metrics['test_reward'].append(rewards)
            lineplot(metrics['test_episode'][-len(metrics['test_reward']):], metrics['test_reward'], 'test_reward', log_dir)
            

        # Save checkpoint
        if episode % conf.checkpoint_interval == 0:
            save_checkpoint()

    save_checkpoint()
    # Save the final model
    torch.save({'agent': agent.state_dict()}, os.path.join(ckpt_dir, 'final_model.pth'))