Exemple #1
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    
    print('Initializing...')
    agent = Agent(config, make_env(config, seed), device)
    es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], 
               {'popsize': config['train.popsize'], 
                'seed': seed})
    train_logs = []
    checkpoint_count = 0
    with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            start_time = time.perf_counter()
            solutions = es.ask()
            out = list(executor.map(fitness, solutions, chunksize=2))
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation+1)
            logger('num_seconds', round(time.perf_counter() - start_time, 1))
            logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation+1)%config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-'*50)
            if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation+1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
    return None
Exemple #2
0
 def eval(self, n=None, **kwargs):
     start_time = perf_counter()
     returns = []
     horizons = []
     for _ in range(self.config['eval.num_episode']):
         observation = self.eval_env.reset()
         for _ in range(self.eval_env.spec.max_episode_steps):
             with torch.no_grad():
                 action = self.agent.choose_action(observation, mode='eval')['action']
             next_observation, reward, done, info = self.eval_env.step(action)
             if done[0]:  # [0] single environment
                 returns.append(info[0]['episode']['return'])
                 horizons.append(info[0]['episode']['horizon'])
                 break
             observation = next_observation
     logger = Logger()
     logger('num_seconds', round(perf_counter() - start_time, 1))
     logger('accumulated_trained_timesteps', kwargs['accumulated_trained_timesteps'])
     logger('accumulated_trained_episodes', kwargs['accumulated_trained_episodes'])
     logger('online_return', describe(returns, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('online_horizon', describe(horizons, axis=-1, repr_indent=1, repr_prefix='\n'))
     
     monitor_env = get_wrapper(self.eval_env, 'VecMonitor')
     logger('running_return', describe(monitor_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_horizon', describe(monitor_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
     return logger.logs
Exemple #3
0
    def train(self, n=None, **kwargs):
        self.agent.train()
        t0 = time.perf_counter()

        D = self.runner(self.agent, self.env,
                        self.config['train.timestep_per_iter'])
        out_agent = self.agent.learn(D)

        logger = Logger()
        logger('train_iteration', n + 1)
        logger('num_seconds', round(time.perf_counter() - t0, 1))
        [logger(key, value) for key, value in out_agent.items()]
        logger('num_trajectories', len(D))
        logger('num_timesteps', sum([traj.T for traj in D]))
        logger('accumulated_trained_timesteps', self.agent.total_timestep)
        logger(
            'return',
            describe([sum(traj.rewards) for traj in D],
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))

        E = [
            traj[-1].info['episode'] for traj in D
            if 'episode' in traj[-1].info
        ]
        logger(
            'online_return',
            describe([e['return'] for e in E],
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        logger(
            'online_horizon',
            describe([e['horizon'] for e in E],
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        logger(
            'running_return',
            describe(self.env.return_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        logger(
            'running_horizon',
            describe(self.env.horizon_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        return logger
Exemple #4
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK

    print('Initializing...')
    agent = Agent(config, make_env(config, seed, 'eval'), device)
    es = OpenAIES(
        [config['train.mu0']] * agent.num_params, config['train.std0'], {
            'popsize': config['train.popsize'],
            'seed': seed,
            'sigma_scheduler_args': config['train.sigma_scheduler_args'],
            'lr': config['train.lr'],
            'lr_decay': config['train.lr_decay'],
            'min_lr': config['train.min_lr'],
            'antithetic': config['train.antithetic'],
            'rank_transform': config['train.rank_transform']
        })
    train_logs = []
    checkpoint_count = 0
    with Pool(processes=config['train.popsize'] //
              config['train.worker_chunksize']) as pool:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            t0 = time.perf_counter()
            solutions = es.ask()
            data = [(config, seed, device, solution) for solution in solutions]
            out = pool.map(CloudpickleWrapper(fitness),
                           data,
                           chunksize=config['train.worker_chunksize'])
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation + 1)
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('Returns',
                   describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons',
                   describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation + 1) % config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-' * 50)
            if (generation + 1) >= int(config['train.generations'] *
                                       (checkpoint_count /
                                        (config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation + 1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
    return None
Exemple #5
0
 def eval(self, n=None, **kwargs):
     t0 = time.perf_counter()
     with torch.no_grad():
         D = self.runner(self.agent, self.eval_env, 10, mode='eval')
     
     logger = Logger()
     logger('eval_iteration', n+1)
     logger('num_seconds', round(time.perf_counter() - t0, 1))
     logger('accumulated_trained_timesteps', self.agent.total_timestep)
     logger('online_return', describe([sum(traj.rewards) for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('online_horizon', describe([traj.T for traj in D], axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_return', describe(self.eval_env.return_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger('running_horizon', describe(self.eval_env.horizon_queue, axis=-1, repr_indent=1, repr_prefix='\n'))
     logger.dump(keys=None, index=0, indent=0, border=color_str('+'*50, color='green'))
     return logger.logs
Exemple #6
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        last_Vs = [traj.extra_info['last_info']['V'] for traj in D]
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As.detach()
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])

        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Exemple #7
0
    def train(self, n=None, **kwargs):
        self.agent.train()
        start_time = perf_counter()

        D = self.runner(self.agent, self.env,
                        self.config['train.timestep_per_iter'])
        out_agent = self.agent.learn(D)

        logger = Logger()
        logger('train_iteration', n + 1)
        logger('num_seconds', round(perf_counter() - start_time, 1))
        [logger(key, value) for key, value in out_agent.items()]
        logger('num_trajectories', len(D))
        logger('num_timesteps', sum([len(traj) for traj in D]))
        logger('accumulated_trained_timesteps', self.agent.total_timestep)
        G = [traj.numpy_rewards.sum() for traj in D]
        logger('return', describe(G, axis=-1, repr_indent=1, repr_prefix='\n'))

        infos = [
            info for info in chain.from_iterable([traj.infos for traj in D])
            if 'episode' in info
        ]
        online_returns = [info['episode']['return'] for info in infos]
        online_horizons = [info['episode']['horizon'] for info in infos]
        logger(
            'online_return',
            describe(online_returns, axis=-1, repr_indent=1, repr_prefix='\n'))
        logger(
            'online_horizon',
            describe(online_horizons, axis=-1, repr_indent=1,
                     repr_prefix='\n'))

        monitor_env = get_wrapper(self.env, 'VecMonitor')
        logger(
            'running_return',
            describe(monitor_env.return_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        logger(
            'running_horizon',
            describe(monitor_env.horizon_queue,
                     axis=-1,
                     repr_indent=1,
                     repr_prefix='\n'))
        return logger
Exemple #8
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        episode_length = kwargs['episode_length']
        out = {}
        out['actor_loss'] = []
        out['critic_loss'] = []
        Q_vals = []
        for i in range(episode_length):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs = self.critic(observations, actions).squeeze()
            with torch.no_grad():
                next_Qs = self.critic_target(
                    next_observations,
                    self.actor_target(next_observations)).squeeze()
            targets = rewards + self.config[
                'agent.gamma'] * masks * next_Qs.detach()

            critic_loss = F.mse_loss(Qs, targets)
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            actor_loss = -self.critic(observations,
                                      self.actor(observations)).mean()
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            actor_loss.backward()
            actor_grad_norm = nn.utils.clip_grad_norm_(
                self.actor.parameters(), self.config['agent.max_grad_norm'])
            self.actor_optimizer.step()

            self.polyak_update_target()

            out['actor_loss'].append(actor_loss)
            out['critic_loss'].append(critic_loss)
            Q_vals.append(Qs)
        out['actor_loss'] = torch.stack(out['actor_loss']).mean().item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.stack(out['critic_loss']).mean().item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q'] = describe_it(Q_vals)
        return out
Exemple #9
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        Ts = [len(traj) for traj in D]
        behavior_logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        out_agent = self.choose_action(
            np.concatenate([traj.numpy_observations[:-1] for traj in D], 0))
        logprobs = out_agent['action_logprob'].squeeze()
        entropies = out_agent['entropy'].squeeze()
        Vs = out_agent['V'].squeeze()
        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)

        vs, As = [], []
        for traj, behavior_logprob, logprob, V, last_V in zip(
                D, behavior_logprobs,
                logprobs.detach().cpu().split(Ts),
                Vs.detach().cpu().split(Ts), last_Vs):
            v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards,
                          V, last_V, traj.reach_terminal, self.clip_rho,
                          self.clip_pg_rho)
            vs.append(v)
            As.append(A)

        # Metrics -> Tensor, device
        vs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [vs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, vs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(vs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Exemple #10
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        episode_length = kwargs['episode_length']
        out = {}
        out['actor_loss'] = []
        out['critic_loss'] = []
        out['alpha_loss'] = []
        Q1_vals = []
        Q2_vals = []
        logprob_vals = []
        for i in range(episode_length):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs1, Qs2 = self.critic(observations, actions)
            #Qs1, Qs2 = map(lambda x: x.squeeze(), [Qs1, Qs2])

            ##########print(Qs1.mean().item())

            with torch.no_grad():
                #out_actor = self.choose_action(next_observations, mode='train')
                _, policy_action, log_pi = self.actor(next_observations)
                next_Qs1, next_Qs2 = self.critic_target(
                    next_observations, policy_action)

                #print(next_actions_logprob.shape)

                next_Qs = torch.min(next_Qs1,
                                    next_Qs2) - self.alpha.detach() * log_pi
                Q_targets = rewards.unsqueeze(-1) + self.config[
                    'agent.gamma'] * masks.unsqueeze(-1) * next_Qs

                #print(Q_targets.shape)

            #print(Qs1.shape, Q_targets.shape)

            critic_loss = F.mse_loss(Qs1, Q_targets) + F.mse_loss(
                Qs2, Q_targets)
            self.optimizer_zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            if i % self.config['agent.policy_delay'] == 0:
                _, pi, log_pi = self.actor(observations)
                actor_Qs1, actor_Qs2 = self.critic(observations, pi)
                actor_Qs = torch.min(actor_Qs1, actor_Qs2)

                actor_loss = (self.alpha.detach() * log_pi - actor_Qs).mean()

                self.optimizer_zero_grad()
                actor_loss.backward()
                actor_grad_norm = nn.utils.clip_grad_norm_(
                    self.actor.parameters(),
                    self.config['agent.max_grad_norm'])
                self.actor_optimizer.step()

                alpha_loss = torch.mean(
                    self.alpha * (-log_pi - self.target_entropy).detach())

                #print((self.alpha*(sampled_actions_logprob - self.target_entropy)).shape)

                self.optimizer_zero_grad()
                alpha_loss.backward()
                self.log_alpha_optimizer.step()

                self.polyak_update_target()

                out['actor_loss'].append(actor_loss)
                out['alpha_loss'].append(alpha_loss)
            out['critic_loss'].append(critic_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
            logprob_vals.append(log_pi)
        out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(torch.cat(x).detach().cpu().numpy().
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        out['logprob'] = describe_it(logprob_vals)
        out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item()
        out['alpha'] = self.alpha.item()
        return out
Exemple #11
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        episode_length = kwargs['episode_length']
        out = {}
        out['actor_loss'] = []
        out['critic_loss'] = []
        out['alpha_loss'] = []
        Q1_vals = []
        Q2_vals = []
        logprob_vals = []
        for i in range(episode_length):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs1, Qs2 = self.critic(observations, actions)
            Qs1, Qs2 = map(lambda x: x.squeeze(-1), [Qs1, Qs2])
            with torch.no_grad():
                out_actor = self.choose_action(next_observations, mode='train')
                next_actions = out_actor['action']
                next_actions_logprob = out_actor['action_logprob']
                next_Qs1, next_Qs2 = self.critic_target(
                    next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2).squeeze(
                    -1) - self.alpha.detach() * next_actions_logprob
                Q_targets = rewards + self.config[
                    'agent.gamma'] * masks * next_Qs

            critic_loss = F.mse_loss(Qs1, Q_targets.detach()) + F.mse_loss(
                Qs2, Q_targets.detach())
            print(critic_loss.item())  ############
            self.optimizer_zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            if i % self.config['agent.policy_delay'] == 0:
                out_actor = self.choose_action(observations, mode='train')
                policy_actions = out_actor['action']
                policy_actions_logprob = out_actor['action_logprob']

                actor_Qs1, actor_Qs2 = self.critic(observations,
                                                   policy_actions)
                actor_Qs = torch.min(actor_Qs1, actor_Qs2).squeeze(-1)
                actor_loss = torch.mean(self.alpha.detach() *
                                        policy_actions_logprob - actor_Qs)

                self.optimizer_zero_grad()
                actor_loss.backward()
                actor_grad_norm = nn.utils.clip_grad_norm_(
                    self.actor.parameters(),
                    self.config['agent.max_grad_norm'])
                self.actor_optimizer.step()

                alpha_loss = torch.mean(
                    self.log_alpha *
                    (-policy_actions_logprob - self.target_entropy).detach())

                self.optimizer_zero_grad()
                alpha_loss.backward()
                self.log_alpha_optimizer.step()

                self.polyak_update_target()

                out['actor_loss'].append(actor_loss)
                out['alpha_loss'].append(alpha_loss)
            out['critic_loss'].append(critic_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
            logprob_vals.append(policy_actions_logprob)
        out['actor_loss'] = torch.tensor(out['actor_loss']).mean().item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(out['critic_loss']).mean().item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        out['logprob'] = describe_it(logprob_vals)
        out['alpha_loss'] = torch.tensor(out['alpha_loss']).mean().item()
        out['alpha'] = self.alpha.item()
        return out
Exemple #12
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        T = kwargs['T']
        list_actor_loss = []
        list_critic_loss = []
        list_alpha_loss = []
        Q1_vals = []
        Q2_vals = []
        logprob_vals = []
        for i in range(T):
            observations, actions, rewards, next_observations, masks = replay.sample(self.config['replay.batch_size'])
            
            Qs1, Qs2 = self.critic(observations, actions)
            with torch.no_grad():
                action_dist = self.actor(next_observations)
                next_actions = action_dist.rsample()
                next_actions_logprob = action_dist.log_prob(next_actions).unsqueeze(-1)
                next_Qs1, next_Qs2 = self.critic_target(next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2) - self.alpha.detach()*next_actions_logprob
                targets = rewards + self.config['agent.gamma']*masks*next_Qs
            critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(Qs2, targets.detach())
            self.optimizer_zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()
            
            action_dist = self.actor(observations)
            policy_actions = action_dist.rsample()
            policy_actions_logprob = action_dist.log_prob(policy_actions).unsqueeze(-1)
            actor_Qs1, actor_Qs2 = self.critic(observations, policy_actions)
            actor_Qs = torch.min(actor_Qs1, actor_Qs2)
            actor_loss = torch.mean(self.alpha.detach()*policy_actions_logprob - actor_Qs)
            self.optimizer_zero_grad()
            actor_loss.backward()
            actor_grad_norm = nn.utils.clip_grad_norm_(self.actor.parameters(), self.config['agent.max_grad_norm'])
            self.actor_optimizer.step()
            
            alpha_loss = torch.mean(self.log_alpha*(-policy_actions_logprob - self.target_entropy).detach())
            self.optimizer_zero_grad()
            alpha_loss.backward()
            self.log_alpha_optimizer.step()

            self.polyak_update_target()
            list_actor_loss.append(actor_loss)
            list_critic_loss.append(critic_loss)
            list_alpha_loss.append(alpha_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
            logprob_vals.append(policy_actions_logprob)
        self.total_timestep += T
        
        out = {}
        out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').squeeze(), axis=-1, repr_indent=1, repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        out['logprob'] = describe_it(logprob_vals)
        out['alpha_loss'] = torch.tensor(list_alpha_loss).mean(0).item()
        out['alpha'] = self.alpha.item()
        return out
Exemple #13
0
    def learn(self, D, **kwargs):
        replay = kwargs['replay']
        T = kwargs['T']
        list_actor_loss = []
        list_critic_loss = []
        Q1_vals = []
        Q2_vals = []
        for i in range(T):
            observations, actions, rewards, next_observations, masks = replay.sample(
                self.config['replay.batch_size'])

            Qs1, Qs2 = self.critic(observations, actions)
            with torch.no_grad():
                next_actions = self.actor_target(next_observations)
                eps = torch.empty_like(next_actions).normal_(
                    0.0, self.config['agent.target_noise'])
                eps = eps.clamp(-self.config['agent.target_noise_clip'],
                                self.config['agent.target_noise_clip'])
                next_actions = torch.clamp(next_actions + eps,
                                           -self.max_action, self.max_action)
                next_Qs1, next_Qs2 = self.critic_target(
                    next_observations, next_actions)
                next_Qs = torch.min(next_Qs1, next_Qs2)
                targets = rewards + self.config['agent.gamma'] * masks * next_Qs
            critic_loss = F.mse_loss(Qs1, targets.detach()) + F.mse_loss(
                Qs2, targets.detach())
            self.actor_optimizer.zero_grad()
            self.critic_optimizer.zero_grad()
            critic_loss.backward()
            critic_grad_norm = nn.utils.clip_grad_norm_(
                self.critic.parameters(), self.config['agent.max_grad_norm'])
            self.critic_optimizer.step()

            if i % self.config['agent.policy_delay'] == 0:
                actor_loss = -self.critic.Q1(observations,
                                             self.actor(observations)).mean()
                self.actor_optimizer.zero_grad()
                self.critic_optimizer.zero_grad()
                actor_loss.backward()
                actor_grad_norm = nn.utils.clip_grad_norm_(
                    self.actor.parameters(),
                    self.config['agent.max_grad_norm'])
                self.actor_optimizer.step()

                self.polyak_update_target()
                list_actor_loss.append(actor_loss)
            list_critic_loss.append(critic_loss)
            Q1_vals.append(Qs1)
            Q2_vals.append(Qs2)
        self.total_timestep += T

        out = {}
        out['actor_loss'] = torch.tensor(list_actor_loss).mean(0).item()
        out['actor_grad_norm'] = actor_grad_norm
        out['critic_loss'] = torch.tensor(list_critic_loss).mean(0).item()
        out['critic_grad_norm'] = critic_grad_norm
        describe_it = lambda x: describe(numpify(torch.cat(x), 'float').
                                         squeeze(),
                                         axis=-1,
                                         repr_indent=1,
                                         repr_prefix='\n')
        out['Q1'] = describe_it(Q1_vals)
        out['Q2'] = describe_it(Q2_vals)
        return out
Exemple #14
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Exemple #15
0
def evaluator(config, logdir, seed, make_env, learner_agent):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    eval_logs = []
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, torch.device('cpu'))
    runner = EpisodeRunner(reset_on_call=True)
    evaluated_steps = config['eval.freq']
    while learner_agent.total_timestep < config['train.timestep']:
        if learner_agent.total_timestep < evaluated_steps:
            time.sleep(1.0)
        else:
            t0 = time.perf_counter()
            agent.load_state_dict(
                learner_agent.state_dict())  # copy to CPU by default
            with torch.no_grad():
                D = []
                for _ in range(config['eval.num_episode']):
                    D += runner(agent, env, env.spec.max_episode_steps)
            logger = Logger()
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('num_trajectories', len(D))
            logger('num_timesteps', sum([len(traj) for traj in D]))
            logger('accumulated_trained_timesteps',
                   learner_agent.total_timestep)

            infos = [
                info
                for info in chain.from_iterable([traj.infos for traj in D])
                if 'episode' in info
            ]
            online_returns = [info['episode']['return'] for info in infos]
            online_horizons = [info['episode']['horizon'] for info in infos]
            logger(
                'online_return',
                describe(online_returns,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'online_horizon',
                describe(online_horizons,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))

            monitor_env = get_wrapper(env, 'VecMonitor')
            logger(
                'running_return',
                describe(monitor_env.return_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger(
                'running_horizon',
                describe(monitor_env.horizon_queue,
                         axis=-1,
                         repr_indent=1,
                         repr_prefix='\n'))
            logger.dump(keys=None,
                        index=0,
                        indent=0,
                        border=color_str('+' * 50, color='green'))
            eval_logs.append(logger.logs)

            evaluated_steps += config['eval.freq']
    pickle_dump(obj=eval_logs, f=logdir / 'eval_logs', ext='.pkl')