Example #1
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.value(last_observations).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        dataset = Dataset(D, logprobs, entropies, Vs, Qs, As)
        dataloader = DataLoader(dataset,
                                self.config['train.batch_size'],
                                shuffle=True)
        for epoch in range(self.config['train.num_epochs']):
            logs = [self.learn_one_update(data) for data in dataloader]

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.policy_lr_scheduler.get_lr()
        out['policy_grad_norm'] = np.mean(
            [item['policy_grad_norm'] for item in logs])
        out['value_grad_norm'] = np.mean(
            [item['value_grad_norm'] for item in logs])
        out['policy_loss'] = np.mean([item['policy_loss'] for item in logs])
        out['policy_entropy'] = np.mean(
            [item['policy_entropy'] for item in logs])
        out['value_loss'] = np.mean([item['value_loss'] for item in logs])
        out['explained_variance'] = np.mean(
            [item['explained_variance'] for item in logs])
        out['approx_kl'] = np.mean([item['approx_kl'] for item in logs])
        out['clip_frac'] = np.mean([item['clip_frac'] for item in logs])
        return out
Example #2
0
 def sample(self, batch_size):
     idx = np.random.randint(0, self.size, size=batch_size)
     return list(
         map(lambda x: tensorify(x, self.device), [
             self.observations[idx], self.actions[idx], self.rewards[idx],
             self.next_observations[idx], self.masks[idx]
         ]))
Example #3
0
    def learn(self, D, **kwargs):
        logprobs = [torch.cat(traj.get_infos('action_logprob')) for traj in D]
        entropies = [torch.cat(traj.get_infos('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_infos('V')) for traj in D]
        last_Vs = [traj.extra_info['last_info']['V'] for traj in D]
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj.rewards,
                                 last_V, traj.reach_terminal)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj.rewards, V, last_V, traj.reach_terminal)
            for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-4)
        assert all([x.ndim == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As.detach()
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')
        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.total_timestep += sum([traj.T for traj in D])

        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -out['entropy_loss']
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Example #4
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    
    print('Initializing...')
    agent = Agent(config, make_env(config, seed), device)
    es = CMAES([config['train.mu0']]*agent.num_params, config['train.std0'], 
               {'popsize': config['train.popsize'], 
                'seed': seed})
    train_logs = []
    checkpoint_count = 0
    with ProcessPoolExecutor(max_workers=config['train.popsize'], initializer=initializer, initargs=(config, seed, device)) as executor:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            start_time = time.perf_counter()
            solutions = es.ask()
            out = list(executor.map(fitness, solutions, chunksize=2))
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation+1)
            logger('num_seconds', round(time.perf_counter() - start_time, 1))
            logger('Returns', describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons', describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation+1)%config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-'*50)
            if (generation+1) >= int(config['train.generations']*(checkpoint_count/(config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation+1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir/'train_logs', ext='.pkl')
    return None
Example #5
0
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     features = self.feature_network(obs)
     action_dist = self.action_head(features)
     action = action_dist.sample()
     out = {}
     out['raw_action'] = numpify(action,
                                 self.env.action_space.dtype).squeeze(0)
     return out
Example #6
0
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     with torch.no_grad():
         if kwargs['mode'] == 'train':
             action = numpify(self.actor(obs).sample(), 'float')
         elif kwargs['mode'] == 'eval':
             action = numpify(torch.tanh(self.actor.mean_forward(obs)), 'float')
     out = {}
     out['raw_action'] = action.squeeze(0)
     return out
Example #7
0
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        features = self.feature_network(obs)

        action_dist = self.action_head(features)
        out['entropy'] = action_dist.entropy()
        action = action_dist.sample()
        out['raw_action'] = numpify(action, 'float')
        return out
Example #8
0
def test_tensorify():
    # tensor
    x = torch.tensor(2.43)
    y = tensorify(x, 'cpu')
    assert torch.equal(x, y)
    del x, y

    x = torch.randn(10)
    y = tensorify(x, 'cpu')
    assert torch.equal(x, y)
    del x, y

    x = torch.randn(10, 20, 30)
    y = tensorify(x, 'cpu')
    assert torch.equal(x, y)
    del x, y

    # ndarray
    x = np.array(2.43)
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y.item())
    del x, y

    x = np.random.randn(10)
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y)
    del x, y

    x = np.random.randn(10, 20, 30)
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y)
    del x, y

    # raw list
    x = [2.43]
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y.item())
    del x, y

    x = [1, 2, 3, 4, 5, 6]
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y)
    del x, y

    x = [[1, 2], [3, 4], [5, 6]]
    y = tensorify(x, 'cpu')
    assert np.allclose(x, y)
    del x, y
Example #9
0
 def choose_action(self, obs, **kwargs):
     obs = tensorify(obs, self.device)
     with torch.no_grad():
         action = numpify(self.actor(obs), 'float')
     if kwargs['mode'] == 'train':
         eps = np.random.normal(0.0, self.config['agent.action_noise'], size=action.shape)
         action = np.clip(action + eps, self.env.action_space.low, self.env.action_space.high)
     out = {}
     out['action'] = action
     return out
Example #10
0
def fitness(data):
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK
    config, seed, device, param = data
    env = make_env(config, seed, 'train')
    agent = Agent(config, env, device)
    agent.from_vec(tensorify(param, 'cpu'))
    runner = EpisodeRunner()
    with torch.no_grad():
        D = runner(agent, env, 10)
    R = np.mean([sum(traj.rewards) for traj in D])
    H = np.mean([traj.T for traj in D])
    return R, H
Example #11
0
 def choose_action(self, x, **kwargs):
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     features = self.feature_network(obs)
     action_dist = self.action_head(features)
     V = self.V_head(features)
     action = action_dist.sample()
     out = {}
     out['action_dist'] = action_dist
     out['V'] = V
     out['entropy'] = action_dist.entropy()
     out['action'] = action
     out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0)
     out['action_logprob'] = action_dist.log_prob(action.detach())
     return out
Example #12
0
def run(config, seed, device, logdir):
    set_global_seeds(seed)
    torch.set_num_threads(1)  # VERY IMPORTANT TO AVOID GETTING STUCK

    print('Initializing...')
    agent = Agent(config, make_env(config, seed, 'eval'), device)
    es = OpenAIES(
        [config['train.mu0']] * agent.num_params, config['train.std0'], {
            'popsize': config['train.popsize'],
            'seed': seed,
            'sigma_scheduler_args': config['train.sigma_scheduler_args'],
            'lr': config['train.lr'],
            'lr_decay': config['train.lr_decay'],
            'min_lr': config['train.min_lr'],
            'antithetic': config['train.antithetic'],
            'rank_transform': config['train.rank_transform']
        })
    train_logs = []
    checkpoint_count = 0
    with Pool(processes=config['train.popsize'] //
              config['train.worker_chunksize']) as pool:
        print('Finish initialization. Training starts...')
        for generation in range(config['train.generations']):
            t0 = time.perf_counter()
            solutions = es.ask()
            data = [(config, seed, device, solution) for solution in solutions]
            out = pool.map(CloudpickleWrapper(fitness),
                           data,
                           chunksize=config['train.worker_chunksize'])
            Rs, Hs = zip(*out)
            es.tell(solutions, [-R for R in Rs])
            logger = Logger()
            logger('generation', generation + 1)
            logger('num_seconds', round(time.perf_counter() - t0, 1))
            logger('Returns',
                   describe(Rs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('Horizons',
                   describe(Hs, axis=-1, repr_indent=1, repr_prefix='\n'))
            logger('fbest', es.result.fbest)
            train_logs.append(logger.logs)
            if generation == 0 or (generation + 1) % config['log.freq'] == 0:
                logger.dump(keys=None, index=0, indent=0, border='-' * 50)
            if (generation + 1) >= int(config['train.generations'] *
                                       (checkpoint_count /
                                        (config['checkpoint.num'] - 1))):
                agent.from_vec(tensorify(es.result.xbest, 'cpu'))
                agent.checkpoint(logdir, generation + 1)
                checkpoint_count += 1
    pickle_dump(obj=train_logs, f=logdir / 'train_logs', ext='.pkl')
    return None
Example #13
0
def fitness(param):
    agent.from_vec(tensorify(param, 'cpu'))
    R = []
    H = []
    with torch.no_grad():
        for i in range(10):
            observation = env.reset()
            for t in range(env.spec.max_episode_steps):
                action = agent.choose_action(observation)['raw_action']
                observation, reward, done, info = env.step(action)
                if done[0]:
                    R.append(info[0]['episode']['return'])
                    H.append(info[0]['episode']['horizon'])
                    break
    return np.mean(R), np.mean(H)
Example #14
0
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}

        action_dist = self.policy(obs)
        out['action_dist'] = action_dist
        out['entropy'] = action_dist.entropy()

        action = action_dist.sample()
        out['action'] = action
        out['raw_action'] = numpify(action, 'float')
        out['action_logprob'] = action_dist.log_prob(action.detach())

        V = self.value(obs)
        out['V'] = V
        return out
Example #15
0
    def choose_action(self, obs, **kwargs):
        obs = tensorify(obs, self.device)
        out = {}
        features = self.feature_network(obs)

        action_dist = self.action_head(features)
        out['action_dist'] = action_dist
        out['entropy'] = action_dist.entropy()

        action = action_dist.sample()
        out['action'] = action
        out['raw_action'] = numpify(action, 'float')
        out['action_logprob'] = action_dist.log_prob(action.detach())

        V = self.V_head(features)
        out['V'] = V
        return out
Example #16
0
 def choose_action(self, obs, **kwargs):
     obs = tensorify(obs, self.device)
     out = {}
     if kwargs['mode'] == 'train':
         dist = self.actor(obs)
         action = dist.rsample()
         out['action'] = action
         out['action_logprob'] = dist.log_prob(action)
     elif kwargs['mode'] == 'stochastic':
         with torch.no_grad():
             out['action'] = numpify(self.actor(obs).sample(), 'float')
     elif kwargs['mode'] == 'eval':
         with torch.no_grad():
             out['action'] = numpify(
                 torch.tanh(self.actor.mean_forward(obs)), 'float')
     else:
         raise NotImplementedError
     return out
Example #17
0
 def choose_action(self, x, **kwargs):
     if x.first():
         self.state = self.reset(1)
     obs = tensorify(x.observation, self.device).unsqueeze(0)
     obs = obs.unsqueeze(0)  # add seq_dim
     features, [next_state] = self.feature_network(obs, [self.state])
     if 'last_info' not in kwargs:
         self.state = next_state
     features = features.squeeze(0)  # squeeze seq_dim
     action_dist = self.action_head(features)
     V = self.V_head(features)
     action = action_dist.sample()
     out = {}
     out['action_dist'] = action_dist
     out['V'] = V
     out['entropy'] = action_dist.entropy()
     out['action'] = action
     out['raw_action'] = numpify(action, self.env.action_space.dtype).squeeze(0)
     out['action_logprob'] = action_dist.log_prob(action.detach())
     return out
Example #18
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        entropies = [torch.cat(traj.get_all_info('entropy')) for traj in D]
        Vs = [torch.cat(traj.get_all_info('V')) for traj in D]

        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)
        Qs = [
            bootstrapped_returns(self.config['agent.gamma'], traj, last_V)
            for traj, last_V in zip(D, last_Vs)
        ]
        As = [
            gae(self.config['agent.gamma'], self.config['agent.gae_lambda'],
                traj, V, last_V) for traj, V, last_V in zip(D, Vs, last_Vs)
        ]

        # Metrics -> Tensor, device
        logprobs, entropies, Vs = map(lambda x: torch.cat(x).squeeze(),
                                      [logprobs, entropies, Vs])
        Qs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [Qs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, Qs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, Qs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)
        self.optimizer.step()

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(Qs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out
Example #19
0
    def learn(self, D, **kwargs):
        # Compute all metrics, D: list of Trajectory
        Ts = [len(traj) for traj in D]
        behavior_logprobs = [
            torch.cat(traj.get_all_info('action_logprob')) for traj in D
        ]
        out_agent = self.choose_action(
            np.concatenate([traj.numpy_observations[:-1] for traj in D], 0))
        logprobs = out_agent['action_logprob'].squeeze()
        entropies = out_agent['entropy'].squeeze()
        Vs = out_agent['V'].squeeze()
        with torch.no_grad():
            last_observations = tensorify(
                np.concatenate([traj.last_observation for traj in D], 0),
                self.device)
            last_Vs = self.V_head(
                self.feature_network(last_observations)).squeeze(-1)

        vs, As = [], []
        for traj, behavior_logprob, logprob, V, last_V in zip(
                D, behavior_logprobs,
                logprobs.detach().cpu().split(Ts),
                Vs.detach().cpu().split(Ts), last_Vs):
            v, A = vtrace(behavior_logprob, logprob, self.gamma, traj.rewards,
                          V, last_V, traj.reach_terminal, self.clip_rho,
                          self.clip_pg_rho)
            vs.append(v)
            As.append(A)

        # Metrics -> Tensor, device
        vs, As = map(
            lambda x: tensorify(np.concatenate(x).copy(), self.device),
            [vs, As])
        if self.config['agent.standardize_adv']:
            As = (As - As.mean()) / (As.std() + 1e-8)

        assert all(
            [x.ndimension() == 1 for x in [logprobs, entropies, Vs, vs, As]])

        # Loss
        policy_loss = -logprobs * As
        entropy_loss = -entropies
        value_loss = F.mse_loss(Vs, vs, reduction='none')

        loss = policy_loss + self.config[
            'agent.value_coef'] * value_loss + self.config[
                'agent.entropy_coef'] * entropy_loss
        loss = loss.mean()

        self.optimizer.zero_grad()
        loss.backward()
        grad_norm = nn.utils.clip_grad_norm_(
            self.parameters(), self.config['agent.max_grad_norm'])
        self.optimizer.step()
        if self.config['agent.use_lr_scheduler']:
            self.lr_scheduler.step(self.total_timestep)

        self.total_timestep += sum([len(traj) for traj in D])
        out = {}
        if self.config['agent.use_lr_scheduler']:
            out['current_lr'] = self.lr_scheduler.get_lr()
        out['loss'] = loss.item()
        out['grad_norm'] = grad_norm
        out['policy_loss'] = policy_loss.mean().item()
        out['entropy_loss'] = entropy_loss.mean().item()
        out['policy_entropy'] = -entropy_loss.mean().item()
        out['value_loss'] = value_loss.mean().item()
        out['V'] = describe(numpify(Vs, 'float').squeeze(),
                            axis=-1,
                            repr_indent=1,
                            repr_prefix='\n')
        out['explained_variance'] = ev(y_true=numpify(vs, 'float'),
                                       y_pred=numpify(Vs, 'float'))
        return out