def build(self, mdp_info): policy = GaussianTorchPolicy(Network, mdp_info.observation_space.shape, mdp_info.action_space.shape, **self.policy_params) self.critic_params["input_shape"] = mdp_info.observation_space.shape self.alg_params['critic_params'] = self.critic_params return TRPO(mdp_info, policy, **self.alg_params)
def learn(alg, alg_params): mdp = InvertedPendulum(horizon=50) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return agent
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_episodes_test, alg_params, policy_params): print(alg.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.Adam, 'params': { 'lr': 3e-4 } }, loss=F.mse_loss, n_features=32, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH 0') tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_episodes=n_episodes_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() tqdm.write('END OF EPOCH ' + str(it + 1)) tqdm.write('J: {}, R: {}, entropy: {}'.format(J, R, E)) tqdm.write( '##################################################################################################' ) print('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def experiment(alg, env_id, horizon, gamma, n_epochs, n_steps, n_steps_per_fit, n_step_test, alg_params, policy_params): logger = Logger(A2C.__name__, results_dir=None) logger.strong_line() logger.info('Experiment Algorithm: ' + A2C.__name__) mdp = Gym(env_id, horizon, gamma) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, n_features=64, batch_size=64, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) alg_params['critic_params'] = critic_params policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(0, J=J, R=R, entropy=E) for it in trange(n_epochs): core.learn(n_steps=n_steps, n_steps_per_fit=n_steps_per_fit) dataset = core.evaluate(n_steps=n_step_test, render=False) J = np.mean(compute_J(dataset, mdp.info.gamma)) R = np.mean(compute_J(dataset)) E = agent.policy.entropy() logger.epoch_info(it + 1, J=J, R=R, entropy=E) logger.info('Press a button to visualize') input() core.evaluate(n_episodes=5, render=True)
def test_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) w = agent.policy.get_weights() w_test = np.array( [-1.6307759, 1.0356185, -0.34508315, 0.27108294, -0.01047843]) assert np.allclose(w, w_test)
def learn(alg, alg_params): class Network(nn.Module): def __init__(self, input_shape, output_shape, **kwargs): super(Network, self).__init__() n_input = input_shape[-1] n_output = output_shape[0] self._h = nn.Linear(n_input, n_output) nn.init.xavier_uniform_(self._h.weight, gain=nn.init.calculate_gain('relu')) def forward(self, state, **kwargs): return F.relu(self._h(torch.squeeze(state, 1).float())) mdp = Gym('Pendulum-v0', 200, .99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) critic_params = dict(network=Network, optimizer={'class': optim.Adam, 'params': {'lr': 3e-4}}, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1,)) policy_params = dict(std_0=1., use_cuda=False) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) alg_params['critic_params'] = critic_params agent = alg(mdp.info, policy, **alg_params) core = Core(agent, mdp) core.learn(n_episodes=2, n_episodes_per_fit=1) return policy
def learn_a2c(): mdp = Gym(name='Pendulum-v0', horizon=200, gamma=.99) mdp.seed(1) np.random.seed(1) torch.manual_seed(1) torch.cuda.manual_seed(1) policy_params = dict(std_0=1., n_features=64, use_cuda=False) critic_params = dict(network=Network, optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 1e-5 } }, loss=F.mse_loss, input_shape=mdp.info.observation_space.shape, output_shape=(1, )) algorithm_params = dict(critic_params=critic_params, actor_optimizer={ 'class': optim.RMSprop, 'params': { 'lr': 7e-4, 'eps': 3e-3 } }, max_grad_norm=0.5, ent_coeff=0.01) policy = GaussianTorchPolicy(Network, mdp.info.observation_space.shape, mdp.info.action_space.shape, **policy_params) agent = A2C(mdp.info, policy, **algorithm_params) core = Core(agent, mdp) core.learn(n_episodes=10, n_episodes_per_fit=5) return agent