from src.configs.configs import TORCH_DTYPE, NP_DTYPE from src.nopg.nopg import NOPG # Use the GPU if available, or if the memory is insufficient use only the CPU # DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') DEVICE = torch.device('cpu') ########################################################################################## # Create the Environment (MDP) register( id='Pendulum-v1', entry_point='gym.envs.classic_control:PendulumEnv', max_episode_steps=500, ) env = gym.make('Pendulum-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset sampling_params = {'sampling_type': 'behavioral', 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), 'initial_state': np.array([1., 0., 0.], dtype=NP_DTYPE), # theta=0., theta_dot=0, 'policy': policy_gmm, 'n_samples': 1500, 'max_ep_transitions': 500 } dataset = mdp.get_samples(**sampling_params) dataset.update_dataset_internal()
from src.configs.configs import TORCH_DTYPE, NP_DTYPE from src.nopg.nopg import NOPG # Use the GPU if available, or if the memory is insufficient use only the CPU # DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') DEVICE = torch.device('cpu') ########################################################################################## # Create the Environment (MDP) register( id='Pendulum-v1', entry_point='gym.envs.classic_control:PendulumEnv', max_episode_steps=500, ) env = gym.make('Pendulum-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset states = mdp.discretize_space(space='state', levels=[20, 20]) # theta, theta_dot actions = mdp.discretize_space(space='action', levels=[2]) sampling_params = { 'sampling_type': 'uniform', 'states': states, 'actions': actions, 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[ 2]), # use when the obsevation and state space do not exactly match 'render': False }
def test_nopg_d_cartpole_behavioral(self): ########################################################################################## # Create the Environment (MDP) env = gym.make('CartpoleStabShort-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset sampling_params = { 'sampling_type': 'behavioral', 'policy': lambda x: np.random.uniform(low=-5., high=5., size=(1, )), 'n_samples': 200, # n_trajectories: 2 } dataset = mdp.get_samples(**sampling_params) dataset.update_dataset_internal() # Compute Kernel Bandwidths (state, action, state next) s_band_factor = [1., 1., 1., 1., 1.] s_n_band_factor = s_band_factor a_band_factor = [20.] dataset.kde_bandwidths_internal(s_band_factor=s_band_factor, a_band_factor=a_band_factor, s_n_band_factor=s_n_band_factor) ########################################################################################## # Define the Policy Network policy_params = { 'policy_class': 'deterministic', 'neurons': [mdp.s_dim, 10, mdp.a_dim], # [state_dim, hidden1, ... , hiddenN, action_dim] 'activations': [nn.functional.relu], # one activation function per hidden layer 'f_out': [lambda x: 5.0 * torch.tanh(x)], 'device': DEVICE } policy = Policy(**policy_params).to(device=DEVICE, dtype=TORCH_DTYPE) ########################################################################################## # Optimize the policy with NOPG nopg_params = { 'initial_states': np.array([env.reset() for _ in range(5)]), # For multiple initial states 'gamma': 0.99 } nopg = NOPG(dataset, policy, **nopg_params) n_policy_updates = 10 def optimizer(x): return optim.Adam(x, lr=1e-2) evaluation_params = { 'eval_mdp': mdp, 'eval_every_n': 200, 'eval_n_episodes': 1, 'eval_render': False } nopg.fit(n_policy_updates=n_policy_updates, optimizer=optimizer, **evaluation_params)
def test_nopg_d_pendulum_uniform_sparsify_P_kl_divergence(self): ########################################################################################## # Create the Environment (MDP) env = gym.make('Pendulum-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset states = mdp.discretize_space(space='state', levels=[10, 10]) # theta, theta_dot actions = mdp.discretize_space(space='action', levels=[2]) sampling_params = { 'sampling_type': 'uniform', 'states': states, 'actions': actions, 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), 'render': False } dataset = mdp.get_samples(**sampling_params) dataset.update_dataset_internal() # Compute Kernel Bandwidths (state, action, state next) s_band_factor = [1., 1., 1.] # Pendulum uniform s_n_band_factor = s_band_factor a_band_factor = [50.] # Pendulum uniform dataset.kde_bandwidths_internal(s_band_factor=s_band_factor, a_band_factor=a_band_factor, s_n_band_factor=s_n_band_factor) ########################################################################################## # Define the Policy Network policy_params = { 'policy_class': 'deterministic', 'neurons': [mdp.s_dim, 10, mdp.a_dim], # [state_dim, hidden1, ... , hiddenN, action_dim] 'activations': [nn.functional.relu], # one activation function per hidden layer 'f_out': [lambda x: 2.0 * torch.tanh(x)], 'device': DEVICE } policy = Policy(**policy_params).to(device=DEVICE, dtype=TORCH_DTYPE) ########################################################################################## # Optimize the policy with NOPG nopg_params = { 'initial_states': np.array([-1., 0., 0.]).reshape((-1, 3)), 'gamma': 0.97, 'sparsify_P': { 'kl_max': 0.001, 'kl_interval_k': 10, 'kl_repeat_every_n_iterations': 3 } } nopg = NOPG(dataset, policy, **nopg_params) n_policy_updates = 10 def optimizer(x): return optim.Adam(x, lr=1e-2) evaluation_params = { 'eval_mdp': mdp, 'eval_every_n': 200, 'eval_n_episodes': 1, 'eval_initial_state': np.array([-1., 0., 0.]), # or None 'eval_transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), # or None 'eval_render': False } nopg.fit(n_policy_updates=n_policy_updates, optimizer=optimizer, **evaluation_params)
def test_nopg_s_pendulum_behavioral_sample_P(self): ########################################################################################## # Create the Environment (MDP) env = gym.make('Pendulum-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset sampling_params = { 'sampling_type': 'behavioral', 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), 'initial_state': np.array([1., 0., 0.], dtype=NP_DTYPE), # theta=0., theta_dot=0, 'policy': policy_gmm, 'n_samples': 100, # n_trajectories: 2 'max_ep_transitions': 500 } dataset = mdp.get_samples(**sampling_params) dataset.update_dataset_internal() # Compute Kernel Bandwidths (state, action, state next) s_band_factor = [10., 10., 1.] # Pendulum behavioral s_n_band_factor = s_band_factor a_band_factor = [10.] # Pendulum behavioral dataset.kde_bandwidths_internal(s_band_factor=s_band_factor, a_band_factor=a_band_factor, s_n_band_factor=s_n_band_factor) ########################################################################################## # Define the Policy Network policy_params = { 'policy_class': 'stochastic', 'neurons': [mdp.s_dim, 10, mdp.a_dim], # [state_dim, hidden1, ... , hiddenN, action_dim] 'activations': [nn.functional.relu], # one activation function per hidden layer 'f_out': [lambda x: 2.0 * torch.tanh(x), lambda x: 2.0 * torch.sigmoid(x)], 'device': DEVICE } policy = Policy(**policy_params).to(device=DEVICE, dtype=TORCH_DTYPE) ########################################################################################## # Optimize the policy with NOPG nopg_params = { 'initial_states': np.array([-1., 0., 0.]).reshape((-1, 3)), 'gamma': 0.97, 'MC_samples_P': 2, 'MC_samples_stochastic_policy': 2 } nopg = NOPG(dataset, policy, **nopg_params) n_policy_updates = 10 def optimizer(x): return optim.Adam(x, lr=1e-2) evaluation_params = { 'eval_mdp': mdp, 'eval_every_n': 200, 'eval_n_episodes': 1, 'eval_initial_state': np.array([-1., 0., 0.]), # or None 'eval_transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), # or None 'eval_render': False } nopg.fit(n_policy_updates=n_policy_updates, optimizer=optimizer, **evaluation_params)
def test_nopg_d_pendulum_uniform(self): ########################################################################################## # Create the Environment (MDP) env = gym.make('Pendulum-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset states = mdp.discretize_space(space='state', levels=[10, 10]) # theta, theta_dot actions = mdp.discretize_space(space='action', levels=[2]) sampling_params = { 'sampling_type': 'uniform', 'states': states, 'actions': actions, 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), 'render': False } # sampling_params = {'sampling_type': 'behavioral', # 'transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), # 'initial_state': np.array([1., 0., 0.], dtype=NP_DTYPE), # theta=0., theta_dot=0, # 'policy': policy_gmm, # 'n_samples': 1000, # n_trajectories: 2 # 'max_ep_transitions': 500 # } dataset = mdp.get_samples(**sampling_params) dataset.update_dataset_internal() # Compute Kernel Bandwidths (state, action, state next) s_band_factor = [1., 1., 1.] # Pendulum uniform # s_band_factor = [10., 10., 1.] # Pendulum behavioral s_n_band_factor = s_band_factor a_band_factor = [50.] # Pendulum uniform # a_band_factor = [10.] # Pendulum behavioral dataset.kde_bandwidths_internal(s_band_factor=s_band_factor, a_band_factor=a_band_factor, s_n_band_factor=s_n_band_factor) ########################################################################################## # Define the Policy Network policy_params = { 'policy_class': 'deterministic', 'neurons': [mdp.s_dim, 10, mdp.a_dim], # [state_dim, hidden1, ... , hiddenN, action_dim] 'activations': [nn.functional.relu], # one activation function per hidden layer 'f_out': [lambda x: 2.0 * torch.tanh(x)], 'device': DEVICE # NOTE: for a stochastic policy f_out is a list with 2 entries (mean and diagonal covariance) # 'type': 'stochastic', # 'f_out': [lambda x: 2.0 * torch.tanh(x), lambda x: 2.0 * torch.sigmoid(x)] } policy = Policy(**policy_params).to(device=DEVICE, dtype=TORCH_DTYPE) ########################################################################################## # Optimize the policy with NOPG nopg_params = { 'initial_states': np.array([-1., 0., 0.]).reshape((-1, 3)), # 'initial_states': np.array([env.reset() for _ in range(20)]), # For multiple initial states 'gamma': 0.97 # 'MC_samples_stochastic_policy': 15 # 'MC_samples_P': 15 # 'sparsify_P': {'P_sparse_k': 10 } # 'sparsify_P': {'kl_max': 0.001, 'kl_interval_k': 20, 'kl_repeat_every_n_iterations': 200} } nopg = NOPG(dataset, policy, **nopg_params) n_policy_updates = 10 def optimizer(x): return optim.Adam(x, lr=1e-2) evaluation_params = { 'eval_mdp': mdp, 'eval_every_n': 200, 'eval_n_episodes': 1, 'eval_initial_state': np.array([-1., 0., 0.]), # or None 'eval_transform_to_internal_state': lambda x: (math.atan2(x[1], x[0]), x[2]), # or None 'eval_render': False } nopg.fit(n_policy_updates=n_policy_updates, optimizer=optimizer, **evaluation_params)
# Use the GPU if available, or if the memory is insufficient use only the CPU # DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') DEVICE = torch.device('cpu') ########################################################################################## # Create the Environment (MDP) register(id='Qube-100-v1', entry_point='quanser_robots.qube.qube:Qube', max_episode_steps=1500, kwargs={ 'fs': 200.0, 'fs_ctrl': 200.0 }) env = gym.make('Qube-100-v1') mdp = MDP(env) ########################################################################################## # Gather an Off-Policy Dataset results_dir = '/home/carvalho/Documents/projects/nopg/results/qube/nopgs/' os.makedirs(results_dir, exist_ok=True) # Load trajectories from file filename = '/home/carvalho/Documents/projects/nopg/datasets/qube/15_trajectories.npy' dataset = Dataset(results_dir=results_dir) dataset.load_trajectories_from_file(filename, n_trajectories=8) dataset.update_dataset_internal() s_band_factor = [15., 15., 15, 15., 1., 1.] s_n_band_factor = s_band_factor a_band_factor = [7.] dataset.kde_bandwidths_internal(s_band_factor=s_band_factor,