def trial_rr(self, episodes): """ A trial with given episodes in the real environment :return: reward trajectory as a list """ self.env = GentlyTerminating(self.env) return self.trial_sim(episodes)
def __init__(self, gym_env, seed: int = 0, horizon: int = None, clip: float = None): """ :param gym_env: Name of the gym environment :type gym_env: str :param seed: The seed for the environment :type seed: int :param horizon: Number of maximal time steps in the simulation per roll out :type horizon: int or None :param clip: The maximal absolute value for the action, i.e the actions will be clipped to [-clip, clip] :type clip: float or None """ env = GentlyTerminating(gym.make(gym_env)) self.__env = env self.__horizon = self.__env.spec.timestep_limit if horizon is None\ else horizon self.act_low = self.__env.action_space.low \ if clip is None else -np.ones(1) * clip self.act_high = self.__env.action_space.high \ if clip is None else np.ones(1) * clip self.seed(seed) self.__name = gym_env
def run_rs(args=None): """ Initializes random search with given arguments. Use default values if not provided :param args: parameter dictionary """ parser = cmd_util.rs_args_parser() args = parser.parse_known_args(args)[0] env = GentlyTerminating(gym.make(args.env)) rs_params = load_input_to_dict(args) if args.resume: if args.path != None: rs_params = torch.load(args.path+'/hyper_params.pt') rs = RandomSearch(env, hyperparams=rs_params, path=args.path, resume_training=True) else: print("Path not provided") if not args.resume: if args.path == None: path = os.path.dirname(os.path.abspath(__file__)) + '/data/'+args.alg + '-' + env.unwrapped.spec.id + '_' + \ datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') else: path = args.path checkpoint_path = path + '/checkpoint' best_policy_path = path + '/best_policy' os.makedirs(checkpoint_path) os.makedirs(best_policy_path) torch.save(rs_params, path + '/hyper_params.pt') with open(path + '/info.txt', 'w') as f: print(rs_params, file=f) rs = RandomSearch(env, hyperparams=rs_params, path=path) if args.alg == 'arsv1': print("Start training augmented random search v1") rs.ars_v1() elif args.alg == 'arsv1ff': print("Start training augmented random search v1 with random fourier features") rs.ars_v1_ff() elif args.alg == 'arsv2': print("Start training augmented random search v2") rs.ars_v2() else: print("Version not available")
def choose_environment(selection=0): if selection == 0: return gym.make('CartpoleSwingShort-v0') if selection == 1: return gym.make('Qube-v0') if selection == 2: return gym.make('Levitation-v1') if selection == 3: env = GentlyTerminating(gym.make('CartpoleSwingRR-v0')) env.action_space.high = np.array([6.0]) env.action_space.low = np.array([-6.0]) return env else: return gym.make('Pendulum-v0')
def run(args=None): """ Initializes PPO object and starts training :param env: gym environment :param args: arguments for PPO """ parser = cmd_util.ppo_args_parser() args = parser.parse_known_args(args)[0] env = GentlyTerminating(gym.make(args.env)) ppo_params = load_input_to_dict(args) if args.resume: if args.path != None: resume_training(env, args.path) else: print("Path not provided training not continued") if not args.resume: if args.path == None: path = os.path.dirname(os.path.abspath(__file__)) + '/data/ppo' + env.unwrapped.spec.id + '_' + \ datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S') else: path = args.path checkpoint_path = path + '/checkpoint' best_policy_path = path + '/best_policy' os.makedirs(checkpoint_path) os.makedirs(best_policy_path) torch.save(ppo_params, path+'/hyper_params.pt') with open(path+'/info.txt', 'w') as f: print(ppo_params, file=f) ppo = PPO(env, hyper_params=ppo_params, path=path) ppo.run_ppo()
def main(): env = GentlyTerminating(gym.make('CartpoleRR-v0')) print("\n\nMetronom Example:") ctrl = MetronomCtrl() print("\tCalibrate the System:", end="") obs = env.reset() print("\tDone") print("\tSwing Pendulum:", end="") while not ctrl.done: # env.render() act = ctrl(obs) obs, _, _, _ = env.step(act) print("\t\t\tDone") print("\tReset the System:", end="") obs = env.reset() print("\t\tDone") env.close()
from quanser_robots import GentlyTerminating import os from torch.distributions import Normal path = os.path.dirname(__file__) def load_model(env, path): hyper_params = torch.load(path + '/hyper_params.pt', map_location='cpu') policy = PPO(env, path, hyper_params).ac_net checkpoint = torch.load(path + '/model/save_file.pt', map_location='cpu') policy.load_state_dict(checkpoint['model_state_dict']) return policy if __name__ == "__main__": env = GentlyTerminating(gym.make('QubeRR-v0')) model = load_model(env=env, path=path) state = env.reset() done = False while not done: mean, _, _ = model(torch.FloatTensor(state)) dist = Normal(mean, 0) action = dist.sample().cpu().detach().numpy() state, reward, done, _ = env.step(action) env.render() print(state, action, reward) env.close()
# coding: utf-8 from DQN import * import argparse from quanser_robots import GentlyTerminating plt.style.use('seaborn') env = GentlyTerminating(gym.make('QubeRR-v0')) config_path = "config.yml" print_config(config_path) config = load_config(config_path) training_config = config["training_config"] config["model_config"]["load_model"] = True n_episodes = 10 max_episode_step = 10000 print("*********************************************") print("Testing the model for 10 episodes with 10000 maximum steps per episode") print("*********************************************") policy = Policy(env, config) losses = [] all_rewards = [] avg_rewards = [] epsilons = [] s_all = [] a_all = []
import numpy as np import matplotlib.pyplot as plt import gym from quanser_robots import GentlyTerminating from quanser_robots.ball_balancer.ctrl import QPDCtrl if __name__ == "__main__": env = GentlyTerminating(gym.make('BallBalancerRR-v0')) ctrl = QPDCtrl() obs, done = env.reset() obs_hist = [obs] act_hist = [] rew_hist = [] while not done: env.render() act = ctrl(obs) obs, rew, done, _ = env.step(act) act_hist.append(act) obs_hist.append(obs) rew_hist.append(rew) env.close() # Visualization fig, axes = plt.subplots(6, 1, figsize=(6, 8), tight_layout=True) obs_hist = np.stack(obs_hist) act_hist = np.stack(act_hist) rew_hist = np.stack(rew_hist)
""" The minimal program that shows the basic control loop on the simulated swing-up. """ import gym from quanser_robots import GentlyTerminating from quanser_robots.qube import SwingUpCtrl env = GentlyTerminating(gym.make('Qube-100-v0')) ctrl = SwingUpCtrl() obs = env.reset() done = False while not done: env.render() act = ctrl(obs) obs, _, done, _ = env.step(act) env.close()
parser.add_argument("-e", "--episodes", type=int, default=60, help="number of episodes, that shall be performed per TRPO step") parser.add_argument("--layers", type=int, default=[64, 64], nargs="+", help="dimensions of layers in policy network and eventually of the value network") parser.add_argument("--gae", action='store_true', help="shall general advantage estimation be used?") parser.add_argument("--lambd", type=float, default=0.9, help="Parameter for general advantage estimation") args = parser.parse_args() if args.save is not None: settings_file = open("settings/%s.txt" %args.save, "w+") settings_file.write(str(args.__dict__)) settings_file.close() plotter = LearningCurvePlotter(args.iterations, args.save) env = GentlyTerminating(gym.make(args.env)) # Load policy if args.load is not None: input = open("policies/%s.pkl" %args.load, "rb") data = pickle.load(input) policy = data.get("policy") else: policy = Policy(env.observation_space.shape[0], env.action_space.shape[0], args.layers) if args.gae: gae = GAE(args.gamma, args.lambd, env.observation_space.shape[0], args.layers) for i in range(args.iterations): print("Iteration ", i, ":")
import gym from quanser_robots import GentlyTerminating from lax.a2c_lax import learn if __name__ == '__main__': seed = 42 # env = gym.make('Pendulum-v0') # env = GentlyTerminating(gym.make('CartpoleStabShort-v0')) # env = GentlyTerminating(gym.make('Qube-100-v0')) env = GentlyTerminating(gym.make('CartpoleSwingShort-v0')) # env = GentlyTerminating(gym.make('LunarLanderContinuous-v2')) # env = GentlyTerminating(gym.make('BipedalWalker-v2')) # env = GentlyTerminating(gym.make('BipedalWalkerHardcore-v2')) # env = GentlyTerminating(gym.make('HalfCheetah-v3')) # env.unwrapped._dt = 0.01 # env.unwrapped._sigma = 1e-4 # env.spec._max_episode_steps = 100 # env._max_episode_steps = 100 learn(env, seed=seed, obfilter=True, total_steps=int(50e6), tsteps_per_batch=5000, cv_opt_epochs=5, lax=False, gamma=0.99, lamb=0.97, check_kl=True, animate=True, vf_opt_epochs=50, save_loc='evals')
""" This example shows how to change physics parameters upon environment reset. """ import gym from quanser_robots import GentlyTerminating from quanser_robots.qube import Parameterized env = Parameterized(GentlyTerminating(gym.make('Qube-100-v0'))) # Show all adjustable physics parameters print(env.params()) # Pass a dictionary of modified physics parameters upon environment reset env.reset({'g': 10.0}) print(env.params()) # only the provided parameters are modified # Upon reset, previous parameters are used and not the default ones env.reset({'Rm': 9.0}) print(env.params())
def __init__(self, env, path, hyper_params, continue_training=False): """ This class provides a PPO implementation :param env: gym environment :param path: path where to save checkpoints and results :param hyper_params: hyper parameter for ppo :param continue_training: checks if to continue training or start new """ self.env = GentlyTerminating(env) self.path = path self.num_iterations = hyper_params[ 'num_iterations'] # number of total training iterations self.lamb = hyper_params[ 'lambda'] # lambda for general advantage estimate self.cliprange = hyper_params[ 'cliprange'] # ppo cliprange of importance weights self.gamma = hyper_params[ 'gamma'] # gamma for general advantage estimate self.ppo_epochs = hyper_params[ 'ppo_epochs'] # number ppo optimization epochs self.horizon = hyper_params[ 'horizon'] # number of training samples per iteration self.minibatches = hyper_params[ 'minibatches'] # minibatch size for ppo optimization self.vf_coef = hyper_params['vf_coef'] # value function coefficient self.entropy_coef = hyper_params['entropy_coef'] # entropy coefficient self.num_hidden_neurons = hyper_params[ 'num_hidden_neurons'] # number of hidden neurons self.policy_std = hyper_params['policy_std'] # initial policy stddev self.lr = hyper_params['lr'] # lern rate self.max_grad_norm = hyper_params[ 'max_grad_norm'] # maximum gradient norm for param update self.num_evals = hyper_params[ 'num_evals'] # number of policy evaluations to compute expected reward self.eval_step = hyper_params[ 'eval_step'] # policy gets evaluated after every eval_step self.num_inputs = self.env.observation_space.shape[0] self.num_outputs = self.env.action_space.shape[0] self.num_states = self.num_inputs self.cumulative_rollout_rewards = np.array([]) self.cum_eval_rewards = np.array([]) self.cum_eval_rewards_std = np.array([]) self.entropy = np.array([]) self.epoch = 0 # initialize actor critic network self.ac_net = actor_critic.ActorCriticMLPShared( num_inputs=self.num_inputs, num_hidden_neurons=self.num_hidden_neurons, num_outputs=self.num_outputs, layer_norm=hyper_params['layer_norm'], std=self.policy_std) self.ac_optim = optim.Adam(self.ac_net.parameters(), lr=self.lr) if continue_training: self.ac_net, self.ac_optim, self.cumulative_rollout_rewards, \ self.cum_eval_rewards, self.cum_eval_rewards_std, self.epoch, self.entropy = \ model_handler.load_model(path=path, model=self.ac_net, optimizer=self.ac_optim, from_checkpoint=True) self.ac_net.train()
class PPO(): def __init__(self, env, path, hyper_params, continue_training=False): """ This class provides a PPO implementation :param env: gym environment :param path: path where to save checkpoints and results :param hyper_params: hyper parameter for ppo :param continue_training: checks if to continue training or start new """ self.env = GentlyTerminating(env) self.path = path self.num_iterations = hyper_params[ 'num_iterations'] # number of total training iterations self.lamb = hyper_params[ 'lambda'] # lambda for general advantage estimate self.cliprange = hyper_params[ 'cliprange'] # ppo cliprange of importance weights self.gamma = hyper_params[ 'gamma'] # gamma for general advantage estimate self.ppo_epochs = hyper_params[ 'ppo_epochs'] # number ppo optimization epochs self.horizon = hyper_params[ 'horizon'] # number of training samples per iteration self.minibatches = hyper_params[ 'minibatches'] # minibatch size for ppo optimization self.vf_coef = hyper_params['vf_coef'] # value function coefficient self.entropy_coef = hyper_params['entropy_coef'] # entropy coefficient self.num_hidden_neurons = hyper_params[ 'num_hidden_neurons'] # number of hidden neurons self.policy_std = hyper_params['policy_std'] # initial policy stddev self.lr = hyper_params['lr'] # lern rate self.max_grad_norm = hyper_params[ 'max_grad_norm'] # maximum gradient norm for param update self.num_evals = hyper_params[ 'num_evals'] # number of policy evaluations to compute expected reward self.eval_step = hyper_params[ 'eval_step'] # policy gets evaluated after every eval_step self.num_inputs = self.env.observation_space.shape[0] self.num_outputs = self.env.action_space.shape[0] self.num_states = self.num_inputs self.cumulative_rollout_rewards = np.array([]) self.cum_eval_rewards = np.array([]) self.cum_eval_rewards_std = np.array([]) self.entropy = np.array([]) self.epoch = 0 # initialize actor critic network self.ac_net = actor_critic.ActorCriticMLPShared( num_inputs=self.num_inputs, num_hidden_neurons=self.num_hidden_neurons, num_outputs=self.num_outputs, layer_norm=hyper_params['layer_norm'], std=self.policy_std) self.ac_optim = optim.Adam(self.ac_net.parameters(), lr=self.lr) if continue_training: self.ac_net, self.ac_optim, self.cumulative_rollout_rewards, \ self.cum_eval_rewards, self.cum_eval_rewards_std, self.epoch, self.entropy = \ model_handler.load_model(path=path, model=self.ac_net, optimizer=self.ac_optim, from_checkpoint=True) self.ac_net.train() def collect_trajectories(self): """ collects multiple trajectories limited by horizon :return: values,old_log_probs, actions, states, rewards, masks, entropy """ # init arrays for data collection rewards = np.empty(shape=self.horizon) values = torch.empty(self.horizon) states = torch.empty(size=(self.horizon, self.num_states)) actions = torch.empty(self.horizon, 1) masks = np.empty(self.horizon) old_log_probs = torch.empty(size=(self.horizon, 1)) state = self.env.reset() cum_reward = 0 for i in range(self.horizon): state = torch.FloatTensor(state) # sample state from normal distribution mean, std, value = self.ac_net(state) dist = Normal(mean, std) action = dist.sample() next_state, reward, done, info = self.env.step( action.cpu().detach().numpy()[0]) # save values and rewards for gae log_prob = dist.log_prob(action) values[i] = value old_log_probs[i] = log_prob states[i] = state actions[i] = action state = next_state rewards[i] = reward masks[i] = 1 - done cum_reward += reward if done: state = self.env.reset() _, _, last_value = self.ac_net(torch.FloatTensor(next_state)) last_value = last_value.detach() values = values.detach() entropy = dist.entropy().detach().numpy()[0][0] old_log_probs = old_log_probs.detach() return values, old_log_probs, actions, states, rewards, last_value, masks, entropy def ppo_update(self, advantage_estimates, states, actions, old_log_probs, returns, cliprange=0.2): """ This method performs proximal policy update over batches of inputs :param ppo_epochs: number of ppo optimization epochs per trajectory :param advantage_estimates: computed advantage estimates :param states: collected number of states of a trajectory :param actions: collected number of actions of a trajectory :param values: collected number of values of a trajectory :param old_log_probs: old log probabilities. :param actor_net: current actor network (samples policy) :param critic_net: current critic network (sample new values) :param minibatch_size: size of minibatches for each ppo epoch """ randomized_inds = np.arange(self.horizon) # normalize advantages advantage_estimates = (advantage_estimates - advantage_estimates.mean()) / \ (advantage_estimates.std() + 1e-8) for k in range(self.ppo_epochs): # shuffle inputs every ppo epoch np.random.shuffle(randomized_inds) old_log_probs = old_log_probs[randomized_inds] actions = actions[randomized_inds] advantage_estimates = advantage_estimates[randomized_inds] states = states[randomized_inds] returns = returns[randomized_inds] for start in range(0, self.horizon, self.minibatches): end = start + self.minibatches mean, std, current_policy_value = self.ac_net( states[start:end]) dist = Normal(mean, std) new_log_prob = dist.log_prob(actions[start:end]) entropy = dist.entropy().mean() # importance weights ratio = torch.exp(new_log_prob - old_log_probs[start:end]) advantage_batch = advantage_estimates[start:end] surr = ratio * advantage_batch clipped_surr = torch.clamp(ratio, 1 - cliprange, 1 + cliprange) * advantage_batch pg_loss = torch.min(surr, clipped_surr).mean() target_value = returns[start:end] vf_loss = ((current_policy_value - target_value).pow(2)).mean() loss = -(pg_loss - self.vf_coef * vf_loss + self.entropy_coef * entropy) self.ac_optim.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.ac_net.parameters(), self.max_grad_norm) self.ac_optim.step() def run_ppo(self): """ runs ppo and logs data """ check_reward = 0 for epoch in range(self.epoch, self.num_iterations + 1): # collect trajectory data values, old_log_probs, actions, states, \ rewards, last_value, masks, entropy = self.collect_trajectories() # computes general advantages from trajectories advantage_est, returns = compute_gae(rewards, values, last_value, masks, self.lamb, self.gamma) # interesting to check how model behaves total_rollout_reward = rewards.sum() self.cumulative_rollout_rewards = np.append( self.cumulative_rollout_rewards, total_rollout_reward) self.entropy = np.append(self.entropy, entropy) # plotting and evaluating policy if epoch % self.eval_step == 0: check_reward = self.logger(check_reward, epoch) # actual ppo optimization self.ppo_update(advantage_est, states, actions, old_log_probs, returns, cliprange=self.cliprange) def logger(self, check_reward, epoch): """ evaluates current model, checks if to save best scoring model. :param check_reward: :param epoch: :return: """ eval_reward, eval_std = eval_policy( env=self.env, model=self.ac_net, num_evals=self.num_evals, ) self.cum_eval_rewards = np.append(self.cum_eval_rewards, eval_reward) self.cum_eval_rewards_std = np.append(self.cum_eval_rewards_std, eval_std) plot_utility.plt_expected_cum_reward(self.path, self.cum_eval_rewards, self.eval_step) print("---------------------------------------------------") print("Expected cumulative reward: {} after {} epochs:".format( eval_reward, epoch)) print("---------------------------------------------------") model_handler.save_model(model=self.ac_net, optimizer=self.ac_optim, train_rewards=self.cumulative_rollout_rewards, eval_rewards=self.cum_eval_rewards, eval_rewards_std=self.cum_eval_rewards_std, epoch=epoch, entropy=self.entropy, path=self.path + '/checkpoint') if check_reward < eval_reward: print("Found new high scoring model") check_reward = eval_reward model_handler.save_model( model=self.ac_net, optimizer=self.ac_optim, train_rewards=self.cumulative_rollout_rewards, eval_rewards=self.cum_eval_rewards, eval_rewards_std=self.cum_eval_rewards_std, epoch=epoch, entropy=self.entropy, path=self.path + '/best_policy') return check_reward
from torch.distributions import Normal path = os.path.dirname(__file__) def load_model(env, path): hyper_params = torch.load(path + '/hyper_params.pt', map_location='cpu') policy = PPO(env, path, hyper_params).ac_net checkpoint = torch.load(path + '/model/save_file.pt', map_location='cpu') policy.load_state_dict(checkpoint['model_state_dict']) return policy if __name__ == "__main__": env = GentlyTerminating(gym.make('CartpoleSwingShort-v0')) model = load_model(env=env, path=path) state = env.reset() done = False while not done: mean, _, _ = model(torch.FloatTensor(state)) dist = Normal(mean, 0) action = dist.sample().cpu().detach().numpy() state, reward, done, _ = env.step(action) env.render() print(state, action, reward) env.close()
#FuturaPend=Qube-v0 env_names = { 0: "Levitation-v0", 1: "CartpoleSwingShort-v0", 2: "Qube-v0", 3: "Pendulum-v2" } ENV_NAME = env_names[3] sampling_type = "uniform" print("Sampling env:") print(ENV_NAME) env = GentlyTerminating(gym.make(ENV_NAME)) print("Observation space:") print(env.observation_space) print("Low:") print(env.observation_space.low) print("High:") print(env.observation_space.high) print("Action space:") print(env.action_space) print("Low:") print(env.action_space.low) print("High:") print(env.action_space.high) states = [] actions = []
class DDPG: def __init__(self, env, action_space_limits, dirname="out", buffer_size=10000, batch_size=64, is_quanser_env=True, gamma=.99, tau=1e-2, steps=100000, warmup_samples=1000, noise_decay=0.9, transform=lambda x: x, actor_lr=1e-3, critic_lr=1e-3, lr_decay=1.0, lr_min=1.e-7, trial_horizon=5000, batch_norm=True, actor_hidden_layers=[10, 10, 10], critic_hidden_layers=[10, 10, 10], device="cpu"): """ DDPG algorithm implementation as in https://arxiv.org/abs/1509.02971 param env: the gym environment to deal with param dirname: non-existing or existing directory in which the calculated immediate models will be saved in param action_space_limits: sets a limit on action space param buffer_size: the size of the replay buffer param batch_size: size of batches to learn with while training, extracted from replay buffer param is_quaner_env: True if given env is from quaner_robots, else false param gamma: interest rate of expected return param tau: update factor from source to target network param steps: number of steps that will be performed during training time param warmup_samples: number of random samples placed into replay buffer before training actor and critic network param noise_decay: gaussian noise on actions will be reduced multiplicative in every episode by this factor param transform: function to transform observation space of given environment param actor_lr: learning rate of adam optimizer for actor network param critic_lr: learning rate of adam optimizer for critic network param lr_decay: learning rate decay of adam optimizers param lr_min: lower bound of learning rate of adam_optimizers param trial_horizon: maximum steps to take per episode param actor_hidden_layers: hidden layers of actor network as a numeric list param critic_hidden_layers: hidden layers of critic network as a numeric list param device: on which device to train your torch nn.Models on either cpu or gpu """ self.device = device # algorithm timestamp self.started = datetime.datetime.now() self.env = env self.is_quanser_env = is_quanser_env self.dirname = dirname self.env_low = torch.tensor(action_space_limits[0], device=self.device, dtype=torch.float) self.env_high = torch.tensor(action_space_limits[1], device=self.device, dtype=torch.float) self.warmup_samples = warmup_samples self.total_steps = steps self.transformObservation = transform # replay buffer parameters + initialization self.buffer_size = buffer_size self.replayBuffer = ReplayBuffer(self.buffer_size, self.device) self.batch_size = batch_size self.n_batches = warmup_samples self.state_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] # optimizer parameters self.actor_lr = actor_lr self.critic_lr = critic_lr self.lr_decay = lr_decay self.lr_min = lr_min # actor and critic parameters + initialization self.actor_hidden_layers = actor_hidden_layers self.critic_hidden_layers = critic_hidden_layers self.actor_network = ActorNetwork([self.state_dim, *self.actor_hidden_layers, self.action_dim], torch.tensor(self.env_low[0], device=self.device, dtype=torch.float), torch.tensor(self.env_high[0], device=self.device, dtype=torch.float), batch_norm=batch_norm).to(self.device) self.critic_network = CriticNetwork([self.state_dim + self.action_dim, *self.critic_hidden_layers, 1], batch_norm=batch_norm).to(self.device) self.actor_target = copy.deepcopy(self.actor_network).to(self.device) self.critic_target = copy.deepcopy(self.critic_network).to(self.device) # optimizer initialization self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=actor_lr) self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=critic_lr) self.actor_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.actor_optim, lr_decay) self.critic_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.critic_optim, lr_decay) # training parameters self.loss = nn.MSELoss() self.noise_decay = torch.tensor(noise_decay, device=self.device, dtype=torch.float) self.trial_horizon = trial_horizon self.gamma = torch.tensor(gamma, device=self.device, dtype=torch.float) self.tau = torch.tensor(tau, device=self.device, dtype=torch.float) # gaussian noise on actions used self.noise_torch = torch.distributions.normal.Normal(0, self.env_high[0]) def action_selection(self, state): """ Selects best action according to q-function for a given state param state: current state return: action with highest q-value """ with torch.no_grad(): self.actor_network.eval() action = self.actor_network(state) self.actor_network.train() return action def soft_update(self, source, target): """ Updates the weights of given target network (nn.Module) by the weights of given source network (nn.Module) param source: nn.Module which weights will be taken from param target: nn.Module which weights will be updated to """ for target_w, source_w in zip(target.parameters(), source.parameters()): target_w.data.copy_( (1.0 - self.tau) * target_w.data \ + self.tau * source_w.data ) def update_actor(self, loss): """ Updates actor network by given calculated loss """ # update actor self.actor_optim.zero_grad() loss.backward() self.actor_optim.step() def update_critic(self, loss): """ Updates critic network by given calculated loss """ # update critic self.critic_optim.zero_grad() loss.backward(retain_graph=True) self.critic_optim.step() def forward_actor_network(self, network, state): """ Forwards state through either target or training ActorNetwork param network: either target or training ActorNetwork param state: state to forward through network return: action for environment step """ state = torch.tensor(state, dtype=torch.float32).to(self.device).unsqueeze(0) action = network(state).squeeze() # dimensionality check of actions action = action.unsqueeze(0).cpu().detach().numpy() if action.dim() == 0 else action if self.is_quanser_env: action = np.array(action) return action def trial(self): """ Test the target actor in the environment return: average total reward """ print("trial average total reward:") self.actor_target.eval() with torch.no_grad(): episodes = 5 average_reward = 0 for episode in range(episodes): obs = self.env.reset() total_reward = 0 for t in range(self.trial_horizon): state = self.transformObservation(obs) action = self.forward_actor_network(self.actor_target, state) obs, reward, done, _ = self.env.step(action) total_reward += reward if done: break # calculate average reward with incremental average average_reward += total_reward/episodes print(average_reward) self.actor_target.train() return average_reward def save_model(self, reward): """ Saves the immediate actor and critic target network in given self.dirame directory param reward: will be displayed as filename """ if not os.path.exists(self.dirname): os.makedirs(self.dirname) torch.save(self.actor_target.state_dict(), os.path.join(self.dirname, "actortarget_{}".format(reward))) torch.save(self.critic_target.state_dict(), os.path.join(self.dirname, "critictarget_{}".format(reward))) def update(self): """ Calculating loss w.r.t. DDPG paper https://arxiv.org/abs/1509.02971 return: actor and critic loss """ sample_batch = self.replayBuffer.sample_batch(self.batch_size) s_batch, a_batch, r_batch, s_2_batch, done_batch = sample_batch # calculate policy/actor loss actor_loss = self.critic_network(s_batch, self.actor_network(s_batch)) actor_loss = - actor_loss.mean() # calculate value/critic loss next_action = self.actor_target(s_2_batch) critic_target_prediction = self.critic_target(s_2_batch, next_action) expected_critic = r_batch + self.gamma * (1. - done_batch) * critic_target_prediction critic_pred = self.critic_network(s_batch, a_batch) critic_loss = self.loss(critic_pred, expected_critic) return actor_loss, critic_loss def info_print(self, step, total_reward, reward_record): """ Status print of this training session per episode """ statusprint = "{} /{} | {:.0f} /{:.0f} | {} /{} | alr,clr: {:.2E} {:.2E}" print(statusprint.format(step, self.total_steps, total_reward, reward_record, self.replayBuffer.count, self.replayBuffer.buffer_size, self.actor_lr_scheduler.get_lr()[0], self.critic_lr_scheduler.get_lr()[0])) def train_rr(self): """ A training session w.r.t. training parameters in a real environment :return: total reward for this training session """ self.env = GentlyTerminating(self.env) print("Training in real environment started...") reward_record = 0 total_reward = 0 episode = 0 rew = [] step = 0 while step < self.total_steps: state = self.transformObservation(self.env.reset()) done = False self.info_print(step, total_reward, reward_record) total_reward = 0 i = 0 while not done: action = self.action_selection( torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze() action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0]) action = action.to("cpu").detach().numpy() next_state, reward, done, _ = self.env.step(action) done = done or i >= self.trial_horizon next_state = self.transformObservation(next_state) total_reward += reward step += 1 i = i + 1 self.replayBuffer.add(state, action, reward, next_state, done) state = next_state # we do this at end of every episode because it takes to much time between episodes if self.replayBuffer.count >= self.n_batches: actor_loss, critic_loss = self.update() self.update_actor(actor_loss) self.update_critic(critic_loss) self.soft_update(self.actor_network, self.actor_target) self.soft_update(self.critic_network, self.critic_target) if self.replayBuffer.count >= self.n_batches: if self.critic_lr_scheduler.get_lr()[0] > self.lr_min: self.critic_lr_scheduler.step() if self.actor_lr_scheduler.get_lr()[0] > self.lr_min: self.actor_lr_scheduler.step() episode += 1 # if out actor is really good, test target actor. If the target actor is good too, save it. if reward_record < total_reward and total_reward > 50: trial_average_reward = self.trial() if trial_average_reward > reward_record: print("New record") reward_record = trial_average_reward self.save_model(trial_average_reward) rew.append(total_reward) # test & save final model trial_average_reward = self.trial() self.save_model("{:.2f}_final".format(trial_average_reward)) return rew def load_model(self, dirname): """ Setting actor network to given model :param dirname: specified policy that will be loaded :return: """ if not os.path.exists(os.path.join(dirname)): print("no model checkoutpoint found") return self.actor_network.load_state_dict(torch.load(os.path.join(dirname), map_location='cpu')) def trial_sim(self, episodes): """ A trial with given episodes in the simulated environment :return: reward trajectory as a list """ rew = [] self.actor_network.eval() for step in range(episodes): done = False obs = self.env.reset() total_reward = 0 i = 0 while not done: state = obs action = self.forward_actor_network(self.actor_network, state) if step == 0: self.env.render() obs, reward, done, _ = self.env.step(action) done = done or i >= self.trial_horizon - 1 total_reward += reward i += 1 rew.append(total_reward) self.actor_network.train() return rew def trial_rr(self, episodes): """ A trial with given episodes in the real environment :return: reward trajectory as a list """ self.env = GentlyTerminating(self.env) return self.trial_sim(episodes) def train_sim(self): """ A training session w.r.t. training parameters in a simulated environment return: total reward achieved during this training session """ print("Training in simulation started...") reward_record = 0 total_reward = 0 episode = 0 rew = [] step = 0 while step < self.total_steps: state = self.transformObservation(self.env.reset()) done = False self.info_print(step, total_reward, reward_record) total_reward = 0 i = 0 while not done: action = self.action_selection( torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze() action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0]) action = action.to("cpu").detach().numpy() next_state, reward, done, _ = self.env.step(action) done = done or i >= self.trial_horizon next_state = self.transformObservation(next_state) total_reward += reward self.replayBuffer.add(state, action, reward, next_state, done) state = next_state if self.replayBuffer.count >= self.n_batches: actor_loss, critic_loss = self.update() self.update_actor(actor_loss) self.update_critic(critic_loss) self.soft_update(self.actor_network, self.actor_target) self.soft_update(self.critic_network, self.critic_target) step += 1 i = i + 1 if self.replayBuffer.count >= self.n_batches: if self.critic_lr_scheduler.get_lr()[0] > self.lr_min: self.critic_lr_scheduler.step() if self.actor_lr_scheduler.get_lr()[0] > self.lr_min: self.actor_lr_scheduler.step() episode += 1 # if out actor is really good, test target actor. If the target actor is good too, save it. if reward_record < total_reward and total_reward > 50: trial_average_reward = self.trial() if trial_average_reward > reward_record: print("New record") reward_record = trial_average_reward self.save_model(trial_average_reward) rew.append(total_reward) # test & save final model trial_average_reward = self.trial() self.save_model("{:.2f}_final".format(trial_average_reward)) return rew
def train_rr(self): """ A training session w.r.t. training parameters in a real environment :return: total reward for this training session """ self.env = GentlyTerminating(self.env) print("Training in real environment started...") reward_record = 0 total_reward = 0 episode = 0 rew = [] step = 0 while step < self.total_steps: state = self.transformObservation(self.env.reset()) done = False self.info_print(step, total_reward, reward_record) total_reward = 0 i = 0 while not done: action = self.action_selection( torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze() action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0]) action = action.to("cpu").detach().numpy() next_state, reward, done, _ = self.env.step(action) done = done or i >= self.trial_horizon next_state = self.transformObservation(next_state) total_reward += reward step += 1 i = i + 1 self.replayBuffer.add(state, action, reward, next_state, done) state = next_state # we do this at end of every episode because it takes to much time between episodes if self.replayBuffer.count >= self.n_batches: actor_loss, critic_loss = self.update() self.update_actor(actor_loss) self.update_critic(critic_loss) self.soft_update(self.actor_network, self.actor_target) self.soft_update(self.critic_network, self.critic_target) if self.replayBuffer.count >= self.n_batches: if self.critic_lr_scheduler.get_lr()[0] > self.lr_min: self.critic_lr_scheduler.step() if self.actor_lr_scheduler.get_lr()[0] > self.lr_min: self.actor_lr_scheduler.step() episode += 1 # if out actor is really good, test target actor. If the target actor is good too, save it. if reward_record < total_reward and total_reward > 50: trial_average_reward = self.trial() if trial_average_reward > reward_record: print("New record") reward_record = trial_average_reward self.save_model(trial_average_reward) rew.append(total_reward) # test & save final model trial_average_reward = self.trial() self.save_model("{:.2f}_final".format(trial_average_reward)) return rew
""" Analytic real-robot swing-up controller with trajectory visualization. """ import numpy as np import matplotlib.pyplot as plt import gym from quanser_robots import GentlyTerminating from quanser_robots.qube import SwingUpCtrl import time plt.style.use('seaborn') env = GentlyTerminating(gym.make('QubeRR-100-v0')) ctrl = SwingUpCtrl() obs = env.reset() s_all, a_all = [], [] done = False t0 = time.perf_counter() n = 0 while not done: env.render() act = ctrl(obs) obs, rwd, done, info = env.step(act) s_all.append(info['s']) a_all.append(info['a']) n += 1 t1 = time.perf_counter() print("freq = {}, time = {}".format(n / (t1-t0), t1-t0))