Example #1
0
 def trial_rr(self, episodes):
     """
     A trial with given episodes in the real environment
     :return: reward trajectory as a list
     """
     self.env = GentlyTerminating(self.env)
     return self.trial_sim(episodes)
Example #2
0
    def __init__(self,
                 gym_env,
                 seed: int = 0,
                 horizon: int = None,
                 clip: float = None):
        """
        :param gym_env: Name of the gym environment
        :type gym_env: str

        :param seed: The seed for the environment
        :type seed: int

        :param horizon: Number of maximal time steps in the simulation
            per roll out
        :type horizon: int or None

        :param clip: The maximal absolute value for the action,
        i.e the actions will be clipped to [-clip, clip]
        :type clip: float or None
        """
        env = GentlyTerminating(gym.make(gym_env))
        self.__env = env
        self.__horizon = self.__env.spec.timestep_limit if horizon is None\
            else horizon
        self.act_low = self.__env.action_space.low \
            if clip is None else -np.ones(1) * clip
        self.act_high = self.__env.action_space.high \
            if clip is None else np.ones(1) * clip
        self.seed(seed)
        self.__name = gym_env
Example #3
0
def run_rs(args=None):
    """
    Initializes random search with given arguments. Use default values if not provided

    :param args: parameter dictionary
    """
    parser = cmd_util.rs_args_parser()
    args = parser.parse_known_args(args)[0]
    env = GentlyTerminating(gym.make(args.env))
    rs_params = load_input_to_dict(args)

    if args.resume:
        if args.path != None:
            rs_params = torch.load(args.path+'/hyper_params.pt')
            rs = RandomSearch(env, hyperparams=rs_params, path=args.path, resume_training=True)
        else:
            print("Path not provided")



    if not args.resume:
        if args.path == None:
            path = os.path.dirname(os.path.abspath(__file__)) + '/data/'+args.alg + '-' + env.unwrapped.spec.id + '_' + \
                   datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        else:
            path = args.path

        checkpoint_path = path + '/checkpoint'
        best_policy_path = path + '/best_policy'

        os.makedirs(checkpoint_path)
        os.makedirs(best_policy_path)

        torch.save(rs_params, path + '/hyper_params.pt')

        with open(path + '/info.txt', 'w') as f:
            print(rs_params, file=f)

        rs = RandomSearch(env, hyperparams=rs_params, path=path)

    if args.alg == 'arsv1':
        print("Start training augmented random search v1")
        rs.ars_v1()

    elif args.alg == 'arsv1ff':
        print("Start training augmented random search v1 with random fourier features")
        rs.ars_v1_ff()
    elif args.alg == 'arsv2':
        print("Start training augmented random search v2")
        rs.ars_v2()
    else:
        print("Version not available")
Example #4
0
def choose_environment(selection=0):
    if selection == 0:
        return gym.make('CartpoleSwingShort-v0')
    if selection == 1:
        return gym.make('Qube-v0')
    if selection == 2:
        return gym.make('Levitation-v1')
    if selection == 3:
        env = GentlyTerminating(gym.make('CartpoleSwingRR-v0'))
        env.action_space.high = np.array([6.0])
        env.action_space.low = np.array([-6.0])
        return env
    else:
        return gym.make('Pendulum-v0')
Example #5
0
def run(args=None):
	"""
	Initializes PPO object and starts training

	:param env: gym environment
	:param args: arguments for PPO
	"""
	parser = cmd_util.ppo_args_parser()
	args = parser.parse_known_args(args)[0]
	env = GentlyTerminating(gym.make(args.env))
	ppo_params = load_input_to_dict(args)


	if args.resume:
		if args.path != None:
			resume_training(env, args.path)
		else:
			print("Path not provided training not continued")

	if not args.resume:
		if args.path == None:
			path = os.path.dirname(os.path.abspath(__file__)) + '/data/ppo' + env.unwrapped.spec.id + '_' + \
				   datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
		else:
			path = args.path

		checkpoint_path = path + '/checkpoint'
		best_policy_path = path + '/best_policy'
		os.makedirs(checkpoint_path)
		os.makedirs(best_policy_path)
		torch.save(ppo_params, path+'/hyper_params.pt')

		with open(path+'/info.txt', 'w') as f:
			print(ppo_params, file=f)


		ppo = PPO(env, hyper_params=ppo_params, path=path)
		ppo.run_ppo()
Example #6
0
def main():
    env = GentlyTerminating(gym.make('CartpoleRR-v0'))

    print("\n\nMetronom Example:")
    ctrl = MetronomCtrl()

    print("\tCalibrate the System:", end="")
    obs = env.reset()
    print("\tDone")

    print("\tSwing Pendulum:", end="")
    while not ctrl.done:

        # env.render()
        act = ctrl(obs)
        obs, _, _, _ = env.step(act)

    print("\t\t\tDone")

    print("\tReset the System:", end="")
    obs = env.reset()
    print("\t\tDone")

    env.close()
Example #7
0
from quanser_robots import GentlyTerminating
import os
from torch.distributions import Normal

path = os.path.dirname(__file__)

def load_model(env, path):
    hyper_params = torch.load(path + '/hyper_params.pt', map_location='cpu')
    policy = PPO(env, path, hyper_params).ac_net
    checkpoint = torch.load(path + '/model/save_file.pt', map_location='cpu')
    policy.load_state_dict(checkpoint['model_state_dict'])
    return policy

if __name__ == "__main__":

    env = GentlyTerminating(gym.make('QubeRR-v0'))
    model = load_model(env=env, path=path)

    state = env.reset()
    done = False

    while not done:
        mean, _, _ = model(torch.FloatTensor(state))
        dist = Normal(mean, 0)
        action = dist.sample().cpu().detach().numpy()
        state, reward, done, _ = env.step(action)
        env.render()

        print(state, action, reward)

    env.close()
# coding: utf-8

from DQN import *
import argparse
from quanser_robots import GentlyTerminating

plt.style.use('seaborn')
env = GentlyTerminating(gym.make('QubeRR-v0'))

config_path = "config.yml"
print_config(config_path)
config = load_config(config_path)
training_config = config["training_config"]
config["model_config"]["load_model"] = True

n_episodes = 10
max_episode_step = 10000
print("*********************************************")
print("Testing the model for 10 episodes with 10000 maximum steps per episode")
print("*********************************************")

policy = Policy(env, config)

losses = []
all_rewards = []
avg_rewards = []
epsilons = []

s_all = []
a_all = []
import numpy as np
import matplotlib.pyplot as plt
import gym

from quanser_robots import GentlyTerminating
from quanser_robots.ball_balancer.ctrl import QPDCtrl

if __name__ == "__main__":
    env = GentlyTerminating(gym.make('BallBalancerRR-v0'))
    ctrl = QPDCtrl()

    obs, done = env.reset()
    obs_hist = [obs]
    act_hist = []
    rew_hist = []

    while not done:
        env.render()
        act = ctrl(obs)
        obs, rew, done, _ = env.step(act)
        act_hist.append(act)
        obs_hist.append(obs)
        rew_hist.append(rew)
    env.close()

    # Visualization
    fig, axes = plt.subplots(6, 1, figsize=(6, 8), tight_layout=True)

    obs_hist = np.stack(obs_hist)
    act_hist = np.stack(act_hist)
    rew_hist = np.stack(rew_hist)
"""
The minimal program that shows the basic control loop on the simulated swing-up.
"""

import gym
from quanser_robots import GentlyTerminating
from quanser_robots.qube import SwingUpCtrl

env = GentlyTerminating(gym.make('Qube-100-v0'))

ctrl = SwingUpCtrl()
obs = env.reset()
done = False
while not done:
    env.render()
    act = ctrl(obs)
    obs, _, done, _ = env.step(act)

env.close()
Example #11
0
parser.add_argument("-e", "--episodes", type=int, default=60,
                    help="number of episodes, that shall be performed per TRPO step")
parser.add_argument("--layers", type=int, default=[64, 64], nargs="+",
                    help="dimensions of layers in policy network and eventually of the value network")
parser.add_argument("--gae", action='store_true',
                    help="shall general advantage estimation be used?")
parser.add_argument("--lambd", type=float, default=0.9,
                    help="Parameter for general advantage estimation")
args = parser.parse_args()
if args.save is not None:
    settings_file = open("settings/%s.txt" %args.save, "w+")
    settings_file.write(str(args.__dict__))
    settings_file.close()

plotter = LearningCurvePlotter(args.iterations, args.save)
env = GentlyTerminating(gym.make(args.env))

# Load policy
if args.load is not None:
    input = open("policies/%s.pkl" %args.load, "rb")
    data = pickle.load(input)
    policy = data.get("policy")
else:
    policy = Policy(env.observation_space.shape[0], env.action_space.shape[0], args.layers)

if args.gae:
    gae = GAE(args.gamma, args.lambd, env.observation_space.shape[0], args.layers)

for i in range(args.iterations):
    print("Iteration ", i, ":")
Example #12
0
import gym
from quanser_robots import GentlyTerminating

from lax.a2c_lax import learn


if __name__ == '__main__':
    seed = 42
    # env = gym.make('Pendulum-v0')
    # env = GentlyTerminating(gym.make('CartpoleStabShort-v0'))
    # env = GentlyTerminating(gym.make('Qube-100-v0'))
    env = GentlyTerminating(gym.make('CartpoleSwingShort-v0'))
    # env = GentlyTerminating(gym.make('LunarLanderContinuous-v2'))
    # env = GentlyTerminating(gym.make('BipedalWalker-v2'))
    # env = GentlyTerminating(gym.make('BipedalWalkerHardcore-v2'))
    # env = GentlyTerminating(gym.make('HalfCheetah-v3'))

    # env.unwrapped._dt = 0.01
    # env.unwrapped._sigma = 1e-4
    # env.spec._max_episode_steps = 100
    # env._max_episode_steps = 100

    learn(env, seed=seed, obfilter=True, total_steps=int(50e6), tsteps_per_batch=5000, cv_opt_epochs=5, lax=False,
          gamma=0.99, lamb=0.97, check_kl=True, animate=True, vf_opt_epochs=50, save_loc='evals')
Example #13
0
"""
This example shows how to change physics parameters upon environment reset.
"""

import gym
from quanser_robots import GentlyTerminating
from quanser_robots.qube import Parameterized

env = Parameterized(GentlyTerminating(gym.make('Qube-100-v0')))

# Show all adjustable physics parameters
print(env.params())

# Pass a dictionary of modified physics parameters upon environment reset
env.reset({'g': 10.0})
print(env.params())  # only the provided parameters are modified

# Upon reset, previous parameters are used and not the default ones
env.reset({'Rm': 9.0})
print(env.params())
Example #14
0
    def __init__(self, env, path, hyper_params, continue_training=False):
        """
        This class provides a PPO implementation

        :param env: gym environment
        :param path: path where to save checkpoints and results
        :param hyper_params: hyper parameter for ppo
        :param continue_training: checks if to continue training or start new
        """

        self.env = GentlyTerminating(env)
        self.path = path
        self.num_iterations = hyper_params[
            'num_iterations']  # number of total training iterations
        self.lamb = hyper_params[
            'lambda']  # lambda for general advantage estimate
        self.cliprange = hyper_params[
            'cliprange']  # ppo cliprange of importance weights
        self.gamma = hyper_params[
            'gamma']  # gamma for general advantage estimate
        self.ppo_epochs = hyper_params[
            'ppo_epochs']  # number ppo optimization epochs
        self.horizon = hyper_params[
            'horizon']  # number of training samples per iteration
        self.minibatches = hyper_params[
            'minibatches']  # minibatch size for ppo optimization
        self.vf_coef = hyper_params['vf_coef']  # value function coefficient
        self.entropy_coef = hyper_params['entropy_coef']  # entropy coefficient
        self.num_hidden_neurons = hyper_params[
            'num_hidden_neurons']  # number of hidden neurons
        self.policy_std = hyper_params['policy_std']  # initial policy stddev
        self.lr = hyper_params['lr']  # lern rate
        self.max_grad_norm = hyper_params[
            'max_grad_norm']  # maximum gradient norm for param update
        self.num_evals = hyper_params[
            'num_evals']  # number of policy evaluations to compute expected reward
        self.eval_step = hyper_params[
            'eval_step']  # policy gets evaluated after every eval_step

        self.num_inputs = self.env.observation_space.shape[0]
        self.num_outputs = self.env.action_space.shape[0]
        self.num_states = self.num_inputs
        self.cumulative_rollout_rewards = np.array([])
        self.cum_eval_rewards = np.array([])
        self.cum_eval_rewards_std = np.array([])
        self.entropy = np.array([])
        self.epoch = 0

        # initialize actor critic network
        self.ac_net = actor_critic.ActorCriticMLPShared(
            num_inputs=self.num_inputs,
            num_hidden_neurons=self.num_hidden_neurons,
            num_outputs=self.num_outputs,
            layer_norm=hyper_params['layer_norm'],
            std=self.policy_std)

        self.ac_optim = optim.Adam(self.ac_net.parameters(), lr=self.lr)

        if continue_training:
            self.ac_net, self.ac_optim, self.cumulative_rollout_rewards, \
            self.cum_eval_rewards, self.cum_eval_rewards_std, self.epoch, self.entropy = \
                model_handler.load_model(path=path,
                                         model=self.ac_net,
                                         optimizer=self.ac_optim,
                                         from_checkpoint=True)
            self.ac_net.train()
Example #15
0
class PPO():
    def __init__(self, env, path, hyper_params, continue_training=False):
        """
        This class provides a PPO implementation

        :param env: gym environment
        :param path: path where to save checkpoints and results
        :param hyper_params: hyper parameter for ppo
        :param continue_training: checks if to continue training or start new
        """

        self.env = GentlyTerminating(env)
        self.path = path
        self.num_iterations = hyper_params[
            'num_iterations']  # number of total training iterations
        self.lamb = hyper_params[
            'lambda']  # lambda for general advantage estimate
        self.cliprange = hyper_params[
            'cliprange']  # ppo cliprange of importance weights
        self.gamma = hyper_params[
            'gamma']  # gamma for general advantage estimate
        self.ppo_epochs = hyper_params[
            'ppo_epochs']  # number ppo optimization epochs
        self.horizon = hyper_params[
            'horizon']  # number of training samples per iteration
        self.minibatches = hyper_params[
            'minibatches']  # minibatch size for ppo optimization
        self.vf_coef = hyper_params['vf_coef']  # value function coefficient
        self.entropy_coef = hyper_params['entropy_coef']  # entropy coefficient
        self.num_hidden_neurons = hyper_params[
            'num_hidden_neurons']  # number of hidden neurons
        self.policy_std = hyper_params['policy_std']  # initial policy stddev
        self.lr = hyper_params['lr']  # lern rate
        self.max_grad_norm = hyper_params[
            'max_grad_norm']  # maximum gradient norm for param update
        self.num_evals = hyper_params[
            'num_evals']  # number of policy evaluations to compute expected reward
        self.eval_step = hyper_params[
            'eval_step']  # policy gets evaluated after every eval_step

        self.num_inputs = self.env.observation_space.shape[0]
        self.num_outputs = self.env.action_space.shape[0]
        self.num_states = self.num_inputs
        self.cumulative_rollout_rewards = np.array([])
        self.cum_eval_rewards = np.array([])
        self.cum_eval_rewards_std = np.array([])
        self.entropy = np.array([])
        self.epoch = 0

        # initialize actor critic network
        self.ac_net = actor_critic.ActorCriticMLPShared(
            num_inputs=self.num_inputs,
            num_hidden_neurons=self.num_hidden_neurons,
            num_outputs=self.num_outputs,
            layer_norm=hyper_params['layer_norm'],
            std=self.policy_std)

        self.ac_optim = optim.Adam(self.ac_net.parameters(), lr=self.lr)

        if continue_training:
            self.ac_net, self.ac_optim, self.cumulative_rollout_rewards, \
            self.cum_eval_rewards, self.cum_eval_rewards_std, self.epoch, self.entropy = \
                model_handler.load_model(path=path,
                                         model=self.ac_net,
                                         optimizer=self.ac_optim,
                                         from_checkpoint=True)
            self.ac_net.train()

    def collect_trajectories(self):
        """
        collects multiple trajectories limited by horizon

        :return: values,old_log_probs, actions, states, rewards, masks, entropy
        """
        # init arrays for data collection
        rewards = np.empty(shape=self.horizon)
        values = torch.empty(self.horizon)
        states = torch.empty(size=(self.horizon, self.num_states))
        actions = torch.empty(self.horizon, 1)
        masks = np.empty(self.horizon)
        old_log_probs = torch.empty(size=(self.horizon, 1))
        state = self.env.reset()
        cum_reward = 0

        for i in range(self.horizon):
            state = torch.FloatTensor(state)

            # sample state from normal distribution
            mean, std, value = self.ac_net(state)
            dist = Normal(mean, std)
            action = dist.sample()

            next_state, reward, done, info = self.env.step(
                action.cpu().detach().numpy()[0])

            # save values and rewards for gae
            log_prob = dist.log_prob(action)
            values[i] = value
            old_log_probs[i] = log_prob
            states[i] = state
            actions[i] = action
            state = next_state
            rewards[i] = reward
            masks[i] = 1 - done
            cum_reward += reward

            if done:
                state = self.env.reset()

        _, _, last_value = self.ac_net(torch.FloatTensor(next_state))
        last_value = last_value.detach()
        values = values.detach()
        entropy = dist.entropy().detach().numpy()[0][0]
        old_log_probs = old_log_probs.detach()

        return values, old_log_probs, actions, states, rewards, last_value, masks, entropy

    def ppo_update(self,
                   advantage_estimates,
                   states,
                   actions,
                   old_log_probs,
                   returns,
                   cliprange=0.2):
        """
        This method performs proximal policy update over batches of inputs

        :param ppo_epochs: number of ppo optimization epochs per trajectory
        :param advantage_estimates: computed advantage estimates
        :param states: collected number of states of a trajectory
        :param actions: collected number of actions of a trajectory
        :param values: collected number of values of a trajectory
        :param old_log_probs: old log probabilities.
        :param actor_net: current actor network (samples policy)
        :param critic_net: current critic network (sample new values)
        :param minibatch_size: size of minibatches for each ppo epoch

        """
        randomized_inds = np.arange(self.horizon)

        # normalize advantages
        advantage_estimates = (advantage_estimates - advantage_estimates.mean()) / \
                              (advantage_estimates.std() + 1e-8)
        for k in range(self.ppo_epochs):
            # shuffle inputs every ppo epoch
            np.random.shuffle(randomized_inds)
            old_log_probs = old_log_probs[randomized_inds]
            actions = actions[randomized_inds]
            advantage_estimates = advantage_estimates[randomized_inds]
            states = states[randomized_inds]
            returns = returns[randomized_inds]

            for start in range(0, self.horizon, self.minibatches):
                end = start + self.minibatches
                mean, std, current_policy_value = self.ac_net(
                    states[start:end])

                dist = Normal(mean, std)
                new_log_prob = dist.log_prob(actions[start:end])
                entropy = dist.entropy().mean()

                # importance weights
                ratio = torch.exp(new_log_prob - old_log_probs[start:end])

                advantage_batch = advantage_estimates[start:end]

                surr = ratio * advantage_batch
                clipped_surr = torch.clamp(ratio, 1 - cliprange,
                                           1 + cliprange) * advantage_batch
                pg_loss = torch.min(surr, clipped_surr).mean()

                target_value = returns[start:end]
                vf_loss = ((current_policy_value - target_value).pow(2)).mean()

                loss = -(pg_loss - self.vf_coef * vf_loss +
                         self.entropy_coef * entropy)

                self.ac_optim.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(self.ac_net.parameters(),
                                               self.max_grad_norm)
                self.ac_optim.step()

    def run_ppo(self):
        """
        runs ppo and logs data

        """
        check_reward = 0
        for epoch in range(self.epoch, self.num_iterations + 1):

            # collect trajectory data
            values, old_log_probs, actions, states, \
            rewards, last_value, masks, entropy = self.collect_trajectories()

            # computes general advantages from trajectories
            advantage_est, returns = compute_gae(rewards, values, last_value,
                                                 masks, self.lamb, self.gamma)

            # interesting to check how model behaves
            total_rollout_reward = rewards.sum()

            self.cumulative_rollout_rewards = np.append(
                self.cumulative_rollout_rewards, total_rollout_reward)
            self.entropy = np.append(self.entropy, entropy)

            # plotting and evaluating policy
            if epoch % self.eval_step == 0:
                check_reward = self.logger(check_reward, epoch)

            # actual ppo optimization
            self.ppo_update(advantage_est,
                            states,
                            actions,
                            old_log_probs,
                            returns,
                            cliprange=self.cliprange)

    def logger(self, check_reward, epoch):
        """
        evaluates current model, checks if to save best scoring model.

        :param check_reward:
        :param epoch:
        :return:
        """
        eval_reward, eval_std = eval_policy(
            env=self.env,
            model=self.ac_net,
            num_evals=self.num_evals,
        )
        self.cum_eval_rewards = np.append(self.cum_eval_rewards, eval_reward)
        self.cum_eval_rewards_std = np.append(self.cum_eval_rewards_std,
                                              eval_std)
        plot_utility.plt_expected_cum_reward(self.path, self.cum_eval_rewards,
                                             self.eval_step)
        print("---------------------------------------------------")
        print("Expected cumulative reward: {} after {} epochs:".format(
            eval_reward, epoch))
        print("---------------------------------------------------")
        model_handler.save_model(model=self.ac_net,
                                 optimizer=self.ac_optim,
                                 train_rewards=self.cumulative_rollout_rewards,
                                 eval_rewards=self.cum_eval_rewards,
                                 eval_rewards_std=self.cum_eval_rewards_std,
                                 epoch=epoch,
                                 entropy=self.entropy,
                                 path=self.path + '/checkpoint')
        if check_reward < eval_reward:
            print("Found new high scoring model")
            check_reward = eval_reward
            model_handler.save_model(
                model=self.ac_net,
                optimizer=self.ac_optim,
                train_rewards=self.cumulative_rollout_rewards,
                eval_rewards=self.cum_eval_rewards,
                eval_rewards_std=self.cum_eval_rewards_std,
                epoch=epoch,
                entropy=self.entropy,
                path=self.path + '/best_policy')

        return check_reward
Example #16
0
from torch.distributions import Normal

path = os.path.dirname(__file__)


def load_model(env, path):
    hyper_params = torch.load(path + '/hyper_params.pt', map_location='cpu')
    policy = PPO(env, path, hyper_params).ac_net
    checkpoint = torch.load(path + '/model/save_file.pt', map_location='cpu')
    policy.load_state_dict(checkpoint['model_state_dict'])
    return policy


if __name__ == "__main__":

    env = GentlyTerminating(gym.make('CartpoleSwingShort-v0'))
    model = load_model(env=env, path=path)

    state = env.reset()
    done = False

    while not done:
        mean, _, _ = model(torch.FloatTensor(state))
        dist = Normal(mean, 0)
        action = dist.sample().cpu().detach().numpy()
        state, reward, done, _ = env.step(action)
        env.render()

        print(state, action, reward)

    env.close()
Example #17
0
#FuturaPend=Qube-v0

env_names = {
    0: "Levitation-v0",
    1: "CartpoleSwingShort-v0",
    2: "Qube-v0",
    3: "Pendulum-v2"
}

ENV_NAME = env_names[3]
sampling_type = "uniform"

print("Sampling env:")
print(ENV_NAME)

env = GentlyTerminating(gym.make(ENV_NAME))
print("Observation space:")
print(env.observation_space)
print("Low:")
print(env.observation_space.low)
print("High:")
print(env.observation_space.high)
print("Action space:")
print(env.action_space)
print("Low:")
print(env.action_space.low)
print("High:")
print(env.action_space.high)

states = []
actions = []
Example #18
0
class DDPG:
    
    def __init__(self, env, action_space_limits, dirname="out", buffer_size=10000, batch_size=64, is_quanser_env=True,
                 gamma=.99, tau=1e-2, steps=100000, warmup_samples=1000, noise_decay=0.9,
                 transform=lambda x: x, actor_lr=1e-3, critic_lr=1e-3, lr_decay=1.0, lr_min=1.e-7, trial_horizon=5000,
                 batch_norm=True,
                 actor_hidden_layers=[10, 10, 10], critic_hidden_layers=[10, 10, 10], device="cpu"):
        """
        DDPG algorithm implementation as in https://arxiv.org/abs/1509.02971
        param env: the gym environment to deal with
        param dirname: non-existing or existing directory in which the calculated immediate models will be saved in
        param action_space_limits: sets a limit on action space
        param buffer_size: the size of the replay buffer 
        param batch_size: size of batches to learn with while training, extracted from replay buffer
        param is_quaner_env: True if given env is from quaner_robots, else false
        param gamma: interest rate of expected return
        param tau: update factor from source to target network
        param steps: number of steps that will be performed during training time
        param warmup_samples: number of random samples placed into replay buffer before training actor and critic network
        param noise_decay: gaussian noise on actions will be reduced multiplicative in every episode by this factor
        param transform: function to transform observation space of given environment
        param actor_lr: learning rate of adam optimizer for actor network
        param critic_lr: learning rate of adam optimizer for critic network
        param lr_decay: learning rate decay of adam optimizers
        param lr_min: lower bound of learning rate of adam_optimizers
        param trial_horizon: maximum steps to take per episode
        param actor_hidden_layers: hidden layers of actor network as a numeric list
        param critic_hidden_layers: hidden layers of critic network as a numeric list
        param device: on which device to train your torch nn.Models on either cpu or gpu
        """

        self.device = device
        # algorithm timestamp
        self.started = datetime.datetime.now()
        
        self.env = env
        self.is_quanser_env = is_quanser_env
        self.dirname = dirname
        self.env_low = torch.tensor(action_space_limits[0], device=self.device, dtype=torch.float)
        self.env_high = torch.tensor(action_space_limits[1], device=self.device, dtype=torch.float)
        self.warmup_samples = warmup_samples
        self.total_steps = steps
        self.transformObservation = transform

        # replay buffer parameters + initialization
        self.buffer_size = buffer_size
        self.replayBuffer = ReplayBuffer(self.buffer_size, self.device)
        self.batch_size = batch_size
        self.n_batches = warmup_samples

        self.state_dim = self.env.observation_space.shape[0]
        self.action_dim = self.env.action_space.shape[0]
        
        # optimizer parameters
        self.actor_lr = actor_lr
        self.critic_lr = critic_lr
        self.lr_decay = lr_decay
        self.lr_min = lr_min
  
        # actor and critic parameters + initialization
        self.actor_hidden_layers = actor_hidden_layers
        self.critic_hidden_layers = critic_hidden_layers
        self.actor_network = ActorNetwork([self.state_dim, *self.actor_hidden_layers, self.action_dim],
                                          torch.tensor(self.env_low[0], device=self.device, dtype=torch.float),
                                          torch.tensor(self.env_high[0], device=self.device, dtype=torch.float),
                                          batch_norm=batch_norm).to(self.device)
        self.critic_network = CriticNetwork([self.state_dim + self.action_dim, *self.critic_hidden_layers, 1], batch_norm=batch_norm).to(self.device)
        self.actor_target = copy.deepcopy(self.actor_network).to(self.device)
        self.critic_target = copy.deepcopy(self.critic_network).to(self.device)
        
        # optimizer initialization
        self.actor_optim = torch.optim.Adam(self.actor_network.parameters(), lr=actor_lr)
        self.critic_optim = torch.optim.Adam(self.critic_network.parameters(), lr=critic_lr)
        self.actor_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.actor_optim, lr_decay)
        self.critic_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(self.critic_optim, lr_decay)
        
        # training parameters
        self.loss = nn.MSELoss()
        self.noise_decay = torch.tensor(noise_decay, device=self.device, dtype=torch.float)
        self.trial_horizon = trial_horizon
        self.gamma = torch.tensor(gamma, device=self.device, dtype=torch.float)
        self.tau = torch.tensor(tau, device=self.device, dtype=torch.float)
        # gaussian noise on actions used
        self.noise_torch = torch.distributions.normal.Normal(0, self.env_high[0])

    def action_selection(self, state):
        """
        Selects best action according to q-function for a given state
        param state: current state
        return: action with highest q-value
        """
        with torch.no_grad():
            self.actor_network.eval()
            action = self.actor_network(state)
            self.actor_network.train()
            return action

    def soft_update(self, source, target):
        """
        Updates the weights of given target network (nn.Module) by the weights of given source network (nn.Module) 
        param source: nn.Module which weights will be taken from
        param target: nn.Module which weights will be updated to
        """
        for target_w, source_w  in zip(target.parameters(), source.parameters()):
            target_w.data.copy_(
                (1.0 - self.tau) * target_w.data \
                + self.tau * source_w.data
            )

    def update_actor(self, loss):
        """
        Updates actor network by given calculated loss
        """
        # update actor
        self.actor_optim.zero_grad()
        loss.backward()
        self.actor_optim.step()

    def update_critic(self, loss):
        """
        Updates critic network by given calculated loss
        """
        # update critic
        self.critic_optim.zero_grad()
        loss.backward(retain_graph=True)
        self.critic_optim.step()
   
    def forward_actor_network(self, network, state):
        """
        Forwards state through either target or training ActorNetwork
        param network: either target or training ActorNetwork
        param state: state to forward through network
        return: action for environment step 
        """
        state = torch.tensor(state, dtype=torch.float32).to(self.device).unsqueeze(0)
        action = network(state).squeeze()
        # dimensionality check of actions
        action = action.unsqueeze(0).cpu().detach().numpy() if action.dim() == 0 else action
        if self.is_quanser_env:
            action = np.array(action)
        return action

    def trial(self):
        """
        Test the target actor in the environment
        return: average total reward
        """
        print("trial average total reward:")
        self.actor_target.eval()
        with torch.no_grad():
            episodes = 5
            average_reward = 0
            for episode in range(episodes):
                obs = self.env.reset()
                total_reward = 0
                for t in range(self.trial_horizon):
                    state = self.transformObservation(obs)
                    action = self.forward_actor_network(self.actor_target, state)
                    obs, reward, done, _ = self.env.step(action)
                    total_reward += reward
                    if done:
                        break
                # calculate average reward with incremental average
                average_reward += total_reward/episodes
        print(average_reward)
        self.actor_target.train()
        return average_reward

    def save_model(self, reward):
        """
        Saves the immediate actor and critic target network in given self.dirame directory
        param reward: will be displayed as filename
        """
        if not os.path.exists(self.dirname):
            os.makedirs(self.dirname)
        torch.save(self.actor_target.state_dict(), os.path.join(self.dirname, "actortarget_{}".format(reward)))
        torch.save(self.critic_target.state_dict(), os.path.join(self.dirname, "critictarget_{}".format(reward)))

    def update(self):
        """
        Calculating loss w.r.t. DDPG paper https://arxiv.org/abs/1509.02971
        return: actor and critic loss        
        """
        sample_batch = self.replayBuffer.sample_batch(self.batch_size)
        s_batch, a_batch, r_batch, s_2_batch, done_batch = sample_batch

        # calculate policy/actor loss
        actor_loss = self.critic_network(s_batch, self.actor_network(s_batch))
        actor_loss = - actor_loss.mean() 

        # calculate value/critic loss
        next_action = self.actor_target(s_2_batch)
        critic_target_prediction = self.critic_target(s_2_batch, next_action)
        expected_critic = r_batch + self.gamma * (1. - done_batch) * critic_target_prediction

        critic_pred = self.critic_network(s_batch, a_batch)
        critic_loss = self.loss(critic_pred, expected_critic)

        return actor_loss, critic_loss

    def info_print(self, step, total_reward, reward_record):
        """
        Status print of this training session per episode
        """
        statusprint = "{} /{} | {:.0f} /{:.0f} | {} /{} | alr,clr: {:.2E} {:.2E}"
        print(statusprint.format(step, self.total_steps, total_reward, reward_record, self.replayBuffer.count, self.replayBuffer.buffer_size, self.actor_lr_scheduler.get_lr()[0], self.critic_lr_scheduler.get_lr()[0]))

    def train_rr(self):
        """
        A training session w.r.t. training parameters in a real environment
        :return: total reward for this training session
        """
        self.env = GentlyTerminating(self.env)
        print("Training in real environment started...")
        reward_record = 0
        total_reward = 0
        episode = 0
        rew = []
        step = 0
        while step < self.total_steps:
            state = self.transformObservation(self.env.reset())
            done = False

            self.info_print(step, total_reward, reward_record)

            total_reward = 0
            i = 0
            while not done:

                action = self.action_selection(
                    torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze()

                action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action

                action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0])

                action = action.to("cpu").detach().numpy()
                next_state, reward, done, _ = self.env.step(action)
                done = done or i >= self.trial_horizon
                next_state = self.transformObservation(next_state)

                total_reward += reward

                step += 1
                i = i + 1

                self.replayBuffer.add(state, action, reward, next_state, done)
                state = next_state
            # we do this at end of every episode because it takes to much time between episodes
            if self.replayBuffer.count >= self.n_batches:
                actor_loss, critic_loss = self.update()

                self.update_actor(actor_loss)
                self.update_critic(critic_loss)

                self.soft_update(self.actor_network, self.actor_target)
                self.soft_update(self.critic_network, self.critic_target)

            if self.replayBuffer.count >= self.n_batches:
                if self.critic_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.critic_lr_scheduler.step()
                if self.actor_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.actor_lr_scheduler.step()
                episode += 1
                # if out actor is really good, test target actor. If the target actor is good too, save it.
                if reward_record < total_reward and total_reward > 50:
                    trial_average_reward = self.trial()
                    if trial_average_reward > reward_record:
                        print("New record")
                        reward_record = trial_average_reward
                        self.save_model(trial_average_reward)
                rew.append(total_reward)
        # test & save final model
        trial_average_reward = self.trial()
        self.save_model("{:.2f}_final".format(trial_average_reward))

        return rew

    def load_model(self, dirname):
        """
        Setting actor network to given model
        :param dirname: specified policy that will be loaded
        :return:
        """
        if not os.path.exists(os.path.join(dirname)):
            print("no model checkoutpoint found")
            return
        self.actor_network.load_state_dict(torch.load(os.path.join(dirname), map_location='cpu'))

    def trial_sim(self, episodes):
        """
        A trial with given episodes in the simulated environment
        :return: reward trajectory as a list
        """
        rew = []
        self.actor_network.eval()
        for step in range(episodes):
            done = False
            obs = self.env.reset()

            total_reward = 0
            i = 0
            while not done:
                state = obs
                action = self.forward_actor_network(self.actor_network, state)
                if step == 0:
                    self.env.render()
                obs, reward, done, _ = self.env.step(action)

                done = done or i >= self.trial_horizon - 1
                total_reward += reward
                i += 1

            rew.append(total_reward)

        self.actor_network.train()
        return rew

    def trial_rr(self, episodes):
        """
        A trial with given episodes in the real environment
        :return: reward trajectory as a list
        """
        self.env = GentlyTerminating(self.env)
        return self.trial_sim(episodes)

    def train_sim(self):
        """
        A training session w.r.t. training parameters in a simulated environment
        return: total reward achieved during this training session
        """
        print("Training in simulation started...")
        reward_record = 0
        total_reward = 0
        episode = 0
        rew = []
        step = 0
        while step < self.total_steps:
            state = self.transformObservation(self.env.reset())
            done = False

            self.info_print(step, total_reward, reward_record)

            total_reward = 0
            i = 0
            while not done:

                action = self.action_selection(
                torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze()

                action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action

                action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0])

                action = action.to("cpu").detach().numpy()
                next_state, reward, done, _ = self.env.step(action)
                done = done or i >= self.trial_horizon
                next_state = self.transformObservation(next_state)

                total_reward += reward

                self.replayBuffer.add(state, action, reward, next_state, done)
                state = next_state
                if self.replayBuffer.count >= self.n_batches:

                    actor_loss, critic_loss = self.update()

                    self.update_actor(actor_loss)
                    self.update_critic(critic_loss)

                    self.soft_update(self.actor_network, self.actor_target)
                    self.soft_update(self.critic_network, self.critic_target)

                step += 1
                i = i + 1
            if self.replayBuffer.count >= self.n_batches:
                if self.critic_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.critic_lr_scheduler.step()
                if self.actor_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.actor_lr_scheduler.step()
                episode += 1
                # if out actor is really good, test target actor. If the target actor is good too, save it.
                if reward_record < total_reward and total_reward > 50:
                    trial_average_reward = self.trial()
                    if trial_average_reward > reward_record:
                        print("New record")
                        reward_record = trial_average_reward
                        self.save_model(trial_average_reward)
                rew.append(total_reward)

        # test & save final model
        trial_average_reward = self.trial()
        self.save_model("{:.2f}_final".format(trial_average_reward))

        return rew
Example #19
0
    def train_rr(self):
        """
        A training session w.r.t. training parameters in a real environment
        :return: total reward for this training session
        """
        self.env = GentlyTerminating(self.env)
        print("Training in real environment started...")
        reward_record = 0
        total_reward = 0
        episode = 0
        rew = []
        step = 0
        while step < self.total_steps:
            state = self.transformObservation(self.env.reset())
            done = False

            self.info_print(step, total_reward, reward_record)

            total_reward = 0
            i = 0
            while not done:

                action = self.action_selection(
                    torch.tensor(state, dtype=torch.float32, device=self.device).unsqueeze(0)).squeeze()

                action = self.noise_torch.sample((self.action_dim,)) * self.noise_decay ** episode + action

                action = torch.clamp(action, min=self.env_low[0], max=self.env_high[0])

                action = action.to("cpu").detach().numpy()
                next_state, reward, done, _ = self.env.step(action)
                done = done or i >= self.trial_horizon
                next_state = self.transformObservation(next_state)

                total_reward += reward

                step += 1
                i = i + 1

                self.replayBuffer.add(state, action, reward, next_state, done)
                state = next_state
            # we do this at end of every episode because it takes to much time between episodes
            if self.replayBuffer.count >= self.n_batches:
                actor_loss, critic_loss = self.update()

                self.update_actor(actor_loss)
                self.update_critic(critic_loss)

                self.soft_update(self.actor_network, self.actor_target)
                self.soft_update(self.critic_network, self.critic_target)

            if self.replayBuffer.count >= self.n_batches:
                if self.critic_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.critic_lr_scheduler.step()
                if self.actor_lr_scheduler.get_lr()[0] > self.lr_min:
                    self.actor_lr_scheduler.step()
                episode += 1
                # if out actor is really good, test target actor. If the target actor is good too, save it.
                if reward_record < total_reward and total_reward > 50:
                    trial_average_reward = self.trial()
                    if trial_average_reward > reward_record:
                        print("New record")
                        reward_record = trial_average_reward
                        self.save_model(trial_average_reward)
                rew.append(total_reward)
        # test & save final model
        trial_average_reward = self.trial()
        self.save_model("{:.2f}_final".format(trial_average_reward))

        return rew
Example #20
0
"""
Analytic real-robot swing-up controller with trajectory visualization.
"""

import numpy as np
import matplotlib.pyplot as plt
import gym
from quanser_robots import GentlyTerminating
from quanser_robots.qube import SwingUpCtrl
import time

plt.style.use('seaborn')


env = GentlyTerminating(gym.make('QubeRR-100-v0'))

ctrl = SwingUpCtrl()
obs = env.reset()
s_all, a_all = [], []
done = False
t0 = time.perf_counter()
n = 0
while not done:
    env.render()
    act = ctrl(obs)
    obs, rwd, done, info = env.step(act)
    s_all.append(info['s'])
    a_all.append(info['a'])
    n += 1
t1 = time.perf_counter()
print("freq = {}, time = {}".format(n / (t1-t0), t1-t0))