def get_env_and_controller(long_pendulum=True, simulation=False, swinging=False, mouse_control=False): pendulum_str = {True: "Long", False: "Short"} simulation_str = {True: "", False: "RR"} task_str = {True: "Swing", False: "Stab"} if not simulation: pendulum_str = {True: "", False: ""} mu = 7.5 if long_pendulum else 19. env_name = "Cartpole%s%s%s-v0" % (task_str[swinging], pendulum_str[long_pendulum], simulation_str[simulation]) if not mouse_control: return Logger(GentlyTerminating(gym.make(env_name))), SwingUpCtrl(long=long_pendulum) else: return Logger(GentlyTerminating(gym.make(env_name))), MouseCtrl()
def test(): config_path = "config.yml" print_config(config_path) config = load_config(config_path) training_config = config["training_config"] config["model_config"]["load_model"] = True env_id = "DoublePendulum-v0" env = GentlyTerminating(gym.make(env_id)) n_episodes = 10 max_episode_step = 10000 print("*********************************************") print( "Testing the model for 10 episodes with 10000 maximum steps per episode" ) print("*********************************************") policy = Policy(env, config) losses = [] all_rewards = [] avg_rewards = [] epsilons = [] for i_episode in range(n_episodes): episode_reward = 0 state = env.reset() state[4] /= 10 epsilon = 0 epsilons.append(epsilon) for step in range(max_episode_step): env.render() time.sleep(0.01) action = policy.act(state, epsilon) f_action = 6 * (action - (policy.n_actions - 1) / 2) / ( (policy.n_actions - 1) / 2) next_state, reward, done, _ = env.step(f_action) reward = 10 * reward next_state[4] /= 10 policy.replay_buffer.push(state, action[0], reward, next_state, done) state = next_state episode_reward += reward if done: break print(" episode: %s, episode reward: %s" % (i_episode, episode_reward)) all_rewards.append(episode_reward) avg_rewards.append(np.mean(all_rewards[-3:])) env.close() plot_fig(n_episodes, all_rewards, avg_rewards, losses)
def train(): '''Load the configuration setttings''' config_path = "config.yml" print_config(config_path) config = load_config(config_path) training_config = config["training_config"] seed = training_config["random_seed"] n_episodes = training_config["n_episodes"] max_episode_step = training_config["max_episode_step"] n_update_target = training_config["n_update_target"] exp_number = training_config["exp_number"] save_model_path = training_config["save_model_path"] render_flag = training_config["render"] save_best = training_config["save_best"] '''Use fixed epsilon or use a exponential function decay?''' if training_config["use_fix_epsilon"]: epsilon_by_frame = lambda frame_idx: training_config["fix_epsilon"] else: epsilon_start = training_config["epsilon_start"] epsilon_final = training_config["epsilon_final"] epsilon_decay = training_config["epsilon_decay"] epsilon_by_frame = lambda frame_idx: epsilon_final + ( epsilon_start - epsilon_final) * np.exp(-1. * frame_idx / epsilon_decay) torch.manual_seed(seed) np.random.seed(seed) '''Environment initialization''' env_id = "Qube-v0" env = GentlyTerminating(gym.make(env_id)) '''Initialize the DQN algorithm object''' policy = Policy(env, config) losses = [] all_rewards = [] avg_rewards = [] epsilons = [] '''Training the q-network with n episodes''' for i_episode in range(n_episodes): episode_reward = 0 state = env.reset() state[4:6] /= 20 epsilon = epsilon_by_frame(i_episode) epsilons.append(epsilon) for step in range(max_episode_step): if render_flag: env.render() '''Choose action''' action = policy.act(state, epsilon) f_action = 5 * (action - (policy.n_actions - 1) / 2) / ( (policy.n_actions - 1) / 2) next_state, reward, done, _ = env.step(f_action) reward = 100 * (reward) next_state[4:6] /= 20 policy.replay_buffer.push(state, action[0], reward, next_state, done) state = next_state episode_reward += reward if done: break if len(policy.replay_buffer) > policy.batch_size: loss = policy.train() losses.append(loss.item()) all_rewards.append(episode_reward) avg_rewards.append(np.mean(all_rewards[-10:])) if i_episode % 50 == 0: '''Save the results figure every 50 episodes''' save_fig(i_episode, all_rewards, avg_rewards, losses, epsilons, exp_number) if i_episode % n_update_target == 0: '''Update the target network''' policy.update_target() policy.save_model(save_model_path) if save_best and i_episode > 100: ratio = 1.1 if episode_reward > ratio * np.mean(all_rewards[-10:]): print("Save model with episode reward %s " % (episode_reward)) print("Model path: %s " % (save_model_path)) break env.close()
parser.add_argument('--max-kl', type=float, default=1e-2, metavar='G', help='max kl value (default: 1e-2)') parser.add_argument('--damping', type=float, default=1e-1, metavar='G', help='damping (default: 1e-1)') parser.add_argument('--seed', type=int, default=543, metavar='N', help='random seed (default: 1)') parser.add_argument('--batch-size', type=int, default=15000, metavar='N', help='random seed (default: 1)') parser.add_argument('--render', action='store_true', help='render the environment') parser.add_argument('--log-interval', type=int, default=1, metavar='N', help='interval between training status logs (default: 10)') args = parser.parse_args() env_id = 'BallBalancerSim-v0' env = GentlyTerminating(gym.make(args.env_name)) num_inputs = env.observation_space.shape[0] num_actions = env.action_space.shape[0] env.seed(args.seed) torch.manual_seed(args.seed) loadpretrained = False if loadpretrained: print("load model") policy_net = torch.load("policynet.pth") value_net = torch.load("valuenet.pth") else: policy_net = Policy(num_inputs, num_actions) value_net = Value(num_inputs)
# coding: utf-8 import gym import torch.utils.data as data from dynamics import * from controller import * from utils import * from quanser_robots.common import GentlyTerminating import time # datasets: numpy array, size:[sample number, input dimension] # labels: numpy array, size:[sample number, output dimension] env_id = "Qube-100-v0" # "CartPole-v0" env = GentlyTerminating(gym.make(env_id)) config_path = "config.yml" config = load_config(config_path) print_config(config_path) batchsize_list = [] total_rewardlist = [] for i in range(4): batchsize_list.append(config["training_config"]["batch_size"]) model = DynamicModel(config) data_fac = DatasetFactory(env, config) data_fac.collect_random_dataset() loss = model.train(data_fac.random_trainset, data_fac.random_testset) mpc = MPC(env, config) rewards_list = [] for itr in range(config["dataset_config"]["n_mpc_itrs"] // 2): t = time.time() print("**********************************************")
# coding: utf-8 import gym import torch.utils.data as data from dynamics import * from controller import * from utils import * from quanser_robots.common import GentlyTerminating from quanser_robots.qube import Parameterized from quanser_robots.qube import SwingUpCtrl import matplotlib.pyplot as plt import time env = Parameterized(GentlyTerminating(gym.make('Qube-100-v0'))) # Show all adjustable physics parameters print(env.params()) # env = GentlyTerminating(gym.make('BallBalancerSim-v0')) # obs = env.reset() # done = False # while not done: # env.render() # act = env.action_space.sample() # obs, _, done, _ = env.step(act) # # env.close()