Esempio n. 1
0
def get_env_and_controller(long_pendulum=True, simulation=False, swinging=False, mouse_control=False):
    pendulum_str = {True: "Long", False: "Short"}
    simulation_str = {True: "", False: "RR"}
    task_str = {True: "Swing", False: "Stab"}

    if not simulation:
        pendulum_str = {True: "", False: ""}

    mu = 7.5 if long_pendulum else 19.
    env_name = "Cartpole%s%s%s-v0" % (task_str[swinging], pendulum_str[long_pendulum], simulation_str[simulation])
    if not mouse_control:
        return Logger(GentlyTerminating(gym.make(env_name))), SwingUpCtrl(long=long_pendulum)
    else:
        return Logger(GentlyTerminating(gym.make(env_name))), MouseCtrl()
def test():
    config_path = "config.yml"
    print_config(config_path)
    config = load_config(config_path)
    training_config = config["training_config"]
    config["model_config"]["load_model"] = True

    env_id = "DoublePendulum-v0"
    env = GentlyTerminating(gym.make(env_id))

    n_episodes = 10
    max_episode_step = 10000
    print("*********************************************")
    print(
        "Testing the model for 10 episodes with 10000 maximum steps per episode"
    )
    print("*********************************************")

    policy = Policy(env, config)

    losses = []
    all_rewards = []
    avg_rewards = []
    epsilons = []
    for i_episode in range(n_episodes):
        episode_reward = 0
        state = env.reset()
        state[4] /= 10
        epsilon = 0
        epsilons.append(epsilon)
        for step in range(max_episode_step):
            env.render()
            time.sleep(0.01)
            action = policy.act(state, epsilon)

            f_action = 6 * (action - (policy.n_actions - 1) / 2) / (
                (policy.n_actions - 1) / 2)
            next_state, reward, done, _ = env.step(f_action)
            reward = 10 * reward
            next_state[4] /= 10

            policy.replay_buffer.push(state, action[0], reward, next_state,
                                      done)

            state = next_state
            episode_reward += reward

            if done:
                break
        print(" episode: %s, episode reward: %s" % (i_episode, episode_reward))
        all_rewards.append(episode_reward)
        avg_rewards.append(np.mean(all_rewards[-3:]))

    env.close()
    plot_fig(n_episodes, all_rewards, avg_rewards, losses)
def train():
    '''Load the configuration setttings'''
    config_path = "config.yml"
    print_config(config_path)
    config = load_config(config_path)
    training_config = config["training_config"]
    seed = training_config["random_seed"]
    n_episodes = training_config["n_episodes"]
    max_episode_step = training_config["max_episode_step"]
    n_update_target = training_config["n_update_target"]
    exp_number = training_config["exp_number"]
    save_model_path = training_config["save_model_path"]
    render_flag = training_config["render"]
    save_best = training_config["save_best"]
    '''Use fixed epsilon or use a exponential function decay?'''
    if training_config["use_fix_epsilon"]:
        epsilon_by_frame = lambda frame_idx: training_config["fix_epsilon"]
    else:
        epsilon_start = training_config["epsilon_start"]
        epsilon_final = training_config["epsilon_final"]
        epsilon_decay = training_config["epsilon_decay"]
        epsilon_by_frame = lambda frame_idx: epsilon_final + (
            epsilon_start - epsilon_final) * np.exp(-1. * frame_idx /
                                                    epsilon_decay)
    torch.manual_seed(seed)
    np.random.seed(seed)
    '''Environment initialization'''
    env_id = "Qube-v0"
    env = GentlyTerminating(gym.make(env_id))
    '''Initialize the DQN algorithm object'''
    policy = Policy(env, config)
    losses = []
    all_rewards = []
    avg_rewards = []
    epsilons = []
    '''Training the q-network with n episodes'''
    for i_episode in range(n_episodes):
        episode_reward = 0
        state = env.reset()
        state[4:6] /= 20
        epsilon = epsilon_by_frame(i_episode)
        epsilons.append(epsilon)
        for step in range(max_episode_step):
            if render_flag:
                env.render()
            '''Choose action'''
            action = policy.act(state, epsilon)
            f_action = 5 * (action - (policy.n_actions - 1) / 2) / (
                (policy.n_actions - 1) / 2)
            next_state, reward, done, _ = env.step(f_action)
            reward = 100 * (reward)
            next_state[4:6] /= 20
            policy.replay_buffer.push(state, action[0], reward, next_state,
                                      done)
            state = next_state
            episode_reward += reward

            if done:
                break

            if len(policy.replay_buffer) > policy.batch_size:
                loss = policy.train()
                losses.append(loss.item())

        all_rewards.append(episode_reward)
        avg_rewards.append(np.mean(all_rewards[-10:]))

        if i_episode % 50 == 0:
            '''Save the results figure every 50 episodes'''
            save_fig(i_episode, all_rewards, avg_rewards, losses, epsilons,
                     exp_number)

        if i_episode % n_update_target == 0:
            '''Update the target network'''
            policy.update_target()

        policy.save_model(save_model_path)
        if save_best and i_episode > 100:
            ratio = 1.1
            if episode_reward > ratio * np.mean(all_rewards[-10:]):
                print("Save model with episode reward %s " % (episode_reward))
                print("Model path: %s " % (save_model_path))
                break

    env.close()
Esempio n. 4
0
parser.add_argument('--max-kl', type=float, default=1e-2, metavar='G',
                    help='max kl value (default: 1e-2)')
parser.add_argument('--damping', type=float, default=1e-1, metavar='G',
                    help='damping (default: 1e-1)')
parser.add_argument('--seed', type=int, default=543, metavar='N',
                    help='random seed (default: 1)')
parser.add_argument('--batch-size', type=int, default=15000, metavar='N',
                    help='random seed (default: 1)')
parser.add_argument('--render', action='store_true',
                    help='render the environment')
parser.add_argument('--log-interval', type=int, default=1, metavar='N',
                    help='interval between training status logs (default: 10)')
args = parser.parse_args()

env_id = 'BallBalancerSim-v0'
env = GentlyTerminating(gym.make(args.env_name))

num_inputs = env.observation_space.shape[0]
num_actions = env.action_space.shape[0]

env.seed(args.seed)
torch.manual_seed(args.seed)

loadpretrained = False
if loadpretrained:
    print("load model")
    policy_net = torch.load("policynet.pth")
    value_net = torch.load("valuenet.pth")
else:
    policy_net = Policy(num_inputs, num_actions)
    value_net = Value(num_inputs)
Esempio n. 5
0
# coding: utf-8
import gym
import torch.utils.data as data
from dynamics import *
from controller import *
from utils import *
from quanser_robots.common import GentlyTerminating
import time

# datasets:  numpy array, size:[sample number, input dimension]
# labels:  numpy array, size:[sample number, output dimension]

env_id = "Qube-100-v0"  # "CartPole-v0"
env = GentlyTerminating(gym.make(env_id))
config_path = "config.yml"
config = load_config(config_path)
print_config(config_path)

batchsize_list = []
total_rewardlist = []
for i in range(4):
    batchsize_list.append(config["training_config"]["batch_size"])
    model = DynamicModel(config)
    data_fac = DatasetFactory(env, config)
    data_fac.collect_random_dataset()
    loss = model.train(data_fac.random_trainset, data_fac.random_testset)
    mpc = MPC(env, config)
    rewards_list = []
    for itr in range(config["dataset_config"]["n_mpc_itrs"] // 2):
        t = time.time()
        print("**********************************************")
Esempio n. 6
0
# coding: utf-8
import gym
import torch.utils.data as data
from dynamics import *
from controller import *
from utils import *
from quanser_robots.common import GentlyTerminating
from quanser_robots.qube import Parameterized
from quanser_robots.qube import SwingUpCtrl
import matplotlib.pyplot as plt
import time

env = Parameterized(GentlyTerminating(gym.make('Qube-100-v0')))

# Show all adjustable physics parameters
print(env.params())


# env = GentlyTerminating(gym.make('BallBalancerSim-v0'))
# obs = env.reset()
# done = False
# while not done:
#     env.render()
#     act = env.action_space.sample()
#     obs, _, done, _ = env.step(act)
#
# env.close()