コード例 #1
0
def train_and_evaluate(args, monitor_path, checkpoint_step_filename,
                       checkpoint_weights_filename, weights_filename,
                       log_filename):

    env = gym.make(args["env_name"])
    env = Monitor(env,
                  monitor_path,
                  resume=True,
                  uid=args["run_id"],
                  video_callable=lambda episode_num: episode_num % args[
                      "record_video_every"] == 0)
    np.random.seed(args["random_seed"])
    env.seed(args["random_seed"])
    starting_step = 0
    if os.path.exists(checkpoint_step_filename):
        with open(checkpoint_step_filename, 'r') as f:
            starting_step = int(f.read())
    args["starting_step"] = starting_step
    dqn = make_deep_q_network(env, args)
    if args["starting_step"] > 0:
        dqn.load_weights(checkpoint_weights_filename)

    callbacks = [
        ReloadModelIntervalCheckpoint(checkpoint_weights_filename,
                                      step_path=checkpoint_step_filename,
                                      interval=args["checkpoint_frequency"],
                                      starting_step=starting_step),
        MyTrainLogger(args["checkpoint_frequency"], args["training_steps"],
                      starting_step, log_filename)
    ]

    if args["mode"] == "Train":
        dqn.fit(env,
                callbacks=callbacks,
                verbose=0,
                nb_steps=args["training_steps"] - starting_step,
                nb_max_start_steps=args["strarting_fire_steps"],
                start_step_policy=lambda obs: 1)  # 1 is fire action

        dqn.save_weights(weights_filename, overwrite=True)
    else:
        dqn.load_weights(weights_filename)

    env = gym.make(args["env_name"])
    env = Monitor(env, monitor_path, resume=True, uid=args["run_id"] + "_test")
    np.random.seed(args["random_seed"])
    env.seed(args["random_seed"])
    dqn.test(env,
             nb_episodes=1,
             visualize=False,
             nb_max_start_steps=args["strarting_fire_steps"],
             start_step_policy=lambda obs: 1)  # 1 is fire action
コード例 #2
0
"""
Use this file to check that your implementation complies with our evaluation
interface.
"""

import gym
from gym.wrappers.monitor import Monitor
from challenge1 import get_model, get_policy

# 1. Learn the model f: s, a -> s', r
env = Monitor(gym.make('Pendulum-v0'),
              'training',
              video_callable=False,
              force=True)
env.seed(98251624)
max_num_samples = 10000
model = get_model(env, max_num_samples)
env.close()

# Your model will be tested on the quality of prediction
obs = env.reset()
act = env.action_space.sample()
nobs, rwd, _, _ = env.step(act)
nobs_pred, rwd_pred = model(obs, act)
print(f'truth = {nobs, rwd}\nmodel = {nobs_pred, rwd_pred}')
env.close()

# 2. Perform dynamic programming using the learned model
env = Monitor(gym.make('Pendulum-v0'), 'evaluation', force=True)
env.seed(31186490)
policy = get_policy(model, env.observation_space, env.action_space)
コード例 #3
0
import gym, time
import numpy as np
from getModel import getModelQube, getModelPendel
from gym.wrappers.monitor import Monitor
from sklearn.neural_network import MLPRegressor
from challenge1_template import get_model, get_policy
from scipy import spatial

env = Monitor(gym.make('Pendulum-v0'),
              'training',
              video_callable=False,
              force=True)
env.seed(98251624)
max_num_samples = 10000
model = get_model(env, max_num_samples)

max_state = env.observation_space.high
min_state = env.observation_space.low
max_action = env.action_space.high
min_action = env.action_space.low
discret_states = 100
discrete_actions = 4
discount_factor = 0.99
theta = 1


def discreizeSpace(min_state, max_state, discret_num):
    discrete_space = []
    for i in range(0, len(max_state)):
        min = min_state[i]
        max = max_state[i]
コード例 #4
0
def test(args,
         worker_id: int,
         global_model: torch.nn.Module,
         T: Value,
         global_reward: Value = None,
         optimizer: torch.optim.Optimizer = None,
         global_model_critic: CriticNetwork = None,
         optimizer_critic: torch.optim.Optimizer = None):
    """
    Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance
    loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py
    :param args: console arguments
    :param worker_id: id of worker to differentiatethem and init different seeds
    :param global_model: global model, which is optimized/ for split models: actor
    :param T: global counter of steps
    :param global_reward: global running reward value
    :param optimizer: optimizer for shared model/ for split models: actor model
    :param global_model_critic: optional global critic model for split networks
    :param optimizer_critic: optional critic optimizer for split networks
    :return: None
    """

    logging.info("test worker started.")
    torch.manual_seed(args.seed + worker_id)

    if "RR" in args.env_name:
        env = quanser_robots.GentlyTerminating(gym.make(args.env_name))
    else:
        if args.monitor:
            env = Monitor(gym.make(args.env_name),
                          '100_test_runs',
                          video_callable=lambda count: count % 100 == 0,
                          force=True)
        else:
            env = gym.make(args.env_name)

    env.seed(args.seed + worker_id)

    normalizer = get_normalizer(args.normalizer, env)

    # get an instance of the current global model state
    model = copy.deepcopy(global_model)
    model.eval()

    model_critic = None
    if global_model_critic:
        model_critic = copy.deepcopy(global_model_critic)
        model_critic.eval()

    state = torch.from_numpy(env.reset())

    writer = SummaryWriter(comment='_test', log_dir='experiments/runs/')
    start_time = time.time()

    t = 0
    episode_reward = 0

    done = False
    global_iter = 0
    best_global_reward = -np.inf
    best_test_reward = -np.inf

    while True:

        # Get params from shared global model
        model.load_state_dict(global_model.state_dict())
        if not args.shared_model:
            model_critic.load_state_dict(global_model_critic.state_dict())

        rewards = []
        eps_len = []

        sleep = True

        # make 10 runs to get current avg performance
        for i in range(args.test_runs):
            while not done:
                t += 1

                if not args.no_render:
                    if i == 0 and t % 1 == 0 and "RR" not in args.env_name:
                        env.render()
                        if args.monitor and sleep:  # add a small delay to do a screen capture of the test run if needed
                            time.sleep(1)
                            sleep = False

                # apply min/max scaling on the environment

                with torch.no_grad():

                    # select mean of normal dist as action --> Expectation
                    if args.shared_model:
                        _, mu, _ = model(normalizer(state))
                    else:
                        mu, _ = model(normalizer(state))

                    action = mu.detach()

                state, reward, done, _ = env.step(
                    np.clip(action.numpy(), -args.max_action, args.max_action))

                done = done or t >= args.max_episode_length
                episode_reward += reward

                if done:
                    # reset current cumulated reward and episode counter as well as env
                    rewards.append(episode_reward)
                    episode_reward = 0

                    eps_len.append(t)
                    t = 0

                    state = env.reset()

                state = torch.from_numpy(state)

            # necessary to make more than one run
            done = False

        time_print = time.strftime("%Hh %Mm %Ss",
                                   time.gmtime(time.time() - start_time))

        std_reward = np.std(rewards)
        rewards = np.mean(rewards)

        new_best = rewards > best_test_reward
        writer.add_scalar("reward/test", rewards, int(T.value))
        writer.add_scalar("episode/length", np.mean(eps_len), int(T.value))

        log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \
            f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \
            f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}"

        if new_best:
            # highlight messages if progress was done
            logging.info(log_string)

            best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward
            best_test_reward = rewards if rewards > best_test_reward else best_test_reward
            model_type = 'shared' if args.shared_model else 'split'

            save_checkpoint(
                {
                    'epoch':
                    T.value,
                    'model':
                    model.state_dict(),
                    'model_critic':
                    model_critic.state_dict()
                    if model_critic is not None else None,
                    'global_reward':
                    global_reward.value,
                    # only save optimizers if shared ones are used
                    'optimizer':
                    optimizer.state_dict() if optimizer else None,
                    'optimizer_critic':
                    optimizer_critic.state_dict()
                    if optimizer_critic else None,
                },
                path=
                f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar"
            )
        else:
            # use by default only debug messages if no progress was reached
            logging.debug(log_string)

        global_iter += 1

        # run evaluation only once in test mode
        if args.test:
            break