def train_and_evaluate(args, monitor_path, checkpoint_step_filename, checkpoint_weights_filename, weights_filename, log_filename): env = gym.make(args["env_name"]) env = Monitor(env, monitor_path, resume=True, uid=args["run_id"], video_callable=lambda episode_num: episode_num % args[ "record_video_every"] == 0) np.random.seed(args["random_seed"]) env.seed(args["random_seed"]) starting_step = 0 if os.path.exists(checkpoint_step_filename): with open(checkpoint_step_filename, 'r') as f: starting_step = int(f.read()) args["starting_step"] = starting_step dqn = make_deep_q_network(env, args) if args["starting_step"] > 0: dqn.load_weights(checkpoint_weights_filename) callbacks = [ ReloadModelIntervalCheckpoint(checkpoint_weights_filename, step_path=checkpoint_step_filename, interval=args["checkpoint_frequency"], starting_step=starting_step), MyTrainLogger(args["checkpoint_frequency"], args["training_steps"], starting_step, log_filename) ] if args["mode"] == "Train": dqn.fit(env, callbacks=callbacks, verbose=0, nb_steps=args["training_steps"] - starting_step, nb_max_start_steps=args["strarting_fire_steps"], start_step_policy=lambda obs: 1) # 1 is fire action dqn.save_weights(weights_filename, overwrite=True) else: dqn.load_weights(weights_filename) env = gym.make(args["env_name"]) env = Monitor(env, monitor_path, resume=True, uid=args["run_id"] + "_test") np.random.seed(args["random_seed"]) env.seed(args["random_seed"]) dqn.test(env, nb_episodes=1, visualize=False, nb_max_start_steps=args["strarting_fire_steps"], start_step_policy=lambda obs: 1) # 1 is fire action
""" Use this file to check that your implementation complies with our evaluation interface. """ import gym from gym.wrappers.monitor import Monitor from challenge1 import get_model, get_policy # 1. Learn the model f: s, a -> s', r env = Monitor(gym.make('Pendulum-v0'), 'training', video_callable=False, force=True) env.seed(98251624) max_num_samples = 10000 model = get_model(env, max_num_samples) env.close() # Your model will be tested on the quality of prediction obs = env.reset() act = env.action_space.sample() nobs, rwd, _, _ = env.step(act) nobs_pred, rwd_pred = model(obs, act) print(f'truth = {nobs, rwd}\nmodel = {nobs_pred, rwd_pred}') env.close() # 2. Perform dynamic programming using the learned model env = Monitor(gym.make('Pendulum-v0'), 'evaluation', force=True) env.seed(31186490) policy = get_policy(model, env.observation_space, env.action_space)
import gym, time import numpy as np from getModel import getModelQube, getModelPendel from gym.wrappers.monitor import Monitor from sklearn.neural_network import MLPRegressor from challenge1_template import get_model, get_policy from scipy import spatial env = Monitor(gym.make('Pendulum-v0'), 'training', video_callable=False, force=True) env.seed(98251624) max_num_samples = 10000 model = get_model(env, max_num_samples) max_state = env.observation_space.high min_state = env.observation_space.low max_action = env.action_space.high min_action = env.action_space.low discret_states = 100 discrete_actions = 4 discount_factor = 0.99 theta = 1 def discreizeSpace(min_state, max_state, discret_num): discrete_space = [] for i in range(0, len(max_state)): min = min_state[i] max = max_state[i]
def test(args, worker_id: int, global_model: torch.nn.Module, T: Value, global_reward: Value = None, optimizer: torch.optim.Optimizer = None, global_model_critic: CriticNetwork = None, optimizer_critic: torch.optim.Optimizer = None): """ Start worker in _test mode, i.e. no training is done, only testing is used to validate current performance loosely based on https://github.com/ikostrikov/pytorch-a3c/blob/master/_test.py :param args: console arguments :param worker_id: id of worker to differentiatethem and init different seeds :param global_model: global model, which is optimized/ for split models: actor :param T: global counter of steps :param global_reward: global running reward value :param optimizer: optimizer for shared model/ for split models: actor model :param global_model_critic: optional global critic model for split networks :param optimizer_critic: optional critic optimizer for split networks :return: None """ logging.info("test worker started.") torch.manual_seed(args.seed + worker_id) if "RR" in args.env_name: env = quanser_robots.GentlyTerminating(gym.make(args.env_name)) else: if args.monitor: env = Monitor(gym.make(args.env_name), '100_test_runs', video_callable=lambda count: count % 100 == 0, force=True) else: env = gym.make(args.env_name) env.seed(args.seed + worker_id) normalizer = get_normalizer(args.normalizer, env) # get an instance of the current global model state model = copy.deepcopy(global_model) model.eval() model_critic = None if global_model_critic: model_critic = copy.deepcopy(global_model_critic) model_critic.eval() state = torch.from_numpy(env.reset()) writer = SummaryWriter(comment='_test', log_dir='experiments/runs/') start_time = time.time() t = 0 episode_reward = 0 done = False global_iter = 0 best_global_reward = -np.inf best_test_reward = -np.inf while True: # Get params from shared global model model.load_state_dict(global_model.state_dict()) if not args.shared_model: model_critic.load_state_dict(global_model_critic.state_dict()) rewards = [] eps_len = [] sleep = True # make 10 runs to get current avg performance for i in range(args.test_runs): while not done: t += 1 if not args.no_render: if i == 0 and t % 1 == 0 and "RR" not in args.env_name: env.render() if args.monitor and sleep: # add a small delay to do a screen capture of the test run if needed time.sleep(1) sleep = False # apply min/max scaling on the environment with torch.no_grad(): # select mean of normal dist as action --> Expectation if args.shared_model: _, mu, _ = model(normalizer(state)) else: mu, _ = model(normalizer(state)) action = mu.detach() state, reward, done, _ = env.step( np.clip(action.numpy(), -args.max_action, args.max_action)) done = done or t >= args.max_episode_length episode_reward += reward if done: # reset current cumulated reward and episode counter as well as env rewards.append(episode_reward) episode_reward = 0 eps_len.append(t) t = 0 state = env.reset() state = torch.from_numpy(state) # necessary to make more than one run done = False time_print = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start_time)) std_reward = np.std(rewards) rewards = np.mean(rewards) new_best = rewards > best_test_reward writer.add_scalar("reward/test", rewards, int(T.value)) writer.add_scalar("episode/length", np.mean(eps_len), int(T.value)) log_string = f"Time: {time_print}, T={T.value} -- n_runs={args.test_runs} -- mean total reward={rewards:.5f} " \ f" +/- {std_reward:.5f} -- mean episode length={np.mean(eps_len):.5f}" \ f" +/- {np.std(eps_len):.5f} -- global reward={global_reward.value:.5f}" if new_best: # highlight messages if progress was done logging.info(log_string) best_global_reward = global_reward.value if global_reward.value > best_global_reward else best_global_reward best_test_reward = rewards if rewards > best_test_reward else best_test_reward model_type = 'shared' if args.shared_model else 'split' save_checkpoint( { 'epoch': T.value, 'model': model.state_dict(), 'model_critic': model_critic.state_dict() if model_critic is not None else None, 'global_reward': global_reward.value, # only save optimizers if shared ones are used 'optimizer': optimizer.state_dict() if optimizer else None, 'optimizer_critic': optimizer_critic.state_dict() if optimizer_critic else None, }, path= f"./experiments/checkpoints/model_{model_type}_T-{T.value}_global-{global_reward.value:.5f}_test-{rewards:.5f}.pth.tar" ) else: # use by default only debug messages if no progress was reached logging.debug(log_string) global_iter += 1 # run evaluation only once in test mode if args.test: break