def _init(): if args.original_gym: env = gym.make("CarRacing-v0") else: env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip)) env = wrappers.VaeCarWrapper(env, silent=silent) if not args.recodex: env = wrappers.TerminateEarlyWrapper(env) if args.discrete_actions: env = wrappers.CarDiscretizatinoWrapper(env, args.action_map == "large") env = wrappers.EvaluationWrapper(env, args.seed, evaluate_for=args.evaluate_for, render_every=args.render_every, report_each=1, logname=args.logdir + "/" + get_params_str(seed)) env = wrappers.RewardWrapper(env, green_penalty=args.green_penalty, args=args, silent=silent) return env
def _init(): env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip)) env = wrappers.VaeCarWrapper(env,silent=True) globals()['vae'] = env.vae globals()['vae_wrapper'] = env env = wrappers.CarDiscretizatinoWrapper(env, args.action_map == "large") env = wrappers.EvaluationWrapper(env, args.seed, evaluate_for=args.evaluate_for, report_each=1, logname="/dev/null") return env
def _init(): env = gym.make("CarRacingSoftFS{}-v0".format(1)) env = wrappers.VaeCarWrapper(env, silent=silent) env = wrappers.TerminateEarlyWrapper(env) env = wrappers.CarDiscretizatinoWrapper(env) env = wrappers.EvaluationWrapper(env, np.random.randint(0, 100000), evaluate_for=1, report_each=1, logname="/dev/null") return env
def create_env(args, report_each=100, **kwargs): # Create the environment env = wrappers.EvaluationWrapper(gym.make("Taxi-v3"), seed=args.seed, report_each=report_each, **kwargs) # Extract a deterministic MDP into three NumPy arrays # - R[state][action] is the reward # - D[state][action] is the True/False value indicating end of episode # - N[state][action] is the next state R, D, N = [ np.array([[env.P[s][a][0][i] for a in range(env.action_space.n)] for s in range(env.observation_space.n)]) for i in [2,3,1] ] return env, R, D, N
def _init(): if args.original_gym: env = gym.make("CarRacing-v0") else: env = gym.make("CarRacingSoftFS{}-v0".format(args.frame_skip)) env = my_wrappers.VaeCarWrapper(env, silent=silent) if not args.recodex: env = my_wrappers.TerminateEarlyWrapper(env) if args.discrete_actions: env = my_wrappers.CarDiscretizatinoWrapper(env) env = wrappers.EvaluationWrapper(env, args.seed, evaluate_for=evaluate_for, report_each=1 ) # env = my_wrappers.RewardWrapper(env, green_penalty=args.green_penalty, speed_limit=args.speed_limit, speed_limit_end=args.speed_limit_end * args.total_timesteps, silent=silent) return env
# TODO(paac): Train network using current states, chosen actions and estimated returns network.train(states, actions, returns) states = next_states # Periodic evaluation total_reward = [] for _ in range(args.evaluate_for): total_reward.append(evaluate_episode()) print( f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}' ) if np.mean(total_reward) > 90: training = False # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed) main(env, args)
if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Compute `action` using epsilon-greedy policy. action = None next_state, reward, done, _ = env.step(action) state = next_state # TODO: Compute returns from the recieved rewards # and update Q and C. # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose greedy action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteCartPoleWrapper(gym.make("CartPole-v1")), args.seed) main(env, args)
def main(env, args): global model # Fix random seeds and number of threads np.random.seed(args.seed) if args.recodex: models = [] for path in args.load_from: models.append(SAC.load(path)) while True: state, done = env.reset(start_evaluation=True), False ret = 0 while not done: action = np.sum(np.array( list( map(lambda m: m.predict(state, deterministic=True)[0], models))), axis=0) / len(models)**0.5 # print(action) # action, _states = model.predict(state, deterministic=True) # action, _states = model.predict(state) ## TODO delete before submitting if not args.no_render: env.render() state, reward, done, _ = env.step(action) ret += reward print("Episode return:", ret) else: tensorboard_log_dir = None if args.tensorboard_log_dir is None else os.path.join( args.tensorboard_log_dir, get_exp_name()) model = SAC("MlpPolicy", env, learning_rate=lr_schedule, buffer_size=args.buffer_size, learning_starts=args.learning_starts, n_episodes_rollout=args.train_episodes, batch_size=args.batch_size, tau=args.tau, gamma=args.gamma, train_freq=args.train_freq, gradient_steps=args.gradient_steps, ent_coef="auto" if args.ent_coef == "auto" else float(args.ent_coef), use_sde=False, policy_kwargs=dict(log_std_init=-3, net_arch=args.net_arch, use_expln=True), tensorboard_log=tensorboard_log_dir, rew_skip_thres=args.rew_skip_thres, seed=args.seed) model.verbose = 2 callbacks = [ CheckpointCallback(20000, "checkpoints", name_prefix=get_exp_name()), EvalCallback( gym.make(getEnvName()), callback_on_new_best=SaveBestModelCallback( save_path="best/" + get_exp_name() + "_best_model.zip"), eval_freq=20000, n_eval_episodes=5, deterministic=True), EpisodeCallback(env, model) ] print(args.log_interval) model.learn(args.timesteps, log_interval=args.log_interval, callback=callbacks) # Final evaluation env = wrappers.EvaluationWrapper(gym.make(getEnvName()), evaluate_for=200, seed=args.seed) while True: state, done = env.reset(start_evaluation=True), False while not done: action, _states = model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) model.save(get_exp_name())
training = True while training: # Training for _ in range(args.evaluate_each): # TODO: Choose actions using network.predict_actions actions = None # TODO: Perform steps in the vectorized environment # TODO: Compute estimates of returns by one-step bootstrapping # TODO: Train network using current states, chosen actions and estimated returns # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(gym.make(args.env), args.seed) main(env, args)
# Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) if args.recodex: # TODO: Perform evaluation of a trained model. while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose an action action = None state, reward, done, _ = env.step(action) else: # TODO: Perform training pass if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(gym.make("BipedalWalker-v3"), args.seed) main(env, args)
try: # Final evaluation returns = [] while True: state, done = env.reset(start_evaluation=True), False r = 0 while not done: action = np.argmax(W[state].sum(axis=0)) state, reward, done, _ = env.step(action) r += reward returns.append(r) except KeyboardInterrupt: if not args.recodex: np.save(f"{sum(returns)}_{args.tiles}_W_matrix.npy", W) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0"), tiles=args.tiles), args.seed, logname= f"{args.logdir}/alpha={args.alpha},alpha_dec={args.alpha_dec},epsilon={args.epsilon},epsilon_final={args.epsilon_final},epsilon_final_at={args.epsilon_final_at},episodes={args.episodes},tiles={args.tiles},gamma={args.gamma},seed={args.seed}" ) main(env, args)
evaluate_for=200, seed=args.seed) while True: state, done = env.reset(start_evaluation=True), False while not done: action, _states = model.predict(state, deterministic=True) state, reward, done, _ = env.step(action) model.save(get_exp_name()) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment if not args.recodex: # env = TimeLimit( env = wrappers.EvaluationWrapper(gym.make(getEnvName()), evaluate_for=10, seed=args.seed) # max_episode_steps=1600) if args.frame_skip > 1: env = wrappers.FrameSkipWrapper(env, args.frame_skip) else: env = wrappers.EvaluationWrapper(gym.make(getEnvName()), seed=args.seed) main(env, args)
# # TODO: Perform a training episode # state, done = env.reset(), False # while not done: # if args.render_each and env.episode and env.episode % args.render_each == 0: # env.render() # # state, reward, done, _ = env.step(action) # Final evaluation q = q1 + q2 while True: state, done = env.reset(start_evaluation=True), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() action = np.argmax(q[state]) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")), seed=args.seed, logname= f"double=True,alpha={args.alpha},alpha_decay_exp={args.alpha_decay_exp},epsilon={args.epsilon},start_epsilon={args.start_epsilon},gamma={args.gamma},n={args.n},epsilon_decay={args.epsilon_decay},alpha_decay={args.alpha_decay},expert_every={args.expert_training_every},episodes={args.episodes},seed={args.seed},init_random_actions={args.init_random_actions}", evaluate_for=args.evaluate_for) main(env, args)
training = True while training: # To generate expert trajectory, you can use state, trajectory = env.expert_trajectory() # TODO: Perform a training episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() state, reward, done, _ = env.step(action) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(wrappers.DiscreteLunarLanderWrapper(gym.make("LunarLander-v2")), args.seed) main(env, args)
next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates q[state, action] = q[state, action] + (alpha_schedule( args, e) if args.decrease_alpha else args.alpha) * ( reward + args.gamma * np.max(q[next_state]) - q[state, action]) state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = np.argmax(q[state]) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed, logname= f"alpha={args.alpha},epsilon={args.epsilon},gamma={args.gamma},init_bias={args.init_bias},de={args.decrease_epsilon},da={args.decrease_alpha},seed={args.seed}", evaluate_for=100) main(env, args)
if step % args.target_update_freq == 0: target.copy_weights_from(network) state = next_state if args.epsilon_final_at: epsilon = np.interp(env.episode + 1, [0, args.epsilon_final_at * args.episodes], [args.epsilon, args.epsilon_final]) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: if args.render_each == 1: env.render() action = np.argmax( network.predict(np.array([state], np.float32))[0]) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(gym.make("CartPole-v1"), args.seed, logname="logs/" + get_log_name()) main(env, args)
training = True while training: # Generate required number of episodes for _ in range(args.evaluate_each // args.batch_size): episodes = [] for _ in range(args.batch_size): episodes.append(env.expert_episode()) # Train the network network.train(episodes) # TODO: Maybe evaluate the current performance, using # `evaluate_episode()` method returning the achieved return, # and setting `training=False` when the performance is high enough. # Final evaluation while True: evaluate_episode(True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(memory_game_environment.make(args.cards), args.seed, evaluate_for=args.evaluate_for, report_each=args.evaluate_for) main(env, args)
name_prefix=get_exp_name()), EvalCallback(gym.make("BipedalWalker-v3"), callback_on_new_best=SaveBestModelCallback(), eval_freq=10000, n_eval_episodes=5, deterministic=True) # EpisodeCallback(env) ] print(args.log_interval) model.learn(args.timesteps, log_interval=args.log_interval, callback=callbacks) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment if not args.recodex: # env = TimeLimit( env = wrappers.EvaluationWrapper( gym.make("BipedalWalkerHardcore-v3"), evaluate_for=100, seed=args.seed) # max_episode_steps=1600) else: env = wrappers.EvaluationWrapper(gym.make("BipedalWalkerHardcore-v3"), seed=args.seed) main(env, args)
Transition(state, action, reward, done, next_state)) # TODO: If the replay_buffer is large enough, preform a training batch # from `args.batch_size` uniformly randomly chosen transitions. # # After you choose `states` and suitable targets, you can train the network as # network.train(states, ...) state = next_state if args.epsilon_final_at: epsilon = np.interp(env.episode + 1, [0, args.epsilon_final_at], [args.epsilon, args.epsilon_final]) # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(gym.make("CartPole-v1"), args.seed) main(env, args)
def main(args): # Create the environment env = wrappers.EvaluationWrapper(gym.make("Taxi-v3"), seed=args.seed, report_each=100) # Fix random seed and create a generator generator = np.random.RandomState(args.seed) Q = np.zeros((env.observation_space.n, env.action_space.n)) for _ in range(args.episodes): next_state, done = env.reset(), False # Generate episode and update Q using the given TD method next_action = np.argmax(Q[next_state]) if generator.uniform( ) >= args.epsilon else env.action_space.sample() next_action_prob = args.epsilon / env.action_space.n + ( 1 - args.epsilon) * (next_action == np.argmax(Q[next_state])) while not done: action, action_prob, state = next_action, next_action_prob, next_state next_state, reward, done, _ = env.step(action) if not done: next_action = np.argmax(Q[next_state]) if generator.uniform( ) >= args.epsilon else env.action_space.sample() next_action_prob = args.epsilon / env.action_space.n + ( 1 - args.epsilon) * (next_action == np.argmax( Q[next_state])) target_policy = np.eye(env.action_space.n)[np.argmax(Q, axis=1)] if not args.off_policy: target_policy = ( 1 - args.epsilon ) * target_policy + args.epsilon / env.action_space.n * np.ones_like( target_policy) # TODO: Perform the update to the state-action value function `Q`, using # a TD update with the following parameters: # - `args.n`: use `args.n`-step method # - `args.off_policy`: # - if False, the epsilon-greedy behaviour policy is also the target policy # - if True, the target policy is the greedy policy # - for SARSA (with any `args.n`) and expected SARSA (with `args.n` > 1), # importance sampling must be used # - `args.mode`: this argument can have the following values: # - "sarsa": regular SARSA algorithm # - "expected_sarsa": expected SARSA algorithm # - "tree_backup": tree backup algorithm # # Perform the updates as soon as you can -- whenever you have all the information # to update `Q[state, action]`, do it. For each `action` use its corresponding # `action_prob` at the time of taking the `action` as the behaviour policy action # probability, and current `target_policy` as the target policy (everywhere # in the update). # # Do not forget that when `done` is True, bootstrapping on the # `next_state` is not used. # # Also note that when the episode ends and `args.n` > 1, there will # be several state-action pairs that also need to be updated. Perform # the updates in the order in which you encountered the state-action # pairs and during these updates, use the `target_policy` computed # above (do not modify it during these post-episode updates). return Q
# network._baseline_model.optimizer.lr = warmup_lr_schedule(ep) # network._model.optimizer.lr = warmup_lr_schedule(ep) network.train(batch_states, batch_actions, batch_returns) print(steps) steps += 1 network._model.save("checkpoint") except KeyboardInterrupt: pass # if args.recodex: # network._model.load_weights ... print("Evaluation!") # Final evaluation while True: state, done = env.reset(True), False while not done: action = np.argmax(network.predict([state])[0]) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) env = wrappers.EvaluationWrapper(gym.make("CartPolePixels-v0"), args.seed) main(env, args)
import tensorflow as tf import random from vae.vae import CVAE # from env import make_env from utils import PARSER args = PARSER.parse_args(['--config_path', 'configs/carracing.config']) import pygame pygame.init() screen = pygame.display.set_mode((600, 300)) frame_skip = 3 seed = 2 env = wrappers.EvaluationWrapper(wrappers.VaeCarWrapper( gym.make("CarRacingSoftFS{}-v0".format(frame_skip))), seed, evaluate_for=15, report_each=1) DATA_DIR = "export" model_path_name = "models/tf_vae".format(args.exp_name, args.env_name) vae = CVAE(args) vae.set_weights( tf.keras.models.load_model(model_path_name, compile=False).get_weights()) filelist = os.listdir(DATA_DIR) obs = np.load(os.path.join(DATA_DIR, random.choice(filelist)))["obs"] obs = obs.astype(np.float32) / 255.0 def resize(img, factor):
if i % 100 == 0: network.update_target_weights() if i % 100 == 0: network.save() state = next_state epsilon = np.exp( np.interp(env.episode + 1, [0, 5000], [np.log(0.25), np.log(0.01)])) elif args.evolution: es = train(load_from='saved_model.pkl') np.save('best_params', es.best.get()[0]) best_params = es.best.get()[0] play(best_params, render=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper(gym.make("CartPolePixels-v0"), args.seed, report_each=10, evaluate_for=15) main(env, args)
if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Perform an action. action = None next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed) main(env, args)