def main(env: wrappers.EvaluationEnv, args: argparse.Namespace) -> None: # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the network network = Network(env, args) def evaluate_episode(start_evaluation: bool = False) -> float: rewards, state, done = 0, env.reset(start_evaluation), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Predict the action using the greedy policy action = None state, reward, done, _ = env.step(action) rewards += reward return rewards # Create the vectorized environment vector_env = gym.vector.AsyncVectorEnv([ lambda: wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles) ] * args.workers) vector_env.seed(args.seed) states = vector_env.reset() training = True while training: # Training for _ in range(args.evaluate_each): # TODO: Predict action distribution using `network.predict_actions` # and then sample it using for example `np.random.normal`. Do not # forget to clip the actions to the `env.action_space.{low,high}` # range, for example using `np.clip`. actions = None # TODO(paac): Perform steps in the vectorized environment # TODO(paac): Compute estimates of returns by one-step bootstrapping # TODO(paac): Train network using current states, chosen actions and estimated returns # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() # Final evaluation while True: evaluate_episode(start_evaluation=True)
def main(env, args): # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the network network = Network(env, args) weights = env.observation_space.nvec[-1] def evaluate_episode(start_evaluation=False): rewards, state, done = 0, env.reset(start_evaluation), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() state = multi_hot_states([state], weights) action = network.predict_actions(state)[0][0] state, reward, done, _ = env.step(action) rewards += reward return rewards # Create the vectorized environment vector_env = gym.vector.AsyncVectorEnv([ lambda: wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles) ] * args.workers) vector_env.seed(args.seed) states = vector_env.reset() states = multi_hot_states(states, weights) training = True while training: # Training for _ in range(args.evaluate_each): # TODO: Predict action distribution using `network.predict_actions` # and then sample it using for example `np.random.normal`. Do not # forget to clip the actions to the `env.action_space.{low,high}` # range, for example using `np.clip`. mus, sds = network.predict_actions(states) actions = np.random.normal(mus, sds) actions = np.clip(actions, env.action_space.low, env.action_space.high) # TODO(paac): Perform steps in the vectorized environment next_states, rewards, dones, _ = vector_env.step(actions) next_states = multi_hot_states(next_states, weights) # TODO(paac): Compute estimates of returns by one-step bootstrapping predicted_values = network.predict_values(next_states) returns = rewards + (args.gamma * np.array([ 0 if done else pred for done, pred in zip(dones, predicted_values) ])) # TODO(paac): Train network using current states, chosen actions and estimated returns network.train(states, actions, returns) states = next_states # Periodic evaluation total_reward = [] for _ in range(args.evaluate_for): total_reward.append(evaluate_episode()) print( f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}' ) if np.mean(total_reward) > 90: training = False # Final evaluation while True: evaluate_episode(start_evaluation=True)
# TODO(paac): Train network using current states, chosen actions and estimated returns network.train(states, actions, returns) states = next_states # Periodic evaluation total_reward = [] for _ in range(args.evaluate_for): total_reward.append(evaluate_episode()) print( f'Mean {args.evaluate_for} episodes return {np.mean(total_reward)}' ) if np.mean(total_reward) > 90: training = False # Final evaluation while True: evaluate_episode(start_evaluation=True) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles), args.seed) main(env, args)
try: # Final evaluation returns = [] while True: state, done = env.reset(start_evaluation=True), False r = 0 while not done: action = np.argmax(W[state].sum(axis=0)) state, reward, done, _ = env.step(action) r += reward returns.append(r) except KeyboardInterrupt: if not args.recodex: np.save(f"{sum(returns)}_{args.tiles}_W_matrix.npy", W) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0"), tiles=args.tiles), args.seed, logname= f"{args.logdir}/alpha={args.alpha},alpha_dec={args.alpha_dec},epsilon={args.epsilon},epsilon_final={args.epsilon_final},epsilon_final_at={args.epsilon_final_at},episodes={args.episodes},tiles={args.tiles},gamma={args.gamma},seed={args.seed}" ) main(env, args)
def main(env, args): global vector_env # Fix random seeds and number of threads np.random.seed(args.seed) tf.random.set_seed(args.seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Construct the network network = Network(env, args) def evaluate_episode(start_evaluation=False): rewards, state, done = 0, env.reset(start_evaluation), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() mus, sds = network.predict_actions([state]) mu, sd = mus[0], sds[0] # action = np.clip(np.random.normal(mu, sd), -1, 1) action = np.clip(mu, -1, 1) return_estimate = network.predict_values([state])[0] # print(f"mu:\t{mu}\tsd:\t{sd}\taction:\t{action}\treturn_est:\t{return_estimate}") # mus, _ = network.predict_actions([state]) state, reward, done, _ = env.step([action]) rewards += reward return rewards # Create the vectorized environment vector_env = gym.vector.AsyncVectorEnv([ lambda: wrappers.DiscreteMountainCarWrapper( gym.make("MountainCarContinuous-v0"), tiles=args.tiles) ] * args.workers) vector_env.seed(args.seed) states = vector_env.reset() training = True while training: # Training for _ in range(args.evaluate_each): # TODO: Predict action distribution using `network.predict_actions` # and then sample it using for example `np.random.normal`. Do not # forget to clip the actions to the `env.action_space.{low,high}` # range, for example using `np.clip`. mus, sds = network.predict_actions(states) actions = np.reshape(np.random.normal(mus, sds), (args.workers, 1)) # print(actions) # TODO(paac): Perform steps in the vectorized environment next_states, rewards, dones, _ = vector_env.step( np.clip(actions, -1, 1)) # rewards -= 1 # TODO(paac): Compute estimates of returns by one-step bootstrapping predicted_values = network.predict_values(next_states) return_estimates = rewards + (args.gamma * np.array([ 0 if done else pred for done, pred in zip(dones, predicted_values) ])) # TODO(paac): Train network using current states, chosen actions and estimated returns network.train(states, actions, return_estimates) states = next_states # Periodic evaluation for _ in range(args.evaluate_for): evaluate_episode() if sum(env._episode_returns[-100:]) / min(100, len( env._episode_returns)) > 90: training = False # Final evaluation while True: evaluate_episode(start_evaluation=True)
state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: Perform an action. action = None next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = None state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationEnv(wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed) main(env, args)
next_state, reward, done, _ = env.step(action) # TODO: Update the action-value estimates q[state, action] = q[state, action] + (alpha_schedule( args, e) if args.decrease_alpha else args.alpha) * ( reward + args.gamma * np.max(q[next_state]) - q[state, action]) state = next_state # Final evaluation while True: state, done = env.reset(start_evaluation=True), False while not done: # TODO: Choose (greedy) action action = np.argmax(q[state]) state, reward, done, _ = env.step(action) if __name__ == "__main__": args = parser.parse_args([] if "__file__" not in globals() else None) # Create the environment env = wrappers.EvaluationWrapper( wrappers.DiscreteMountainCarWrapper(gym.make("MountainCar1000-v0")), args.seed, logname= f"alpha={args.alpha},epsilon={args.epsilon},gamma={args.gamma},init_bias={args.init_bias},de={args.decrease_epsilon},da={args.decrease_alpha},seed={args.seed}", evaluate_for=100) main(env, args)