def target_fn(x) -> float: env = cart_pole_evaluator.environment() epsilon, epsilon_final, gamma = x args.epsilon = epsilon args.epsilon_final = epsilon_final args.gamma = gamma Q = np.zeros((env.states, env.actions), dtype=np.float32) C = np.zeros_like(Q) train(args, env, Q, C) # Perform last 100 evaluation episodes mean_value = evaluate(args, env, Q) return -mean_value
type=int, help="Render some episodes.") parser.add_argument("--threads", default=0, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Fix random seed np.random.seed(42) tf.random.set_seed(42) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create the environment env = cart_pole_evaluator.environment(discrete=False) # Construct the network network = Network(env, args) # Training for _ in range(args.episodes // args.batch_size): batch_states, batch_actions, batch_returns = [], [], [] for _ in range(args.batch_size): # Perform episode states, actions, rewards = [], [], [] state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render()
help="Training episodes.") parser.add_argument("--epsilon", default=0.2, type=float, help="Exploration factor.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") args = parser.parse_args() # Fix random seed np.random.seed(42) # Create the environment env = cart_pole_evaluator.environment(discrete=True) # Create Q, C and other variables # TODO: # - Create Q, a zero-filled NumPy array with shape [env.states, env.actions], # representing estimated Q value of a given (state, action) pair. # - Create C, a zero-filled NumPy array with shape [env.states, env.actions], # representing number of observed returns of a given (state, action) pair. Q = np.zeros([env.states, env.actions]) C = np.zeros([env.states, env.actions]) for _ in range(args.episodes): # Perform episode state = env.reset() states, actions, rewards = [], [], [] while True:
# Parse arguments parser = argparse.ArgumentParser() # TODO: Define reasonable defaults and optionally more parameters parser.add_argument("--episodes", default=600, type=int, help="Training episodes.") parser.add_argument("--epsilon", default=0.5, type=float, help="Exploration factor.") parser.add_argument("--gamma", default=0.3, type=float, help="Discount factor of the rewards.") parser.add_argument("--recodex", default=False, action="store_true", help="Evaluation in ReCodEx.") parser.add_argument("--render_each", default=50, type=int, help="Render some episodes.") parser.add_argument("--seed", default=42, type=int, help="Random seed.") args = parser.parse_args([] if "__file__" not in globals() else None) # Fix random seeds and threads np.random.seed(args.seed) # Create the environment env = cart_pole_evaluator.environment(discrete=True, seed=args.seed) # Create Q, C and other variables # TODO: # - Create Q, a zero-filled NumPy array with shape [env.states, env.actions], # representing estimated Q value of a given (state, action) pair. # - Create C, a zero-filled NumPy array with shape [env.states, env.actions], # representing number of observed returns of a given (state, action) pair. Q = np.zeros([env.states, env.actions]) C = np.zeros([env.states, env.actions]) for _ in range(args.episodes): # Perform episode state = env.reset()
parser.add_argument("--epsilon", default=0.15, type=float, help="Exploration factor.") parser.add_argument("--epsilon_final", default=0.001, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=0.99, type=float, help="Discounting factor.") args = parser.parse_args() # Create the environment env = cart_pole_evaluator.environment() env2 = cart_pole_evaluator.environment() env3 = cart_pole_evaluator.environment() Q = np.zeros((env.states, env.actions)) C = np.zeros((env.states, env.actions)) Q2 = np.zeros((env.states, env.actions)) C2 = np.zeros((env.states, env.actions)) Q3 = np.zeros((env.states, env.actions)) C3 = np.zeros((env.states, env.actions)) Qs = ((Q, C, env), (Q2, C2, env2), (Q3, C3, env3)) #a = (args.epsilon_final/args.epsilon) ** (1/max(args.episodes*0.9, 1000)) #a = (args.epsilon_final - args.epsilon)/(args.episodes * 0.9) d = args.epsilon_final c = args.epsilon
def main(args,seed): # Fix random seeds and number of threads np.random.seed(seed) tf.random.set_seed(seed) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create the environment env = cart_pole_evaluator.environment(discrete=False) # env2 = cart_pole_evaluator.environment(discrete=False) # print(env.actions) # print(env.state_shape) # print(env.action_shape) # Construct the network network = Network(env, args) A = np.array(range(env.actions)) N = args.episodes // args.batch_size training = True # Training for n in range(N): batch_states, batch_actions, batch_returns = [], [], [] for _ in range(args.batch_size): print('Episode {}/{}'.format(env.episode+1, args.episodes)) # Perform episode states, actions, rewards = [], [], [] state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # Compute action probabilities using `network.predict` and current `state` probabilities = network.predict([state])[0] S = sum(probabilities) # Choose `action` according to `probabilities` distribution (np.random.choice can be used) action = np.random.choice(A, p=probabilities/S) next_state, reward, done, _ = env.step(action) states.append(state) actions.append(action) rewards.append(reward) state = next_state # Compute returns by summing rewards (with discounting) G = 0 Gs = [] rewards.reverse() for r in rewards: G = r + args.gamma * G Gs.append(G) Gs.reverse() # Add states, actions and returns to the training batch batch_states.append(states) batch_actions.append(actions) batch_returns.append(Gs) # print('Reward {} -- mean[-10:] {}'.format(env._episode_returns[-1], np.mean(env._episode_returns[-10:]))) # print('Last return: {}'.format(round(np.mean(env._episode_returns[-args.batch_size:]), 2))) if round(np.mean(env._episode_returns[-10:]), 2) > 460: training = False if not training: break # Train using the generated batch network.train( batch_states, batch_actions, batch_returns ) # print('Training {}/{} done in {}s'.format(n+1, N, round(time.time() - T, 2))) # Final evaluation while True: state, done = env.reset(True), False # R = 0 while not done: # Compute action `probabilities` using `network.predict` and current `state` # Choose greedy action this time probabilities = network.predict([state])[0] action = np.argmax(probabilities) state, reward, done, _ = env.step(action) # R += reward return np.mean(env._episode_returns[-100:])
default=0.005, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=1.0, type=float, help="Discounting factor.") args = parser.parse_args() # stuff for my own learning control rewards_history = [] target_reward = 500 treshold_reward = 485 # Create the environment env = cpe.environment() # init args eps = args.epsilon gamma = args.gamma # init policy policy = np.zeros((env.states, env.actions)) + 1 / env.actions # Combines Q and Return from Alg, remembers average return for this (state, act) combination avgReturn = np.zeros((env.states, env.actions), dtype=float) stateActionSeen = np.zeros((env.states, env.actions), dtype=int) # # Could be improved with addition of automatic policy search resets after # ..the learning process stops improving for certain number of episodes.
class Network: def __init__(self, env, args): # TODO: Create a suitable network # Warning: If you plan to use Keras `.train_on_batch` and/or `.predict_on_batch` # methods, pass `experimental_run_tf_function=False` to compile. There is # a bug in TF 2.0 which causes the `*_on_batch` methods not to use `tf.function`. # Otherwise, if you are training manually, using `tf.function` is a good idea # to get good performance. # Define a training method. Generally you have two possibilities # - pass new q_values of all actions for a given state; all but one are the same as before # - pass only one new q_value for a given state, including the index of the action to which # the new q_value belongs def train(self, states, ...): # TODO pass def predict(self, states): # TODO pass if __name__ == "__main__": # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=32, type=int, help="Batch size.") parser.add_argument("--episodes", default=1000, type=int, help="Episodes for epsilon decay.") parser.add_argument("--epsilon", default=0.3, type=float, help="Exploration factor.") parser.add_argument("--epsilon_final", default=0.01, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=1.0, type=float, help="Discounting factor.") parser.add_argument("--hidden_layers", default=1, type=int, help="Number of hidden layers.") parser.add_argument("--hidden_layer_size", default=20, type=int, help="Size of hidden layer.") parser.add_argument("--learning_rate", default=0.001, type=float, help="Learning rate.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Fix random seeds and number of threads np.random.seed(42) tf.random.set_seed(42) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create the environment env = cart_pole_evaluator.environment(discrete=False) # Construct the network network = Network(env, args) # Replay memory; maxlen parameter can be passed to deque for a size limit, # which we however do not need in this simple task. replay_buffer = collections.deque() Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"]) evaluating = False epsilon = args.epsilon while training: # Perform episode state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() # TODO: compute action using epsilon-greedy policy. You can compute # the q_values of a given state using # q_values = network.predict(np.array([state], np.float32))[0] next_state, reward, done, _ = env.step(action) # Append state, action, reward, done and next_state to replay_buffer replay_buffer.append(Transition(state, action, reward, done, next_state)) # TODO: If the replay_buffer is large enough, preform a training batch # of `args.batch_size` uniformly randomly chosen transitions. # # After you choose `states` and suitable targets, you can train the network as # network.train(states, ...) state = next_state if args.epsilon_final: epsilon = np.exp(np.interp(env.episode + 1, [0, args.episodes], [np.log(args.epsilon), np.log(args.epsilon_final)])) # Final evaluation while True: state, done = env.reset(True), False while not done: action = np.argmax(network.predict(np.array([state], np.float32))[0]) state, reward, done, _ = env.step(action)
def main(): # Fix random seed np.random.seed(42) # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--episodes", default=500, type=int, help="Training episodes.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--epsilon", default=0.2, type=float, help="Exploration factor.") parser.add_argument("--epsilon_final", default=0.1, type=float, help="Final exploration factor.") parser.add_argument("--gamma", default=0.99, type=float, help="Discounting factor.") args = parser.parse_args() print(args) # Create the environment env = cart_pole_evaluator.environment() training = True Q = np.zeros([env.states, env.actions], dtype=np.float32) Q.fill(500) # Q.fill(1 / args.epsilon) C = np.zeros([env.states, env.actions], dtype=np.float32) eps_diff = (args.epsilon_final - args.epsilon) / float(args.episodes) eps_curr = args.epsilon while training: trajectory = [] # Perform a training episode state, done = env.reset(), False while not done: if args.render_each and env.episode and env.episode % args.render_each == 0: env.render() if random.random() < eps_curr: action = random.randint(0, env.actions - 1) else: action = np.argmax(Q[state]).item() next_state, reward, done, _ = env.step(action) trajectory.append([state, action, reward]) state = next_state G = 0.0 for state, action, reward in reversed(trajectory): G = args.gamma * G + reward # returns[(state, action)].append(G) # Q[state, action] = np.mean(returns[(state, action)]).item() C[state, action] += 1 Q[state, action] += (G - Q[state, action]) / C[state, action] state = next_state eps_curr += eps_diff if args.render_each and env.episode % args.render_each == 0: print(f"eps curr: {eps_curr}") # Evaluation episode state, done = env.reset(), False while not done: env.render() action = np.argmax(Q[state]).item() state, _, done, _ = env.step(action) if env.episode > args.episodes: break # Perform last 100 evaluation episodes for _ in range(100): state, done = env.reset(True), False while not done: action = np.argmax(Q[state]).item() state, _, done, _ = env.step(action)