def environment(discrete=True): if discrete: separators = [ np.linspace(-.4, .4, num=5 + 1)[1:-1], # x np.linspace(-.05, .9, num=6 + 1)[1:-1], # y np.linspace(-.5, .5, num=5 + 1)[1:-1], # vel x np.linspace(-.8, .8, num=7 + 1)[1:-1], # vel y np.linspace(-.2, .2, num=3 + 1)[1:-1], # rot np.linspace(-.2, .2, num=5 + 1)[1:-1], # ang vel [.5], #lc [.5], #rc ] evaluator = gym_evaluator.GymEnvironment("LunarLander-v2", separators=separators) else: evaluator = gym_evaluator.GymEnvironment("LunarLander-v2") evaluator._expert = gym.make("LunarLander-v2") evaluator._expert.seed(42) evaluator._expert.continuous = not discrete def expert_trajectory(): state, trajectory, done = evaluator._expert.reset(), [], False initial_state = evaluator._maybe_discretize(state) while not done: action = gym.envs.box2d.lunar_lander.heuristic( evaluator._expert, state) state, reward, done, _ = evaluator._expert.step(action) trajectory.append( (action, reward, evaluator._maybe_discretize(state))) return initial_state, trajectory evaluator.expert_trajectory = expert_trajectory return evaluator
def environment(discrete=True): if discrete: bins = 12 separators = [ np.linspace(-1.2, 0.6, num=bins + 1)[1:-1], # car position np.linspace(-0.07, 0.07, num=bins + 1)[1:-1], # car velocity ] return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", bins, separators) return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0")
def environment(discrete=True, tiles=None, verbose=True): if discrete: bins = 24 if tiles is None or tiles <= 1 else 12 if tiles <= 3 else 8 separators = [ np.linspace(-1.2, 0.6, num=bins + 1)[1:-1], # car position np.linspace(-0.07, 0.07, num=bins + 1)[1:-1], # car velocity ] return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", separators=separators, tiles=tiles, verbose=verbose) return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", verbose=verbose)
def environment(discrete=True): if discrete: bins = 8 separators = [ np.linspace(-2.4, 2.4, num=bins + 1)[1:-1], # cart position np.linspace(-3, 3, num=bins + 1)[1:-1], # pole angle np.linspace(-0.5, 0.5, num=bins + 1)[1:-1], # cart velocity np.linspace(-2, 2, num=bins + 1)[1:-1], # pole angle velocity ] return gym_evaluator.GymEnvironment("CartPole-v1", bins, separators) return gym_evaluator.GymEnvironment("CartPole-v1")
def environment(cards): env_name = "MemoryGame{}-v0".format(cards) if env_name not in memory_games: gym.envs.register(id=env_name, entry_point=lambda: MemoryGame(cards), max_episode_steps=2 * cards, reward_threshold=0) memory_games.add(env_name) env = gym_evaluator.GymEnvironment(env_name) env._expert = gym.make(env_name) def expert_episode(): state = env._expert.reset() episode, seen, done = [], {}, False while not done: last_action, observation = state if observation in seen: action = seen.pop(observation) if action == last_action - 1: action = cards else: seen[observation] = last_action action = cards episode.append((state, action)) state, _, done, _ = env._expert.step(action) episode.append((state, None)) return episode env.expert_episode = expert_episode return env
def environment(frame_skip=1): if frame_skip not in FRAME_SKIPS: raise ValueError( "Unsupported frame skip {}, only {} are supported".format( frame_skip, list(FRAME_SKIPS))) return gym_evaluator.GymEnvironment( "CarRacingCustomDrawFrameSkip{}-v0".format(frame_skip))
def environment(cards): env_name = "MemoryGame{}-v0".format(cards) if env_name not in memory_games: gym.envs.register(id=env_name, entry_point=lambda: MemoryGame(cards), max_episode_steps=2 * cards, reward_threshold=0) memory_games.add(env_name) return gym_evaluator.GymEnvironment(env_name)
def environment(): env = gym_evaluator.GymEnvironment("CarRacingCustomDraw-v0") def step(action, frame_skip=1): env._env.unwrapped.frame_skip = frame_skip return gym_evaluator.GymEnvironment.step(env, action) env.step = step return env
gpus = tf.config.experimental.list_physical_devices('GPU') if gpus: try: # Currently, memory growth needs to be the same across GPUs for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) logical_gpus = tf.config.experimental.list_logical_devices( 'GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs") except RuntimeError as e: # Memory growth must be set before GPUs have been initialized print(e) # Create the environment env = gym_evaluator.GymEnvironment(args.env) # Construct the network network = Network(env, args) # Initialize parallel workers by env.parallel_init states = env.parallel_init(args.workers) saved_model_path = Path(__file__).parent / 'paac_models_weights' training = not args.use_pretrained summary_writer = tf.summary.create_file_writer( str( Path(__file__).parent / 'logs' / f'train-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}')) summary_writer.set_as_default()
def environment(): return gym_evaluator.GymEnvironment("CartPolePixels-v0")
def environment(seed=None): return gym_evaluator.GymEnvironment("CartPolePixels-v0", seed=seed)
class Network: def __init__(self, env, args): # TODO: Similarly to reinforce, define two models: # - _policy, which predicts distribution over the actions # - _value, which predicts the value function # Use independent networks for both of them, each with # `args.hidden_layer` neurons in one hidden layer, # and train them using Adam with given `args.learning_rate`. def train(self, states, actions, returns): states, actions, returns = np.array(states, np.float32), np.array(actions, np.int32), np.array(returns, np.float32) # TODO: Train the policy network using policy gradient theorem # and the value network using MSE. def predict_actions(self, states): states = np.array(states, np.float32) return self._policy.predict_on_batch(states) def predict_values(self, states): states = np.array(states, np.float32) return self._value.predict_on_batch(states)[:, 0] if __name__ == "__main__": # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment.") parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of batches.") parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of batches.") parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.") parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.") parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") parser.add_argument("--workers", default=1, type=int, help="Number of parallel workers.") args = parser.parse_args() # Fix random seeds and number of threads np.random.seed(42) tf.random.set_seed(42) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create the environment env = gym_evaluator.GymEnvironment(args.env) # Construct the network network = Network(env, args) # Initialize parallel workers by env.parallel_init states = env.parallel_init(args.workers) while True: # Training for _ in range(args.evaluate_each): # TODO: Choose actions using network.predict_actions # TODO: Perform steps by env.parallel_step # TODO: Compute return estimates by # - extracting next_states from steps # - computing value function approximation in next_states # - estimating returns by reward + (0 if done else args.gamma * next_state_value) # TODO: Train network using current states, chosen actions and estimated returns # Periodic evaluation returns = [] for _ in range(args.evaluate_for): returns.append(0) state, done = env.reset(), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() probabilities = network.predict_actions([state])[0] action = np.argmax(probabilities) state, reward, done, _ = env.step(action) returns[-1] += reward print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns)))
import gym import numpy as np import gym_evaluator import _pickle as pickle import sys env = gym_evaluator.GymEnvironment('BipedalWalker-v2') #env = gym.make('BipedalWalker-v2') np.random.seed(10) hl_size = 100 version = 1 npop = 50 sigma = 0.1 alpha = 0.03 iter_num = 300 aver_reward = None allow_writing = True reload = False print(hl_size, version, npop, sigma, alpha, iter_num) if reload: model = pickle.load(open('model-pedal%d.p' % version, 'rb')) else: model = {} model['W1'] = np.random.randn(24, hl_size) / np.sqrt(24) model['W2'] = np.random.randn(hl_size, 4) / np.sqrt(hl_size)
class Network: def __init__(self, env, args): # Store the arguments regularization self.args = args # TODO: Create the actor. The input should be a batch of _sequences_ of # states (so the input shape is `[None] + env.state_shape`), each state # processed independently by the same network with a dense layer of # args.hidden_layer units with ReLU activation, followed by an softmax # layer with `env.actions` units. # # We use sequences of states on the input, because we want to predict # probabilities of up to `args.n` following states. # # We train the actor using sparse categorical crossentropy loss # and Adam optimizer with args.learning_rate. # TODO: Create the critic. The input should be again a batch of _sequences_ # of states, each processed independently by a network with a dense layer of # args.hidden_layer units with ReLU activation, followed by a dense layer # with 1 output and no activation. # # We train the critic using MSE loss and Adam optimizer with args.learning_rate. # Do not change the method signature, as this method is used for testing in ReCodEx. @staticmethod def vtrace(args, actions, action_probabilities, rewards, actor_probabilities, critic_values): """Compute loss for V-trace algorithm. Arguments: args: command line arguments actions: [batch_size, n] chosen actions action_probabilities: [batch_size, n] probability of the chosen actions under behaviour policy; guaranteed to be 1 for actions after episode termination rewards: [batch_size, n] observed rewards; guaranteed to be 0 for rewards after episode termination actor_probabilities: [batch_size, n, num_actions] probabilities of actions under current (target) policy critic_values: [batch_size, n+1] critic estimation of values of encountered states; guaranteed to be 0 for states after episode termination """ # TODO: Compute target policy probability of given actions # into `actor_action_probabilities`, i.e., symbolically # actor_action_probabilities = actor_probabilities[:, :, actions[:, :]] rhos, cs = [], [] # TODO: Compute clipped rho-s and c-s, as a Python list with # args.n elements, each a tensor (values for a whole batch). # The value rhos[i] and cs[i] should be importance sampling # ratio for actions[:, i] clipped by `args.clip_rho` and # `args.clip_c`, respectively. vs = [None] * (args.n + 1) # TODO: Compute vs from the last one to the first one. # The `vs[args.n]` is just `critic_values[:, args.n]` # The others can be computed recursively as # vs[t] = critic_values[:, t] + delta_t V + gamma * cs[t] * (vs[t+1] - critic_values[:, t+1]) # TODO: Return a pair with following elements: # - coefficient for actor loss, i.e., a product of the importance # sampling factor (rhos[0]) and the estimated q_value # (rewards + gamma * vs[1]) minus the baseline of critic_values # - target for the critic, i.e., vs[0] @tf.function def train(self, steps, states, actions, action_probabilities, rewards): # TODO: Run the actor on first `args.n` states and the critic on `args.n+1` states # TODO: Only first `steps` of `states` are valid (so `steps` might be `args.n+1` # if all `states` are non-terminal), so the critic predictions for the # states after the `steps` ones must be set to zero. # TODO: Run the `vtrace` method, with the last two arguments being the actor # and critic predictions, obtaining `actor_weights` and `critic_targets`. # TODO: Train the actor, using the first state of every batch instance, with # - sparse categorical crossentropy loss, weighted by `actor_weights` # - plus entropy regularization with weights self.args.entropy_regularization. # Entropy of a given categorical distribution `d` is # tf.reduce_sum(-d * tf.math.log(d), axis=-1) # TODO: Train the critic using the first state of every batch instance, # utilizing MSE loss with `critic_targets` as gold values. @tf.function def _predict_actions(self, states): return self._actor(states) def predict_actions(self, states): states = np.expand_dims(np.array(states, np.float32), axis=1) return self._predict_actions(states).numpy()[:, 0] if __name__ == "__main__": # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=None, type=int, help="Number of transitions to train on.") parser.add_argument("--clip_c", default=1., type=float, help="Clip value for c in V-trace.") parser.add_argument("--clip_rho", default=1., type=float, help="Clip value for rho in V-trace.") parser.add_argument("--entropy_regularization", default=0.1, type=float, help="Entropy regularization weight.") parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment.") parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of episodes.") parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of episodes.") parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.") parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.") parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.") parser.add_argument("--n", default=None, type=int, help="Number of steps to use in V-trace.") parser.add_argument("--replay_buffer_maxlen", default=None, type=int, help="Replay buffer maxlen.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--target_return", default=495, type=float, help="Target return.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Fix random seeds and number of threads np.random.seed(42) tf.random.set_seed(42) tf.config.threading.set_inter_op_parallelism_threads(args.threads) tf.config.threading.set_intra_op_parallelism_threads(args.threads) # Create the environment env = gym_evaluator.GymEnvironment(args.env) # Construct the network network = Network(env, args) # Replay memory replay_buffer = collections.deque(maxlen=args.replay_buffer_maxlen) Transition = collections.namedtuple("Transition", ["state", "action", "action_probability", "reward", "done"]) def evaluate_episode(evaluating=False): rewards = 0 state, done = env.reset(evaluating), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() probabilities = network.predict_actions([state])[0] action = np.argmax(probabilities) state, reward, done, _ = env.step(action) rewards += reward return rewards while True: # Training for _ in range(args.evaluate_each): state, done = env.reset(), False while not done: probabilities = network.predict_actions([state])[0] action = np.random.choice(np.arange(len(probabilities)), p=probabilities) next_state, reward, done, _ = env.step(action) replay_buffer.append(Transition(state, action, probabilities[action], reward, done)) state = next_state if len(replay_buffer) > args.n * args.batch_size: steps = np.zeros((args.batch_size), dtype=np.int32) states = np.zeros([args.batch_size, args.n + 1] + env.state_shape, dtype=np.float32) actions = np.zeros((args.batch_size, args.n), dtype=np.int32) action_probabilities = np.ones((args.batch_size, args.n), dtype=np.float32) rewards = np.zeros((args.batch_size, args.n), dtype=np.float32) # TODO: Prepare a batch. # # Each batch instance is a sequence of `args.n+1` consecutive `states` and # `args.n` consecutive `actions`, `action_probabilities` and `rewards`. # The `steps` indicate how many `states` in range [1,2,...,args.n+1] are valid. # # To generate a batch, sample `args.batch_size` indices from replay_buffer # (ignoring the last `args.n` ones to avoid overflow). Then fill for every # sampled index the consecutive states, actions, action_probabilities and # rewards -- if `done` is not set, all of them are filled and `steps` is # set to `args.n+1`. If `done` is set, only a subset of states, actions, # action_probabilities and rewards are set, and `steps` is set to the # number of valid states (<`args.n+1`). network.train(steps, states, actions, action_probabilities, rewards) # Periodic evaluation returns = [] for _ in range(args.evaluate_for): returns.append(evaluate_episode()) print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns))) if np.mean(returns) >= args.target_return: print("Reached mean average return of {}, running final evaluation.".format(np.mean(returns))) while True: evaluate_episode(True)
def construct(self, args, state_shape, action_components, action_lows, action_highs): with self.session.graph.as_default(): self.states = tf.placeholder(tf.float32, [None] + state_shape) self.actions = tf.placeholder(tf.float32, [None, action_components]) self.returns = tf.placeholder(tf.float32, [None]) # Actor def actor(inputs): # TODO: Implement actor network, starting with `inputs` and returning # action_components values for each batch example. Usually, one # or two hidden layers are employed. # # Each action_component[i] should be mapped to range # [actions_lows[i]..action_highs[i]], for example using tf.nn.sigmoid # and suitable rescaling. with tf.variable_scope("actor"): self.mus = actor(self.states) with tf.variable_scope("target_actor"): target_actions = actor(self.states) # Critic from given actions def critic(inputs, actions): # TODO: Implement critic network, starting with `inputs` and `actions` # and producing a vector of predicted returns. Usually, `inputs` are fed # through a hidden layer first, and then concatenated with `actions` and fed # through two more hidden layers, before computing the returns. with tf.variable_scope("critic"): values_of_given = critic(self.states, self.actions) with tf.variable_scope("critic", reuse=True): values_of_predicted = critic(self.states, self.mus) with tf.variable_scope("target_critic"): self.target_values = critic(self.states, target_actions) # Update ops update_target_ops = [] for target_var, var in zip(tf.global_variables("target_actor") + tf.global_variables("target_critic"), tf.global_variables("actor") + tf.global_variables("critic")): update_target_ops.append(target_var.assign((1.-args.target_tau) * target_var + args.target_tau * var)) # TODO: Training # Define actor_loss and critic loss and then: # - train the critic (if required, using critic variables only, # by using `var_list` argument of `Optimizer.minimize`) # - train the actor (if required, using actor variables only, # by using `var_list` argument of `Optimizer.minimize`) # - update target network variables # You can group several operations into one using `tf.group`. global_step = tf.train.create_global_step() self.training = tf.group(...) # Initialize variables self.session.run(tf.global_variables_initializer()) def predict_actions(self, states): return self.session.run(self.mus, {self.states: states}) def predict_values(self, states): return self.session.run(self.target_values, {self.states: states}) def train(self, states, actions, returns): self.session.run(self.training, {self.states: states, self.actions: actions, self.returns: returns}) class OrnsteinUhlenbeckNoise: """Ornstein-Uhlenbeck process.""" def __init__(self, shape, mu, theta, sigma): self.mu = mu * np.ones(shape) self.theta = theta self.sigma = sigma self.reset() def reset(self): self.state = np.copy(self.mu) def sample(self): self.state += self.theta * (self.mu - self.state) + np.random.normal(scale=self.sigma, size=self.state.shape) return self.state if __name__ == "__main__": # Fix random seed np.random.seed(42) # Parse arguments import argparse parser = argparse.ArgumentParser() parser.add_argument("--batch_size", default=None, type=int, help="Batch size.") parser.add_argument("--env", default="Pendulum-v0", type=str, help="Environment.") parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of episodes.") parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of batches.") parser.add_argument("--noise_sigma", default=0.2, type=float, help="UB noise sigma.") parser.add_argument("--noise_theta", default=0.15, type=float, help="UB noise theta.") parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.") parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.") parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.") parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.") parser.add_argument("--target_tau", default=None, type=float, help="Target network update weight.") parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.") args = parser.parse_args() # Create the environment env = gym_evaluator.GymEnvironment(args.env) assert len(env.action_shape) == 1 action_lows, action_highs = map(np.array, env.action_ranges) # Construct the network network = Network(threads=args.threads) network.construct(args, env.state_shape, env.action_shape[0], action_lows, action_highs) # Replay memory; maxlen parameter can be passed to deque for a size limit, # which we however do not need in this simple task. replay_buffer = collections.deque() Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"]) def evaluate_episode(evaluating=False): rewards = 0 state, done = env.reset(evaluating), False while not done: if args.render_each and env.episode > 0 and env.episode % args.render_each == 0: env.render() action = network.predict_actions([state])[0] state, reward, done, _ = env.step(action) rewards += reward return rewards noise = OrnsteinUhlenbeckNoise(env.action_shape[0], 0., args.noise_theta, args.noise_sigma) while True: # Training for _ in range(args.evaluate_each): state, done = env.reset(), False noise.reset() while not done: # TODO: Perform an action and store the transition in the replay buffer # If the replay_buffer is large enough, perform training if len(replay_buffer) >= args.batch_size: batch = np.random.choice(len(replay_buffer), size=args.batch_size, replace=False) states, actions, rewards, dones, next_states = zip(*[replay_buffer[i] for i in batch]) # TODO: Perform the training # Evaluation returns = [] for _ in range(args.evaluate_for): returns.append(evaluate_episode()) print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns)))