def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) YOUR_CODE_HERE sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************"%i) YOUR_CODE_HERE if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train(self, num_iter): start = time.time() for i in range(num_iter): t1 = time.time() self.train_step() t2 = time.time() print('total time of one step', t2 - t1) print('iter ', i,' done') # record statistics every 10 iterations if ((i + 1) % 10 == 0): rewards = self.aggregate_rollouts(num_rollouts = 100, evaluate = True) w = ray.get(self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + "/lin_policy_plus", w) print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.dump_tabular() t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update(ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [worker.sync_filter.remote(filter_id) for worker in self.workers] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [worker.stats_increment.remote() for worker in self.workers] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) return
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="advn", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# sy_logprob_n = None if discrete: sy_logits_na = build_mlp(sy_ob_no, ac_dim, "mlp", n_layers=n_layers, size=size)(sy_ob_no) sy_sampled_ac = tf.squeeze(tf.multinomial( sy_logits_na, 1)) # Hint: Use the tf.multinomial op sy_logprob_n = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) else: sy_mean = build_mlp(sy_ob_no, ac_dim, "mlp", n_layers=n_layers, size=size)(sy_ob_no) #will learn this when doing the loss sy_logstd = tf.get_variable("logstd", shape=[ ac_dim ]) # logstd should just be a trainable variable, not a network output. #I guess I could also iterate over passing the mean and std, but less cool/efficient then reparametrization trick sy_sampled_ac = tf.squeeze( sy_mean + tf.exp(sy_logstd) * tf.random_normal(tf.shape(sy_mean)), axis=[1]) # Hint: Use the log probability under a multivariate gaussian. sy_logprob_n = -tf.contrib.distributions.MultivariateNormalDiag( loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = tf.reduce_mean( sy_logprob_n * sy_adv_n ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE #baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 after_loss = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 20 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = 0 rewards_by_episode = [path['reward'] for path in paths] if not reward_to_go: q_n = np.concatenate([[ sum([ reward_path[i] * gamma**len(reward_path) for i in range(len(reward_path)) ]) ] * len(reward_path) for reward_path in rewards_by_episode]) else: q_n = np.concatenate([[ sum([ reward_path[j] * gamma**(j - i) for j in range(i, len(reward_path)) ]) for i in range(len(reward_path)) ] for reward_path in rewards_by_episode]) assert len(q_n) == len(ob_no) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) assert False #b_n = TODO #adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_mean = np.mean(adv_n) adv_std = np.std(adv_n) adv_n = (adv_n - adv_mean) / adv_std #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. before_loss = after_loss _, after_loss = sess.run([update_op, loss], feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) #after_loss = sess.run([loss], feed_dict={sy_ob_no: ob_no, sy_ac_na: ac_na,sy_adv_n: adv_n}) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("Loss before", before_loss) logz.log_tabular("Loss_after", after_loss) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, n_job=1, epoch=1, gae_lambda=None, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "nn_policy", n_layers, size) # Hint: Use the tf.multinomial op sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), 1) sy_logprob_na = tf.nn.log_softmax(sy_logits_na) sy_index_n = tf.stack([tf.range(tf.shape(sy_logits_na)[0]), sy_ac_na], 1) sy_logprob_n = tf.gather_nd(sy_logprob_na, sy_index_n) else: # YOUR_CODE_HERE sy_mu_na = build_mlp(sy_ob_no, ac_dim, "nn_mu", n_layers, size) # logstd should just be a trainable variable, not a network output. sy_logstd = tf.get_variable("nn_logstd", shape=[1, ac_dim], initializer=tf.zeros_initializer()) norm_dist = tf.distributions.Normal(sy_mu_na, tf.exp(sy_logstd)) sy_sampled_ac = norm_dist.sample() # Hint: Use the log probability under a multivariate gaussian. sy_logprob_n = tf.reduce_sum(norm_dist.log_prob(sy_ac_na), 1) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Loss function that we'll differentiate to get the policy gradient. loss = -tf.reduce_sum(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline or gae_lambda: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_target = tf.placeholder(tf.float32, [None]) baseline_loss = tf.losses.mean_squared_error(baseline_target, baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 # Create environments envs = MultiEnv(env_name, n_job) for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Start timer for sample timing time_start = time.time() # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) observations = envs.reset() paths_done = [False] * n_job paths_observations = [[] for _ in range(n_job)] paths_actions = [[] for _ in range(n_job)] paths_rewards = [[] for _ in range(n_job)] steps = 0 while True: if animate_this_episode: envs.render() time.sleep(0.05) # Append observations for i in range(n_job): if not paths_done[i]: paths_observations[i].append(observations[i]) # Get actions from current policy actions = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: observations}) for i in range(n_job): if not paths_done[i]: paths_actions[i].append(actions[i]) # Run step observations, rewards, path_done_next = envs.step(actions) # Append rewards for i in range(n_job): if not paths_done[i]: paths_rewards[i].append(rewards[i]) steps += 1 paths_done = path_done_next if np.all(paths_done) or steps > max_path_length: break # Append paths for i in range(n_job): path = { "observation": np.array(paths_observations[i]), "reward": np.array(paths_rewards[i]), "action": np.array(paths_actions[i]) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Get sample time time_used = time.time() - time_start # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = np.asanyarray([]) for path in paths: reward = path['reward'] length = len(reward) if not reward_to_go: q_path = np.sum(reward * np.logspace(0, length - 1, length, base=gamma)) q_n = np.append(q_n, np.ones_like(reward) * q_path) else: q_path = np.zeros_like(reward) # Accumulate reward from right to left temp = reward.copy() for t in range(length): q_path += np.pad(temp[t:], (0, t), 'constant') temp *= gamma q_n = np.append(q_n, q_path) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline or gae_lambda: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, {sy_ob_no: ob_no}) # Rescale to normal distribution b_std = np.std(b_n) b_mean = np.mean(b_n) b_n = (b_n - b_mean) / b_std # Rescale to Q-value distribution q_std = np.std(q_n) q_mean = np.mean(q_n) b_n = q_mean + b_n * q_std if gae_lambda: # Generalized advantage estimator adv_n = np.zeros_like(q_n) index_start = 0 for path in paths: reward = path['reward'] length = len(reward) index_end = index_start + length path_v = b_n[index_start:index_end] path_v_next = b_n[index_start + 1:index_end] path_v_next = np.append(path_v_next, 0) delta = reward + gamma * path_v_next - path_v # Accumulate critic from right to left temp = delta.copy() for t in range(length): adv_n[index_start:index_end] += np.pad( temp[t:], (0, t), 'constant') temp *= gamma * gae_lambda index_start = index_end else: # Baseline estimator adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_std = np.std(adv_n) + 1e-5 adv_mean = np.mean(adv_n) adv_n = (adv_n - adv_mean) / adv_std #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline or gae_lambda: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target_n = (q_n - q_mean) / q_std feed_dict = {sy_ob_no: ob_no, baseline_target: target_n} for i in range(epoch): sess.run(baseline_update_op, feed_dict) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE feed_dict = { sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n / len(paths) } # Save the loss function before the update loss_before = sess.run(loss, feed_dict) # Train for i in range(epoch): sess.run(update_op, feed_dict) # Save the loss function after the update loss_after = sess.run(loss, feed_dict) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("LossBeforeUpdate", loss_before) logz.log_tabular("LossAfterUpdate", loss_after) logz.log_tabular("LossUpdated", loss_after - loss_before) logz.log_tabular("SampleTime", time_used) logz.dump_tabular() logz.pickle_tf_vars()
def deepq(env, max_episode_steps, n_experiments, n_total_steps, seed, gamma, learning_rate, conv_sizes, fc_sizes, n_init_buffer_size, n_buffer_size, batch_size, epsilon_start, epsilon_end, exploration_fraction, update_target_freq, logging_dir="log", isRenderding=True, isRecordingVideo=True, recordingVideo_dir="video", rec_per_episodes=100, chckp_dir="checkpoint", checkpt_save_freq=100, test_name="test", device="CPU"): # Get environment name env_name = env.spec.id if max_episode_steps > 0: env._max_episode_steps = max_episode_steps print("Env max_step_per_episode:{}".format(env._max_episode_steps)) # Identify states and action dimensions isDiscrete = isinstance(env.action_space, gym.spaces.Discrete) n_actions = env.action_space.n if isDiscrete else env.action_space.shape[0] # State processor state_shape = env.observation_space.shape state_size = [84, 84, 4] # list(state_shape) state_processor = StateProcessor(input_shape=state_shape, output_shape=state_size[:-1]) if device in {"gpu", "GPU"}: tf_device = '/device:GPU:0' else: tf_device = '/device:CPU:0' with tf.device(tf_device): value_model = ValueEstimator(scope="q_func", state_size=state_size, action_size=n_actions, conv_sizes=conv_sizes, fc_sizes=fc_sizes, learning_rate=learning_rate, isDiscrete=isDiscrete) target_value_model = ValueEstimator(scope="t_q_func", state_size=state_size, action_size=n_actions, conv_sizes=conv_sizes, fc_sizes=fc_sizes, learning_rate=learning_rate, isDiscrete=isDiscrete) init_time = time.strftime("%d-%m-%Y_%H-%M-%S") for exp in range(n_experiments): # Set random seed rand_seed = seed + 10 * exp tf.set_random_seed(rand_seed) np.random.seed(rand_seed) # Global step global_step = tf.Variable(0, name="global_step", trainable=False) # Init TF session with tf.Session() as sess: sess.run(tf.global_variables_initializer()) # TF saver saver = tf.train.Saver() chckp_dir = os.path.join(chckp_dir, env_name, test_name, init_time, str(exp)) if not os.path.exists(chckp_dir): os.makedirs(chckp_dir) latest_checkpoint = tf.train.latest_checkpoint( checkpoint_dir=chckp_dir) if latest_checkpoint: print( "Loading model checkpoint {}...".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) # Configure output directory for logging # Data logging paths if isRecordingVideo: recordingVideo_dir = os.path.join(recordingVideo_dir, env_name, test_name, init_time, str(exp)) if not os.path.exists(recordingVideo_dir): os.makedirs(recordingVideo_dir) logging_dir = os.path.join(logging_dir, env_name, test_name, init_time) if not os.path.exists(logging_dir): os.makedirs(logging_dir) logz.configure_output_dir(os.path.join(logging_dir, str(exp))) # Log experimental parameters args = inspect.getargspec(deepq)[0] locals_ = locals() params = { k: locals_[k] if k in locals_ and isinstance(locals_[k], (int, str, float)) else None for k in args } logz.save_params(params) print("Parameter Lists") for param in params: if params[param]: print(param + ": {}".format(params[param])) # Global step total_step = tf.train.global_step(sess, global_step) # Epsilon decaying schedule epsilons = np.linspace(epsilon_start, epsilon_end, int(exploration_fraction * n_total_steps)) # The policy we're following policy = make_epsilon_greedy_policy(value_model, env.action_space.n) # Create a replay buffer replay_memory = [] print("Collecting initial replay buffer") state = env.reset() state = state_processor.process( state, sess) # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE state = np.stack([state] * 4, axis=2) # Sequential images (4 frames) for idx in range(n_init_buffer_size): action_probs = policy(state, epsilons[0], sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) next_state = state_processor.process(next_state, sess) # Append next_state next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, axis=2), axis=2) replay_memory.append( Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(state, sess) state = np.stack([state] * 4, axis=2) else: state = next_state print("==========================================") print("Exp: ", exp) print("==========================================") # Stat variables episode_reward_sum = 0 episode_length = 0 loss_sum = 0 loss_steps = 0 ep = 0 # Episode # Reset env variables state = env.reset() state = state_processor.process( state, sess) # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE state = np.stack([state] * 4, axis=2) # Sequential images (4 frames) video_recorder = None if isRenderding and isRecordingVideo and (ep == 0 or ep % rec_per_episodes == 0): video_recorder = VideoRecorder( env, os.path.join( recordingVideo_dir, "vid_{}_{}_{}_{}.mp4".format(env_name, exp, test_name, ep)), enabled=True) print("Recording a video of this episode {} in experiment {}". format(ep, exp)) # Iterate total n steps of simulation across numerous episodes for total_step in range(n_total_steps): # Epsilon for this time step epsilon = epsilons[min( total_step, int(exploration_fraction * n_total_steps) - 1)] # Update target Q-function with online Q-function if total_step % update_target_freq == 0: copy_model_parameters(value_model, target_value_model, sess) print("Copied model parameters to target network.") # Take a step action_probs = policy(state, epsilon, sess) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(action) next_state = state_processor.process(next_state, sess) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, axis=2), axis=2) if video_recorder and isRenderding: env.render() video_recorder.capture_frame() # Check whether replay buffer is full if len(replay_memory) == n_buffer_size: replay_memory.pop(0) # Save transition to replay buffer replay_memory.append( Transition(state, action, reward, next_state, done)) # Update online Q-function # Sample randomized minibatch from replay buffer samples = random.sample(replay_memory, batch_size) states_batch, actions_batch, reward_batch, next_state_batch, done_batch = map( np.array, zip(*samples)) # Calculate action-values from double Q-functions q_values_next = value_model.predict( next_state_batch, sess) # Q values per each possible actions selected_actions = np.argmax( q_values_next, axis=1 ) # Use Q-function (not target) to get the max action # Get max action-value using max action from online Q-values target_q_values_next = target_value_model.predict( next_state_batch, sess) selected_target_values = gamma * target_q_values_next[ np.arange(batch_size), selected_actions] targets_batch = reward_batch + np.invert(done_batch).astype( np.float32) * selected_target_values # Update Q(action-value) function states_batch = np.array(states_batch) loss = value_model.update(states_batch, actions_batch, targets_batch, sess=sess) loss_sum += loss loss_steps += 1 if done: # Close video recorder if video_recorder: video_recorder.close() video_recorder = None print( "===================== End of Episode:{} @ step:{} =====================" .format(ep, total_step)) # Log progress logz.log_tabular("Episode", ep) logz.log_tabular("Episode length", episode_length) logz.log_tabular("Total steps", total_step) logz.log_tabular("Mean rewards", episode_reward_sum / episode_length) logz.dump_tabular() logz.pickle_tf_vars() # Reset env and stat variables state = env.reset() state = state_processor.process( state, sess) # TODO: DO NOT PROCESS IMAGE TO GRAYSCALE state = np.stack([state] * 4, axis=2) # Sequential images (4 frames) episode_reward_sum = 0 episode_length = 0 loss_sum = 0 loss_steps = 0 ep += 1 # Save model per episode if ep % checkpt_save_freq == 0 or ep == 0: saver.save(tf.get_default_session(), chckp_dir, global_step=total_step) # Recording videos if video_recorder: video_recorder.close() if isRenderding and isRecordingVideo and ( ep == 0 or ep % rec_per_episodes == 0): video_recorder = VideoRecorder( env, os.path.join( recordingVideo_dir, "vid_{}_{}_{}_{}.mp4".format( env_name, exp, test_name, ep)), enabled=True) print( "Recording a video of this episode {} in experiment {}" .format(ep, exp)) else: # Update episode stats episode_reward_sum += reward episode_length += 1 state = next_state print( "===================== End of Last Episode:{} @ step:{} =====================" .format(ep, total_step)) # Log progress logz.log_tabular("Episode", ep) logz.log_tabular("Episode length", episode_length) logz.log_tabular("Total steps", total_step) logz.log_tabular("Mean rewards", episode_reward_sum / episode_length) logz.dump_tabular() logz.pickle_tf_vars() # Save session saver.save(tf.get_default_session(), chckp_dir, global_step=total_step)
def train_ppo(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size): start = time.time() # ========================================================================================# # Set Up Logger # ========================================================================================# setup_logger(logdir, locals()) # ========================================================================================# # Set Up Env # ========================================================================================# # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # ========================================================================================# # Initialize Agent # ========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args, seed) # estimate_return_args # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) logp = np.concatenate([path["logp"] for path in paths]) # Call tensorflow operations to: # (1) update the critic, by calling agent.update_critic # (2) use the updated critic to compute the advantage by, calling agent.estimate_advantage # (3) use the estimated advantage values to update the actor, by calling agent.update_actor # YOUR CODE HERE closs = agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) aloss = agent.update_actor(ob_no, ac_na, adv, logp) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("closs", closs) logz.log_tabular("aloss", aloss) logz.dump_tabular() logz.pickle_tf_vars()
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) #YOUR_CODE_HERE sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder( shape=[None, ac_dim], name="ac", dtype=tf.float32 ) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate # a network mapping state to probability of action sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_mean_na = dense(sy_h1, ac_dim, "mean", weight_init=normc_initializer(0.1)) # mean output sy_logstd_a = tf.get_variable( "logstdev", [ac_dim], initializer=tf.zeros_initializer()) # log std # sample the action anc calculate its log probability U = tf.random_normal([tf.shape(sy_ob_no)[0], ac_dim], 0.0, 1.0) # a number from standard normal distribution sy_sampled_ac = ( U * tf.exp(sy_logstd_a) + sy_mean_na )[0] # convert standard normal to normal with given mean and var, used to update state and not for policy gradient #sy_logprob_n = -(sy_ac_n - sy_mean_na)**2/tf.exp(2*sy_logstd_a) - tf.log(2*np.pi)/2 - sy_logstd_a sy_logprob_n = tf.reduce_sum( -(sy_ac_n - sy_mean_na)**2 / tf.exp(2 * sy_logstd_a) / 2 - sy_logstd_a, axis=1) # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_oldmean_na = tf.placeholder(shape=[None, ac_dim], name="oldmean", dtype=tf.float32) # mean before update sy_oldlogstd_na = tf.placeholder(shape=[ac_dim], name="oldstd", dtype=tf.float32) # std before update sy_n = tf.shape(sy_ob_no)[0] # KL divergence sy_kl = tf.reduce_sum(sy_logstd_a-sy_oldlogstd_na - 0.5 + (tf.exp(sy_oldlogstd_na*2) + (sy_mean_na - \ sy_oldmean_na)**2)/(2*tf.exp(sy_logstd_a*2))) / tf.to_float(sy_n) # entropy sy_ent = tf.reduce_sum(sy_logstd_a + 0.5 * tf.log(2 * np.pi * np.e)) / tf.to_float(sy_n) # end of your code sy_surr = -tf.reduce_mean( sy_adv_n * sy_logprob_n ) # Loss function that we'll differentiate to get the policy gradient sy_stepsize = tf.placeholder( shape=[], dtype=tf.float32 ) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************" % i) #YOUR_CODE_HERE # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, old_mean_na, old_logstd_na = sess.run( [update_op, sy_mean_na, sy_logstd_a], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize }) kl, ent = sess.run( [sy_kl, sy_ent], feed_dict={ sy_ob_no: ob_no, sy_oldmean_na: old_mean_na, sy_oldlogstd_na: old_logstd_na }) # end of your code if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s' % stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s' % stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train_PG( exp_name, #env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size, pg_step): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# #setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment img = io.imread("Clear.png") # Starting image img = color.rgb2gray(img) env = MRILib(img, 'SyntheticImagesRecognizer_100K.hdf5', dim=2) # Set random seeds # tf.set_random_seed(seed) # np.random.seed(seed) # env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or 100 # Observation and action sizes ob_dim = 4 ac_dim = 4 #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': True, 'size': size, 'learning_rate': learning_rate, 'pg_step': pg_step } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] q_n, adv_n = agent.estimate_return(ob_no, re_n) agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() if (args.log): logz.pickle_tf_vars() fig, axarr = plt.subplots(1,2) #normalized = tmp / np.sum(tmp, axis = None) img = scipy.signal.convolve2d(color.rgb2gray(io.imread("Clear.png")), env.filter) #img = skimage.restoration.unsupervised_wiener(env.image, normalized)[0] histmatched = skimage.exposure.equalize_hist(img) axarr[0].imshow(env.filter) axarr[1].imshow(img) plt.show()
def train_PG(logdir, path, sim_mode=False): start = time.time() # Initialize the ROS/Sim Environment env = ros_env.Env(path, train_mode=True, sim_mode=sim_mode) # initialize the ROS agent agent = Agent(path, sim_mode=sim_mode) # Set Up Logger setup_logger(logdir, locals()) # Set random seeds tf.set_random_seed(agent.seed) np.random.seed(agent.seed) # Maximum length for episodes max_path_length = agent.max_path_length # Observation and action sizes ob_dim = agent.ob_dim ac_dim = agent.ac_dim """ Placeholders for batch observations/actions/advantages in policy gradient loss function. See Agent.build_computation_graph for notation sy_ob_no: placeholder for observations sy_ac_na: placeholder for actions sy_adv_n: placeholder for advantages """ sy_ob_no = tf.placeholder(shape=[None, agent.ob_dim], name="ob", dtype=tf.float32) sy_ac_na = tf.placeholder(shape=[None, agent.ac_dim], name="ac", dtype=tf.float32) sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) """ The policy takes in an observation and produces a distribution over the action space Constructs the symbolic operation for the policy network outputs, which are the parameters of the policy distribution p(a|s) """ # output activations are left linear by not passing any arg sy_mean = build_mlp(sy_ob_no, agent.ac_dim, "policy-ddpg", agent.n_layers, agent.size, activation=tf.tanh) #print sy_mean.name sy_logstd = tf.Variable(tf.zeros([1, agent.ac_dim]), dtype=tf.float32, name="logstd") """ Constructs a symbolic operation for stochastically sampling from the policy distribution use the reparameterization trick: The output from a Gaussian distribution with mean 'mu' and std 'sigma' is mu + sigma * z, z ~ N(0, I) This reduces the problem to just sampling z. (use tf.random_normal!) """ sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal( tf.shape(sy_mean)) """ We can also compute the logprob of the actions that were actually taken by the policy. This is used in the loss function. Constructs a symbolic operation for computing the log probability of a set of actions that were actually taken according to the policy use the log probability under a multivariate gaussian. """ action_normalized = (sy_ac_na - sy_mean) / tf.exp(sy_logstd) sy_logprob_n = -0.5 * tf.reduce_sum(tf.square(action_normalized), axis=1) #=================================================================# # Loss Function and Training Operation #=================================================================# loss = -tf.reduce_mean(tf.multiply(sy_logprob_n, sy_adv_n)) update_op = tf.train.AdamOptimizer(agent.learning_rate).minimize(loss) #==============================================================# # Optional Baseline # # Define placeholders for targets, a loss function and an update op # for fitting a neural network baseline. These will be used to fit the # neural network baseline. #===============================================================# if agent.nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=agent.n_layers, size=agent.size)) sy_target_n = tf.placeholder(shape=[None], name="sy_target_n", dtype=tf.float32) baseline_loss = tf.nn.l2_loss(baseline_prediction - sy_target_n) baseline_update_op = tf.train.AdamOptimizer( agent.learning_rate).minimize(baseline_loss) # tensorflow: config, session, variable initialization tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 # Add ops to save and restore all the variables. saver = tf.train.Saver() #====================================================================# # Training Loop #====================================================================# total_timesteps = 0 for itr in range(agent.n_iter): print("********** Iteration %i ************" % itr) itr_mesg = "Iteration started at " itr_mesg += time.strftime("%d-%m-%Y_%H-%M-%S") print(itr_mesg) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] steps = 0 while True: #time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) #print sy_sampled_ac.name (add:0) #print sy_ob_no.name (ob:0) ac = ac[0] acs.append(ac) # returns obs, reward and done status ob, rew, done = env.step(ac) rewards.append(rew) steps += 1 if done or steps > agent.max_path_length: break path = { "observation": np.array(obs, dtype=np.float32), "reward": np.array(rewards, dtype=np.float32), "action": np.array(acs, dtype=np.float32) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > agent.min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ''' # Build arrays for observation and action for the # policy gradient update by concatenating across paths ''' ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] """ Monte Carlo estimation of the Q function. Estimates the returns over a set of trajectories. Store the Q-values for all timesteps and all trajectories in a variable 'q_n', like the 'ob_no' and 'ac_na' above. """ if agent.reward_to_go: q_n = [] for path in paths: q = np.zeros(pathlength(path)) q[-1] = path['reward'][-1] for i in reversed(range(pathlength(path) - 1)): q[i] = path['reward'][i] + agent.gamma * q[i + 1] q_n.extend(q) else: q_n = [] for path in paths: ret_tau = 0 for i in range(pathlength(path)): ret_tau += (agent.gamma**i) * path['reward'][i] q = np.ones(shape=[pathlength(path)]) * ret_tau q_n.extend(q) """ Compute advantages by (possibly) subtracting a baseline from the estimated Q values let sum_of_path_lengths be the sum of the lengths of the paths sampled. """ #===========================================================# # # Computing Baselines #===========================================================# if agent.nn_baseline: # If nn_baseline is True, use your neural network to predict # reward-to-go at each timestep for each trajectory, and save the # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'. # # rescale the output from the nn_baseline to match the # statistics (mean and std) of the current batch of Q-values. b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) m1 = np.mean(b_n) s1 = np.std(b_n) m2 = np.mean(q_n) s2 = np.std(q_n) b_n = b_n - m1 b_n = m2 + b_n * (s2 / (s1 + 1e-8)) adv_n = q_n - b_n else: adv_n = q_n.copy() #=========================================================# # Advantage Normalization #=========================================================# if agent.normalize_advantages: # On the next line, implement a trick which is known # empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean # zero and std=1. adv_n = preprocessing.scale(adv_n) """ Update the parameters of the policy and (possibly) the neural network baseline, which is trained to approximate the value function """ #========================================================# # Optimizing Neural Network Baseline #========================================================# if agent.nn_baseline: # If a neural network baseline is used, set up the targets and # the inputs for the baseline. # # Fit it to the current batch in order to use for the next # iteration. Use the baseline_update_op you defined earlier. # # Instead of trying to target raw Q-values directly, # rescale the targets to have mean zero and std=1. target_n = preprocessing.scale(q_n) sess.run(baseline_update_op, feed_dict={ sy_target_n: target_n, sy_ob_no: ob_no }) #=================================================================# # Performing the Policy Update #=================================================================# # Call the update operation necessary to perform the policy # gradient update based on the current batch of rollouts. _, after_loss = sess.run([update_op, loss], feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n }) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.log_tabular("After-Loss", after_loss) logz.dump_tabular() logz.pickle_tf_vars() model_file = os.path.join(logdir, "model.ckpt") save_path = saver.save(sess, model_file) print("Model saved in file: %s" % save_path) env.close_env_log()
def train_PG( exp_name='', #参数方案的名称 env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, gae_lambda=-1.0, batch_epochs=1, model_tag='vanilla', #ppo parameter clip_ratio=0.2, ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getfullargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # # ========================================================================================# if discrete: # YOUR_CODE_HERE scope_name = 'discrete' old_scope_name = 'discrete_old' sy_logits_na = build_mlp(sy_ob_no, ac_dim, scope_name, n_layers, size) # softmax生成prob被压缩在sparse_softmax_cross_entropy_with_logits中,提升效率 # 因此sy_logits_na是没有归一化的,但不影响分布sample的生成 sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) # Hint: Use the tf.multinomial op # 这里加负号为了兼容 continuous的情况,loss也加负号 sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) old_logits_na = build_mlp(sy_ob_no, ac_dim, old_scope_name, n_layers, size) old_sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=old_logits_na) else: # YOUR_CODE_HERE scope_name = 'continuous' old_scope_name = 'continuous_old' sy_mean = build_mlp(sy_ob_no, ac_dim, scope_name, n_layers, size) # logstd should just be a trainable variable, not a network output. # ??? why sy_logstd = tf.get_variable('std', [ac_dim], dtype=tf.float32) sy_sampled_ac = tf.random_normal(shape=tf.shape(sy_mean), mean=sy_mean, stddev=sy_logstd) # Hint: Use the log probability under a multivariate gaussian. sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag( loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na) old_sy_mean = build_mlp(sy_ob_no, ac_dim, old_scope_name, n_layers, size) old_sy_logprob_n = tf.contrib.distributions.MultivariateNormalDiag( loc=old_sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na) old_network_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, old_scope_name) network_param = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope_name) param_assign_op = [ tf.assign(old_value, new_value) for (old_value, new_value) in zip(old_network_param, network_param) ] # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # Loss function that we'll differentiate to get the policy gradient. # ppo clip loss if model_tag == 'ppo': # 和tensorforce不同 这里stop_gradient之后的梯度为0,导致lossDelta为0 #old_log_prob = tf.stop_gradient(input=sy_logprob_n) prob_ratio = tf.exp(x=(sy_logprob_n - old_sy_logprob_n)) # 这里无法指定axis=1 因为只有一维,剩下的一维就是[?] 即batch_size prob_ratio = tf.reduce_mean(input_tensor=prob_ratio) clipped_prob_ratio = tf.clip_by_value( t=prob_ratio, clip_value_min=(1.0 - clip_ratio), clip_value_max=(1.0 + clip_ratio)) loss = tf.reduce_mean(-tf.minimum(x=(prob_ratio * sy_adv_n), y=(clipped_prob_ratio * sy_adv_n))) else: #vanilla pg loss = tf.reduce_mean(-sy_logprob_n * sy_adv_n) #loss = tf.Print(loss, [loss, loss.shape], 'debug loss') tf.summary.scalar('loss', loss) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # ========================================================================================# # ----------SECTION 5---------- # Optional Baseline # ========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. baseline_targets = tf.placeholder(shape=[None], name='baseline_targets', dtype=tf.float32) baseline_loss = tf.nn.l2_loss(baseline_prediction - baseline_targets) tf.summary.scalar('baseline_loss', baseline_loss) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 for itr in range(n_iter): # Collect paths until we have enough timesteps # 每一轮结束或者超过max_path_length时会结束一次path # 每一轮path结束后填充到paths中,检查一次总的batch步数是否超过batch需求数,超过了则退出,开始训练 # 因此每次训练的都是完整的数据 # PG算法每次都使用当前分布sample action,不涉及exploration # TODO 改成observation和train分开两个进程,这样不用互相等待 timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) # ====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # YOUR_CODE_HERE q_n = [] reward_n = [] for path in paths: reward = path['reward'] max_step = len(reward) reward_n.extend(reward) # 从当前t开始的value估算 if reward_to_go: q = [ np.sum( np.power(gamma, np.arange(max_step - t)) * reward[t:]) for t in range(max_step) ] else: # 整个trajectory的q值估算 q = [ np.sum(np.power(gamma, np.arange(max_step)) * reward) for t in range(max_step) ] q_n.extend(q) for epoch in range(batch_epochs): # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# #print('run %d epoch' % epoch) if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) # b_n_norm = b_n - np.mean(b_n, axis=0) / (np.std(b_n, axis=0) + 1e-7) # 这里b_n要根据qn设置回来,因为b_n在下面optimize时是标准化过的 b_n = b_n * np.std(q_n, axis=0) + np.mean(q_n, axis=0) if gae_lambda > 0: adv_n = lambda_advantage(reward_n, b_n, len(reward_n), gae_lambda * gamma) else: adv_n = q_n - b_n else: adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Advantage Normalization # ====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_mean = np.mean(adv_n, axis=0) adv_std = np.std(adv_n, axis=0) adv_n = (adv_n - adv_mean) / (adv_std + 1e-7) # ====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline # ====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # 标准化的q_n作为baseline的优化目标 q_n_mean = np.mean(q_n, axis=0) q_n_std = np.std(q_n, axis=0) q_n = (q_n - q_n_mean) / (q_n_std + 1e-7) sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, baseline_targets: q_n }) # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # 输出两次loss是为了下面的log feed_dict = {sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n} sess.run(param_assign_op, feed_dict) loss_1 = sess.run(loss, feed_dict) sess.run(update_op, feed_dict) loss_2 = sess.run(loss, feed_dict) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("LossDelta", loss_1 - loss_2) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self, train_db, val_db, test_db): ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## Main loop ################################################################## start = time() min_val_loss = 100000000 for epoch in range(self.epoch, self.cfg.n_epochs): ################################################################## ## Training ################################################################## torch.cuda.empty_cache() train_loss = self.train_epoch(train_db, epoch) ################################################################## ## Validation ################################################################## torch.cuda.empty_cache() val_loss = self.validate_epoch(val_db, epoch) # val_loss = train_loss ################################################################## ## Sample ################################################################## torch.cuda.empty_cache() self.sample_for_vis(epoch, test_db, self.cfg.n_samples) torch.cuda.empty_cache() ################################################################## ## Logging ################################################################## # update optim scheduler current_val_loss = np.mean(val_loss) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("AverageTotalError", np.mean(train_loss[:, 0])) logz.log_tabular("AveragePredError", np.mean(train_loss[:, 1])) logz.log_tabular("AverageImageError", np.mean(train_loss[:, 2])) logz.log_tabular("AverageFeat0Error", np.mean(train_loss[:, 3])) logz.log_tabular("AverageFeat1Error", np.mean(train_loss[:, 4])) logz.log_tabular("AverageFeat2Error", np.mean(train_loss[:, 5])) logz.log_tabular("AverageFeat3Error", np.mean(train_loss[:, 6])) logz.log_tabular("AverageFeat4Error", np.mean(train_loss[:, 7])) logz.log_tabular("ValAverageTotalError", np.mean(val_loss[:, 0])) logz.log_tabular("ValAveragePredError", np.mean(val_loss[:, 1])) logz.log_tabular("ValAverageImageError", np.mean(val_loss[:, 2])) logz.log_tabular("ValAverageFeat0Error", np.mean(val_loss[:, 3])) logz.log_tabular("ValAverageFeat1Error", np.mean(val_loss[:, 4])) logz.log_tabular("ValAverageFeat2Error", np.mean(val_loss[:, 5])) logz.log_tabular("ValAverageFeat3Error", np.mean(val_loss[:, 6])) logz.log_tabular("ValAverageFeat4Error", np.mean(val_loss[:, 7])) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## if min_val_loss > current_val_loss: min_val_loss = current_val_loss self.save_checkpoint(epoch) torch.cuda.empty_cache()
def run_experiment(exp_params, learner_params, discriminator_params): # Experiment parameters file_location = exp_params.get('expert_samples_location', 'expert_data') prior_file_location = exp_params.get('prior_samples_location', 'prior_data') env_name = exp_params.get('env_name', 'InvertedPendulum-v2') env_type = exp_params.get('env_type', 'expert') exp_name = exp_params.get('exp_name', '{}_{}'.format(env_name, env_type)) exp_num = exp_params.get('exp_num', 0) epochs = exp_params.get('epochs', 100) test_runs_per_epoch = exp_params.get('test_runs_per_epoch', 10) steps_per_epoch = exp_params.get('steps_per_epoch', 1000) init_random_samples = exp_params.get('init_random_samples', 5000) training_starts = exp_params.get('training_starts', 0) episode_limit = exp_params.get('episode_limit', 200) return_threshold = exp_params.get('return_threshold', 1e4) visualize_collected_observations = exp_params.get('visualize_collected_observations', False) # Learner parameters l_type = learner_params.get('l_type', 'TD3') l_buffer_size = learner_params.get('l_buffer_size', 10000) l_exploration_noise = learner_params.get('l_exploration_noise', 0.2) l_learning_rate = learner_params.get('l_learning_rate', 1e-3) l_batch_size = learner_params.get('l_batch_size', 128) l_updates_per_step = learner_params.get('l_updates_per_step', 1) l_act_delay = learner_params.get('l_act_delay', 2) l_gamma = learner_params.get('l_gamma', 0.99) l_polyak = learner_params.get('l_polyak', 0.995) l_train_actor_noise = learner_params.get('l_train_actor_noise', 0.1) l_entropy_coefficient = learner_params.get('l_entropy_coefficient', 0.2) l_tune_entropy_coefficient = learner_params.get('l_tune_entropy_coefficient', True) l_target_entropy = learner_params.get('l_target_entropy', None) l_clip_actor_gradients = learner_params.get('l_clip_actor_gradients', False) # Discriminator parameters d_type = discriminator_params.get('d_type', 'latent') d_domain_constant = discriminator_params.get('d_domain_constant', 0.25) d_rew = discriminator_params.get('d_rew', 'mixed') d_rew_noise = discriminator_params.get('d_rew_noise', True) d_learning_rate = discriminator_params.get('d_learning_rate', 1e-3) d_updates_per_step = discriminator_params.get('d_updates_per_step', 1) d_stability_constant = discriminator_params.get('d_stability_constant', 0.0) d_e_batch_size = discriminator_params.get('d_e_batch_size', 64) d_l_batch_size = discriminator_params.get('d_l_batch_size', 64) d_sn_discriminator = discriminator_params.get('d_sn_discriminator', False) d_use_prior_data = discriminator_params.get('d_use_prior_data', False) d_pre_filters = discriminator_params.get('d_pre_filters', [32, 32, 1]) d_hidden_units = discriminator_params.get('d_hidden_units', [32]) d_pre_scale_stddev = discriminator_params.get('d_pre_scale_stddev', 1.0) n_expert_demos = discriminator_params.get('n_expert_demos', None) n_expert_prior_demos = discriminator_params.get('n_expert_prior_demos', None) n_agent_prior_demos = discriminator_params.get('n_agent_prior_demos', n_expert_prior_demos) if env_name == 'InvertedPendulum-v2': im_side = 32 im_shape = [im_side, im_side] expert_prior_location = 'Expert' + env_name if env_type == 'expert': env = ExpertInvertedPendulumEnv() agent_prior_location = 'Expert' + env_name elif env_type == 'agent' or env_type == 'colored': env = AgentInvertedPendulumEnv() agent_prior_location = 'Agent' + env_name elif env_type == 'to_two': env = ExpertInvertedDoublePendulumEnv() agent_prior_location = 'ExpertInvertedDoublePendulum-v2' elif env_type == 'to_colored_two': env = AgentInvertedDoublePendulumEnv() agent_prior_location = 'AgentInvertedDoublePendulum-v2' else: raise NotImplementedError elif env_name == 'InvertedDoublePendulum-v2': im_side = 32 im_shape = [im_side, im_side] expert_prior_location = 'ExpertInvertedDoublePendulum-v2' if env_type == 'expert': agent_prior_location = 'ExpertInvertedDoublePendulum-v2' env = ExpertInvertedDoublePendulumEnv() elif env_type == 'colored': env = AgentInvertedDoublePendulumEnv() agent_prior_location = 'AgentInvertedDoublePendulum-v2' elif env_type == 'to_one': agent_prior_location = 'ExpertInvertedPendulum-v2' env = ExpertInvertedPendulumEnv() elif env_type == 'agent' or env_type == 'to_colored_one': agent_prior_location = 'AgentInvertedPendulum-v2' env = AgentInvertedPendulumEnv() else: raise NotImplementedError elif env_name == 'ThreeReacherEasy-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Expert' + env_name if env_type == 'expert': env = ThreeReacherEasyEnv() agent_prior_location = 'Expert' + env_name elif env_type == 'agent' or env_type == 'to_two': agent_prior_location = 'ExpertReacherEasy-v2' env = ReacherEasyEnv() elif env_type == 'tilted': agent_prior_location = 'AgentThreeReacherEasy-v2' env = Tilted3ReacherEasyEnv() elif env_type == 'to_tilted_two': env = TiltedReacherEasyEnv() agent_prior_location = 'AgentReacherEasy-v2' else: raise NotImplementedError elif env_name == 'ReacherEasy-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'ExpertReacherEasy-v2' if env_type == 'expert': env = ReacherEasyEnv() agent_prior_location = 'ExpertReacherEasy-v2' elif env_type == 'agent' or env_type == 'tilted': env = TiltedReacherEasyEnv() agent_prior_location = 'AgentReacherEasy-v2' elif env_type == 'to_three': env = ThreeReacherEasyEnv() agent_prior_location = 'ExpertThreeReacherEasy-v2' elif env_type == 'to_tilted_three': agent_prior_location = 'AgentThreeReacherEasy-v2' env = Tilted3ReacherEasyEnv() else: raise NotImplementedError elif env_name == 'Hopper-v2': im_side = 64 im_shape = [im_side, im_side] expert_prior_location = 'Hopper-v2' if env_type == 'expert': env = HopperEnv() agent_prior_location = 'Hopper-v2' elif env_type == 'flexible': env = HopperFlexibleEnv() agent_prior_location = 'HopperFlexible-v2' else: raise NotImplementedError elif env_name == 'HalfCheetah-v2': im_side = 64 im_shape = [im_side, im_side] expert_prior_location = 'HalfCheetah-v2' if env_type == 'expert': env = ExpertHalfCheetahEnv() agent_prior_location = 'HalfCheetah-v2' elif env_type == 'locked_legs': env = LockedLegsHalfCheetahEnv() agent_prior_location = 'LockedLegsHalfCheetah-v2' else: raise NotImplementedError elif env_name == 'Striker-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Striker-v2' if env_type == 'expert': env = StrikerEnv() agent_prior_location = 'Striker-v2' elif env_type == 'to_human': env = StrikerHumanSimEnv() agent_prior_location = 'StrikerHuman-v2' else: raise NotImplementedError elif env_name == 'StrikerHumanSim-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'StrikerHumanSim-v2' if env_type == 'expert': env = StrikerHumanSimEnv() agent_prior_location = 'StrikerHumanSim-v2' elif env_type == 'to_robot': env = StrikerEnv() agent_prior_location = 'Striker-v2' else: raise NotImplementedError elif env_name == 'Pusher-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'Pusher-v2' if env_type == 'expert': env = PusherEnv() agent_prior_location = 'Pusher-v2' elif env_type == 'to_human': env = PusherHumanSimEnv() agent_prior_location = 'PusherHuman-v2' else: raise NotImplementedError elif env_name == 'PusherHumanSim-v2': im_side = 48 im_shape = [im_side, im_side] expert_prior_location = 'PusherHumanSim-v2' if env_type == 'expert': env = PusherHumanSimEnv() agent_prior_location = 'PusherHumanSim-v2' elif env_type == 'to_robot': env = PusherEnv() agent_prior_location = 'Pusher-v2' else: raise NotImplementedError else: raise NotImplementedError expert_buffer = DemonstrationsReplayBuffer( load_expert_trajectories(env_name, file_location, visual_data=True, load_ids=True, max_demos=n_expert_demos)) expert_visual_data_shape = expert_buffer.get_random_batch(1)['ims'][0].shape print('Visual data shape: {}'.format(expert_visual_data_shape)) past_frames = expert_visual_data_shape[0] print('Past frames: {}'.format(past_frames)) if d_use_prior_data: prior_expert_buffer = DemonstrationsReplayBuffer(load_expert_trajectories( agent_prior_location, prior_file_location, visual_data=True, load_ids=True, max_demos=n_expert_prior_demos)) prior_agent_buffer = DemonstrationsReplayBuffer(load_expert_trajectories( expert_prior_location, prior_file_location, visual_data=True, load_ids=True, max_demos=n_agent_prior_demos)) else: prior_expert_buffer, prior_agent_buffer = None, None if d_type == 'latent' or d_type == 'pretrained_ae': im_shape += [3] else: im_shape += [3 * past_frames] action_size = env.action_space.shape[0] if exp_num == -1: logz.configure_output_dir(None, True) else: log_dir = osp.join('experiments_data/', '{}/{}'.format(exp_name, exp_num)) logz.configure_output_dir(log_dir, True) params = { 'exp': exp_params, 'learner': learner_params, 'discriminator': discriminator_params, } print(params) logz.save_params(params) if l_type == 'TD3': def make_actor(): actor = Actor([tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(action_size, 'tanh', kernel_initializer=tf.keras.initializers.Orthogonal(0.01))]) return actor def make_critic(): critic = Critic([tf.keras.layers.Dense(400, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(300, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.Orthogonal(0.01))]) return critic elif l_type == 'SAC': def make_actor(): actor = StochasticActor([tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(action_size * 2, kernel_initializer=tf.keras.initializers.Orthogonal(0.01))]) return actor def make_critic(): critic = Critic([tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(256, 'relu', kernel_initializer='orthogonal'), tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.Orthogonal(0.01))]) return critic if l_target_entropy is None: l_target_entropy = -1 * (np.prod(env.action_space.shape)) else: raise NotImplementedError d_optimizer = tf.keras.optimizers.Adam(learning_rate=d_learning_rate) tfl = tf.keras.layers if d_type == 'latent': pre_layers = [tfl.Reshape(im_shape)] else: pre_layers = [tfl.Permute((2, 3, 1, 4)), tfl.Reshape(im_shape)] if (d_type == 'latent') or (not d_sn_discriminator): for filters in d_pre_filters[:-1]: pre_layers += [tfl.Conv2D(filters, 3, activation='tanh', padding='same'), tfl.MaxPooling2D(2, padding='same')] pre_layers += [tfl.Conv2D(d_pre_filters[-1], 3, padding='same'), tfl.MaxPooling2D(2, padding='same'), tfl.Reshape([-1])] else: for filters in d_pre_filters[:-1]: pre_layers += [SpectralNormalization( tfl.Conv2D(filters, 3, padding='same')), tfl.LeakyReLU(), tfl.MaxPooling2D(2, padding='same')] pre_layers += [SpectralNormalization( tfl.Conv2D(d_pre_filters[-1], 3, padding='same')), tfl.MaxPooling2D(2, padding='same'), tfl.Reshape([-1])] def make_disc(): if d_sn_discriminator: disc_layers = [SpectralNormalization( tfl.Dense(units, activation='relu')) for units in d_hidden_units] disc_layers.append(SpectralNormalization(tfl.Dense(1))) else: disc_layers = [tfl.Dense(units, activation='tanh') for units in d_hidden_units] disc_layers.append(tfl.Dense(1)) return InvariantDiscriminator(disc_layers, d_stability_constant, d_rew) if d_type == 'latent': def make_pre(): pre = GaussianPreprocessor(pre_layers, d_pre_scale_stddev) return pre else: def make_pre(): pre = DeterministicPreprocessor(pre_layers) return pre l_optimizer = tf.keras.optimizers.Adam(l_learning_rate) if l_type == 'TD3': l_agent = DDPG(make_actor=make_actor, make_critic=make_critic, make_critic2=make_critic, actor_optimizer=l_optimizer, critic_optimizer=l_optimizer, gamma=l_gamma, polyak=l_polyak, train_actor_noise=l_train_actor_noise, clip_actor_gradients=l_clip_actor_gradients,) elif l_type == 'SAC': l_agent = SAC(make_actor=make_actor, make_critic=make_critic, make_critic2=make_critic, actor_optimizer=l_optimizer, critic_optimizer=l_optimizer, gamma=l_gamma, polyak=l_polyak, entropy_coefficient=l_entropy_coefficient, tune_entropy_coefficient=l_tune_entropy_coefficient, target_entropy=l_target_entropy, clip_actor_gradients=l_clip_actor_gradients,) else: raise NotImplementedError sampler = Sampler(env, episode_limit, init_random_samples, visual_env=True) gail = DomainConfusionDisentanGAIL(agent=l_agent, make_discriminator=make_disc, make_preprocessing=make_pre, expert_buffer=expert_buffer, prior_expert_buffer=prior_expert_buffer, prior_agent_buffer=prior_agent_buffer, d_optimizer=d_optimizer, d_domain_constant=d_domain_constant, stab_const=d_stability_constant, past_frames=past_frames,) agent_buffer = LearnerAgentReplayBuffer(gail, l_buffer_size, reward_noise=d_rew_noise) test_input = expert_buffer.get_random_batch(1) test_input['obs'] = np.expand_dims( (env.reset()['obs']).astype('float32'), axis=0) gail(test_input) gail.summary() mean_test_returns = [] mean_test_std = [] steps = [] step_counter = 0 logz.log_tabular('Iteration', 0) logz.log_tabular('Steps', step_counter) print('Epoch {}/{} - total steps {}'.format(0, epochs, step_counter)) out = sampler.evaluate(l_agent, test_runs_per_epoch, False) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) for k, v in out.items(): logz.log_tabular(k, v) logz.dump_tabular() for e in range(epochs): while step_counter < (e + 1) * steps_per_epoch: traj_data = sampler.sample_trajectory(l_agent, l_exploration_noise) agent_buffer.add(traj_data) n = traj_data['n'] step_counter += traj_data['n'] if step_counter > training_starts: gail.train(agent_buffer=agent_buffer, l_batch_size=l_batch_size, l_updates=l_updates_per_step * n, l_act_delay=l_act_delay, d_updates=d_updates_per_step * n, d_e_batch_size=d_e_batch_size, d_l_batch_size=d_l_batch_size,) logz.log_tabular('Iteration', e + 1) logz.log_tabular('Steps', step_counter) print('Epoch {}/{} - total steps {}'.format(e + 1, epochs, step_counter)) traj_test = sampler.sample_test_trajectories(l_agent, 0.0, test_runs_per_epoch) out = log_trajectory_statistics(traj_test['ret'], False) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) for k, v in out.items(): logz.log_tabular(k, v) logz.dump_tabular() if visualize_collected_observations: training_sample = traj_data['ims'][-1, 0] print('Visualization of latest training sample') plt.imshow(training_sample) plt.show() test_sample = traj_test['ims'][-1, 0] print('Visualization of latest test sample') plt.imshow(test_sample) plt.show() if out['mean'] >= return_threshold: print('Early termination due to reaching return threshold') break return gail, sampler
def train(self, num_iter): wandb.login() run = wandb.init(project="project-local", entity="ieor-4575", tags=[f"training-easy"]) rewards_record = [] start = time.time() for i in range(num_iter): t1 = time.time() self.train_step() t2 = time.time() print('total training time: ', t2 - t1) print('iter ', i, ' done') # record statistics every 10 iterations if ((i + 1) % 1 == 0): t3 = time.time() rewards = self.aggregate_rollouts(num_rollouts=5, evaluate=True) t4 = time.time() print('total evaluation time: ', t4 - t3) if ((i + 1) % 10 == 0): w = ray.get( self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + f"/lin_policy_plus_{i}", w) print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.dump_tabular() rewards_record.append(np.mean(rewards)) fixedWindow = 10 movingAverage = 0 if len(rewards_record) >= fixedWindow: movingAverage = np.mean( rewards_record[len(rewards_record) - fixedWindow:len(rewards_record) - 1]) wandb.log({ "Training reward": rewards_record[-1], "movingAverage": movingAverage, "AverageReward": np.mean(rewards), 'StdRewards': np.std(rewards), 'MaxRewardRollout': np.max(rewards), 'MinRewardRollout': np.min(rewards) }) t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update( ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in self.workers ] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in self.workers ] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) return
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, gae_lambda=0.99, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # ========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the # numerical values that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for # that axis is None # ========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # ========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy # gradient loss function. # ========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # ========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian # distribution over actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces # actions. # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std # 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. # (Hint: use tf.random_normal!) # Should have shape [None, ac_dim] # # p.s. these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually # taken, according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', # and the policy network output ops. # # ========================================================================================# if discrete: # Get the logits from neural network output sy_logits_na = build_mlp(sy_ob_no, ac_dim, "pi", n_layers=n_layers, size=size) # Sample one action for each sample from the above probability # distributionin, and then use [-1] to flatten from [None, 1] to [None] sy_sampled_ac = tf.reshape(tf.multinomial(sy_logits_na, 1), [-1]) # Compute likelihood of an action beging chosen from the action space # only single action is needed/classified, use sparse_... here sy_logprob_n = -tf.nn.sparse_softmax_cross_entropy_with_logits( labels=sy_ac_na, logits=sy_logits_na) else: # YOUR_CODE_HERE # Assume independent continuous action sy_mean = build_mlp(sy_ob_no, ac_dim, "pi", n_layers=n_layers, size=size) # logstd should just be a trainable variable, not a network output. sy_logstd = tf.Variable(tf.zeros(shape=[1, ac_dim]), name="ac_log_std", dtype=tf.float32) sy_std = tf.exp(sy_logstd) sy_sampled_ac_k = tf.random_normal(tf.shape(sy_mean)) sy_sampled_ac = sy_mean + sy_std * sy_sampled_ac_k # Hint: Use the log probability under a multivariate gaussian for each # row, using the formula. (action independent) sy_logprob_n = -0.5 * tf.reduce_sum(tf.square( (sy_ac_na - sy_mean) / sy_std), axis=1) # ========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation # ========================================================================================# # Loss function that we'll differentiate to get the policy gradient. loss = -tf.reduce_mean(sy_logprob_n * sy_adv_n) update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) # ========================================================================================# # ----------SECTION 5---------- # Optional Baseline # ========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for # fitting a neural network baseline. These will be used to fit the # neural network baseline. # YOUR_CODE_HERE baseline_target = tf.placeholder(shape=[None], name="baseline_target", dtype=tf.float32) baseline_loss = tf.nn.l2_loss(baseline_target - baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( baseline_loss) # ========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization # ========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 # ========================================================================================# # Training Loop # ========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] # one batch starts while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 # single path starts while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break # single path end path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break # one batch ends total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update # by concatenating across paths ob_no = np.concatenate([p["observation"] for p in paths]) ac_na = np.concatenate([p["action"] for p in paths]) # ====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be # used to compute advantages (which will in turn be fed to the # placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag # 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward # summed over entire trajectory (regardless of which time step # the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of # rewards starting from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a # variable 'q_n', like the 'ob_no' and 'ac_na' above. # # ====================================================================================# # YOUR_CODE_HERE # Use accumulate/reduce here to calculate the reward along the paths q_n = [] for p in paths: if reward_to_go: q_n += list( itertools.accumulate( p["reward"][::-1], lambda ss_r, cur_r: cur_r + gamma * ss_r))[::-1] else: q_n += [ functools.reduce(lambda ss_r, cur_r: cur_r + gamma * ss_r, p["reward"][::-1]) ] * len(p["reward"]) q_n = np.array(q_n) # ====================================================================================# # ----------SECTION 5---------- # Computing Baselines # ====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict # reward-to-go at each timestep for each trajectory, and save the # result in a variable 'b_n' like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the # statistics (mean and std) of the current or previous batch of # Q-values. (Goes with Hint #bl2 below.) b_n = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) # normalize to mean and std that are the same with q_n q_mean, q_std = np.mean(q_n), np.std(q_n) b_n = (b_n - np.mean(b_n)) / (np.std(b_n) + 1e-9) + 1e-9 b_n = q_mean + b_n * q_std # critics using state-dependent baselines # adv_n = q_n - b_n # Generalized advantage estimation adv_n, ll = [], 0 for p in paths: pre_v, pre_t, adv_cur = 0, 0, [] ll += len(p["reward"]) for v, r in zip(b_n[ll - 1::-1], p["reward"][::-1]): adv_cur.append(pre_v * gamma - v + r + pre_t * gamma * gae_lambda) pre_v, pre_t = v, adv_cur[-1] if reward_to_go: adv_n += adv_cur[::-1] else: adv_n += [adv_cur[-1]] * len(adv_cur) adv_n = np.array(adv_n) # Recalculate the advantages for value function estimation q_n = adv_n + b_n else: adv_n = q_n.copy() # ====================================================================================# # ----------SECTION 4---------- # Advantage Normalization # ====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to # reduce variance in policy gradient methods: normalize adv_n to # have mean zero and std=1. # YOUR_CODE_HERE # Without SECTION 5 # scale from sklearn == standardization != normalize from sklearn adv_n = scale(adv_n) # ====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline # ====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the # inputs for the baseline. # # Fit it to the current batch in order to use for the next # iteration. Use the baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, # rescale the targets to have mean zero and std=1. (Goes with Hint # #bl1 above.) q_n_0 = scale(q_n) sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, baseline_target: q_n_0 }) # ====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update # ====================================================================================# # Call the update operation necessary to perform the policy gradient # update based on the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss # function before and after an update, and then log them below. # YOUR_CODE_HERE sess.run(update_op, feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: adv_n, }) # Log diagnostics returns = [p["reward"].sum() for p in paths] ep_lengths = [pathlength(p) for p in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages # sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = build_mlp(sy_ob_no, ac_dim, "", n_layers=3, size=64, activation=tf.tanh, output_activation=tf.softmax) sy_sampled_ac = tf.multinomial(sy_logits_na, None) # Hint: Use the tf.multinomial op sy_logprob_n = tf.log(tf.multiply(sy_ac_na, sy_sampled_ac)) else: # YOUR_CODE_HERE # sy_mean = TODO # sy_logstd = TODO # logstd should just be a trainable variable, not a network output. # sy_sampled_ac = TODO # sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars() def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('env_name', type=str) parser.add_argument('--exp_name', type=str, default='vpg') parser.add_argument('--render', action='store_true') parser.add_argument('--discount', type=float, default=1.0) parser.add_argument('--n_iter', '-n', type=int, default=100) parser.add_argument('--batch_size', '-b', type=int, default=1000) parser.add_argument('--ep_len', '-ep', type=float, default=-1.) parser.add_argument('--learning_rate', '-lr', type=float, default=5e-3) parser.add_argument('--reward_to_go', '-rtg', action='store_true') parser.add_argument('--dont_normalize_advantages', '-dna', action='store_true') parser.add_argument('--nn_baseline', '-bl', action='store_true') parser.add_argument('--seed', type=int, default=1) parser.add_argument('--n_experiments', '-e', type=int, default=1) parser.add_argument('--n_layers', '-l', type=int, default=1) parser.add_argument('--size', '-s', type=int, default=32) args = parser.parse_args() if not(os.path.exists('data')): os.makedirs('data') logdir = args.exp_name + '_' + args.env_name + '_' + time.strftime("%d-%m-%Y_%H-%M-%S") logdir = os.path.join('data', logdir) if not(os.path.exists(logdir)): os.makedirs(logdir) max_path_length = args.ep_len if args.ep_len > 0 else None for e in range(args.n_experiments): seed = args.seed + 10*e print('Running experiment with seed %d'%seed) def train_func(): train_PG( exp_name=args.exp_name, env_name=args.env_name, n_iter=args.n_iter, gamma=args.discount, min_timesteps_per_batch=args.batch_size, max_path_length=max_path_length, learning_rate=args.learning_rate, reward_to_go=args.reward_to_go, animate=args.render, logdir=os.path.join(logdir,'%d'%seed), normalize_advantages=not(args.dont_normalize_advantages), nn_baseline=args.nn_baseline, seed=seed, n_layers=args.n_layers, size=args.size ) # Awkward hacky process runs, because Tensorflow does not like # repeatedly calling train_PG in the same thread. p = Process(target=train_func, args=tuple()) p.start() p.join() if __name__ == "__main__": main()
def main_pendulum(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None): env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_file(logfile) #vf = LinearValueFunction() vf = NeuralValueFunction(ob_dim) # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these functions sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_mean_n = dense(sy_h1, ac_dim, "final", weight_init=normc_initializer(0.05)) # Mean control output sy_logstd_n = tf.Variable(tf.zeros([ac_dim])) sy_std_n = tf.exp(sy_logstd_n) # Get probabilities from normal distribution and sample from distribution dist = tf.contrib.distributions.Normal(mu=tf.reshape(sy_mean_n,[-1]), sigma=sy_std_n) sy_logprob_n = tf.reshape(tf.log(dist.pdf(sy_ac_n)),[-1]) sy_n = tf.shape(sy_ob_no)[0] sy_sampled_ac = dist.sample(sy_n) # sampled actions, used for defining the policy (NOT computing the policy gradient) # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_mean_n_old = tf.placeholder(shape=[None, ac_dim], name='old_mean', dtype=tf.float32) sy_std_n_old = tf.placeholder(shape=[ac_dim], name='old_std', dtype=tf.float32) sy_kl = tf.reduce_sum(tf.log(sy_std_n/sy_std_n_old)+(sy_std_n_old**0.5+(sy_mean_n_old-sy_mean_n)**0.5)/(2*sy_std_n**0.5)-0.5)/tf.to_float(sy_n) sy_ent = tf.reduce_sum(-(1+tf.log(2*math.pi*sy_std_n**2))*0.5) # <<<<<<<<<<<<< sy_surr = -tf.reduce_mean(sy_adv_n*sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) total_timesteps = 0 obs_mean = np.zeros(ob_dim) obs_std = np.zeros(ob_dim) for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac.flatten()) ob, rew, done, _ = env.step(ac) rewards.append(rew.flatten()) ob = ob.flatten() if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8)) adv_t = return_t.flatten() - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8) vtarg_n = np.concatenate(vtargs).flatten() vpred_n = np.concatenate(vpreds) obs_mean = np.average(ob_no,axis=0) obs_std = np.std(ob_no,axis=0) vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n) # Policy update _, mean_n, std_n = sess.run([update_op, sy_mean_n, sy_std_n], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n.flatten(), sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_mean_n_old: mean_n, sy_std_n_old: std_n}) desired_kl = 2e-3 if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def logger(id_, out): logz.log_tabular('Iteration', id_) logz.log_tabular('AverageReturn', out[0]) logz.log_tabular('StdReturn', out[1]) logz.dump_tabular()
def main(): DATASET_SIZE = 30000 STEPS = 200000 VALIDSET_SIZE = 2000 LR = 0.0003 BATCH_SIZE = 64 if not (os.path.exists('data/pre_trained_model')): os.makedirs('data/pre_trained_model') home = os.path.expanduser('~') expdir = os.path.join(home, 'robotics_drl/reacher/data/pre_trained_model') logz.configure_output_dir(d=expdir) D = deque(maxlen=DATASET_SIZE) V = deque(maxlen=VALIDSET_SIZE) env = environment(continuous_control=True, obs_lowdim=False, rpa=4, frames=4) obs = env.reset() home = os.path.expanduser("~") path = home + "/robotics_drl/reacher" os.chdir(path) torchvision.utils.save_image(obs.view(-1, 64, 64)[0, :, :], "test_inverted.png", normalize=True) net = network().to(device) net.apply(weights_init) optimiser = optim.Adam(net.parameters(), lr=LR) #pbar = tqdm(range(1, STEPS + 1), unit_scale=1, smoothing=0) for i in range(DATASET_SIZE): action = env.sample_action() obs, _, _ = env.step(action) target_pos = env.target_position() joint_pos = env.agent.get_joint_positions() #joint_pos = [cos(joint_pos[0]),sin(joint_pos[1])] joint_vel = env.agent.get_joint_velocities() D.append({ "target_pos": to_torch(target_pos[:2]).view(1, -1), "joint_pos": to_torch(joint_pos).view(1, -1), "joint_vel": to_torch(joint_vel).view(1, -1), "img": to_torch(obs).unsqueeze(dim=0) }) if i % 50 == 0 and i != 0: env.reset() for i in range(VALIDSET_SIZE): action = env.sample_action() obs, _, _ = env.step(action) target_pos = env.target_position() joint_pos = env.agent.get_joint_positions() #joint_pos = [cos(joint_pos[0]),sin(joint_pos[1])] joint_vel = env.agent.get_joint_velocities() V.append({ "target_pos": to_torch(target_pos[:2]).view(1, -1), "joint_pos": to_torch(joint_pos).view(1, -1), "joint_vel": to_torch(joint_vel).view(1, -1), "img": to_torch(obs).unsqueeze(dim=0) }) if i % 50 == 0 and i != 0: env.reset() for step in range(STEPS): if len(D) > BATCH_SIZE: loss = get_loss(D, BATCH_SIZE, net) optimiser.zero_grad() loss.backward() optimiser.step() if step % 800 == 0 and step != 0: net.eval() loss_v = get_loss(V, VALIDSET_SIZE, net) net.train() logz.log_tabular('Loss training', loss.item()) logz.log_tabular('Loss validation', loss_v.item()) logz.dump_tabular() #for param in net.parameters(): # print(param.data.size()) #pbar.set_description() if step % 20000 == 0 and step != 0: home = os.path.expanduser("~") path = home + "/robotics_drl/reacher/data/pre_trained_model" torch.save(net.state_dict(), os.path.join(path, "model%s.pkl" % step)) #home = os.path.expanduser("~") #path = home + "/robotics_drl/reacher/pre_trained_net_reacher/model.pkl" #net.load_state_dict(torch.load(path)) #net.eval() #get_loss(V,10,net) env.terminate()
def train_ga(exp_name='', env_name='HalfCheetah-v2', logdir=None, prob_save=0.05, n_gen=100, gamma=0.5, sigma=1e-3, pop_size=100, fitness_eval_episodes=40, max_steps=150, n_elite=20, seed=1, n_layers=2, size=64, network_activation='leaky_relu', output_activation='tanh'): start = time.time() logz.configure_output_dir(logdir) args = inspect.getargspec(train_ga)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) torch.manual_seed(seed) np.random.seed(seed) env = gym.make(env_name) env.seed(seed) discrete = isinstance(env.action_space, gym.spaces.Discrete) max_steps = int(max_steps or env.spec.max_episode_steps) input_size = env.observation_space.shape[0] output_size = env.action_space.n if discrete else env.action_space.shape[0] if network_activation == 'relu': activation = torch.nn.functional.relu elif network_activation == 'leaky_relu': activation = torch.nn.functional.leaky_relu else: activation = torch.nn.functional.tanh if output_activation == 'relu': output_a = torch.nn.functional.relu elif output_activation == 'leaky_relu': output_a = torch.nn.functional.leaky_relu elif output_activation == 'tanh': output_a = torch.nn.functional.tanh else: output_a = None center_return_all = [] member_archive = Archive(prob_save) population = get_init_population(pop_size, input_size, output_size, n_layers, size, activation, output_a, discrete) for member in population: member.setScore( compute_fitness(env, member, member_archive, fitness_eval_episodes, gamma, max_steps, discrete)) sort_members_in_place(population, reverse=True) #save in archive for member in population: member_archive.save(member) population = population[:n_elite] center_return_list = [] current_best_fitness_score = float(population[0].score) current_best_reward_score = float(population[0].reward_score) center_return_list.append(current_best_reward_score) for i_gen in range(n_gen): offsprings = [] for i in range(int((pop_size - n_elite) / 2)): parent_index = random.randint(0, n_elite - 1) parent = population[parent_index] offspring1, offspring2 = perturb_member(parent, sigma, input_size, output_size, n_layers, size, activation, output_a, discrete) offsprings.append(offspring1) offsprings.append(offspring2) for member in offsprings: member.setScore((compute_fitness(env, member, member_archive, fitness_eval_episodes, gamma, max_steps, discrete))) population = population + offsprings sort_members_in_place(population, reverse=True) for member in offsprings: member_archive.save(member) population = population[:n_elite] current_best_fitness_score = float(population[0].score) current_best_reward_score = float(population[0].reward_score) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i_gen) logz.log_tabular("AverageFitness", current_best_fitness_score) logz.log_tabular("stdFitness", -1) logz.log_tabular("AverageReturn", current_best_reward_score) logz.log_tabular("stdReturn", -1) logz.log_tabular("dontcare1", -1) logz.log_tabular("dontcare2", -1) logz.log_tabular("dontcare3", -1) logz.log_tabular("dontcare4", -1) logz.dump_tabular() logz.pickle_tf_vars() center_return_list.append(current_best_reward_score) center_return_all.append(center_return_list) env.close()
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=True, logdir=None): env = gym.make("CartPole-v0") ob_dim = env.observation_space.shape[0] num_actions = env.action_space.n logz.configure_output_dir(logdir) vf = LinearValueFunction() # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these function sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder( shape=[None], name="ac", dtype=tf.int32 ) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_logits_na = dense( sy_h1, num_actions, "final", weight_init=normc_initializer(0.05) ) # "logits", describing probability distribution of final layer # we use a small initialization for the last layer, so the initial policy has maximal entropy sy_oldlogits_na = tf.placeholder( shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions sy_sampled_ac = categorical_sample_logits( sy_logits_na )[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) sy_n = tf.shape(sy_ob_no)[0] sy_logprob_n = fancy_slice_2d( sy_logp_na, tf.range(sy_n), sy_ac_n ) # log-prob of actions taken -- used for policy gradient calculation # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) sy_oldp_na = tf.exp(sy_oldlogp_na) sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) sy_p_na = tf.exp(sy_logp_na) sy_ent = tf.reduce_sum(-sy_p_na * sy_logp_na) / tf.to_float(sy_n) # <<<<<<<<<<<<< sy_surr = -tf.reduce_mean( sy_adv_n * sy_logprob_n ) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder( shape=[], dtype=tf.float32 ) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) # use single thread. on such a small problem, multithreading gives you a slowdown # this way, we can better use multiple cores for different experiments sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 for i in range(n_iter): print("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, oldlogits_na = sess.run( [update_op, sy_logits_na], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize }) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={ sy_ob_no: ob_no, sy_oldlogits_na: oldlogits_na }) # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = QuadCopter(SIM_TIME_STEP, inverted_pendulum=False) ob_dim = env.stateSpace ac_dim = env.actionSpace ac_lim = env.actionLimit print("Quadcopter created") print('state_dim: ', ob_dim) print('action_dim: ', ac_dim) print('action_limit: ', ac_lim) print('max time: ', MAX_EP_TIME) print('max step: ', MAX_EP_STEPS) hover_position = np.asarray([0, 0, 0]) task = hover(hover_position) logz.configure_output_dir(logdir) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these function sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder( shape=[None, ac_dim], name="ac", dtype=tf.float32 ) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None, 1], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = tf.nn.relu( dense(sy_ob_no, 400, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_h2 = tf.nn.relu( dense(sy_h1, 300, "h2", weight_init=normc_initializer(1.0))) # hidden layer # mean_na = dense(sy_h1, ac_dim, "mean", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer mean_na = tf.tanh( dense(sy_h2, ac_dim, "final", weight_init=normc_initializer( 0.1))) * ac_lim # Mean control output # std_a = tf.constant(1.0, dtype=tf.float32, shape=[ac_dim]) std_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.ones_initializer()) # std_a = tf.constant(1.0, shape=[ac_dim], dtype=tf.float32) sy_sampled_ac = sample_gaussian( ac_dim, mean_na, std_a ) # sampled actions, used for defining the policy (NOT computing the policy gradient) # sy_sampled_ac = tf.zeros([1, ac_dim]) sy_prob_n = (1.0 / tf.sqrt( (tf.square(std_a) * 2 * 3.1415926))) * tf.exp(-0.5 * tf.square( (sy_ac_n - mean_na) / std_a)) # sy_prob_n = (1.0/(std_a*2.5067)) * tf.exp(-0.5*tf.square((sy_ac_n - mean_na)/std_a)) sy_logprob_n = tf.log(sy_prob_n) # sub = tf.subtract(sy_ac_n, mean_na) # mul = tf.multiply(sub, sy_h1) # sy_logprob_n = tf.log(tf.divide(sub, tf.square(std_a))) # log-prob of actions taken -- used for policy gradient calculation # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_n = tf.shape(sy_ob_no)[0] old_mean_na = tf.placeholder( shape=[None, ac_dim], name='old_mean_a', dtype=tf.float32) # mean_a BEFORE update (just used for KL diagnostic) old_std_a = tf.placeholder( shape=[ac_dim], name='old_std_a', dtype=tf.float32) # std_a BEFORE update (just used for KL diagnostic) # KL sy_kl = tf.reduce_mean( tf.log(std_a / old_std_a) + (tf.square(old_std_a) + tf.square(old_mean_na - mean_na)) / (2 * tf.square(std_a)) - 0.5) # entropy sy_p_na = tf.exp(mean_na) sy_ent = tf.reduce_sum(-sy_p_na * mean_na) / tf.to_float(sy_n) # <<<<<<<<<<<<< sy_surr = -tf.reduce_mean( sy_adv_n * sy_logprob_n ) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder( shape=[], dtype=tf.float32 ) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] j = 0 while True: j += 1 ob = ob.reshape(ob.shape[0], ) obs.append(ob) # print ob # mean = sess.run(mean_na, feed_dict={sy_ob_no : ob[None]})[0] ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]})[0] # print ac ob, done, _ = env.step(ac) rew = task.reward(ob, done, _) # ac = np.asscalar(ac) acs.append(ac) rew = np.asscalar(rew) rewards.append(rew) if done or j >= MAX_EP_STEPS: # print "done" break path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t # print("return_t: ", return_t.shape) # print("vpred_t: ", vpred_t.shape) # print("adv_t: ", adv_t.shape) advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) ac_n = ac_n.reshape([-1, ac_dim]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) standardized_adv_n = standardized_adv_n.reshape([-1, 1]) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update # print standardized_adv_n surr, adv, logp = sess.run( [sy_surr, sy_adv_n, sy_prob_n], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize }) _, old_mean, old_std = sess.run( [update_op, mean_na, std_a], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize }) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={ sy_ob_no: ob_no, old_mean_na: old_mean, old_std_a: old_std }) # KL if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s' % stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s' % stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) # logz.log_tabular("std", old_std) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train(env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None): """ Arguments: onpol_iters Number of iterations of onpolicy aggregation for the loop to run. dynamics_iters Number of iterations of training for the dynamics model |_ which happen per iteration of the aggregation loop. batch_size Batch size for dynamics training. num_paths_random Number of paths/trajectories/rollouts generated | by a random agent. We use these to train our |_ initial dynamics model. num_paths_onpol Number of paths to collect at each iteration of |_ aggregation, using the Model Predictive Control policy. num_simulated_paths How many fictitious rollouts the MPC policy | should generate each time it is asked for an |_ action. env_horizon Number of timesteps in each path. mpc_horizon The MPC policy generates actions by imagining | fictitious rollouts, and picking the first action | of the best fictitious rollout. This argument is | how many timesteps should be in each fictitious |_ rollout. n_layers/size/activations Neural network architecture arguments. """ logz.configure_output_dir(logdir) #======================================================== # # First, we need a lot of data generated by a random # agent, with which we'll begin to train our dynamics # model. random_controller = RandomController(env) paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) #======================================================== # # The random data will be used to get statistics (mean # and std) for the observations, actions, and deltas # (where deltas are o_{t+1} - o_t). These will be used # for normalizing inputs and denormalizing outputs # from the dynamics network. # normalization = compute_normalization(paths) #======================================================== # # Build dynamics model and MPC controllers. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() #======================================================== # # Take multiple iterations of onpolicy aggregation at each iteration refitting the dynamics model to current dataset and then taking onpolicy samples and aggregating to the dataset. # Note: You don't need to use a mixing ratio in this assignment for new and old data as described in https://arxiv.org/abs/1708.02596 # for itr in range(onpol_iters): dyn_model.fit(paths) new_paths = sample(env, mpc_controller, num_paths=num_paths_onpol, horizon=env_horizon, render=False, verbose=False) costs = [] returns = [] for new_path in new_paths: cost = path_cost(cost_fn, new_path) costs.append(cost) returns.append(new_path['return']) costs = np.array(costs) returns = np.array(returns) paths = paths + new_paths # Aggregation # LOGGING # Statistics for performance of MPC policy using # our learned dynamics model logz.log_tabular('Iteration', itr) # In terms of cost function which your MPC controller uses to plan logz.log_tabular('AverageCost', np.mean(costs)) logz.log_tabular('StdCost', np.std(costs)) logz.log_tabular('MinimumCost', np.min(costs)) logz.log_tabular('MaximumCost', np.max(costs)) # In terms of true environment reward of your rolled out trajectory using the MPC controller logz.log_tabular('AverageReturn', np.mean(returns)) logz.log_tabular('StdReturn', np.std(returns)) logz.log_tabular('MinimumReturn', np.min(returns)) logz.log_tabular('MaximumReturn', np.max(returns)) logz.dump_tabular()
def train( env, cost_fn, logdir=None, render=False, learning_rate=1e-3, onpol_iters=10, dynamics_iters=60, batch_size=512, num_paths_random=10, num_paths_onpol=10, num_simulated_paths=10000, env_horizon=1000, mpc_horizon=15, n_layers=2, size=500, activation=tf.nn.relu, output_activation=None, clip_param=0.2, entcoeff=0.0, gamma=0.99, lam=0.95, optim_epochs=10, optim_batchsize=64, schedule='linear', bc_lr=1e-3, ppo_lr=3e-4, timesteps_per_actorbatch=1000, MPC=True, BEHAVIORAL_CLONING=True, PPO=True, ): start = time.time() logz.configure_output_dir(logdir) print("-------- env info --------") print("observation_space: ", env.observation_space.shape) print("action_space: ", env.action_space.shape) print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING) print("PPO: ", PPO) print(" ") random_controller = RandomController(env) model_data_buffer = DataBuffer() ppo_data_buffer = DataBuffer_general(10000, 4) bc_data_buffer = DataBuffer_general(BC_BUFFER_SIZE, 2) # random sample path print("collecting random data ..... ") paths = sample(env, random_controller, num_paths=num_paths_random, horizon=env_horizon, render=False, verbose=False) # add into buffer for path in paths: for n in range(len(path['observations'])): model_data_buffer.add(path['observations'][n], path['actions'][n], path['next_observations'][n]) print("model data buffer size: ", model_data_buffer.size) normalization = compute_normalization(model_data_buffer) #======================================================== # # Build dynamics model and MPC controllers and Behavioral cloning network. # sess = tf.Session() dyn_model = NNDynamicsModel(env=env, n_layers=n_layers, size=size, activation=activation, output_activation=output_activation, normalization=normalization, batch_size=batch_size, iterations=dynamics_iters, learning_rate=learning_rate, sess=sess) mpc_controller = MPCcontroller(env=env, dyn_model=dyn_model, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) policy_nn = MlpPolicy_bc(sess=sess, env=env, hid_size=128, num_hid_layers=2, clip_param=clip_param, entcoeff=entcoeff) mpc_controller_bc_ppo = MPCcontroller_BC_PPO( env=env, dyn_model=dyn_model, bc_ppo_network=policy_nn, horizon=mpc_horizon, cost_fn=cost_fn, num_simulated_paths=num_simulated_paths) #======================================================== # # Tensorflow session building. # sess.__enter__() tf.global_variables_initializer().run() # init or load checkpoint with saver saver = tf.train.Saver() checkpoint = tf.train.get_checkpoint_state(CHECKPOINT_DIR) if checkpoint and checkpoint.model_checkpoint_path and LOAD_MODEL: saver.restore(sess, checkpoint.model_checkpoint_path) print("checkpoint loaded:", checkpoint.model_checkpoint_path) else: print("Could not find old checkpoint") if not os.path.exists(CHECKPOINT_DIR): os.mkdir(CHECKPOINT_DIR) #======================================================== # # Prepare for rollouts # episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=100) # rolling buffer for episode lengths rewbuffer = deque(maxlen=100) # rolling buffer for episode rewards max_timesteps = num_paths_onpol * env_horizon bc = False bc_ppo_mpc = False for itr in range(onpol_iters): print("onpol_iters: ", itr) if MPC: dyn_model.fit(model_data_buffer) if schedule == 'constant': cur_lrmult = 1.0 elif schedule == 'linear': # cur_lrmult = max(1.0 - float(timesteps_so_far) / max_timesteps, 0) cur_lrmult = 1.0 print("bc learning_rate: ", bc_lr) print("ppo learning_rate: ", ppo_lr) # saver.save(sess, CHECKPOINT_DIR) bc_return = behavioral_cloning_eval(sess, env, policy_nn, env_horizon) if bc_return > 100: bc_ppo_mpc = True else: bc_ppo_mpc = False ppo_data_buffer.clear() if (itr % 2 != 0 and bc_ppo_mpc) or not MPC: direct_mpc = False else: direct_mpc = True seg = traj_segment_generator(policy_nn, mpc_controller, mpc_controller_bc_ppo, bc_data_buffer, env, MPC, direct_mpc, bc_ppo_mpc, env_horizon) add_vtarg_and_adv(seg, gamma, lam) # check if seg is good ep_lengths = seg["ep_lens"] returns = seg["ep_rets"] if np.mean(returns) > 100: bc = True else: bc = False print("BEHAVIORAL_CLONING: ", BEHAVIORAL_CLONING and bc) ob, ac, mpcac, rew, nxt_ob, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["mpcac"], seg["rew"], seg["nxt_ob"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg["vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean() ) / atarg.std() # standardized advantage function estimate for n in range(len(ob)): if PPO: ppo_data_buffer.add([ob[n], ac[n], atarg[n], tdlamret[n]]) if BEHAVIORAL_CLONING and bc: bc_data_buffer.add([ob[n], mpcac[n]]) if MPC: model_data_buffer.add(ob[n], ac[n], nxt_ob[n]) print("ppo_data_buffer size", ppo_data_buffer.size) print("bc_data_buffer size", bc_data_buffer.size) print("model data buffer size: ", model_data_buffer.size) # optim_batchsize = optim_batchsize or ob.shape[0] if hasattr(policy_nn, "ob_rms"): policy_nn.ob_rms.update(ob) # update running mean/std for policy policy_nn.assign_old_eq_new( ) # set old parameter values to new parameter values for op_ep in range(optim_epochs): # losses = [] # list of tuples, each of which gives the loss for a minibatch # for i in range(int(timesteps_per_actorbatch/optim_batchsize)): if PPO: sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target = ppo_data_buffer.sample( optim_batchsize) newlosses = policy_nn.lossandupdate_ppo( sample_ob_no, sample_ac_na, sample_adv_n, sample_b_n_target, cur_lrmult, ppo_lr * cur_lrmult) # losses.append(newlosses) if BEHAVIORAL_CLONING and bc: sample_ob_no, sample_ac_na = bc_data_buffer.sample( optim_batchsize) # print("sample_ob_no", sample_ob_no.shape) # print("sample_ac_na", sample_ac_na.shape) policy_nn.update_bc(sample_ob_no, sample_ac_na, bc_lr * cur_lrmult) if op_ep % (100) == 0 and BEHAVIORAL_CLONING and bc: print('epcho: ', op_ep) behavioral_cloning_eval(sess, env, policy_nn, env_horizon) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 # if np.mean(returns) > 1000: # filename = "seg_data.pkl" # pickle.dump(seg, open(filename, 'wb')) # print("saved", filename) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", iters_so_far) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) # logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", timesteps_so_far) logz.dump_tabular() logz.pickle_tf_vars()
def train(episodes, learning_rate, batch_size, gamma, eps_start, eps_end, eps_decay, target_update, max_steps, buffer_size, random_link, random_target, repeat_actions, logdir): setup_logger(logdir, locals()) env = environment() eval_policy = evaluation(env,logdir) env.reset_robot_position(random_=True) env.reset_target_position(random_=False) # resize = T.Compose([T.ToPILImage(), # T.Grayscale(num_output_channels=1), # T.Resize(64, interpolation = Image.BILINEAR), # T.ToTensor()]) img = env.get_obs() img = torch.from_numpy(img.copy()) # img_height, img_width, _ = img.shape policy_net = DQN_FC().to(device) target_net = DQN_FC().to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.Adam(policy_net.parameters(), lr = learning_rate) memory = Replay_Buffer(buffer_size) obs = env.get_obs() obs = torch.from_numpy((obs)).view(1,-1) successes = 0 target_upd = 0 grad_upd = 0 steps_train = 0 for ep in range(1,episodes+1): env.reset_robot_position(random_=random_link) env.reset_target_position(random_=random_target) #target after link reset so vel=0 rewards_ep = 0 steps_ep = 0 steps_all = [] rewards_all = [] sampling_time = 0 start_time = time.time() while True: action, eps_threshold = select_actions(obs, eps_start, eps_end, eps_decay, steps_train, policy_net,env) reward, done = env.step_(action) reward = torch.tensor(reward,dtype=torch.float).view(-1,1) obs_next = env.get_obs() obs_next = torch.from_numpy(obs_next).view(1,-1).to(device) transition = {'s': obs.to(device), 'a': action.to(device), 'r': reward, "s'": obs_next.to(device) } steps_ep += 1 steps_train += 1 rewards_ep += reward memory_state = memory.push(transition) obs = env.get_obs() obs = torch.from_numpy((obs)).view(1,-1) if done: rewards_all.append(rewards_ep/steps_ep) steps_all.append(steps_ep) successes += 1 break elif steps_ep == max_steps: rewards_all.append(rewards_ep) steps_all.append(steps_ep) break status = optimize_model(policy_net, target_net, optimizer, memory, gamma, batch_size) if status != False: grad_upd += 1 for param, target_param in zip(policy_net.parameters(),target_net.parameters()): target_param.data = 0.995 * target_param.data + (1 - 0.995) * param.data #target_net.load_state_dict(policy_net.state_dict()) target_net.eval() end_time = time.time() sampling_time += end_time-start_time sampling_time /= ep if ep % 40 == 0: return_val, steps_val = eval_policy.sample_episode(policy_net,save_video=True if ep%500==0 else False, n_episodes=5) qvalue_eval = eval_policy.get_qvalue(policy_net) logz.log_tabular('Averaged Steps Traning',np.around(np.average(steps_all),decimals=0)) # last 10 episodes logz.log_tabular('Averaged Return Training',np.around(np.average(rewards_all),decimals=2)) logz.log_tabular('Averaged Steps Validation',np.around(np.average(steps_val),decimals=0)) logz.log_tabular('Averaged Return Validation',np.around(np.average(return_val),decimals=2)) logz.log_tabular('Cumulative Successes',successes) logz.log_tabular('Number of episodes',ep) logz.log_tabular('Sampling time (s)', sampling_time) logz.log_tabular('Epsilon threshold', eps_threshold) logz.log_tabular('Gradient update', grad_upd ) logz.log_tabular('Average q-value evaluation', qvalue_eval) logz.dump_tabular() steps_all = [] rewards_all = [] logz.save_pytorch_model(policy_net.state_dict()) env.terminate()
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir, CLEAR_LOGS) if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, **vf_params) sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = lrelu(dense(sy_ob_no, 128, "h1", weight_init=normc_initializer(1.0))) sy_h2 = lrelu(dense(sy_h1, 128, "h2", weight_init=normc_initializer(1.0))) sy_mean_na = dense(sy_h2, ac_dim, 'mean_na', weight_init=normc_initializer(0.1)) # Mean control output sy_logstd_a = tf.get_variable('logstdev', [ac_dim], initializer=tf.zeros_initializer) # Variance sy_std_a = tf.exp(sy_logstd_a) sy_dist = tf.contrib.distributions.Normal(mu=sy_mean_na, sigma=sy_std_a, validate_args=True) sy_sampled_ac = sy_dist.sample(ac_dim)[0, :, 0] sy_logprob_n = tf.squeeze(tf.log(sy_dist.prob(sy_ac_n))) # log-prob of actions taken -- used for policy gradient calculation sy_old_mean_na = tf.placeholder(shape=[None, ac_dim], name='old_mean_na', dtype=tf.float32) sy_old_logstd_a = tf.placeholder(shape=[ac_dim], name='old_logstdev', dtype=tf.float32) sy_old_std_a = tf.exp(sy_old_logstd_a) sy_old_dist = tf.contrib.distributions.Normal(mu=sy_old_mean_na, sigma=sy_old_std_a, validate_args=True) sy_kl = tf.reduce_mean(tf.contrib.distributions.kl(sy_old_dist, sy_dist, allow_nan=False)) sy_ent = tf.reduce_mean(sy_dist.entropy()) sy_surr = -tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************" % i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = {"observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, old_mean_na, old_logstd_a = sess.run([update_op, sy_mean_na, sy_logstd_a], feed_dict={ sy_ob_no: ob_no, sy_ac_n: ac_n, sy_adv_n: standardized_adv_n, sy_stepsize: stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={ sy_ob_no: ob_no, sy_old_mean_na: old_mean_na, sy_old_logstd_a: old_logstd_a}) if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s' % stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s' % stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train_PG( exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] print("OB AND ACTION DIM=============") print(ob_dim) print(ac_dim) #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages # CODE sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE # Takes in the observation and returns the logits for actions as per our policy net sy_logits_na = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope="Discrete") # Sample an action to be taken. sy_sampled_ac = tf.squeeze(tf.multinomial(sy_logits_na, 1), [1]) tf.assert_rank(sy_logits_na, 2) tf.assert_rank(sy_sampled_ac, 1) # Figure out the log probablity (as per our current policy) of the action that was actually # taken. action_one_hot = tf.one_hot(indices=sy_ac_na, depth=ac_dim) action_taken_logit = tf.reduce_sum(action_one_hot * sy_logits_na, axis=1) normalizer = tf.reduce_sum(tf.exp(sy_logits_na), axis=1) sy_logprob_n = action_taken_logit - tf.log(normalizer) tf.assert_rank(sy_logprob_n, 1) else: # YOUR_CODE_HERE sy_mean = build_mlp(input_placeholder=sy_ob_no, output_size=ac_dim, scope="Continuous") sy_logstd = tf.Variable( 0.0, name="sy_logstd" ) # logstd should just be a trainable variable, not a network output. # For sampling, we use a reparameterization trick. mu + sigma * z, where z ~ N(O, I) # Hint: Use the log probability under a multivariate gaussian. # For finding the probability of the action (multi-dimensional) that was actually taken, first # define a normal distribution with the above mean and std. Note that this defines multiple scalar # distributions with same variance. Equivalent of multi variate gaussian with diagonal covariance matrix # with same diagonal value of std (independent variables). # NOTE: we technically don't need the tf.exp() on the std, since we can assume that the variable is # representing the std directly than its log, and force > 0. However, that may introduce some numerical # instability and leads to some nans in loss and actions. dist = tf.distributions.Normal(loc=sy_mean, scale=tf.exp(sy_logstd)) # Since we are using independent Normal vars to represent a multivariate Gaussian with independent # variables, to get the overall probablity, we have to multiply the individual probabilities # obtained from the Normal. # P(x1, x2) = P(x1) * P(x2). Thus summing in log domain. sy_logprob_n = tf.reduce_sum(dist.log_prob(sy_ac_na), axis=1) # sy_sampled_ac = sy_mean + sy_logstd * tf.random_normal(shape=[ac_dim]) sy_sampled_ac = dist.sample() tf.assert_rank(sy_sampled_ac, 2) tf.assert_rank(sy_logprob_n, 1) #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# # Note the -ve sign, since the remainder is the reward, whereas we are defining loss. loss = -tf.reduce_mean( sy_logprob_n * sy_adv_n ) # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze( build_mlp(sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE # Targets for the baseline will be provided by the paths collected from experience target_bn = tf.placeholder(shape=[None], name="target_bn", dtype=tf.float32) loss_bn = tf.losses.mean_squared_error(target_bn, baseline_prediction) baseline_update_op = tf.train.AdamOptimizer(learning_rate).minimize( loss_bn) #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no: ob[None]}) ac = ac[0] acs.append(ac) #print("OBS, ACTION") #print(ob) #print(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = { "observation": np.array(obs), "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE if reward_to_go is False: # trajectory (path) based PG. # Get the reward for each path as the sum of rewards along the path. # In this scheme, the reward Ret(tau) is the same for every timestamp along # the path. # So just replicate the path reward for each timestamp along that path. rewards_path_repl = [[ np.sum( np.power(gamma, i) * rew for i, rew in enumerate(path["reward"])) ] * len(path["reward"]) for path in paths] # Concate the paths similar to ob_no and ac_na. q_n = np.concatenate(rewards_path_repl) else: discounted_rewards_paths = [] for path in paths: # path["rewards"] -> array with rewards. discounted_sum = 0 discounted_rewards = [] # go over the rewards in reverse order. multiply by gamma and add to previous sum # to get the next sum. This gets the intended rewards in the reverse order, so ultimately # reverse the resulting array (or alternative would be to fill the array at 0 as we go.) for i, rew in enumerate(path['reward'][::-1]): # print('i, rew: ', i, rew) discounted_sum = gamma * discounted_sum + rew discounted_rewards.append(discounted_sum) discounted_rewards_paths.append(discounted_rewards[::-1]) q_n = np.concatenate(discounted_rewards_paths) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n_orig = sess.run(baseline_prediction, feed_dict={sy_ob_no: ob_no}) # b_n_orig is expected to be zero mean and std 1 since that is what we are targeting # in the graph training. So scale with the q_n stats. mean_q = np.mean(q_n) std_q = np.std(q_n) # now b_n should have mean of q_n and std of q_n. b_n = b_n_orig * std_q + mean_q adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - np.mean(adv_n)) / np.std(adv_n) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE # Use the previous network weights to predict the baseline values for calculating # targets = reward[i] + gamma * b_n[i+1] (unless end of episode). This is like the # TD(0) target [if we want Monte Carlo, then the targets will be q_n but that is # going to be noisy]. # b_n should have mean and std same as that of q_n since we scaled that above # before advantage normalization. reward[i] should come from the same distribution # as q_n. # q_values = [] j = 0 for path in paths: path_reward = path["reward"] path_obs = path["observation"] for i in range(len(path_reward)): b_next = b_n[j + 1] if i < len(path_reward) - 1 else 0 q_values.append(path_reward[i] + gamma * b_next) j = j + 1 # Now that we have the targets, we should scale them back to 0 mean and 1 std before # setting it as target for the graph to backprop. q_values = np.array(q_values) targets_ = (q_values - np.mean(q_values)) / np.std(q_values) sess.run(baseline_update_op, feed_dict={ sy_ob_no: ob_no, target_bn: targets_ }) #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE print("q_n shape: ", q_n.shape) print("ob_no shape: ", ob_no.shape) print("ac_na shape: ", ac_na.shape) update_, loss_, sy_logprob_n_ = sess.run( [update_op, loss, sy_logprob_n], feed_dict={ sy_ob_no: ob_no, sy_ac_na: ac_na, sy_adv_n: q_n }) print("sy_logprob_n [Chosen action log prob] Shape: ", sy_logprob_n_.shape) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("Loss", loss_) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self, train_db, val_db, test_db): ################################################################## ## LOG ################################################################## logz.configure_output_dir(self.cfg.model_dir) logz.save_config(self.cfg) ################################################################## ## NN table ################################################################## if self.cfg.use_hard_mining: self.train_tables = AllCategoriesTables(train_db) self.val_tables = AllCategoriesTables(val_db) self.train_tables.build_nntables_for_all_categories(True) self.val_tables.build_nntables_for_all_categories(True) ################################################################## ## Main loop ################################################################## start = time() min_val_loss = 100000000 for epoch in range(self.epoch, self.cfg.n_epochs): ################################################################## ## Training ################################################################## torch.cuda.empty_cache() train_loss, train_accu = self.train_epoch(train_db, epoch) ################################################################## ## Validation ################################################################## torch.cuda.empty_cache() val_loss, val_accu = self.validate_epoch(val_db, epoch) ################################################################## ## Logging ################################################################## # update optim scheduler current_val_loss = np.mean(val_loss[:, 0]) # self.optimizer.update(current_val_loss, epoch) logz.log_tabular("Time", time() - start) logz.log_tabular("Iteration", epoch) logz.log_tabular("AverageLoss", np.mean(train_loss[:, 0])) logz.log_tabular("AveragePredLoss", np.mean(train_loss[:, 1])) logz.log_tabular("AverageEmbedLoss", np.mean(train_loss[:, 2])) logz.log_tabular("AverageAttnLoss", np.mean(train_loss[:, 3])) logz.log_tabular("AverageObjAccu", np.mean(train_accu[:, 0])) logz.log_tabular("AverageCoordAccu", np.mean(train_accu[:, 1])) logz.log_tabular("AverageScaleAccu", np.mean(train_accu[:, 2])) logz.log_tabular("AverageRatioAccu", np.mean(train_accu[:, 3])) logz.log_tabular("ValAverageLoss", np.mean(val_loss[:, 0])) logz.log_tabular("ValAveragePredLoss", np.mean(val_loss[:, 1])) logz.log_tabular("ValAverageEmbedLoss", np.mean(val_loss[:, 2])) logz.log_tabular("ValAverageAttnLoss", np.mean(val_loss[:, 3])) logz.log_tabular("ValAverageObjAccu", np.mean(val_accu[:, 0])) logz.log_tabular("ValAverageCoordAccu", np.mean(val_accu[:, 1])) logz.log_tabular("ValAverageScaleAccu", np.mean(val_accu[:, 2])) logz.log_tabular("ValAverageRatioAccu", np.mean(val_accu[:, 3])) logz.dump_tabular() ################################################################## ## Checkpoint ################################################################## if self.cfg.use_hard_mining: if (epoch + 1) % 3 == 0: torch.cuda.empty_cache() t0 = time() self.dump_shape_vectors(train_db) torch.cuda.empty_cache() self.dump_shape_vectors(val_db) print("Dump shape vectors completes (time %.2fs)" % (time() - t0)) torch.cuda.empty_cache() t0 = time() self.train_tables.build_nntables_for_all_categories(False) self.val_tables.build_nntables_for_all_categories(False) print("NN completes (time %.2fs)" % (time() - t0)) self.save_checkpoint(epoch)
def train_PG( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, mini_batch_size, max_path_length, learning_rate, num_ppo_updates, num_value_iters, animate, logdir, normalize_advantages, nn_critic, seed, n_layers, size, gru_size, history, num_tasks, l2reg, recurrent, grain_size ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment envs = {'pm': PointEnv, 'pm-obs': ObservedPointEnv, } env = envs[env_name](num_tasks, grain_size=grain_size) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] task_dim = len(env._goal) # rude, sorry #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'task_dim': task_dim, 'size': size, 'gru_size': gru_size, 'learning_rate': learning_rate, 'history': history, 'num_value_iters': num_value_iters, 'l2reg': l2reg, 'recurrent': recurrent, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, 'grain_size': grain_size } estimate_return_args = { 'gamma': gamma, 'nn_critic': nn_critic, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# def unpack_sample(data): ''' unpack a sample from the replay buffer ''' ob = data["observations"] ac = data["actions"] re = data["rewards"] hi = data["hiddens"] ma = 1 - data["terminals"] return ob, ac, re, hi, ma # construct PPO replay buffer, perhaps rude to do outside the agent ppo_buffer = PPOReplayBuffer(agent.replay_buffer) total_timesteps = 0 for itr in range(n_iter): # for PPO: flush the replay buffer! ppo_buffer.flush() # sample trajectories to fill agent's replay buffer print("********** Iteration %i ************"%itr) stats = [] for _ in range(num_tasks): s, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch) total_timesteps += timesteps_this_batch stats += s # compute the log probs, advantages, and returns for all data in agent's buffer # store in ppo buffer for use in multiple ppo updates # TODO: should move inside the agent probably data = agent.replay_buffer.all_batch() ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) q_n, adv_n = agent.estimate_return(ob_no, re_n, hidden, masks) ppo_buffer.add_samples(fixed_log_probs, adv_n, q_n) # update with mini-batches sampled from ppo buffer for _ in range(num_ppo_updates): data = ppo_buffer.random_batch(mini_batch_size) ob_no, ac_na, re_n, hidden, masks = unpack_sample(data) fixed_log_probs = data["log_probs"] adv_n = data["advantages"] q_n = data["returns"] log_probs = agent.sess.run(agent.sy_lp_n, feed_dict={agent.sy_ob_no: ob_no, agent.sy_hidden: hidden, agent.sy_ac_na: ac_na}) agent.update_parameters(ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n) # compute validation statistics print('Validating...') val_stats = [] for _ in range(num_tasks): vs, timesteps_this_batch = agent.sample_trajectories(itr, env, min_timesteps_per_batch // 10, is_evaluation=True) val_stats += vs # save trajectories for viz #with open("output/{}-epoch{}.pkl".format(exp_name, itr), 'wb') as f: #pickle.dump(agent.val_replay_buffer.all_batch(), f, pickle.HIGHEST_PROTOCOL) #agent.val_replay_buffer.flush() # Log TRAIN diagnostics returns = [sum(s["rewards"]) for s in stats] final_rewards = [s["rewards"][-1] for s in stats] ep_lengths = [s['ep_len'] for s in stats] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("FinalReward", np.mean(final_rewards)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) # Log VAL diagnostics val_returns = [sum(s["rewards"]) for s in val_stats] val_final_rewards = [s["rewards"][-1] for s in val_stats] logz.log_tabular("ValAverageReturn", np.mean(val_returns)) logz.log_tabular("ValFinalReward", np.mean(val_final_rewards)) logz.dump_tabular() logz.pickle_tf_vars()
def learn(env, q_func, optimizer_spec, session, exploration=LinearSchedule(1000000, 0.1), stopping_criterion=None, replay_buffer_size=1000000, batch_size=32, gamma=0.99, learning_starts=50000, learning_freq=4, frame_history_len=4, target_update_freq=10000, grad_norm_clipping=10, double_q_learning=False): """Run Deep Q-learning algorithm. You can specify your own convnet using q_func. All schedules are w.r.t. total number of steps taken in the environment. Parameters ---------- env: gym.Env gym environment to train on. q_func: function Model to use for computing the q function. It should accept the following named arguments: img_in: tf.Tensor tensorflow tensor representing the input image num_actions: int number of actions scope: str scope in which all the model related variables should be created reuse: bool whether previously created variables should be reused. optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer session: tf.Session tensorflow session to use. exploration: rl_algs.deepq.utils.schedules.Schedule schedule for probability of chosing random action. stopping_criterion: (env, t) -> bool should return true when it's ok for the RL algorithm to stop. takes in env and the number of steps executed so far. replay_buffer_size: int How many memories to store in the replay buffer. batch_size: int How many transitions to sample each time experience is replayed. gamma: float Discount Factor learning_starts: int After how many environment steps to start replaying experiences learning_freq: int How many steps of environment to take between every experience replay frame_history_len: int How many past frames to include as input to the model. target_update_freq: int How many experience replay rounds (not steps!) to perform between each update to the target Q network grad_norm_clipping: float or None If not None gradients' norms are clipped to this value. """ assert type(env.observation_space) == gym.spaces.Box assert type(env.action_space) == gym.spaces.Discrete ############### # BUILD MODEL # ############### if len(env.observation_space.shape) == 1: # This means we are running on low-dimensional observations (e.g. RAM) input_shape = env.observation_space.shape else: img_h, img_w, img_c = env.observation_space.shape input_shape = (img_h, img_w, frame_history_len * img_c) num_actions = env.action_space.n # set up placeholders # placeholder for current observation (or state) obs_t_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for current action act_t_ph = tf.placeholder(tf.int32, [None]) # placeholder for current reward rew_t_ph = tf.placeholder(tf.float32, [None]) # placeholder for next observation (or state) obs_tp1_ph = tf.placeholder(tf.uint8, [None] + list(input_shape)) # placeholder for end of episode mask # this value is 1 if the next state corresponds to the end of an episode, # in which case there is no Q-value at the next state; at the end of an # episode, only the current state reward contributes to the target, not the # next state Q-value (i.e. target is just rew_t_ph, not rew_t_ph + gamma * q_tp1) done_mask_ph = tf.placeholder(tf.float32, [None]) # casting to float on GPU ensures lower data transfer times. obs_t_float = tf.cast(obs_t_ph, tf.float32) / 255.0 obs_tp1_float = tf.cast(obs_tp1_ph, tf.float32) / 255.0 # Here, you should fill in your own code to compute the Bellman error. This requires # evaluating the current and next Q-values and constructing the corresponding error. # TensorFlow will differentiate this error for you, you just need to pass it to the # optimizer. See assignment text for details. # Your code should produce one scalar-valued tensor: total_error # This will be passed to the optimizer in the provided code below. # Your code should also produce two collections of variables: # q_func_vars # target_q_func_vars # These should hold all of the variables of the Q-function network and target network, # respectively. A convenient way to get these is to make use of TF's "scope" feature. # For example, you can create your Q-function network with the scope "q_func" like this: # <something> = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) # And then you can obtain the variables like this: # q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') # Older versions of TensorFlow may require using "VARIABLES" instead of "GLOBAL_VARIABLES" ###### # YOUR CODE HERE ###### q_func_network = q_func(obs_t_float, num_actions, scope="q_func", reuse=False) target_q_func_network = q_func(obs_tp1_float, num_actions, scope="target_q_func", reuse=False) q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='q_func') target_q_func_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_q_func') selected_action_q = tf.reduce_sum(tf.one_hot(act_t_ph, depth=num_actions) * q_func_network, axis=1) if double_q_learning: double_q_func_network = q_func(obs_tp1_float, num_actions, scope="q_func", reuse=True) selected_target_action = tf.one_hot(tf.argmax(double_q_func_network, axis=1), depth=num_actions) target_q_value = tf.reduce_sum(target_q_func_network * selected_target_action, axis=1) y = rew_t_ph + done_mask_ph * gamma * target_q_value else: #done_mask_ph is inverted so 0 if the next state corresponds to the end of an episode y = rew_t_ph + done_mask_ph * gamma * tf.reduce_max( target_q_func_network, axis=1) total_error = tf.nn.l2_loss(selected_action_q - y) # construct optimization op (with gradient clipping) learning_rate = tf.placeholder(tf.float32, (), name="learning_rate") optimizer = optimizer_spec.constructor(learning_rate=learning_rate, **optimizer_spec.kwargs) train_fn = minimize_and_clip(optimizer, total_error, var_list=q_func_vars, clip_val=grad_norm_clipping) # update_target_fn will be called periodically to copy Q network to target Q network update_target_fn = [] for var, var_target in zip( sorted(q_func_vars, key=lambda v: v.name), sorted(target_q_func_vars, key=lambda v: v.name)): update_target_fn.append(var_target.assign(var)) update_target_fn = tf.group(*update_target_fn) # construct the replay buffer replay_buffer = ReplayBuffer(replay_buffer_size, frame_history_len) ############### # RUN ENV # ############### model_initialized = False num_param_updates = 0 mean_episode_reward = -float('nan') best_mean_episode_reward = -float('inf') last_obs = env.reset() LOG_EVERY_N_STEPS = 1000 # Configure output directory for logging # Log experimental parameters args = inspect.getargspec(learn)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} if not (os.path.exists('data')): os.makedirs('data') logdir = '' value_params = {} for key, value in sorted(params.items()): try: float(value) except (ValueError, TypeError): pass else: value_params[key] = value logdir += key + str(value) + '_' logdir = logdir[:-1] iteration = 1 while os.path.exists(os.path.join('data/', (logdir + '/' + str(iteration)))): iteration += 1 logdir = os.path.join('data', logdir) if not (os.path.exists(logdir)): os.makedirs(logdir) logdir = os.path.join(logdir, str(iteration)) value_params['exp_name'] = logdir logz.configure_output_dir(logdir) logz.save_params(value_params) available_actions = range(num_actions) try: for t in itertools.count(): ### 1. Check stopping criterion if stopping_criterion is not None and stopping_criterion(env, t): break ### 2. Step the env and store the transition # At this point, "last_obs" contains the latest observation that was # recorded from the simulator. Here, your code needs to store this # observation and its outcome (reward, next observation, etc.) into # the replay buffer while stepping the simulator forward one step. # At the end of this block of code, the simulator should have been # advanced one step, and the replay buffer should contain one more # transition. # Specifically, last_obs must point to the new latest observation. # Useful functions you'll need to call: # obs, reward, done, info = env.step(action) # this steps the environment forward one step # obs = env.reset() # this resets the environment if you reached an episode boundary. # Don't forget to call env.reset() to get a new observation if done # is true!! # Note that you cannot use "last_obs" directly as input # into your network, since it needs to be processed to include context # from previous frames. You should check out the replay buffer # implementation in dqn_utils.py to see what functionality the replay # buffer exposes. The replay buffer has a function called # encode_recent_observation that will take the latest observation # that you pushed into the buffer and compute the corresponding # input that should be given to a Q network by appending some # previous frames. # Don't forget to include epsilon greedy exploration! # And remember that the first time you enter this loop, the model # may not yet have been initialized (but of course, the first step # might as well be random, since you haven't trained your net...) ##### # YOUR CODE HERE ##### buffer_index = replay_buffer.store_frame(last_obs) encoded_obsv = replay_buffer.encode_recent_observation() epsilon = exploration.value(t) if not model_initialized or np.random.rand(1) < epsilon: action = np.random.choice(available_actions) else: action_values = session.run( q_func_network, feed_dict={obs_t_float: encoded_obsv[None]}) action = np.argmax(action_values) obs, reward, done, info = env.step(action) replay_buffer.store_effect(buffer_index, action, reward, done) if (done): obs = env.reset() last_obs = obs # at this point, the environment should have been advanced one step (and # reset if done was true), and last_obs should point to the new latest # observation ### 3. Perform experience replay and train the network. # note that this is only done if the replay buffer contains enough samples # for us to learn something useful -- until then, the model will not be # initialized and random actions should be taken if (t > learning_starts and t % learning_freq == 0 and replay_buffer.can_sample(batch_size)): # Here, you should perform training. Training consists of four steps: # 3.a: use the replay buffer to sample a batch of transitions (see the # replay buffer code for function definition, each batch that you sample # should consist of current observations, current actions, rewards, # next observations, and done indicator). # 3.b: initialize the model if it has not been initialized yet; to do # that, call # initialize_interdependent_variables(session, tf.global_variables(), { # obs_t_ph: obs_t_batch, # obs_tp1_ph: obs_tp1_batch, # }) # where obs_t_batch and obs_tp1_batch are the batches of observations at # the current and next time step. The boolean variable model_initialized # indicates whether or not the model has been initialized. # Remember that you have to update the target network too (see 3.d)! # 3.c: train the model. To do this, you'll need to use the train_fn and # total_error ops that were created earlier: total_error is what you # created to compute the total Bellman error in a batch, and train_fn # will actually perform a gradient step and update the network parameters # to reduce total_error. When calling session.run on these you'll need to # populate the following placeholders: # obs_t_ph # act_t_ph # rew_t_ph # obs_tp1_ph # done_mask_ph # (this is needed for computing total_error) # learning_rate -- you can get this from optimizer_spec.lr_schedule.value(t) # (this is needed by the optimizer to choose the learning rate) # 3.d: periodically update the target network by calling # session.run(update_target_fn) # you should update every target_update_freq steps, and you may find the # variable num_param_updates useful for this (it was initialized to 0) ##### # YOUR CODE HERE ##### obs_t_batch, act_batch, rew_batch, obs_tp1_batch, done_mask = replay_buffer.sample( batch_size) inverted_done_mask = 1.0 - done_mask train_feed_dict = { learning_rate: optimizer_spec.lr_schedule.value(t), rew_t_ph: rew_batch, obs_t_ph: obs_t_batch, obs_tp1_ph: obs_tp1_batch, act_t_ph: act_batch, done_mask_ph: inverted_done_mask } if not model_initialized: initialize_interdependent_variables( session, tf.global_variables(), { obs_t_ph: obs_t_batch, obs_tp1_ph: obs_tp1_batch }) model_initialized = True session.run(update_target_fn) _, loss_value, = session.run([train_fn, total_error], feed_dict=train_feed_dict) num_param_updates += 1 if num_param_updates % target_update_freq == 0 and model_initialized: session.run(update_target_fn) ### 4. Log progress episode_rewards = get_wrapper_by_name( env, "Monitor").get_episode_rewards() if len(episode_rewards) > 0: mean_episode_reward = np.mean(episode_rewards[-100:]) if len(episode_rewards) > 100: best_mean_episode_reward = max(best_mean_episode_reward, mean_episode_reward) if t % LOG_EVERY_N_STEPS == 0 and model_initialized: #print("Timestep %d" % (t,)) #print("mean reward (100 episodes) %f" % mean_episode_reward) #print("best mean reward %f" % best_mean_episode_reward) #print("episodes %d" % len(episode_rewards)) #print("exploration %f" % exploration.value(t)) #print("learning_rate %f" % optimizer_spec.lr_schedule.value(t)) #sys.stdout.flush() # print(q_value[0]) # print(rew_batch[0]) # print(qtn[0]) logz.log_tabular("Timestep", (t, )[0]) logz.log_tabular("MeanReward(100ep)", mean_episode_reward) logz.log_tabular("BestMeanReward", best_mean_episode_reward) logz.log_tabular("Episodes", len(episode_rewards)) logz.log_tabular("LearningRate", optimizer_spec.lr_schedule.value(t)) logz.log_tabular("Epsilon", exploration.value(t)) logz.log_tabular("Loss", np.sum(loss_value)) #logz.log_tabular("Qval", q_value) logz.log_tabular( "WallTime", time.strftime("%d.%m.%y %H:%M:%S", time.localtime())) logz.dump_tabular() #logz.pickle_tf_vars() except KeyboardInterrupt: print("imp") if os.path.exists("/tmp/hw3_vid_dir2/gym"): shutil.move("/tmp/hw3_vid_dir2/gym", logdir) save_q(os.path.join(logdir, 'Q_network'), session) if os.path.exists("/tmp/hw3_vid_dir2/gym"): shutil.move("/tmp/hw3_vid_dir2/gym", logdir) save_q(os.path.join(logdir, 'Q_network'), session)
def train_PG(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, reward_to_go, animate, logdir, normalize_advantages, nn_baseline, seed, n_layers, size, step_size): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# ## # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'reward_to_go': reward_to_go, 'nn_baseline': nn_baseline, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_return_args) # build computation graph agent.build_computation_graph() # tensorflow: config, session, variable initialization agent.init_tf_sess() #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = [path["reward"] for path in paths] q_n, adv_n = agent.estimate_return(ob_no, re_n) if step_size == 1: agent.update_parameters(ob_no, ac_na, q_n, adv_n) else: for _ in range(step_size): agent.update_parameters(ob_no, ac_na, q_n, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self, num_iter): max_reward_ever = -1 start = time.time() for i in range(num_iter): t1 = time.time() self.train_step() for iter_ in range(10): self.update_explorer_net() t2 = time.time() print('total time of one step', t2 - t1) print('iter ', i, ' done') # if i == num_iter-1: # np.savez(self.logdir + "/lin_policy_plus" + str(i), w) # record statistics every 10 iterations if ((i + 1) % 20 == 0): rewards = self.aggregate_rollouts(num_rollouts=30, evaluate=True) print("SHAPE", rewards.shape) if (np.mean(rewards) > max_reward_ever): max_reward_ever = np.mean(rewards) # np.savez(self.logdir + "/lin_policy_plus", w) w = ray.get(self.workers[0].get_weights_plus_stats.remote()) np.savez(self.logdir + "/bi_policy_num_plus" + str(i), w) torch.save( self.policy.net.state_dict(), self.logdir + "/bi_policy_num_plus_torch" + str(i) + ".pt") torch.save(self.policy.safeQ.state_dict(), self.logdir + "/safeQ_torch" + str(i) + ".pt") # np.savez(self.logdir + "/bi_policy_num_plus" + str(i), w) # torch.save(self.policy.net.state_dict(),self.logdir + "/bi_policy_num_plus_torch" + str(i)+ ".pt") print(sorted(self.params.items())) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", i + 1) logz.log_tabular("BestRewardEver", max_reward_ever) logz.log_tabular("AverageReward", np.mean(rewards)) logz.log_tabular("StdRewards", np.std(rewards)) logz.log_tabular("MaxRewardRollout", np.max(rewards)) logz.log_tabular("MinRewardRollout", np.min(rewards)) logz.log_tabular("timesteps", self.timesteps) logz.dump_tabular() t1 = time.time() # get statistics from all workers for j in range(self.num_workers): self.policy.observation_filter.update( ray.get(self.workers[j].get_filter.remote())) self.policy.observation_filter.stats_increment() # make sure master filter buffer is clear self.policy.observation_filter.clear_buffer() # sync all workers filter_id = ray.put(self.policy.observation_filter) setting_filters_ids = [ worker.sync_filter.remote(filter_id) for worker in self.workers ] # waiting for sync of all workers ray.get(setting_filters_ids) increment_filters_ids = [ worker.stats_increment.remote() for worker in self.workers ] # waiting for increment of all workers ray.get(increment_filters_ids) t2 = time.time() print('Time to sync statistics:', t2 - t1) return
def main_cartpole(n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, stepsize=1e-2, animate=False, logfile=None): env = gym.make("CartPole-v0") ob_dim = env.observation_space.shape[0] num_actions = env.action_space.n logz.configure_output_file(logfile) #vf = LinearValueFunction() vf = NeuralValueFunction(ob_dim) # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in these function sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_ac_n = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) # batch of actions taken by the policy, used for policy gradient computation sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate sy_h1 = tf.nn.relu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_logits_na = dense(sy_h1, num_actions, "final", weight_init=normc_initializer(0.05)) # "logits", describing probability distribution of final layer # we use a small initialization for the last layer, so the initial policy has maximal entropy sy_oldlogits_na = tf.placeholder(shape=[None, num_actions], name='oldlogits', dtype=tf.float32) # logits BEFORE update (just used for KL diagnostic) sy_logp_na = tf.nn.log_softmax(sy_logits_na) # logprobability of actions sy_sampled_ac = categorical_sample_logits(sy_logits_na)[0] # sampled actions, used for defining the policy (NOT computing the policy gradient) sy_n = tf.shape(sy_ob_no)[0] sy_logprob_n = fancy_slice_2d(sy_logp_na, tf.range(sy_n), sy_ac_n) # log-prob of actions taken -- used for policy gradient calculation # The following quantities are just used for computing KL and entropy, JUST FOR DIAGNOSTIC PURPOSES >>>> sy_oldlogp_na = tf.nn.log_softmax(sy_oldlogits_na) sy_oldp_na = tf.exp(sy_oldlogp_na) sy_kl = tf.reduce_sum(sy_oldp_na * (sy_oldlogp_na - sy_logp_na)) / tf.to_float(sy_n) sy_p_na = tf.exp(sy_logp_na) sy_ent = tf.reduce_sum( - sy_p_na * sy_logp_na) / tf.to_float(sy_n) # <<<<<<<<<<<<< sy_surr = - tf.reduce_mean(sy_adv_n * sy_logprob_n) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) # Symbolic, in case you want to change the stepsize during optimization. (We're not doing that currently) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() sess.run(tf.global_variables_initializer()) total_timesteps = 0 obs_mean = np.zeros(ob_dim) obs_std = np.zeros(ob_dim) for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict((path["observation"]-obs_mean)/(obs_std+1e-8)) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n-adv_n.mean())/(adv_n.std()+1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) obs_mean = np.average(ob_no,axis=0) obs_std = np.std(ob_no,axis=0) vf.fit((ob_no-obs_mean)/(obs_std+1e-8), vtarg_n) # Policy update _, oldlogits_na = sess.run([update_op, sy_logits_na], feed_dict={sy_ob_no:ob_no, sy_ac_n:ac_n, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldlogits_na:oldlogits_na}) # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train_AC( exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size, ######################################################################## # Exploration args bonus_coeff, kl_weight, density_lr, density_train_iters, density_batch_size, density_hiddim, dm, replay_size, sigma, ######################################################################## ): start = time.time() #========================================================================================# # Set Up Logger #========================================================================================# setup_logger(logdir, locals()) #========================================================================================# # Set Up Env #========================================================================================# # Make the gym environment ######################################################################## # Exploration if env_name == 'PointMass-v0': from pointmass import PointMass env = PointMass() else: env = gym.make(env_name) dirname = logz.G.output_dir ######################################################################## # Set random seeds # [Mehran Shakeriava] change begin import random random.seed(seed, version=2) # tf.set_random_seed(seed) # np.random.seed(seed) # env.seed(seed) tf.set_random_seed(random.randint(0, 2**32 - 1)) np.random.seed(random.randint(0, 2**32 - 1)) env.seed(random.randint(0, 2**32 - 1)) # [Mehran Shakeriava] change end # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # Initialize Agent #========================================================================================# computation_graph_args = { 'n_layers': n_layers, 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update, } sample_trajectory_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_advantage_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } agent = Agent(computation_graph_args, sample_trajectory_args, estimate_advantage_args) #estimate_return_args # build computation graph agent.build_computation_graph() ######################################################################## # Initalize exploration density model if dm != 'none': if env_name == 'PointMass-v0' and dm == 'hist': density_model = Histogram( nbins=env.grid_size, preprocessor=env.preprocess) exploration = DiscreteExploration( density_model=density_model, bonus_coeff=bonus_coeff) elif dm == 'rbf': density_model = RBF(sigma=sigma) exploration = RBFExploration( density_model=density_model, bonus_coeff=bonus_coeff, replay_size=int(replay_size)) elif dm == 'ex2': density_model = Exemplar( ob_dim=ob_dim, hid_dim=density_hiddim, learning_rate=density_lr, kl_weight=kl_weight) exploration = ExemplarExploration( density_model=density_model, bonus_coeff=bonus_coeff, train_iters=density_train_iters, bsize=density_batch_size, replay_size=int(replay_size)) exploration.density_model.build_computation_graph() else: raise NotImplementedError ######################################################################## # tensorflow: config, session, variable initialization agent.init_tf_sess() ######################################################################## if dm != 'none': exploration.receive_tf_sess(agent.sess) ######################################################################## #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) paths, timesteps_this_batch = agent.sample_trajectories(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate([path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) ######################################################################## # Modify the reward to include exploration bonus """ 1. Fit density model if dm == 'ex2': the call to exploration.fit_density_model should return ll, kl, elbo else: the call to exploration.fit_density_model should return nothing 2. Modify the re_n with the reward bonus by calling exploration.modify_reward """ old_re_n = re_n if dm == 'none': pass else: # 1. Fit density model if dm == 'ex2': ### PROBLEM 3 ### YOUR CODE HERE ll, kl, elbo = exploration.fit_density_model(next_ob_no) elif dm == 'hist' or dm == 'rbf': ### PROBLEM 1 ### YOUR CODE HERE ### exploration.fit_density_model(next_ob_no) ###################### else: assert False # 2. Modify the reward ### PROBLEM 1 ### YOUR CODE HERE ### re_n = exploration.modify_reward(re_n, next_ob_no) ###################### print('average state', np.mean(ob_no, axis=0)) print('average action', np.mean(ac_na, axis=0)) # Logging stuff. # Only works for point mass. if env_name == 'PointMass-v0': np.save(os.path.join(dirname, '{}'.format(itr)), ob_no) ######################################################################## agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) if n_iter - itr < 10: max_reward_path_idx = np.argmax(np.array([path["reward"].sum() for path in paths])) print(paths[max_reward_path_idx]['reward']) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) ######################################################################## logz.log_tabular("Unmodified Rewards Mean", np.mean(old_re_n)) logz.log_tabular("Unmodified Rewards Std", np.mean(old_re_n)) logz.log_tabular("Modified Rewards Mean", np.mean(re_n)) logz.log_tabular("Modified Rewards Std", np.mean(re_n)) if dm == 'ex2': logz.log_tabular("Log Likelihood Mean", np.mean(ll)) logz.log_tabular("Log Likelihood Std", np.std(ll)) logz.log_tabular("KL Divergence Mean", np.mean(kl)) logz.log_tabular("KL Divergence Std", np.std(kl)) logz.log_tabular("Negative ELBo", -elbo) ######################################################################## logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32, network_activation='tanh' ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #activation function for the network if network_activation=='relu': activation=torch.nn.functional.relu elif network_activation=='leaky_relu': activation=torch.nn.functional.leaky_relu else: activation=torch.nn.functional.tanh #todo: create policy actor=build_mlp(ob_dim, ac_dim, "actor",\ n_layers=n_layers, size=size, activation=activation, discrete=discrete) actor_loss=reinforce_loss actor_optimizer=torch.optim.Adam(actor.parameters(), lr=learning_rate) #todo: initilize Agent: #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: critic=build_mlp(ob_dim,1,"nn_baseline",\ n_layers=n_layers,size=size, discrete=discrete) critic_loss=nn.MSELoss() critic_optimizer=torch.optim.Adam(critic.parameters(), lr=learning_rate) #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards, log_probs = [], [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) ob = torch.from_numpy(ob).float().unsqueeze(0) obs.append(ob) ac, log_prob = actor.run(ob) acs.append(ac) log_probs.append(log_prob) #format the action from policy if discrete: ac = int(ac) else: ac = ac.squeeze(0).numpy() ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : torch.cat(obs, 0), "reward" : torch.Tensor(rewards), "action" : torch.cat(acs, 0), "log_prob" : torch.cat(log_probs, 0)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch ob_no = torch.cat([path["observation"] for path in paths], 0) ac_na = torch.cat([path["action"] for path in paths], 0) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# q_n = [] for path in paths: rewards = path['reward'] num_steps = pathlength(path) R=[] if reward_to_go: for t in range(num_steps): R.append((torch.pow(gamma, torch.arange(num_steps-t))*rewards[t:]).sum().view(-1,1)) q_n.append(torch.cat(R)) else: q_n.append((torch.pow(gamma, torch.arange(num_steps)) * rewards).sum() * torch.ones(num_steps, 1)) q_n = torch.cat(q_n, 0) #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = critic(ob_no) q_n_std = q_n.std() q_n_mean = q_n.mean() b_n_scaled = b_n * q_n_std + q_n_mean adv_n = (q_n - b_n_scaled).detach() else: adv_n = q_n #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + np.finfo(np.float32).eps.item()) #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE target = (q_n - q_n_mean) / (q_n_std + np.finfo(np.float32).eps.item()) critic_optimizer.zero_grad() c_loss = critic_loss(b_n, target) c_loss.backward() critic_optimizer.step() #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE log_probs = torch.cat([path["log_prob"] for path in paths], 0) actor_optimizer.zero_grad() loss = actor_loss(log_probs, adv_n, len(paths)) print(loss) loss.backward() actor_optimizer.step() # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def main_pendulum(logdir, seed, n_iter, gamma, min_timesteps_per_batch, initial_stepsize, desired_kl, vf_type, vf_params, animate=False): tf.set_random_seed(seed) np.random.seed(seed) env = gym.make("Pendulum-v0") ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] logz.configure_output_dir(logdir) sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) # batch of observations sy_h1 = lrelu(dense(sy_ob_no, 32, "h1", weight_init=normc_initializer(1.0))) # hidden layer sy_h2 = lrelu(dense(sy_h1, 16, "h2", weight_init=normc_initializer(1.0))) # hidden layer # Gaussian distribution (mean, stdev) for each action dimension for the # batch. sy_mean_na = dense(sy_h2, ac_dim, "mean", weight_init=normc_initializer(0.05)) # Use the same stdev for all inputs. sy_logstd_a = tf.get_variable("logstdev", [ac_dim], initializer=tf.zeros_initializer()) # Variance sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # batch of actions taken by the policy, used for policy gradient computation # Now, need to compute the logprob for each action taken. action_dist = tf.contrib.distributions.Normal(loc=sy_mean_na, scale=tf.exp(sy_logstd_a), validate_args=True) # sy_logprob_n is in [batch_size, ac_dim] shape. sy_logprob_n = action_dist.log_prob(sy_ac_na) # Now, need to sample an action based on input. This should be a 1-D vector # with ac_dim float in it. sy_sampled_ac = action_dist.sample()[0] # old mean/stdev before updating the policy. This is purely used for # computing KL sy_oldmean_na = tf.placeholder(shape=[None, ac_dim], name='oldmean', dtype=tf.float32) sy_oldlogstd_a = tf.placeholder(shape=[ac_dim], name='oldlogstdev', dtype=tf.float32) old_action_dist = tf.contrib.distributions.Normal(loc=sy_oldmean_na, scale=tf.exp(sy_oldlogstd_a), validate_args=True) sy_kl = tf.reduce_mean(tf.contrib.distributions.kl_divergence(action_dist, old_action_dist)) # Compute entropy sy_ent = tf.reduce_mean(action_dist.entropy()) sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) # advantage function estimate # We do tf.reduce_mean on sy_logprob_n here, as it's shape is [batch_size, # ac_dim]. Not sure what's the best way to deal with ac_dim -- but pendulum's # ac_dim is 1, so using reduce_mean here is fine. sy_surr = - tf.reduce_mean(sy_adv_n * tf.reduce_mean(sy_logprob_n, 1)) # Loss function that we'll differentiate to get the policy gradient ("surr" is for "surrogate loss") sy_stepsize = tf.placeholder(shape=[], dtype=tf.float32) update_op = tf.train.AdamOptimizer(sy_stepsize).minimize(sy_surr) sess = tf.Session() sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 if vf_type == 'linear': vf = LinearValueFunction(**vf_params) elif vf_type == 'nn': vf = NnValueFunction(ob_dim=ob_dim, session=sess, **vf_params) initial_ob = env.reset() total_timesteps = 0 stepsize = initial_stepsize for i in range(n_iter): print("********** Iteration %i ************"%i) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (i % 10 == 0) and animate) while True: if animate_this_episode: env.render() obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = {"observation" : np.array(obs), "terminated" : terminated, "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = discount(rew_t, gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update _, oldmean_na, oldlogstd_a = sess.run([update_op, sy_mean_na, sy_logstd_a], feed_dict={sy_ob_no:ob_no, sy_ac_na:ac_na, sy_adv_n:standardized_adv_n, sy_stepsize:stepsize}) kl, ent = sess.run([sy_kl, sy_ent], feed_dict={sy_ob_no:ob_no, sy_oldmean_na: oldmean_na, sy_oldlogstd_a: oldlogstd_a}) if kl > desired_kl * 2: stepsize /= 1.5 print('stepsize -> %s'%stepsize) elif kl < desired_kl / 2: stepsize *= 1.5 print('stepsize -> %s'%stepsize) else: print('stepsize OK') # Log diagnostics logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular("EpLenMean", np.mean([pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("EVAfter", explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit value function AFTER using it to compute the advantage function to avoid introducing bias logz.dump_tabular()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds torch.manual_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #todo: create Agent #todo: initilize Agent: #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = actor.run(ob) print("need to type-check action here:(two lines)") print(ac) print(ac.size()) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break #One episode finishes; perform update here finish_episode(actor, actor_optimizer, critic=None, critic_optimizer=None, ) path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name='', env_name='CartPole-v0', n_iter=100, gamma=1.0, min_timesteps_per_batch=1000, max_path_length=None, learning_rate=5e-3, reward_to_go=True, animate=True, logdir=None, normalize_advantages=True, nn_baseline=False, seed=0, # network arguments n_layers=1, size=32 ): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters args = inspect.getargspec(train_PG)[0] locals_ = locals() params = {k: locals_[k] if k in locals_ else None for k in args} logz.save_params(params) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) # Make the gym environment env = gym.make(env_name) # Is this env continuous, or discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps #========================================================================================# # Notes on notation: # # Symbolic variables have the prefix sy_, to distinguish them from the numerical values # that are computed later in the function # # Prefixes and suffixes: # ob - observation # ac - action # _no - this tensor should have shape (batch size /n/, observation dim) # _na - this tensor should have shape (batch size /n/, action dim) # _n - this tensor should have shape (batch size /n/) # # Note: batch size /n/ is defined at runtime, and until then, the shape for that axis # is None #========================================================================================# # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] #========================================================================================# # ----------SECTION 4---------- # Placeholders # # Need these for batch observations / actions / advantages in policy gradient loss function. #========================================================================================# sy_ob_no = tf.placeholder(shape=[None, ob_dim], name="ob", dtype=tf.float32) if discrete: sy_ac_na = tf.placeholder(shape=[None], name="ac", dtype=tf.int32) else: sy_ac_na = tf.placeholder(shape=[None, ac_dim], name="ac", dtype=tf.float32) # Define a placeholder for advantages sy_adv_n = TODO #========================================================================================# # ----------SECTION 4---------- # Networks # # Make symbolic operations for # 1. Policy network outputs which describe the policy distribution. # a. For the discrete case, just logits for each action. # # b. For the continuous case, the mean / log std of a Gaussian distribution over # actions. # # Hint: use the 'build_mlp' function you defined in utilities. # # Note: these ops should be functions of the placeholder 'sy_ob_no' # # 2. Producing samples stochastically from the policy distribution. # a. For the discrete case, an op that takes in logits and produces actions. # # Should have shape [None] # # b. For the continuous case, use the reparameterization trick: # The output from a Gaussian distribution with mean 'mu' and std 'sigma' is # # mu + sigma * z, z ~ N(0, I) # # This reduces the problem to just sampling z. (Hint: use tf.random_normal!) # # Should have shape [None, ac_dim] # # Note: these ops should be functions of the policy network output ops. # # 3. Computing the log probability of a set of actions that were actually taken, # according to the policy. # # Note: these ops should be functions of the placeholder 'sy_ac_na', and the # policy network output ops. # #========================================================================================# if discrete: # YOUR_CODE_HERE sy_logits_na = TODO sy_sampled_ac = TODO # Hint: Use the tf.multinomial op sy_logprob_n = TODO else: # YOUR_CODE_HERE sy_mean = TODO sy_logstd = TODO # logstd should just be a trainable variable, not a network output. sy_sampled_ac = TODO sy_logprob_n = TODO # Hint: Use the log probability under a multivariate gaussian. #========================================================================================# # ----------SECTION 4---------- # Loss Function and Training Operation #========================================================================================# loss = TODO # Loss function that we'll differentiate to get the policy gradient. update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) #========================================================================================# # ----------SECTION 5---------- # Optional Baseline #========================================================================================# if nn_baseline: baseline_prediction = tf.squeeze(build_mlp( sy_ob_no, 1, "nn_baseline", n_layers=n_layers, size=size)) # Define placeholders for targets, a loss function and an update op for fitting a # neural network baseline. These will be used to fit the neural network baseline. # YOUR_CODE_HERE baseline_update_op = TODO #========================================================================================# # Tensorflow Engineering: Config, Session, Variable initialization #========================================================================================# tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) sess = tf.Session(config=tf_config) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 #========================================================================================# # Training Loop #========================================================================================# total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************"%itr) # Collect paths until we have enough timesteps timesteps_this_batch = 0 paths = [] while True: ob = env.reset() obs, acs, rewards = [], [], [] animate_this_episode=(len(paths)==0 and (itr % 10 == 0) and animate) steps = 0 while True: if animate_this_episode: env.render() time.sleep(0.05) obs.append(ob) ac = sess.run(sy_sampled_ac, feed_dict={sy_ob_no : ob[None]}) ac = ac[0] acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) steps += 1 if done or steps > max_path_length: break path = {"observation" : np.array(obs), "reward" : np.array(rewards), "action" : np.array(acs)} paths.append(path) timesteps_this_batch += pathlength(path) if timesteps_this_batch > min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) #====================================================================================# # ----------SECTION 4---------- # Computing Q-values # # Your code should construct numpy arrays for Q-values which will be used to compute # advantages (which will in turn be fed to the placeholder you defined above). # # Recall that the expression for the policy gradient PG is # # PG = E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * (Q_t - b_t )] # # where # # tau=(s_0, a_0, ...) is a trajectory, # Q_t is the Q-value at time t, Q^{pi}(s_t, a_t), # and b_t is a baseline which may depend on s_t. # # You will write code for two cases, controlled by the flag 'reward_to_go': # # Case 1: trajectory-based PG # # (reward_to_go = False) # # Instead of Q^{pi}(s_t, a_t), we use the total discounted reward summed over # entire trajectory (regardless of which time step the Q-value should be for). # # For this case, the policy gradient estimator is # # E_{tau} [sum_{t=0}^T grad log pi(a_t|s_t) * Ret(tau)] # # where # # Ret(tau) = sum_{t'=0}^T gamma^t' r_{t'}. # # Thus, you should compute # # Q_t = Ret(tau) # # Case 2: reward-to-go PG # # (reward_to_go = True) # # Here, you estimate Q^{pi}(s_t, a_t) by the discounted sum of rewards starting # from time step t. Thus, you should compute # # Q_t = sum_{t'=t}^T gamma^(t'-t) * r_{t'} # # # Store the Q-values for all timesteps and all trajectories in a variable 'q_n', # like the 'ob_no' and 'ac_na' above. # #====================================================================================# # YOUR_CODE_HERE q_n = TODO #====================================================================================# # ----------SECTION 5---------- # Computing Baselines #====================================================================================# if nn_baseline: # If nn_baseline is True, use your neural network to predict reward-to-go # at each timestep for each trajectory, and save the result in a variable 'b_n' # like 'ob_no', 'ac_na', and 'q_n'. # # Hint #bl1: rescale the output from the nn_baseline to match the statistics # (mean and std) of the current or previous batch of Q-values. (Goes with Hint # #bl2 below.) b_n = TODO adv_n = q_n - b_n else: adv_n = q_n.copy() #====================================================================================# # ----------SECTION 4---------- # Advantage Normalization #====================================================================================# if normalize_advantages: # On the next line, implement a trick which is known empirically to reduce variance # in policy gradient methods: normalize adv_n to have mean zero and std=1. # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 5---------- # Optimizing Neural Network Baseline #====================================================================================# if nn_baseline: # ----------SECTION 5---------- # If a neural network baseline is used, set up the targets and the inputs for the # baseline. # # Fit it to the current batch in order to use for the next iteration. Use the # baseline_update_op you defined earlier. # # Hint #bl2: Instead of trying to target raw Q-values directly, rescale the # targets to have mean zero and std=1. (Goes with Hint #bl1 above.) # YOUR_CODE_HERE pass #====================================================================================# # ----------SECTION 4---------- # Performing the Policy Update #====================================================================================# # Call the update operation necessary to perform the policy gradient update based on # the current batch of rollouts. # # For debug purposes, you may wish to save the value of the loss function before # and after an update, and then log them below. # YOUR_CODE_HERE # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()