class AntAgent: def __init__(self, render=False, model=None): # create an environment self.environment = gym.make('MountainCarContinuous-v0') # reset environment when an agent is initialized self.current_observation = self.reset_environment() self.render = render self.model = model self.buffer = ReplayBuffer() def reset_environment(self): current_observation = self.environment.reset() return current_observation def get_action(self, current_observation): """Fetch an action according to model policy""" if self.model is None: action = self.environment.action_space.sample() else: action = self.model.predict(current_observation) return action def get_transitions(self, action): """Take one step in the environment and return the observations""" next_observation, reward, done, _ = self.environment.step(action) if self.render: self.environment.render() return next_observation, reward, done def run_episode(self, num_episodes=1): """run episodes `num_episodes` times using `model` policy""" for episode in range(num_episodes): self.current_observation = self.reset_environment() episode_id = self.buffer.create_episode() done = False transition = dict() while not done: transition['current_observation'] = self.current_observation transition['action'] = self.get_action(self.current_observation) transition['next_observation'], transition['reward'], done = self.get_transitions(transition['action']) self.buffer.add_sample(episode_id, transition) self.buffer.add_episode(episode_id) def learn(self, step=0, restore=False): """Train SAC model using transitions in replay buffer""" if self.model is None: raise Exception("This agent has no brain! Add a model which implements fit() function to train.") # Sample array of transitions from replay buffer. transition_matrices = self.buffer.fetch_sample() if step != 0: restore = True # Fit the SAC model. self.model.fit(transition_matrices, restore=restore, global_step=step)
class Agent(object): def __init__(self, computation_graph_args, sample_trajectory_args, estimate_return_args): super(Agent, self).__init__() self.ob_dim = computation_graph_args['ob_dim'] self.ac_dim = computation_graph_args['ac_dim'] self.task_dim = computation_graph_args['task_dim'] self.reward_dim = 1 self.terminal_dim = 1 self.meta_ob_dim = self.ob_dim + self.ac_dim + self.reward_dim + self.terminal_dim self.scope = 'continuous_logits' self.size = computation_graph_args['size'] self.gru_size = computation_graph_args['gru_size'] self.n_layers = computation_graph_args['n_layers'] self.learning_rate = computation_graph_args['learning_rate'] self.history = computation_graph_args['history'] self.num_value_iters = computation_graph_args['num_value_iters'] self.l2reg = computation_graph_args['l2reg'] self.recurrent = computation_graph_args['recurrent'] self.animate = sample_trajectory_args['animate'] self.max_path_length = sample_trajectory_args['max_path_length'] self.min_timesteps_per_batch = sample_trajectory_args['min_timesteps_per_batch'] self.generalized = sample_trajectory_args['generalized'] self.granularity = sample_trajectory_args['granularity'] self.gamma = estimate_return_args['gamma'] self.nn_critic = estimate_return_args['nn_critic'] self.normalize_advantages = estimate_return_args['normalize_advantages'] self.replay_buffer = ReplayBuffer(100000, [self.history, self.meta_ob_dim], [self.ac_dim], self.gru_size, self.task_dim) self.val_replay_buffer = ReplayBuffer(100000, [self.history, self.meta_ob_dim], [self.ac_dim], self.gru_size, self.task_dim) def init_tf_sess(self): tf_config = tf.ConfigProto(inter_op_parallelism_threads=1, intra_op_parallelism_threads=1) tf_config.gpu_options.allow_growth = True # may need if using GPU self.sess = tf.Session(config=tf_config) self.sess.__enter__() # equivalent to `with self.sess:` tf.global_variables_initializer().run() #pylint: disable=E1101 def define_placeholders(self): """ placeholders for batch batch observations / actions / advantages in policy gradient loss function. see Agent.build_computation_graph for notation returns: sy_ob_no: placeholder for meta-observations sy_ac_na: placeholder for actions sy_adv_n: placeholder for advantages sy_hidden: placeholder for RNN hidden state (PPO stuff) sy_lp_n: placeholder for pre-computed log-probs sy_fixed_lp_n: placeholder for pre-computed old log-probs """ sy_ob_no = tf.placeholder(shape=[None, self.history, self.meta_ob_dim], name="ob", dtype=tf.float32) sy_ac_na = tf.placeholder(shape=[None, self.ac_dim], name="ac", dtype=tf.float32) sy_adv_n = tf.placeholder(shape=[None], name="adv", dtype=tf.float32) sy_hidden = tf.placeholder(shape=[None, self.gru_size], name="hidden", dtype=tf.float32) sy_lp_n = tf.placeholder(shape=[None], name="logprob", dtype=tf.float32) sy_fixed_lp_n = tf.placeholder(shape=[None], name="fixed_logprob", dtype=tf.float32) return sy_ob_no, sy_ac_na, sy_adv_n, sy_hidden, sy_lp_n, sy_fixed_lp_n def policy_forward_pass(self, sy_ob_no, sy_hidden): """ constructs the symbolic operation for the policy network outputs, which are the parameters of the policy distribution p(a|s) arguments: sy_ob_no: (batch_size, self.history, self.meta_ob_dim) sy_hidden: (batch_size, self.gru_size) returns: the parameters of the policy. the parameters are a tuple (mean, log_std) of a Gaussian distribution over actions. log_std should just be a trainable variable, not a network output. sy_mean: (batch_size, self.ac_dim) sy_logstd: (batch_size, self.ac_dim) """ # ac_dim * 2 because we predict both mean and std sy_policy_params, sy_hidden = build_policy(sy_ob_no, sy_hidden, self.ac_dim*2, self.scope, n_layers=self.n_layers, size=self.size, gru_size=self.gru_size, recurrent=self.recurrent) return (sy_policy_params, sy_hidden) def sample_action(self, policy_parameters): """ constructs a symbolic operation for stochastically sampling from the policy distribution arguments: policy_parameters mean, log_std) of a Gaussian distribution over actions sy_mean: (batch_size, self.ac_dim) sy_logstd: (batch_size, self.ac_dim) returns: sy_sampled_ac: (batch_size, self.ac_dim) """ sy_mean, sy_logstd = policy_parameters sy_sampled_ac = sy_mean + tf.exp(sy_logstd) * tf.random_normal(tf.shape(sy_mean), 0, 1) return sy_sampled_ac def get_log_prob(self, policy_parameters, sy_ac_na): """ constructs a symbolic operation for computing the log probability of a set of actions that were actually taken according to the policy arguments: policy_parameters mean, log_std) of a Gaussian distribution over actions sy_mean: (batch_size, self.ac_dim) sy_logstd: (batch_size, self.ac_dim) sy_ac_na: (batch_size, self.ac_dim) returns: sy_lp_n: (batch_size) """ sy_mean, sy_logstd = policy_parameters sy_lp_n = tfp.distributions.MultivariateNormalDiag( loc=sy_mean, scale_diag=tf.exp(sy_logstd)).log_prob(sy_ac_na) return sy_lp_n def build_computation_graph(self): """ notes on notation: Symbolic variables have the prefix sy_, to distinguish them from the numerical values that are computed later in the function prefixes and suffixes: ob - observation ac - action _no - this tensor should have shape (batch self.size /n/, observation dim) _na - this tensor should have shape (batch self.size /n/, action dim) _n - this tensor should have shape (batch self.size /n/) Note: batch self.size /n/ is defined at runtime, and until then, the shape for that axis is None ---------------------------------------------------------------------------------- loss: a function of self.sy_lp_n and self.sy_adv_n that we will differentiate to get the policy gradient. """ self.sy_ob_no, self.sy_ac_na, self.sy_adv_n, self.sy_hidden, self.sy_lp_n, self.sy_fixed_lp_n = self.define_placeholders() # The policy takes in an observation and produces a distribution over the action space policy_outputs = self.policy_forward_pass(self.sy_ob_no, self.sy_hidden) self.policy_parameters = policy_outputs[:-1] # unpack mean and variance self.policy_parameters = tf.split(self.policy_parameters[0], 2, axis=1) # We can sample actions from this action distribution. # This will be called in Agent.sample_trajectory() where we generate a rollout. self.sy_sampled_ac = self.sample_action(self.policy_parameters) # We can also compute the logprob of the actions that were actually taken by the policy # This is used in the loss function. self.sy_lp_n = self.get_log_prob(self.policy_parameters, self.sy_ac_na) # PPO critic update critic_regularizer = tf.contrib.layers.l2_regularizer(1e-3) if self.l2reg else None self.critic_prediction = tf.squeeze(build_critic(self.sy_ob_no, self.sy_hidden, 1, 'critic_network', n_layers=self.n_layers, size=self.size, gru_size=self.gru_size, recurrent=self.recurrent, regularizer=critic_regularizer)) self.sy_target_n = tf.placeholder(shape=[None], name="critic_target", dtype=tf.float32) self.critic_loss = tf.losses.mean_squared_error(self.sy_target_n, self.critic_prediction) self.critic_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='critic_network') self.critic_update_op = tf.train.AdamOptimizer(self.learning_rate).minimize(self.critic_loss) # PPO actor update self.sy_fixed_log_prob_n = tf.placeholder(shape=[None], name="fixed_log_prob", dtype=tf.float32) self.policy_surr_loss = self.ppo_loss(self.sy_lp_n, self.sy_fixed_lp_n, self.sy_adv_n) self.policy_weights = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.scope) optimizer = tf.train.AdamOptimizer(self.learning_rate) self.policy_update_op = minimize_and_clip(optimizer, self.policy_surr_loss, var_list=self.policy_weights, clip_val=40) def sample_trajectories(self, itr, env, min_timesteps, is_evaluation=False): # Collect paths until we have enough timesteps timesteps_this_batch = 0 stats = [] while True: animate_this_episode=(len(stats)==0 and (itr % 10 == 0) and self.animate) steps, s = self.sample_trajectory(env, animate_this_episode, is_evaluation=is_evaluation) stats += s timesteps_this_batch += steps if timesteps_this_batch > min_timesteps: break return stats, timesteps_this_batch def sample_trajectory(self, env, animate_this_episode, is_evaluation): """ sample a task, then sample trajectories from that task until either max(self.history, self.max_path_length) timesteps have been sampled construct meta-observations by concatenating (s, a, r, d) into one vector inputs to the policy should have the shape (batch_size, self.history, self.meta_ob_dim) zero pad the input to maintain a consistent input shape add the entire input as observation to the replay buffer, along with a, r, d samples will be drawn from the replay buffer to update the policy arguments: env: the env to sample trajectories from animate_this_episode: if True then render val: whether this is training or evaluation """ env.reset_task(generalized=self.generalized, granularity=self.granularity, is_evaluation=is_evaluation) stats = [] #====================================================================================# # ----------PROBLEM 1---------- #====================================================================================# ep_steps = 0 steps = 0 num_samples = max(self.history, self.max_path_length + 1) meta_obs = np.zeros((num_samples + self.history + 1, self.meta_ob_dim)) rewards = [] while True: if animate_this_episode: env.render() time.sleep(0.1) if ep_steps == 0: ob = env.reset() # first meta ob has only the observation # set a, r, d to zero, construct first meta observation in meta_obs # YOUR CODE HERE meta_obs[steps + self.history, :self.ob_dim] = ob steps += 1 # index into the meta_obs array to get the window that ends with the current timestep # please name the windowed observation `in_` for compatibilty with the code that adds to the replay buffer (lines 418, 420) # YOUR CODE HERE in_ = meta_obs[steps: steps + self.history, :] hidden = np.zeros((1, self.gru_size), dtype=np.float32) # get action from the policy # YOUR CODE HERE ac = self.sess.run(self.sy_sampled_ac, feed_dict = {self.sy_ob_no: [in_], self.sy_hidden: hidden}) ac = ac[0] # step the environment # YOUR CODE HERE obs, rew, done, _ = env.step(ac) ep_steps += 1 done = bool(done) or ep_steps == self.max_path_length # construct the meta-observation and add it to meta_obs # YOUR CODE HERE #print(self.meta_ob_dim) meta_obs[steps + self.history] = np.concatenate((obs, ac, [rew], [done])) rewards.append(rew) steps += 1 # add sample to replay buffer if is_evaluation: self.val_replay_buffer.add_sample(in_, ac, rew, done, hidden, env._goal) else: self.replay_buffer.add_sample(in_, ac, rew, done, hidden, env._goal) # start new episode if done: # compute stats over trajectory s = dict() s['rewards']= rewards[-ep_steps:] s['ep_len'] = ep_steps stats.append(s) ep_steps = 0 if steps >= num_samples: break return steps, stats def compute_advantage(self, ob_no, re_n, hidden, masks, tau=0.95): """ computes generalized advantage estimation (GAE). arguments: ob_no: (bsize, history, ob_dim) rewards: (bsize,) masks: (bsize,) values: (bsize,) gamma: scalar tau: scalar output: advantages: (bsize,) returns: (bsize,) requires: self.gamma """ bsize = len(re_n) rewards = np.squeeze(re_n) masks = np.squeeze(masks) values = self.sess.run(self.critic_prediction, feed_dict={self.sy_ob_no: ob_no, self.sy_hidden: hidden})[:,None] gamma = self.gamma assert rewards.shape == masks.shape == (bsize,) assert values.shape == (bsize, 1) bsize = len(rewards) returns = np.empty((bsize,)) deltas = np.empty((bsize,)) advantages = np.empty((bsize,)) prev_return = 0 prev_value = 0 prev_advantage = 0 for i in reversed(range(bsize)): returns[i] = rewards[i] + gamma * prev_return * masks[i] deltas[i] = rewards[i] + gamma * prev_value * masks[i] - values[i] advantages[i] = deltas[i] + gamma * tau * prev_advantage * masks[i] prev_return = returns[i] prev_value = values[i] prev_advantage = advantages[i] advantages = (advantages - np.mean(advantages, axis=0)) / np.std(advantages, axis=0) return advantages, returns def estimate_return(self, ob_no, re_n, hidden, masks): """ estimates the returns over a set of trajectories. let sum_of_path_lengths be the sum of the lengths of the paths sampled from Agent.sample_trajectories let num_paths be the number of paths sampled from Agent.sample_trajectories arguments: ob_no: shape: (sum_of_path_lengths, history, meta_obs_dim) re_n: length: num_paths. Each element in re_n is a numpy array containing the rewards for the particular path hidden: hidden state of recurrent policy masks: terminals masks returns: q_n: shape: (sum_of_path_lengths). A single vector for the estimated q values whose length is the sum of the lengths of the paths adv_n: shape: (sum_of_path_lengths). A single vector for the estimated advantages whose length is the sum of the lengths of the paths """ adv_n, q_n = self.compute_advantage(ob_no, re_n, hidden, masks) return q_n, adv_n def update_parameters(self, ob_no, hidden, ac_na, fixed_log_probs, q_n, adv_n): """ update the parameters of the policy and the critic, with PPO update arguments: ob_no: (minibsize, history, meta_obs_dim) hidden: shape: (minibsize, self.gru_size) ac_na: (minibsize) fixed_log_probs: (minibsize) adv_n: shape: (minibsize) q_n: shape: (sum_of_path_lengths) returns: nothing """ self.update_critic(ob_no, hidden, q_n) self.update_policy(ob_no, hidden, ac_na, fixed_log_probs, adv_n) def update_critic(self, ob_no, hidden, q_n): """ given: self.num_value_iters self.l2_reg arguments: ob_no: (minibsize, history, meta_obs_dim) hidden: (minibsize, self.gru_size) q_n: (minibsize) requires: self.num_value_iters """ target_n = (q_n - np.mean(q_n))/(np.std(q_n)+1e-8) for k in range(self.num_value_iters): critic_loss, _ = self.sess.run( [self.critic_loss, self.critic_update_op], feed_dict={self.sy_target_n: target_n, self.sy_ob_no: ob_no, self.sy_hidden: hidden}) return critic_loss def update_policy(self, ob_no, hidden, ac_na, fixed_log_probs, advantages): ''' arguments: fixed_log_probs: (minibsize) advantages: (minibsize) hidden: (minibsize, self.gru_size) ''' policy_surr_loss, _ = self.sess.run( [self.policy_surr_loss, self.policy_update_op], feed_dict={self.sy_ob_no: ob_no, self.sy_hidden: hidden, self.sy_ac_na: ac_na, self.sy_fixed_lp_n: fixed_log_probs, self.sy_adv_n: advantages}) return policy_surr_loss def ppo_loss(self, log_probs, fixed_log_probs, advantages, clip_epsilon=0.1, entropy_coeff=1e-4): """ given: clip_epsilon arguments: advantages (mini_bsize,) states (mini_bsize,) actions (mini_bsize,) fixed_log_probs (mini_bsize,) intermediate results: states, actions --> log_probs log_probs, fixed_log_probs --> ratio advantages, ratio --> surr1 ratio, clip_epsilon, advantages --> surr2 surr1, surr2 --> policy_surr_loss """ ratio = tf.exp(log_probs - fixed_log_probs) surr1 = ratio * advantages surr2 = tf.clip_by_value(ratio, clip_value_min=1.0-clip_epsilon, clip_value_max=1.0+clip_epsilon) * advantages policy_surr_loss = -tf.reduce_mean(tf.minimum(surr1, surr2)) probs = tf.exp(log_probs) entropy = tf.reduce_sum(-(log_probs * probs)) policy_surr_loss -= entropy_coeff * entropy return policy_surr_loss
# Check if we need to load a checkpoint if FLAGS.checkpoint: _saver = tf.train.Saver(var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=GENERATOR_SCOPE)) _saver.restore(sess, FLAGS.checkpoint) if FLAGS.mode == 'train': # Fill replay buffer with the minimum number of sample obs = env.reset() for i in trange(REPLAY_BUFFER_MIN_SIZE): _old_state = obs # Generate random play a = env.action_space.sample() obs, reward, done, _ = env.step(a) # Add to replay buffer, sample of type: (state, action, reward, next_state, done) repbuf.add_sample((_old_state, a, reward, obs, done)) # Check if done and reset if the game is ended if done: obs = env.reset() print("Loaded replay buffer with", len(repbuf.buffer), "samples.") # Now start training done = True episode_reward = 0 episode_steps = 0 rewards = deque([], maxlen=1000) epsilon = EPSILON_MAX episode_rewards_100 = 0 episode_rewards_1000 = 0 n_episode = 0 total_max_q = 0
class StandardRLAgent: def __init__(self, env, device): self.env = env self.device = device self.is_discrete_action = isinstance(env.action_space, gym.spaces.discrete.Discrete) self.obs_dim = env.observation_space.shape[0] gamma = 0.99 self.gamma = gamma if self.is_discrete_action: self.num_act = env.action_space.n self.algo = DQNAlgo(self.obs_dim, self.num_act, gamma, device=device) else: self.act_dim = env.action_space.shape[0] self.algo = DDPGAlgo(self.obs_dim, self.act_dim, gamma, device=device) self.replay_buffer = ReplayBuffer() def update_batch(self, batch_size=100): if self.replay_buffer.size() < batch_size * 10: return {} batch = self.replay_buffer.sample_batch(batch_size=batch_size) return self.algo.update_batch(batch) def run_episode(self): s, done = self.env.reset(), False zero = np.zeros_like(s) stats = Stats() epilen = 0 R = 0. while not done: epilen += 1 if len(self.replay_buffer) < 10000: a = self.env.action_space.sample() elif self.is_discrete_action: a = self.algo.get_action(s, zero, epsilon=0.05) else: a = self.algo.get_action(s, zero, sigma=0.1) sp, r, done, _ = self.env.step(a) mdone = modify_done(self.env, done) self.replay_buffer.add_sample((s, a, r, sp, mdone, zero)) s = sp R += r if epilen % 1 == 0: info = self.update_batch() stats.update(info) print(f'Epilen: {epilen}\tR: {R:.2f}') print(stats) return epilen def test_episode(self): s, done = self.env.reset(), False zero = np.zeros_like(s) R = 0. R0 = 0. DiscR = 0. gamma_power = 1. cnt = 0 while not done: # self.env.render() cnt += 1 # if cnt >= 500: break if self.is_discrete_action: a = self.algo.get_action(s, zero, epsilon=0.) else: a = self.algo.get_action(s, zero, sigma=0.) sp, r, done, _ = self.env.step(a) if cnt == 1: R0 = self.algo.get_value(s, zero) R += r DiscR += r * gamma_power gamma_power *= self.gamma s = sp info = { 'ExtR': R, 'DiscExtR': DiscR, 'DiscExtR_Est': R0, } return info
class UVFAWithRewardAgent: def __init__(self, env, device='cpu', use_td3=True): self.env = env self.device = device self.is_discrete_action = isinstance(env.action_space, gym.spaces.discrete.Discrete) self.obs_dim = env.observation_space.shape[0] gamma = 0.99 self.gamma = gamma if self.is_discrete_action: self.num_act = env.action_space.n self.algo = DQNWithRewardAlgo(self.obs_dim, self.num_act, gamma, use_td3=use_td3, device=device) else: self.act_dim = env.action_space.shape[0] self.algo = DDPGAlgo(self.obs_dim, self.act_dim, gamma, use_td3=use_td3, device=device) self.replay_buffer = ReplayBuffer() self.planner = Planner(trans_fn=self.planner_trans_fn, use_td3=use_td3, gamma=gamma, device=device) self.estimate_std() def estimate_std(self): tot_cnt = 0 states = [] action_deltas = [] while tot_cnt < 10000: s, done = self.env.reset(), False while not done: tot_cnt += 1 states.append(s) a = self.env.action_space.sample() sp, r, done, _ = self.env.step(a) action_deltas.append(sp - s) s = sp self.std = np.std(states, axis=0) + 1e-8 self.astd = np.std(action_deltas, axis=0) + 1e-8 print('Std', self.std, 'Action-Std', self.astd) def gen_goal(self, s): g = s + np.random.randn(*s.shape) * self.astd * np.random.randint(1, 10) return g def goal_reward(self, s, g): # r = (np.linalg.norm((s-g) / self.std, axis=-1) < 0.1).astype(np.float) r = (np.linalg.norm((s-g) / self.astd, axis=-1) < 1).astype(np.float) return r def update_batch(self, batch_size=32): if self.replay_buffer.size() < batch_size * 10: return {} batch = self.replay_buffer.sample_batch(batch_size=batch_size) return self.algo.update_batch(batch) def run_episode(self): s, done = self.env.reset(), False g = self.gen_goal(s) # g = s stats = Stats() episode = [] epilen = 0 extR = 0. intR = 0. while not done: epilen += 1 if len(self.replay_buffer) < 10000: a = self.env.action_space.sample() elif self.is_discrete_action: a = self.algo.get_action(s, g, epsilon=0.05) else: a = self.algo.get_action(s, g, sigma=0.1) sp, r, done, info = self.env.step(a) mdone = modify_done(self.env, done) episode.append((s, a, r, sp, mdone, g)) s = sp extR += r intR = max(intR, self.goal_reward(s, g) * (self.algo.gamma ** epilen)) if epilen % 1 == 0: info = self.update_batch() stats.update(info) her_prob = 0.5 rpl_len = 10 for i in range(epilen): s, a, extr, sp, done, g = episode[i] if np.random.random() < her_prob: hg_idx = np.random.randint(i, min(epilen, i+rpl_len)) g = episode[hg_idx][0] r = self.goal_reward(sp, g) done = np.logical_or((r > 0), done) self.replay_buffer.add_sample((s, a, r, extr, sp, done, g)) print(f'Epilen: {epilen}\tExtR: {extR:.2f}\tIntR: {intR:.2f}') print(stats) return epilen def test_episode(self): s, done = self.env.reset(), False g = self.gen_goal(s) # g = np.array([0.5, 0.0]) ss = torch.from_numpy(s).float().to(self.device).unsqueeze(0) gg = torch.from_numpy(g).float().to(self.device).unsqueeze(0) Q0, R0, _ = self.algo.get_values(ss, gg) Q0 = float(Q0) R0 = float(R0) ExtR = 0. DiscExtR = 0. IntR = 0. gamma_power = 1.0 min_dis = 1e9 cnt = 0 while not done: # self.env.render() cnt += 1 # if cnt >= 500: break if self.is_discrete_action: a = self.algo.get_action(s, g, epsilon=0.) else: a = self.algo.get_action(s, g, sigma=0.) sp, extr, done, info = self.env.step(a) mdone = modify_done(self.env, done) r = self.goal_reward(sp, g) ExtR += extr DiscExtR += gamma_power * extr IntR += gamma_power * r gamma_power *= self.gamma min_dis = min(min_dis, np.linalg.norm((sp-g)/self.std)) s = sp if r > 0: done = True info = { 'ExtR': ExtR, 'DiscExtR': DiscExtR, 'DiscExtR_Est': R0, 'IntR': IntR, 'IntR_Est': Q0, } return info def planner_trans_fn(self, s, g, *args, **kwargs): n, m = s.shape[0], g.shape[0] s = s.unsqueeze(1).expand(-1, m, -1) g = g.unsqueeze(0).expand(n, -1, -1) with torch.no_grad(): G, R, Pi = self.algo.get_values(s, g, *args, **kwargs) return G, R, Pi def update_planner(self): n = 1000 if len(self.replay_buffer) < n: return False waypoints = self.replay_buffer.sample_batch(n, replace=False)[0] # s print(waypoints.shape) self.planner.set_waypoint_states(waypoints) self.planner.update_trans() self.planner.pre_plan() return True def plan_episode(self, show_plan=False): s, done = self.env.reset(), False if show_plan: self.planner.show_plan(s, self.env) ExtR = 0. DiscExtR = 0. V0 = 0. gamma_power = 1.0 step = 0 while not done: step += 1 # if step >= 10: break # if show_plan: # self.env.render() a, v = self.planner.plan(s) if step == 1: V0 = v sp, extr, done, info = self.env.step(a) ExtR += extr DiscExtR += extr * gamma_power gamma_power *= self.gamma s = sp info = { 'Plan_ExtR': ExtR, 'Plan_DiscExtR': DiscExtR, 'Plan_DiscExtR_Est': V0, } return info
class DDPGAgent(object): def __init__(self, sess, env, test_env, args): self.sess = sess self.args = args self.env = env self.test_env = test_env self.ob_dim = env.observation_space.shape[0] self.ac_dim = env.action_space.shape[0] # Construct the networks and the experience replay buffer. self.actor = Actor(sess, env, args) self.critic = Critic(sess, env, args) self.rbuffer = ReplayBuffer(args.replay_size, self.ob_dim, self.ac_dim) # Initialize then run, also setting current=target to start. self._debug_print() self.sess.run(tf.global_variables_initializer()) self.actor.update_target_net(smooth=False) self.critic.update_target_net(smooth=False) def train(self): """ Algorithm 1 in the DDPG paper. """ num_episodes = 0 t_start = time.time() obs = self.env.reset() for t in range(self.args.n_iter): if (t % self.args.log_every_t_iter == 0) and (t > self.args.wait_until_rbuffer): print("\n*** DDPG Iteration {} ***".format(t)) # Sample actions with noise injection and manage buffer. act = self.actor.sample_action(obs, train=True) new_obs, rew, done, info = self.env.step(act) self.rbuffer.add_sample(s=obs, a=act, r=rew, done=done) if done: obs = self.env.reset() num_episodes += 1 else: obs = new_obs if (t > self.args.wait_until_rbuffer) and ( t % self.args.learning_freq == 0): # Sample from the replay buffer. states_t_BO, actions_t_BA, rewards_t_B, states_tp1_BO, done_mask_B = \ self.rbuffer.sample(num=self.args.batch_size) feed = { 'obs_t_BO': states_t_BO, 'act_t_BA': actions_t_BA, 'rew_t_B': rewards_t_B, 'obs_tp1_BO': states_tp1_BO, 'done_mask_B': done_mask_B } # Update the critic, get sampled policy gradients, update actor. a_grads_BA, l2_error = self.critic.update_weights(feed) actor_gradients = self.actor.update_weights(feed, a_grads_BA) # Update both target networks. self.critic.update_target_net() self.actor.update_target_net() if (t % self.args.log_every_t_iter == 0) and (t > self.args.wait_until_rbuffer): # Do some rollouts here and then record statistics. Note that # some of these stats rely on stuff computed from sampling the # replay buffer, so be careful interpreting these. The code # probably needs to guard against this case as well. stats = self._do_rollouts() hours = (time.time() - t_start) / (60 * 60.) logz.log_tabular("MeanReward", np.mean(stats['reward'])) logz.log_tabular("MaxReward", np.max(stats['reward'])) logz.log_tabular("MinReward", np.min(stats['reward'])) logz.log_tabular("StdReward", np.std(stats['reward'])) logz.log_tabular("MeanLength", np.mean(stats['length'])) logz.log_tabular("NumTrainingEps", num_episodes) logz.log_tabular("L2ErrorCritic", l2_error) logz.log_tabular("QaGradL2Norm", np.linalg.norm(a_grads_BA)) logz.log_tabular("TimeHours", hours) logz.log_tabular("Iterations", t) logz.dump_tabular() def _do_rollouts(self): """ Some rollouts to evaluate the agent's progress. Returns a dictionary containing relevant statistics. Later, I should parallelize this using an array of environments. """ num_episodes = 50 stats = defaultdict(list) for i in range(num_episodes): obs = self.test_env.reset() ep_time = 0 ep_reward = 0 # Run one episode ... while True: act = self.actor.sample_action(obs, train=False) new_obs, rew, done, info = self.test_env.step(act) ep_time += 1 ep_reward += rew if done: break # ... and collect its information here. stats['length'].append(ep_time) stats['reward'].append(ep_reward) return stats def _debug_print(self): print("\n\t(A bunch of debug prints)\n") print("\nActor weights") for v in self.actor.weights: shp = v.get_shape().as_list() print("- {} shape:{} size:{}".format(v.name, shp, np.prod(shp))) print("Total # of weights: {}.".format(self.actor.num_weights)) print("\nCritic weights") for v in self.critic.weights: shp = v.get_shape().as_list() print("- {} shape:{} size:{}".format(v.name, shp, np.prod(shp))) print("Total # of weights: {}.".format(self.critic.num_weights))
class UVFAgent: def __init__(self, env, device): self.env = env self.device = device self.is_discrete_action = isinstance(env.action_space, gym.spaces.discrete.Discrete) self.obs_dim = env.observation_space.shape[0] gamma = 0.99 if self.is_discrete_action: self.num_act = env.action_space.n self.algo = DQNAlgo(self.obs_dim, self.num_act, gamma, device=device) else: self.act_dim = env.action_space.shape[0] self.algo = DDPGAlgo(self.obs_dim, self.act_dim, gamma, device=device) self.replay_buffer = ReplayBuffer() self.estimate_std() def estimate_std(self): tot_cnt = 0 states = [] action_deltas = [] while tot_cnt < 10000: s, done = self.env.reset(), False while not done: tot_cnt += 1 states.append(s) a = self.env.action_space.sample() sp, r, done, _ = self.env.step(a) action_deltas.append(sp - s) s = sp self.std = np.std(states, axis=0) + 1e-8 self.astd = np.std(action_deltas, axis=0) + 1e-8 print('Std', self.std, 'Action-Std', self.astd) def gen_goal(self, s): g = s + np.random.randn(*s.shape) * self.astd return g def goal_reward(self, s, g): # r = (np.linalg.norm((s-g) / self.std, axis=-1) < 0.1).astype(np.float) r = (np.linalg.norm((s - g) / self.astd, axis=-1) < 1).astype(np.float) return r def update_batch(self, batch_size=32): if self.replay_buffer.size() < batch_size * 10: return {} batch = self.replay_buffer.sample_batch(batch_size=batch_size) return self.algo.update_batch(batch) def run_episode(self): s, done = self.env.reset(), False g = self.gen_goal(s) # g = s stats = Stats() episode = [] epilen = 0 extR = 0. intR = 0. while not done: epilen += 1 if self.is_discrete_action: a = self.algo.get_action(s, g, epsilon=0.05) else: a = self.algo.get_action(s, g, sigma=0.1) sp, r, done, info = self.env.step(a) mdone = modify_done(self.env, done) episode.append((s, a, r, sp, mdone, g)) s = sp extR += r intR = max(intR, self.goal_reward(s, g) * (self.algo.gamma**epilen)) if epilen % 4 == 0: info = self.update_batch() stats.update(info) her_prob = 0.5 rpl_len = 10 for i in range(epilen): s, a, r, sp, done, g = episode[i] if np.random.random() < her_prob: hg_idx = np.random.randint(i, min(epilen, i + rpl_len)) g = episode[hg_idx][0] r = self.goal_reward(sp, g) done = np.logical_or((r > 0), done) self.replay_buffer.add_sample((s, a, r, sp, done, g)) print(f'Epilen: {epilen}\tExtR: {extR:.2f}\tIntR: {intR:.2f}') print(stats) def test_episode(self): s, done = self.env.reset(), False g = self.gen_goal(s) # g = np.array([0.5, 0.0]) R = 0 min_dis = 1e9 cnt = 0 while not done: # self.env.render() cnt += 1 # if cnt >= 500: break if self.is_discrete_action: a = self.algo.get_action(s, g, epsilon=0.) else: a = self.algo.get_action(s, g, sigma=0.) sp, extr, done, info = self.env.step(a) r = self.goal_reward(sp, g) R += r min_dis = min(min_dis, np.linalg.norm((sp - g) / self.std)) s = sp if r > 0: done = True return R, min_dis