def run_policy(env, get_action, max_ep_len=None, num_episodes=100, render=True): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." logger = EpochLogger() o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 while n < num_episodes: if render: env.render() time.sleep(1e-3) a = get_action(o) o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.dump_tabular()
def __init__(self, env_name, port=2000, gpu=0, train_step=2000, evaluation_step=1000, max_ep_len=1000, polyak=0.995, start_steps=1000, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-3, policy_delay=2, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = gym.make(env_name) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.start_steps = start_steps self.cur_train_step = 0 self.cur_tensorboard_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.polyak = polyak self.gamma = gamma self.opti_q = tf.keras.optimizers.Adam(q_lr) self.opti_pi = tf.keras.optimizers.Adam(pi_lr) if debug_mode: self.summary = tf.summary.create_file_writer( os.path.join(self.logger.output_dir, "logs")) self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.target_actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.replay_buffer = ReplayBuffer(replay_size) # self.critic = core.Critic() # net_params = self.critic.weights # self.target_actor_critic.set_weights(self.actor_critic.weights) self.target_init(self.target_actor_critic, self.actor_critic)
def __init__(self, env_name, train_step=250000/4, evaluation_step=125000/4, max_ep_len=27000/4, epsilon_train=0.1, epsilon_eval=0.01, batch_size=32, replay_size=1e6, epsilon_decay_period=250000/4, warmup_steps=20000/4, iteration=200, gamma=0.99, target_update_period=8000/4, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # self.env = make_atari(env_name) # self.env = wrap_deepmind(self.env, frame_stack=True) self.env = gym.make(env_name) env = self.env.env self.env = AtariPreprocessing(env) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period self.build_model() self.cur_train_step = 0 self.observation_shape = (84, 84) self.state_shape = (1,) + self.observation_shape + (4,) self.s = np.zeros(self.state_shape) self.last_s = np.zeros(self.state_shape) if debug_mode: self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs")) self.sess = tf.Session() self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) # tf.summary.histogram("q", self.q) # tf.summary.histogram("q_target", self.q_target) # tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all()
def __init__(self, env_name, train_step=200, evaluation_step=1000, max_ep_len=200, epsilon_train=0.1, epsilon_eval=0.01, batch_size=32, replay_size=1e6, epsilon_decay_period=100, warmup_steps=0, iteration=200, gamma=0.99, target_update_period=50, update_period=10, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.env = gym.make(env_name) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period self.build_model() self.cur_train_step = 0 if debug_mode: self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs")) self.sess = tf.Session() self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) tf.summary.histogram("q", self.q) tf.summary.histogram("q_target", self.q_target) tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all()
def __init__(self, env_name, port=2000, gpu=0, batch_size=100, train_step=25000, evaluation_step=3000, max_ep_len=6000, epsilon_train=0.1, epsilon_eval=0.01, replay_size=100000, epsilon_decay_period=25000, warmup_steps=2000, iteration=200, gamma=0.99, q_lr=0.0001, target_update_period=800, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=False, port=port, gpu=gpu) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period self.build_model() self.cur_train_step = 0 self.cur_tensorboard = 0 if debug_mode: self.summary = tf.summary.create_file_writer(os.path.join(self.logger.output_dir, "logs")) self.build_model() self.savepath = os.path.join(self.logger.output_dir, "saver") checkpoint = tf.train.Checkpoint(model=self.model, target_model=self.model_target) self.manager = tf.train.CheckpointManager(checkpoint, directory=self.savepath, max_to_keep=20, checkpoint_name="model.ckpt") self.opti_q = tf.keras.optimizers.Adam(q_lr)
class EMAQ: def __init__(self, env_fn, env_name=None, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-5, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='CQL'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.lagrange_threshold = 10 self.penalty_lr = 5e-2 self.lamda = Variable(torch.log(torch.exp(torch.Tensor([5])) - 1), requires_grad=True) self.lamda_optimizer = torch.optim.Adam([self.lamda], lr=self.penalty_lr) self.tune_lambda = True if 'lagrange' in self.algo else False self.alpha = 0 self.target_update_freq = 1 self.p_lr = 3e-5 self.lr = 3e-4 self.n_samples = 100 self.env_name = env_name # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations']. shape[0], :] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions']. shape[0], :] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations']. shape[0], :] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards']. shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals']. shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size + 1) % (self.replay_buffer.max_size) # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] sampled_actions_q1 = None sampled_actions_q2 = None for i in range(self.n_samples): z = np.random.randn(a.shape[0], a.shape[1]) z = torch.FloatTensor(z) actions, _ = self.sampling_policy.inverse(z, y=o2) if sampled_actions_q1 is None: sampled_actions_q1 = self.ac_targ.q1(o2, actions).view(-1, 1) sampled_actions_q2 = self.ac_targ.q2(o2, actions).view(-1, 1) else: sampled_actions_q1 = torch.cat( (sampled_actions_q1, self.ac_targ.q1(o2, actions).view( -1, 1)), dim=1) sampled_actions_q2 = torch.cat( (sampled_actions_q2, self.ac_targ.q2(o2, actions).view( -1, 1)), dim=1) q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = torch.max(sampled_actions_q1, dim=1).values q2_pi_targ = torch.max(sampled_actions_q2, dim=1).values q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info def update(self, data, update_timestep): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Finally, update target networks by polyak averaging. if update_timestep % self.target_update_freq == 0: with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): sampled_actions_q1 = None sampled_actions_q2 = None sampled_actions = [] o = torch.FloatTensor(o).view(1, -1) for i in range(self.n_samples): z = np.random.randn(1, self.act_dim) z = torch.FloatTensor(z) actions, _ = self.sampling_policy.inverse(z, y=o) sampled_actions.append(actions) if sampled_actions_q1 is None: sampled_actions_q1 = self.ac.q1(o, actions).view(-1, 1) sampled_actions_q2 = self.ac.q2(o, actions).view(-1, 1) else: sampled_actions_q1 = torch.cat( (sampled_actions_q1, self.ac.q1(o, actions).view(-1, 1)), dim=1) sampled_actions_q2 = torch.cat( (sampled_actions_q2, self.ac.q2(o, actions).view(-1, 1)), dim=1) q_values = torch.min(sampled_actions_q1, sampled_actions_q2) max_idx = torch.argmax(q_values.view(-1)) return sampled_actions[max_idx].detach().cpu().numpy() def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=100 * self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) def run(self): # Learn a generative model for data # density_epochs = 50 # self.sampling_policy = core.MADE(self.act_dim, 256, 2 , cond_label_size = self.obs_dim[0]) # density_optimizer = torch.optim.Adam(self.sampling_policy.parameters(), lr=1e-4, weight_decay=1e-6) # for i in range(density_epochs): # sample_indices = np.random.choice( # self.replay_buffer.size, self.replay_buffer.size) # np.random.shuffle(sample_indices) # ctr = 0 # total_loss = 0 # for j in range(0, self.replay_buffer.size, self.batch_size): # actions = self.replay_buffer.act_buf[sample_indices[ctr * self.batch_size:( # ctr + 1) * self.batch_size],:] # actions = torch.FloatTensor(actions) # obs = self.replay_buffer.obs_buf[sample_indices[ctr * self.batch_size:( # ctr + 1) * self.batch_size],:] # obs = torch.FloatTensor(obs) # density_optimizer.zero_grad() # loss = -self.sampling_policy.log_prob(actions,y=obs).mean() # loss.backward() # total_loss+=loss.data * self.batch_size # density_optimizer.step() # ctr+=1 # print("Density training loss: {}".format(total_loss/self.replay_buffer.size)) self.sampling_policy = core.MADE(self.act_dim, 256, 3, cond_label_size=self.obs_dim[0]) self.sampling_policy.load_state_dict( torch.load("behavior_policies/" + self.env_name + ".pt")) # self.sampling_policy = torch.load("marginals/"+self.env_name+".pt") # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep=t) # End of epoch handling if (t + 1) % self.steps_per_epoch == 0: epoch = (t + 1) // self.steps_per_epoch # Save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) self.logger.dump_tabular()
class CQL: def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=10000, replay_size=int(2e6), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=1e-4, alpha=0.2, batch_size=100, start_steps=10000, update_after=1000, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1,policy_eval_start=0, algo='CQL',min_q_weight=5, automatic_alpha_tuning=False): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, **ac_kwargs) self.ac_targ = deepcopy(self.ac) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n'%var_counts) self.algo = algo self.lagrange_threshold = 10 self.penalty_lr = lr self.tune_lambda = True if 'lagrange' in self.algo else False if self.tune_lambda: print("Tuning Lambda") self.target_action_gap = self.lagrange_threshold self.log_lamda = torch.zeros(1, requires_grad=True, device=device) self.lamda_optimizer = torch.optim.Adam([self.log_lamda],lr=self.penalty_lr) self.lamda = self.log_lamda.exp() self.min_q_weight = 1.0 else: # self.lamda = min_q_weight self.min_q_weight = min_q_weight self.automatic_alpha_tuning = automatic_alpha_tuning if self.automatic_alpha_tuning is True: self.target_entropy = -torch.prod(torch.Tensor(self.env.action_space.shape)).item() self.log_alpha = torch.zeros(1, requires_grad=True, device=device) self.alpha_optim = Adam([self.log_alpha], lr=p_lr) self.alpha = self.log_alpha.exp() else: self.alpha = alpha # self.alpha = alpha # CWR does not require entropy in Q evaluation self.target_update_freq = 1 self.p_lr = p_lr self.lr=lr # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs= epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak self.softmax = torch.nn.Softmax(dim=1) self.softplus = torch.nn.Softplus(beta=1, threshold=20) self.policy_eval_start=policy_eval_start self._current_epoch=0 # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self): dataset = d4rl.qlearning_dataset(self.env) self.replay_buffer.obs_buf[:dataset['observations'].shape[0],:] = dataset['observations'] self.replay_buffer.act_buf[:dataset['actions'].shape[0],:] = dataset['actions'] self.replay_buffer.obs2_buf[:dataset['next_observations'].shape[0],:] = dataset['next_observations'] self.replay_buffer.rew_buf[:dataset['rewards'].shape[0]] = dataset['rewards'] self.replay_buffer.done_buf[:dataset['terminals'].shape[0]] = dataset['terminals'] self.replay_buffer.size = dataset['observations'].shape[0] self.replay_buffer.ptr = (self.replay_buffer.size+1)%(self.replay_buffer.max_size) # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = self.ac.q1(o,a) q2 = self.ac.q2(o,a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.ac_targ.q1(o2, a2) q2_pi_targ = self.ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup)**2).mean() loss_q2 = ((q2 - backup)**2).mean() loss_q = loss_q1 + loss_q2 self.logger.store(CQLalpha=self.lamda) if 'rho' in self.algo: samples = 10 # Sample from previous policy (10 samples) o_rep = o.repeat_interleave(repeats=samples,dim=0) sample_actions, _ = self.ac.pi(o_rep) cql_loss_q1 = self.ac.q1(o_rep,sample_actions).reshape(-1,1) cql_loss_q2 = self.ac.q2(o_rep,sample_actions).reshape(-1,1) cql_loss_q1 = cql_loss_q1-np.log(samples) cql_loss_q2 = cql_loss_q2-np.log(samples) cql_loss_q1 = torch.logsumexp(cql_loss_q1,dim=1).mean()*self.min_q_weight cql_loss_q2 = torch.logsumexp(cql_loss_q2,dim=1).mean()*self.min_q_weight # Sample from dataset cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight else: samples = 10 q1_pi_samples = None q2_pi_samples = None # Add samples from previous policy o_rep = o.repeat_interleave(repeats=samples,dim=0) o2_rep = o2.repeat_interleave(repeats=samples,dim=0) # o_rep = o.repeat_interleave(samples,1) # Samples from current policy sample_action, logpi = self.ac.pi(o_rep) q1_pi_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach() q2_pi_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - logpi.view(-1,1).detach() q1_pi_samples = q1_pi_samples.view((o.shape[0],-1)) q2_pi_samples = q2_pi_samples.view((o.shape[0],-1)) sample_next_action, logpi_n = self.ac.pi(o2_rep) q1_next_pi_samples = self.ac.q1(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach() q2_next_pi_samples = self.ac.q2(o2_rep,sample_next_action).view(-1,1) - logpi_n.view(-1,1).detach() q1_next_pi_samples = q1_next_pi_samples.view((o2.shape[0],-1)) q2_next_pi_samples = q2_next_pi_samples.view((o2.shape[0],-1)) # Add samples from uniform sampling sample_action = np.random.uniform(low=self.env.action_space.low,high=self.env.action_space.high,size=(q1_pi_samples.shape[0]*10,self.env.action_space.high.shape[0])) sample_action = torch.FloatTensor(sample_action).to(device) log_pi = torch.FloatTensor([np.log(1/np.prod(self.env.action_space.high-self.env.action_space.low))]).to(device) q1_rand_samples = self.ac.q1(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach() q2_rand_samples = self.ac.q2(o_rep,sample_action).view(-1,1) - log_pi.view(-1,1).detach() q1_rand_samples = q1_rand_samples.view((o.shape[0],-1)) q2_rand_samples = q2_rand_samples.view((o.shape[0],-1)) cql_loss_q1 = torch.logsumexp(torch.cat([q1_pi_samples,q1_next_pi_samples,q1_rand_samples],dim=1),dim=1).mean()*self.min_q_weight cql_loss_q2 = torch.logsumexp(torch.cat((q2_pi_samples,q2_next_pi_samples,q2_rand_samples),dim=1),dim=1).mean()*self.min_q_weight # Sample from dataset cql_loss_q1 -= self.ac.q1(o, a).mean()*self.min_q_weight cql_loss_q2 -= self.ac.q2(o, a).mean()*self.min_q_weight # Update the cql-alpha if 'lagrange' in self.algo: cql_alpha = torch.clamp(self.log_lamda.exp(), min=0.0, max=1000000.0) self.lamda = cql_alpha.item() cql_loss_q1 = cql_alpha*(cql_loss_q1-self.target_action_gap) cql_loss_q2 = cql_alpha*(cql_loss_q2-self.target_action_gap) self.lamda_optimizer.zero_grad() lamda_loss = (-cql_loss_q1-cql_loss_q2)*0.5 lamda_loss.backward(retain_graph=True) self.lamda_optimizer.step() # print(self.log_lamda.exp()) avg_q = 0.5*(cql_loss_q1.mean() + cql_loss_q2.mean()).detach().cpu() loss_q += (cql_loss_q1.mean() + cql_loss_q2.mean()) # Useful info for logging q_info = dict(Q1Vals=q1.detach().cpu().numpy(), Q2Vals=q2.detach().cpu().numpy(), AvgQ = avg_q) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self,data): o = data['obs'] a = data['act'] pi, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) q_pi = torch.min(q1_pi, q2_pi) loss_pi = (self.alpha * logp_pi - q_pi).mean() # TODO: Verify if this is needed if self._current_epoch<self.policy_eval_start: policy_log_prob = self.ac.pi.get_logprob(o, a) loss_pi = (self.alpha * logp_pi - policy_log_prob).mean() # Useful info for logging pi_info = dict(LogPi=logp_pi.detach().cpu().numpy()) return loss_pi, pi_info, logp_pi def update(self,data, update_timestep): self._current_epoch+=1 # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in self.q_params: p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, pi_info, log_pi = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() if self.automatic_alpha_tuning: alpha_loss = -(self.log_alpha * (log_pi + self.target_entropy).detach()).mean() self.alpha_optim.zero_grad() alpha_loss.backward() self.alpha_optim.step() self.alpha = self.log_alpha.exp() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in self.q_params: p.requires_grad = True # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. if update_timestep%self.target_update_freq==0: with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): return self.ac.act(torch.as_tensor(o, dtype=torch.float32).to(device), deterministic) def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not(d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) def run(self): # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() o, ep_ret, ep_len = self.env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep = t) # End of epoch handling if (t+1) % self.steps_per_epoch == 0: epoch = (t+1) // self.steps_per_epoch # # Save model # if (epoch % self.save_freq == 0) or (epoch == self.epochs): # self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('CQLalpha', average_only=True) self.logger.log_tabular('Time', time.time()-start_time) self.logger.dump_tabular() def train(self, training_epochs): # Main loop: collect experience in env and update/log each epoch for t in range(training_epochs): # # Update handling batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep = t) self.test_agent() def collect_episodes(self, num_episodes): env_steps = 0 for j in range(num_episodes): o, d, ep_ret, ep_len = self.env.reset(), False, 0, 0 while not(d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time act = self.get_action(o) no, r, d, _ = self.env.step(act) self.replay_buffer.store(o,act,r,no,d) env_steps+=1 return env_steps def log_and_dump(self): # Log info about epoch self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('CQLalpha', average_only=True) self.logger.dump_tabular()
class AWAC: def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-4, alpha=0.0, batch_size=1024, start_steps=10000, update_after=0, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='SAC'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ.load_state_dict(self.ac.state_dict()) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.p_lr = p_lr self.lr = lr self.alpha = 0 # # Algorithm specific hyperparams # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo)) def populate_replay_buffer(self, env_name): data_envs = { 'HalfCheetah-v2': ( "awac_data/hc_action_noise_15.npy", "awac_data/hc_off_policy_15_demos_100.npy"), 'Ant-v2': ( "awac_data/ant_action_noise_15.npy", "awac_data/ant_off_policy_15_demos_100.npy"), 'Walker2d-v2': ( "awac_data/walker_action_noise_15.npy", "awac_data/walker_off_policy_15_demos_100.npy"), } if env_name in data_envs: print('Loading saved data') for file in data_envs[env_name]: if not os.path.exists(file): warnings.warn(colored('Offline data not found. Follow awac_data/instructions.txt to download. Running without offline data.', 'red')) break data = np.load(file, allow_pickle=True) for demo in data: for transition in list(zip(demo['observations'], demo['actions'], demo['rewards'], demo['next_observations'], demo['terminals'])): self.replay_buffer.store(*transition) else: dataset = d4rl.qlearning_dataset(self.env) N = dataset['rewards'].shape[0] for i in range(N): self.replay_buffer.store(dataset['observations'][i], dataset['actions'][i], dataset['rewards'][i], dataset['next_observations'][i], float(dataset['terminals'][i])) print("Loaded dataset") # Set up function for computing SAC Q-losses def compute_loss_q(self, data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data['obs2'], data['done'] q1 = self.ac.q1(o, a) q2 = self.ac.q2(o, a) # Bellman backup for Q functions with torch.no_grad(): # Target actions come from *current* policy a2, logp_a2 = self.ac.pi(o2) # Target Q-values q1_pi_targ = self.ac_targ.q1(o2, a2) q2_pi_targ = self.ac_targ.q2(o2, a2) q_pi_targ = torch.min(q1_pi_targ, q2_pi_targ) backup = r + self.gamma * (1 - d) * (q_pi_targ - self.alpha * logp_a2) # MSE loss against Bellman backup loss_q1 = ((q1 - backup) ** 2).mean() loss_q2 = ((q2 - backup) ** 2).mean() loss_q = loss_q1 + loss_q2 # Useful info for logging q_info = dict(Q1Vals=q1.detach().numpy(), Q2Vals=q2.detach().numpy()) return loss_q, q_info # Set up function for computing SAC pi loss def compute_loss_pi(self, data): o = data['obs'] pi, logp_pi = self.ac.pi(o) q1_pi = self.ac.q1(o, pi) q2_pi = self.ac.q2(o, pi) v_pi = torch.min(q1_pi, q2_pi) beta = 2 q1_old_actions = self.ac.q1(o, data['act']) q2_old_actions = self.ac.q2(o, data['act']) q_old_actions = torch.min(q1_old_actions, q2_old_actions) adv_pi = q_old_actions - v_pi weights = F.softmax(adv_pi / beta, dim=0) policy_logpp = self.ac.pi.get_logprob(o, data['act']) loss_pi = (-policy_logpp * len(weights) * weights.detach()).mean() # Useful info for logging pi_info = dict(LogPi=policy_logpp.detach().numpy()) return loss_pi, pi_info def update(self, data, update_timestep): # First run one gradient descent step for Q1 and Q2 self.q_optimizer.zero_grad() loss_q, q_info = self.compute_loss_q(data) loss_q.backward() self.q_optimizer.step() # Record things self.logger.store(LossQ=loss_q.item(), **q_info) # Freeze Q-networks so you don't waste computational effort # computing gradients for them during the policy learning step. for p in self.q_params: p.requires_grad = False # Next run one gradient descent step for pi. self.pi_optimizer.zero_grad() loss_pi, pi_info = self.compute_loss_pi(data) loss_pi.backward() self.pi_optimizer.step() # Unfreeze Q-networks so you can optimize it at next DDPG step. for p in self.q_params: p.requires_grad = True # Record things self.logger.store(LossPi=loss_pi.item(), **pi_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(self.ac.parameters(), self.ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(self.polyak) p_targ.data.add_((1 - self.polyak) * p.data) def get_action(self, o, deterministic=False): return self.ac.act(torch.as_tensor(o, dtype=torch.float32), deterministic) def test_agent(self): for j in range(self.num_test_episodes): o, d, ep_ret, ep_len = self.test_env.reset(), False, 0, 0 while not (d or (ep_len == self.max_ep_len)): # Take deterministic actions at test time o, r, d, _ = self.test_env.step(self.get_action(o, True)) ep_ret += r ep_len += 1 self.logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) # Get unnormalized score # self.logger.store(TestEpRet=100*self.test_env.get_normalized_score(ep_ret), TestEpLen=ep_len) # Get normalized score def run(self): # Prepare for interaction with environment total_steps = self.epochs * self.steps_per_epoch start_time = time.time() obs, ep_ret, ep_len = self.env.reset(), 0, 0 done = True num_train_episodes = 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Reset stuff if necessary if done and t > 0: self.logger.store(ExplEpRet=ep_ret, ExplEpLen=ep_len) obs, ep_ret, ep_len = self.env.reset(), 0, 0 num_train_episodes += 1 # Collect experience act = self.get_action(obs, deterministic=False) next_obs, rew, done, info = self.env.step(act) self.replay_buffer.store(obs, act, rew, next_obs, done) obs = next_obs # Update handling if t > self.update_after and t % self.update_every == 0: for _ in range(self.update_every): batch = self.replay_buffer.sample_batch(self.batch_size) self.update(data=batch, update_timestep=t) # End of epoch handling if (t + 1) % self.steps_per_epoch == 0: epoch = (t + 1) // self.steps_per_epoch # Save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.logger.save_state({'env': self.env}, None) # Test the performance of the deterministic version of the agent. self.test_agent() # Log info about epoch self.logger.log_tabular('Epoch', epoch) self.logger.log_tabular('TestEpRet', with_min_and_max=True) self.logger.log_tabular('TestEpLen', average_only=True) self.logger.log_tabular('TotalUpdates', t) self.logger.log_tabular('Q1Vals', with_min_and_max=True) self.logger.log_tabular('Q2Vals', with_min_and_max=True) self.logger.log_tabular('LogPi', with_min_and_max=True) self.logger.log_tabular('LossPi', average_only=True) self.logger.log_tabular('LossQ', average_only=True) self.logger.log_tabular('Time', time.time() - start_time) self.logger.dump_tabular()
class Meta_control(object): def __init__(self, **kwargs): for key, value in kwargs.items(): setattr(self, key, value) state_dim = self.env.observation_space.shape[0] action_dim = self.weight_dim # self.env.action_space.shape[0] # print(state_dim, action_dim) # initialize value funciton self.value_Network = Value_Network(state_dim, 256, 1).to(self.device) self.value_net_optimizer = optim.RMSprop( self.value_Network.parameters(), lr=self.value_net_lr) # self.value_net_optimizer = optim.Adam(self.value_Network.parameters(), lr=self.value_net_lr) self.value_Network_target = Value_Network(state_dim, 256, 1).to(self.device) self.value_Network_target.load_state_dict( self.value_Network.state_dict()) self.value_net_loss_func = nn.MSELoss() self.value_net_optimizer.zero_grad() # initialize Q funciton self.soft_Q_Network = Soft_Q_Network(state_dim + action_dim, 256, action_dim).to(self.device) self.soft_Q_net_optimizer = optim.RMSprop( self.soft_Q_Network.parameters(), lr=self.soft_Q_net_lr) # self.soft_Q_net_optimizer = optim.Adam(self.soft_Q_Network.parameters(), lr=self.soft_Q_net_lr) self.soft_Q_Network_target = Soft_Q_Network(state_dim + action_dim, 256, action_dim).to(self.device) self.soft_Q_Network_target.load_state_dict( self.soft_Q_Network.state_dict()) self.soft_Q_net_loss_func = nn.MSELoss() self.soft_Q_net_optimizer.zero_grad() # initialize policy network self.policy_Network = Policy_Network(state_dim, 256, action_dim).to(self.device) self.policy_net_optimizer = optim.RMSprop( self.policy_Network.parameters(), lr=self.policy_net_lr) # self.policy_net_optimizer = optim.Adam(self.policy_Network.parameters(), lr=self.policy_net_lr) self.policy_net_optimizer.zero_grad() # initialize replay buffer self.replay_Buffer = ReplayBuffer(self.replay_buffer_size) # synchronize the parameters of networks in all threads # sync_all_params(self.value_Network.parameters()) # sync_all_params(self.soft_Q_Network.parameters()) # sync_all_params(self.policy_Network.parameters()) # sync_all_params(self.value_Network_target.parameters()) # sync_all_params(self.soft_Q_Network_target.parameters()) def act(self, state): # print(state) state_tensor = torch.tensor(state, dtype=torch.float).unsqueeze(0).to( self.device) mean, log_std = self.policy_Network(state_tensor) normal_distribution = Normal(mean, log_std.exp()) action_sample = normal_distribution.sample() # action_normalized = torch.softmax(action_sample, dim=0) action = torch.tanh(action_sample).squeeze(0).detach().cpu().numpy() return action def value_Network_backward(self, state, log_prob, soft_Q_value): value_predict = self.value_Network(state) value_label = soft_Q_value - log_prob # self.temperature * value_loss = self.value_net_loss_func(value_predict, value_label.detach()) value_loss.backward() # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1: # average_gradients(self.value_net_optimizer.param_groups) # average the gradients of all threads def soft_Q_Network_backward(self, state, action, reward, nxt_state): soft_Q_predict = self.soft_Q_Network(state, action) soft_Q_label = reward + self.discount * self.value_Network_target( nxt_state) soft_Q_loss = self.soft_Q_net_loss_func(soft_Q_predict, soft_Q_label) # print(soft_Q_loss) soft_Q_loss.backward() # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1: # average_gradients(self.soft_Q_net_optimizer.param_groups) # average the gradients of all threads def policy_Network_backward(self, log_prob, soft_Q_value): policy_loss = -torch.mean(soft_Q_value - self.temperature * log_prob) ## -mean(Q - H) = -mean(V) policy_loss.backward() # if self.learn_times % self.ave_gradient_times == self.ave_gradient_times - 1: # average_gradients(self.policy_net_optimizer.param_groups) # average the gradients of all threads # print(self.learn_times, 'ave gradients') def target_network_update(self, target_net, eval_net): for target_params, eval_params in zip(target_net.parameters(), eval_net.parameters()): target_params.data.copy_(target_params.data * (1.0 - self.target_update) + eval_params * self.target_update) def backward(self): if self.replay_Buffer.__len__() < self.batch_size: return state, action, reward, nxt_state = self.replay_Buffer.sample( self.batch_size) state_tensor = torch.tensor(state, dtype=torch.float).to(self.device) action_tensor = torch.tensor(action, dtype=torch.float).to(self.device) reward_tensor = torch.tensor( reward, dtype=torch.float).unsqueeze(1).to(self.device) nxt_state_tensor = torch.tensor(nxt_state, dtype=torch.float).to(self.device) action_sample, log_prob, _ = self.policy_Network.evaluate(state_tensor) # soft_Q_value = self.soft_Q_Network(state_tensor, action_sample) soft_Q_value = self.soft_Q_Network_target(state_tensor, action_sample) self.value_Network_backward(state_tensor, log_prob, soft_Q_value) self.soft_Q_Network_backward(state_tensor, action_tensor, reward_tensor, nxt_state_tensor) self.policy_Network_backward(log_prob, soft_Q_value) self.target_network_update(self.value_Network_target, self.value_Network) self.target_network_update(self.soft_Q_Network_target, self.soft_Q_Network) def step(self): self.value_net_optimizer.step() self.soft_Q_net_optimizer.step() self.policy_net_optimizer.step() self.value_net_optimizer.zero_grad() self.soft_Q_net_optimizer.zero_grad() self.policy_net_optimizer.zero_grad() def reserve_network(self, folder, episode): torch.save(self.value_Network.state_dict(), folder + 'E' + str(episode) + '_sac_value_Network.pkl') # torch.save(self.soft_Q_Network.state_dict(), folder + 'E' + str(episode) + '_soft_Q_Network.pkl') # torch.save(self.policy_Network.state_dict(), folder + 'E' + str(episode) + '_policy_Network.pkl') # # np.savetxt(folder + 'reward.txt', self.reward_buf) def load_network(self, folder, episode): self.value_Network.load_state_dict( torch.load(folder + 'E' + str(episode) + '_sac_value_Network.pkl')) self.soft_Q_Network.load_state_dict( torch.load(folder + 'E' + str(episode) + '_soft_Q_Network.pkl')) self.policy_Network.load_state_dict( torch.load(folder + 'E' + str(episode) + '_policy_Network.pkl')) self.value_Network_target.load_state_dict( self.value_Network.state_dict()) self.soft_Q_Network_target.load_state_dict( self.soft_Q_Network.state_dict()) # for target_params, eval_params in zip(self.soft_Q_Network_target.parameters(), self.soft_Q_Network.parameters()): # target_params.data.copy_(eval_params) # for target_params, eval_params in zip(self.value_Network_target.parameters(), self.value_Network.parameters()): # target_params.data.copy_(eval_params) # self.reward_buf = list(np.loadtxt(folder + 'reward.txt')) # def sync_multi_thread(self): # # synchronize the parameters of networks in all threads # sync_all_params(self.value_Network.parameters()) # sync_all_params(self.soft_Q_Network.parameters()) # sync_all_params(self.policy_Network.parameters()) # sync_all_params(self.value_Network_target.parameters()) # sync_all_params(self.soft_Q_Network_target.parameters()) def logger_setup(self, logger_kwargs, **kwargs): self.logger = EpochLogger(**logger_kwargs) for key, value in kwargs.items(): if key != 'env' and key != 'output_dir': self.logger.log_tabular(key, value) def logger_update(self, kwargs): for key, value in kwargs.items(): self.logger.log_tabular(key, value) self.logger.dump_tabular()
class Dqn: def __init__(self, env_name, train_step=200, evaluation_step=1000, max_ep_len=200, epsilon_train=0.1, epsilon_eval=0.01, batch_size=32, replay_size=1e6, epsilon_decay_period=100, warmup_steps=0, iteration=200, gamma=0.99, target_update_period=50, update_period=10, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.env = gym.make(env_name) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period self.build_model() self.cur_train_step = 0 if debug_mode: self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs")) self.sess = tf.Session() self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) tf.summary.histogram("q", self.q) tf.summary.histogram("q_target", self.q_target) tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all() def build_model(self): self.input_shape = self.env.observation_space.shape self.model, self.model_target = mlp_dqn(self.env.action_space.n, self.input_shape) self.model_target.set_weights(self.model.get_weights()) def choose_action(self, s, eval_mode=False): epsilon = self.epsilon_eval if eval_mode \ else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train) # print("epsilon:", epsilon) if random.random() <= 1-epsilon: q = self.model.predict(s[np.newaxis, :]) a = np.argmax(q, axis=1)[0] # print() else: a = self.env.action_space.sample() return a def run_one_phrase(self, min_step, eval_mode=False): step = 0 episode = 0 reward = 0. while step < min_step: reward_episode = 0. step_episode = 0 done = False obs = self.env.reset() # o = np.array(obs) while not done: a = self.choose_action(np.array(obs), eval_mode) obs_, r, done, _ = self.env.step(a) step += 1 step_episode += 1 reward += r reward_episode += r if not eval_mode: self.cur_train_step += 1 self.replay_buffer.add(np.array(obs), a, np.array(obs_), r, done) if self.cur_train_step > 100: if self.cur_train_step % self.update_period == 0: # data = self.replay_buffer.sample() (s, a, s_, r, d) = self.replay_buffer.sample() target_q = self.model_target.predict(s_) q_ = np.max(target_q, axis=1) q_target = r + (1-d) *self.gamma * q_ q = self.model.predict(s) ori_q = np.copy(q) batch_index = np.arange(self.batch_size) q[batch_index, a] = q_target result = self.model.train_on_batch(np.array(s), q) if debug_mode: merge = self.sess.run(self.merge, feed_dict={self.loss: result[0], self.q: ori_q, self.q_target: q, self.target_q: target_q}) self.summary.add_summary(merge, (self.cur_train_step-100)/self.update_period) # print("result:", result) if self.cur_train_step % self.target_update_period == 0: self.model_target.set_weights(self.model.get_weights()) if step_episode >= self.max_ep_len: break obs = obs_ episode += 1 # print("ep:", episode, "step:", step, "r:", reward) if not eval_mode: self.logger.store(step=step_episode, reward=reward_episode) return reward, episode def train_test(self): for i in range(self.iteration): print("iter:", i+1) self.logger.store(iter=i+1) reward, episode = self.run_one_phrase(self.train_step) print("reward:", reward/episode, "episode:", episode) self.logger.log_tabular("iter", i+1) self.logger.log_tabular("reward", with_min_and_max=True) self.logger.log_tabular("step", with_min_and_max=True) self.logger.dump_tabular() reward, episode = self.run_one_phrase(self.evaluation_step, True) print("reward:", reward / episode, "episode:", episode)
class Dqn: def __init__(self, env_name, train_step=250000/4, evaluation_step=125000/4, max_ep_len=27000/4, epsilon_train=0.1, epsilon_eval=0.01, batch_size=32, replay_size=1e6, epsilon_decay_period=250000/4, warmup_steps=20000/4, iteration=200, gamma=0.99, target_update_period=8000/4, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) # self.env = make_atari(env_name) # self.env = wrap_deepmind(self.env, frame_stack=True) self.env = gym.make(env_name) env = self.env.env self.env = AtariPreprocessing(env) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period self.build_model() self.cur_train_step = 0 self.observation_shape = (84, 84) self.state_shape = (1,) + self.observation_shape + (4,) self.s = np.zeros(self.state_shape) self.last_s = np.zeros(self.state_shape) if debug_mode: self.summary = tf.summary.FileWriter(os.path.join(self.logger.output_dir, "logs")) self.sess = tf.Session() self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) # tf.summary.histogram("q", self.q) # tf.summary.histogram("q_target", self.q_target) # tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all() def build_model(self): self.model, self.model_target = nature_dqn(self.env.action_space.n) self.model_target.set_weights(self.model.get_weights()) def choose_action(self, s, eval_mode=False): epsilon = self.epsilon_eval if eval_mode \ else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train) if random.random() <= 1-epsilon: q = self.model.predict(s[np.newaxis, :]) a = np.argmax(q, axis=1)[0] # print() else: a = self.env.action_space.sample() return a def record_obs(self, observation): self.last_s = copy.copy(self.s) self.s = np.roll(self.s, -1, axis=-1) self.s[0, ..., -1] = np.squeeze(observation) def store(self, s, a, s_, r, done): pass def run_one_phrase(self, min_step, eval_mode=False): step = 0 episode = 0 reward = 0. while step < min_step: done = False obs = self.env.reset() # o = np.array(obs) step_episode = 0 reward_episode = 0 while not done: a = self.choose_action(np.array(obs), eval_mode) obs_, r, done, _ = self.env.step(a) step += 1 step_episode += 1 reward += r reward_episode += r if not eval_mode: self.cur_train_step += 1 self.replay_buffer.add(np.array(obs), a, np.array(obs_), r, done) if self.cur_train_step > 20000/4: if self.cur_train_step % self.update_period == 0: # data = self.replay_buffer.sample() (s, a, s_, r, d) = self.replay_buffer.sample() q_ = np.max(self.model_target.predict(s_), axis=1) q_target = r + (1-d)*self.gamma * q_ q = self.model.predict(s) batch_index = np.arange(self.batch_size) q[batch_index, a] = q_target result = self.model.train_on_batch(np.array(s), q) # print("result:", result) merge = self.sess.run(self.merge, feed_dict={self.loss: result[0]}) self.summary.add_summary(merge, (self.cur_train_step-20000)/self.update_period) if self.cur_train_step % self.target_update_period == 0: self.model_target.set_weights(self.model.get_weights()) if step_episode >= self.max_ep_len: break obs = obs_ episode += 1 # print("ep:", episode, "step:", step, "r:", reward) self.logger.store(step=step_episode, reward=reward_episode) return reward, episode def train_test(self): for i in range(self.iteration): print("iter:", i+1) self.logger.store(iter=i+1) reward, episode = self.run_one_phrase(self.train_step) print("reward:", reward/episode, "episode:", episode) self.logger.log_tabular("reward", with_min_and_max=True) self.logger.log_tabular("step", with_min_and_max=True) self.logger.dump_tabular() reward, episode = self.run_one_phrase(self.evaluation_step, True) print("reward:", reward / episode, "episode:", episode)
def valor(args): if not hasattr(args, "get"): args.get = args.__dict__.get env_fn = args.get('env_fn', lambda: gym.make('HalfCheetah-v2')) actor_critic = args.get('actor_critic', ActorCritic) ac_kwargs = args.get('ac_kwargs', {}) disc = args.get('disc', Discriminator) dc_kwargs = args.get('dc_kwargs', {}) seed = args.get('seed', 0) episodes_per_epoch = args.get('episodes_per_epoch', 40) epochs = args.get('epochs', 50) gamma = args.get('gamma', 0.99) pi_lr = args.get('pi_lr', 3e-4) vf_lr = args.get('vf_lr', 1e-3) dc_lr = args.get('dc_lr', 2e-3) train_v_iters = args.get('train_v_iters', 80) train_dc_iters = args.get('train_dc_iters', 50) train_dc_interv = args.get('train_dc_interv', 2) lam = args.get('lam', 0.97) max_ep_len = args.get('max_ep_len', 1000) logger_kwargs = args.get('logger_kwargs', {}) context_dim = args.get('context_dim', 4) max_context_dim = args.get('max_context_dim', 64) save_freq = args.get('save_freq', 10) k = args.get('k', 1) logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Model actor_critic = actor_critic(input_dim=obs_dim[0] + max_context_dim, **ac_kwargs) disc = disc(input_dim=obs_dim[0], context_dim=max_context_dim, **dc_kwargs) # Buffer local_episodes_per_epoch = episodes_per_epoch # int(episodes_per_epoch / num_procs()) buffer = Buffer(max_context_dim, obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len, train_dc_interv) # Count variables var_counts = tuple( count_vars(module) for module in [actor_critic.policy, actor_critic.value_f, disc.policy]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n' % var_counts) # Optimizers #Optimizer for RL Policy train_pi = torch.optim.Adam(actor_critic.policy.parameters(), lr=pi_lr) #Optimizer for value function (for actor-critic) train_v = torch.optim.Adam(actor_critic.value_f.parameters(), lr=vf_lr) #Optimizer for decoder train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr) #pdb.set_trace() # Parameters Sync #sync_all_params(actor_critic.parameters()) #sync_all_params(disc.parameters()) ''' Training function ''' def update(e): obs, act, adv, pos, ret, logp_old = [ torch.Tensor(x) for x in buffer.retrieve_all() ] # Policy #pdb.set_trace() _, logp, _ = actor_critic.policy(obs, act, batch=False) #pdb.set_trace() entropy = (-logp).mean() # Policy loss pi_loss = -(logp * (k * adv + pos)).mean() # Train policy (Go through policy update) train_pi.zero_grad() pi_loss.backward() # average_gradients(train_pi.param_groups) train_pi.step() # Value function v = actor_critic.value_f(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = actor_critic.value_f(obs) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() # average_gradients(train_v.param_groups) train_v.step() # Discriminator if (e + 1) % train_dc_interv == 0: print('Discriminator Update!') con, s_diff = [torch.Tensor(x) for x in buffer.retrieve_dc_buff()] _, logp_dc, _ = disc(s_diff, con) d_l_old = -logp_dc.mean() # Discriminator train for _ in range(train_dc_iters): _, logp_dc, _ = disc(s_diff, con) d_loss = -logp_dc.mean() train_dc.zero_grad() d_loss.backward() # average_gradients(train_dc.param_groups) train_dc.step() _, logp_dc, _ = disc(s_diff, con) dc_l_new = -logp_dc.mean() else: d_l_old = 0 dc_l_new = 0 # Log the changes _, logp, _, v = actor_critic(obs, act) pi_l_new = -(logp * (k * adv + pos)).mean() v_l_new = F.mse_loss(v, ret) kl = (logp_old - logp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, KL=kl, Entropy=entropy, DeltaLossPi=(pi_l_new - pi_loss), DeltaLossV=(v_l_new - v_l_old), LossDC=d_l_old, DeltaLossDC=(dc_l_new - d_l_old)) # logger.store(Adv=adv.reshape(-1).numpy().tolist(), Pos=pos.reshape(-1).numpy().tolist()) start_time = time.time() #Resets observations, rewards, done boolean o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 #Creates context distribution where each logit is equal to one (This is first place to make change) context_dim_prob_dict = { i: 1 / context_dim if i < context_dim else 0 for i in range(max_context_dim) } last_phi_dict = {i: 0 for i in range(context_dim)} context_dist = Categorical( probs=torch.Tensor(list(context_dim_prob_dict.values()))) total_t = 0 for epoch in range(epochs): #Sets actor critic and decoder (discriminator) into eval mode actor_critic.eval() disc.eval() #Runs the policy local_episodes_per_epoch before updating the policy for index in range(local_episodes_per_epoch): # Sample from context distribution and one-hot encode it (Step 2) # Every time we run the policy we sample a new context c = context_dist.sample() c_onehot = F.one_hot(c, max_context_dim).squeeze().float() for _ in range(max_ep_len): concat_obs = torch.cat( [torch.Tensor(o.reshape(1, -1)), c_onehot.reshape(1, -1)], 1) ''' Feeds in observation and context into actor_critic which spits out a distribution Label is a sample from the observation pi is the action sampled logp is the log probability of some other action a logp_pi is the log probability of pi v_t is the value function ''' a, _, logp_t, v_t = actor_critic(concat_obs) #Stores context and all other info about the state in the buffer buffer.store(c, concat_obs.squeeze().detach().numpy(), a.detach().numpy(), r, v_t.item(), logp_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: # Key stuff with discriminator dc_diff = torch.Tensor(buffer.calc_diff()).unsqueeze(0) #Context con = torch.Tensor([float(c)]).unsqueeze(0) #Feed in differences between each state in your trajectory and a specific context #Here, this is just the log probability of the label it thinks it is _, _, log_p = disc(dc_diff, con) buffer.end_episode(log_p.detach().numpy()) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, [actor_critic, disc], None) # Sets actor_critic and discriminator into training mode actor_critic.train() disc.train() update(epoch) #Need to implement curriculum learning here to update context distribution ''' #Psuedocode: Loop through each of d episodes taken in local_episodes_per_epoch and check log probability from discrimantor If >= 0.86, increase k in the following manner: k = min(int(1.5*k + 1), Kmax) Kmax = 64 ''' decoder_accs = [] stag_num = 10 stag_pct = 0.05 if (epoch + 1) % train_dc_interv == 0 and epoch > 0: #pdb.set_trace() con, s_diff = [torch.Tensor(x) for x in buffer.retrieve_dc_buff()] print("Context: ", con) print("num_contexts", len(con)) _, logp_dc, _ = disc(s_diff, con) log_p_context_sample = logp_dc.mean().detach().numpy() print("Log Probability context sample", log_p_context_sample) decoder_accuracy = np.exp(log_p_context_sample) print("Decoder Accuracy", decoder_accuracy) logger.store(LogProbabilityContext=log_p_context_sample, DecoderAccuracy=decoder_accuracy) ''' Create score (phi(i)) = -log_p_context_sample.mean() for each specific context Assign phis to each specific context Get p(i) in the following manner: (phi(i) + epsilon) Get Probabilities by doing p(i)/sum of all p(i)'s ''' logp_np = logp_dc.detach().numpy() con_np = con.detach().numpy() phi_dict = {i: 0 for i in range(context_dim)} count_dict = {i: 0 for i in range(context_dim)} for i in range(len(logp_np)): current_con = con_np[i] phi_dict[current_con] += logp_np[i] count_dict[current_con] += 1 print(phi_dict) phi_dict = { k: last_phi_dict[k] if count_dict[k] == 0 else (-1) * v / count_dict[k] for (k, v) in phi_dict.items() } sorted_dict = dict( sorted(phi_dict.items(), key=lambda item: item[1], reverse=True)) sorted_dict_keys = list(sorted_dict.keys()) rank_dict = { sorted_dict_keys[i]: 1 / (i + 1) for i in range(len(sorted_dict_keys)) } rank_dict_sum = sum(list(rank_dict.values())) context_dim_prob_dict = { k: rank_dict[k] / rank_dict_sum if k < context_dim else 0 for k in context_dim_prob_dict.keys() } print(context_dim_prob_dict) decoder_accs.append(decoder_accuracy) stagnated = (len(decoder_accs) > stag_num and (decoder_accs[-stag_num - 1] - decoder_accuracy) / stag_num < stag_pct) if stagnated: new_context_dim = max(int(0.75 * context_dim), 5) elif decoder_accuracy >= 0.86: new_context_dim = min(int(1.5 * context_dim + 1), max_context_dim) if stagnated or decoder_accuracy >= 0.86: print("new_context_dim: ", new_context_dim) new_context_prob_arr = np.array( new_context_dim * [1 / new_context_dim] + (max_context_dim - new_context_dim) * [0]) context_dist = Categorical( probs=ptu.from_numpy(new_context_prob_arr)) context_dim = new_context_dim for i in range(context_dim): if i in phi_dict: last_phi_dict[i] = phi_dict[i] elif i not in last_phi_dict: last_phi_dict[i] = max(phi_dict.values()) buffer.clear_dc_buff() else: logger.store(LogProbabilityContext=0, DecoderAccuracy=0) # Log logger.store(ContextDim=context_dim) logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('LossDC', average_only=True) logger.log_tabular('DeltaLossDC', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('LogProbabilityContext', average_only=True) logger.log_tabular('DecoderAccuracy', average_only=True) logger.log_tabular('ContextDim', average_only=True) logger.dump_tabular()
def vpg(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, lam=0.97, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): """ Vanilla Policy Gradient (with GAE-Lambda for advantage estimation) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to VPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = VPGBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n'%var_counts) # VPG objectives pi_loss = -tf.reduce_mean(logp * adv_ph) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean(logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean(-logp) # a sample estimate for entropy, also easy to compute # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k:v for k,v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Policy gradient step sess.run(train_pi, feed_dict=inputs) # Value function learning for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl = sess.run([pi_loss, v_loss, approx_kl], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1,-1)}) o2, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t==local_steps_per_epoch-1): if not(terminal): print('Warning: trajectory cut off by epoch at %d steps.'%ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run(v, feed_dict={x_ph: o.reshape(1,-1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs-1): logger.save_state({'env': env}, None) # Perform VPG update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch+1)*steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
def ddpg(env_fn, env_name, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Deep Deterministic Policy Gradient (DDPG) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, and a ``q`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q`` should accept a batch of observations and a batch of actions as inputs. When called, these should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``pi`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``q`` (batch,) | Tensor containing the current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to DDPG. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Create actor-critic module and target networks ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) ac_targ = deepcopy(ac) # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in ac_targ.parameters(): p.requires_grad = False # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.q]) logger.log('\nNumber of parameters: \t pi: %d, \t q: %d\n' % var_counts) # Set up function for computing DDPG Q-loss def compute_loss_q(data): o, a, r, o2, d = data['obs'], data['act'], data['rew'], data[ 'obs2'], data['done'] q = ac.q(o, a) # Bellman backup for Q function with torch.no_grad(): q_pi_targ = ac_targ.q(o2, ac_targ.pi(o2)) backup = r + gamma * (1 - d) * q_pi_targ # MSE loss against Bellman backup loss_q = ((q - backup)**2).mean() # Useful info for logging loss_info = dict(QVals=q.detach().numpy()) return loss_q, loss_info # Set up function for computing DDPG pi loss def compute_loss_pi(data): o = data['obs'] q_pi = ac.q(o, ac.pi(o)) return -q_pi.mean() # Set up optimizers for policy and q-function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) q_optimizer = Adam(ac.q.parameters(), lr=q_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(data): # First run one gradient descent step for Q. q_optimizer.zero_grad() loss_q, loss_info = compute_loss_q(data) loss_q.backward() q_optimizer.step() # Freeze Q-network so you don't waste computational effort # computing gradients for it during the policy learning step. for p in ac.q.parameters(): p.requires_grad = False # Next run one gradient descent step for pi. pi_optimizer.zero_grad() loss_pi = compute_loss_pi(data) loss_pi.backward() pi_optimizer.step() # Unfreeze Q-network so you can optimize it at next DDPG step. for p in ac.q.parameters(): p.requires_grad = True # Record things logger.store(LossQ=loss_q.item(), LossPi=loss_pi.item(), **loss_info) # Finally, update target networks by polyak averaging. with torch.no_grad(): for p, p_targ in zip(ac.parameters(), ac_targ.parameters()): # NB: We use an in-place operations "mul_", "add_" to update target # params, as opposed to "mul" and "add", which would make new tensors. p_targ.data.mul_(polyak) p_targ.data.add_((1 - polyak) * p.data) def get_action(o, noise_scale): a = ac.act(torch.as_tensor(o, dtype=torch.float32)) a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) episode_rewards.append(ep_ret) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 rewards_log = [] episode_rewards = deque(maxlen=10) # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) update(data=batch) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model # if (epoch % save_freq == 0) or (epoch == epochs): # logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() rewards_log.append(np.mean(episode_rewards)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() rewards_log = np.array(rewards_log) save_path = '../../log/ddpg/' + env_name + '/' + str(seed) + '.npy' np.save(save_path, rewards_log)
def __init__(self, env_name, port=2000, gpu=0, train_step=10000, evaluation_step=3000, max_ep_len=300, polyak=0.995, start_steps=200, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-3, policy_delay=2, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=True, port=port, gpu=gpu, discrete_control=False) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.start_steps = start_steps self.cur_train_step = 0 self.cur_tensorboard_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.polyak = polyak self.gamma = gamma self.opti_q = tf.keras.optimizers.Adam(q_lr) self.opti_pi = tf.keras.optimizers.Adam(pi_lr) if debug_mode: self.summary = tf.summary.create_file_writer( os.path.join(self.logger.output_dir, "logs")) self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.target_actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.replay_buffer = ReplayBuffer(replay_size) self.loadpath = os.path.join( DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first") actor = core.ActorCnn() load_check = tf.train.Checkpoint(model=actor) load_check.restore(os.path.join(self.loadpath, "model.ckpt-200")) # with tf.GradientTape() as tape: # x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) # x = tf.expand_dims(x, axis=0) # a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim]) # a = tf.expand_dims(a, axis=0) # self.actor_critic([x,a]) # self.actor_critic.choose_action(x) # self.target_actor_critic([x,a]) # self.target_actor_critic.choose_action(x) with tf.GradientTape() as tape: img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) img = tf.expand_dims(img, axis=0) speed = tf.random.uniform(minval=0, maxval=1, shape=(1, )) speed = tf.expand_dims(speed, axis=0) self.actor_critic.actor([img, speed]) self.target_actor_critic.actor([img, speed]) actor([img, speed]) for old_var, var in zip(actor.variables, self.actor_critic.variables): var.assign(old_var) var = self.actor_critic.actor.trainable_variables old_var = actor.trainable_variables self.target_init(self.target_actor_critic, self.actor_critic) self.savepath = os.path.join(self.logger.output_dir, "saver") checkpoint = tf.train.Checkpoint(model=self.actor_critic, target_model=self.target_actor_critic) self.manager = tf.train.CheckpointManager(checkpoint, directory=self.savepath, max_to_keep=20, checkpoint_name="model.ckpt")
class Td3: def __init__(self, env_name, port=2000, gpu=0, train_step=10000, evaluation_step=3000, max_ep_len=300, polyak=0.995, start_steps=200, batch_size=100, replay_size=50000, iteration=200, gamma=0.99, act_noise=0.1, target_noise=0.2, noise_clip=0.5, pi_lr=1e-4, q_lr=1e-3, policy_delay=2, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.iteration = iteration self.train_step = train_step self.evaluation_step = evaluation_step self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=True, port=port, gpu=gpu, discrete_control=False) self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] self.start_steps = start_steps self.cur_train_step = 0 self.cur_tensorboard_step = 0 self.batch_size = batch_size self.max_ep_len = max_ep_len self.act_limit = self.env.action_space.high[0] self.act_noise = act_noise self.target_noise = target_noise self.noise_clip = noise_clip self.policy_delay = policy_delay self.polyak = polyak self.gamma = gamma self.opti_q = tf.keras.optimizers.Adam(q_lr) self.opti_pi = tf.keras.optimizers.Adam(pi_lr) if debug_mode: self.summary = tf.summary.create_file_writer( os.path.join(self.logger.output_dir, "logs")) self.actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.target_actor_critic = core.ActorCritic(self.act_dim, self.act_limit) self.replay_buffer = ReplayBuffer(replay_size) self.loadpath = os.path.join( DEFAULT_DATA_DIR, "saver_0.45_0.45_0.05_0.1_tfaug_shuffle_first") actor = core.ActorCnn() load_check = tf.train.Checkpoint(model=actor) load_check.restore(os.path.join(self.loadpath, "model.ckpt-200")) # with tf.GradientTape() as tape: # x = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) # x = tf.expand_dims(x, axis=0) # a = tf.random.uniform(minval=0, maxval=1, shape=[self.act_dim]) # a = tf.expand_dims(a, axis=0) # self.actor_critic([x,a]) # self.actor_critic.choose_action(x) # self.target_actor_critic([x,a]) # self.target_actor_critic.choose_action(x) with tf.GradientTape() as tape: img = tf.random.uniform(minval=0, maxval=1, shape=self.obs_dim) img = tf.expand_dims(img, axis=0) speed = tf.random.uniform(minval=0, maxval=1, shape=(1, )) speed = tf.expand_dims(speed, axis=0) self.actor_critic.actor([img, speed]) self.target_actor_critic.actor([img, speed]) actor([img, speed]) for old_var, var in zip(actor.variables, self.actor_critic.variables): var.assign(old_var) var = self.actor_critic.actor.trainable_variables old_var = actor.trainable_variables self.target_init(self.target_actor_critic, self.actor_critic) self.savepath = os.path.join(self.logger.output_dir, "saver") checkpoint = tf.train.Checkpoint(model=self.actor_critic, target_model=self.target_actor_critic) self.manager = tf.train.CheckpointManager(checkpoint, directory=self.savepath, max_to_keep=20, checkpoint_name="model.ckpt") def get_action(self, o, noise_scale, eval_mode=False): img = o["img"].astype(np.float32) / 255.0 speed = np.array([o["speed"]]) direction = o["direction"] a_list, z = self.actor_critic.select_action( [img[np.newaxis, :], speed[np.newaxis, :]]) a = tf.squeeze(a_list[direction], axis=0) # [act_dim] # print("----ori:" + str(a)) if not eval_mode: a += noise_scale * np.random.randn(self.act_dim) return np.clip(a, -self.act_limit, self.act_limit) def target_init(self, target_net, net): for target_params, params in zip(target_net.trainable_variables, net.trainable_variables): target_params.assign(params) def target_update(self, target_net, net): for target_params, params in zip(target_net.trainable_variables, net.trainable_variables): target_params.assign(self.polyak * target_params + (1 - self.polyak) * params) def train_q(self, batch): with tf.GradientTape() as tape: img1 = batch['obs1']["img"] speed1 = batch['obs1']["speed"] direction1 = batch['obs1']["direction"] direction1 = tf.stack([tf.range(self.batch_size), direction1], axis=1) # [None, 2] img2 = batch["obs2"]["img"] speed2 = batch["obs2"]["speed"] direction2 = batch["obs2"]["direction"] direction2 = tf.stack([tf.range(self.batch_size), direction2], axis=1) # [None, 2] q1_list, q2_list = self.actor_critic([img1, speed1, batch["acts"]]) q1_list = tf.stack(q1_list, axis=1) # [None, 4, 1] q2_list = tf.stack(q2_list, axis=1) # [None, 4, 1] q1 = tf.gather_nd(q1_list, direction1) # [None, 1] q2 = tf.gather_nd(q2_list, direction1) pi_targ_list, z = self.target_actor_critic.select_action( [img2, speed2]) pi_targ_list = tf.stack(pi_targ_list[0:4], axis=1) # [None, 4, 3] pi_targ = tf.gather_nd(pi_targ_list, direction2) # [None, 3] epsilon = tf.random.normal(tf.shape(pi_targ), stddev=self.target_noise) epsilon = tf.clip_by_value(epsilon, -self.noise_clip, self.noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -self.act_limit, self.act_limit) q1_targ_list, q2_targ_list = self.target_actor_critic( [img2, speed2, a2]) q1_targ_list = tf.stack(q1_targ_list, axis=1) # [None, 4, 1] q2_targ_list = tf.stack(q2_targ_list, axis=1) # [None, 4, 1] q1_targ = tf.gather_nd(q1_targ_list, direction2) # [None, 1] q2_targ = tf.gather_nd(q2_targ_list, direction2) min_q_targ = tf.minimum(q1_targ, q2_targ) backup = batch['rews'] + self.gamma * (1 - batch['done']) * min_q_targ q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss if debug_mode and self.cur_tensorboard_step % 1 == 0: tensorboard_step = int(self.cur_tensorboard_step / 1) with self.summary.as_default(): tf.summary.scalar("loss_q1", q1_loss, tensorboard_step) tf.summary.scalar("loss_q2", q2_loss, tensorboard_step) tf.summary.scalar("loss_q", q_loss, tensorboard_step) tf.summary.histogram("q1", q1, tensorboard_step) tf.summary.histogram("q2", q2, tensorboard_step) tf.summary.histogram("pi_targ", pi_targ, tensorboard_step) tf.summary.histogram("pi_a2", a2, tensorboard_step) train_vars = self.actor_critic.q1.trainable_variables + \ self.actor_critic.q2.trainable_variables + \ self.actor_critic.actor.emd.trainable_variables # train_vars = self.actor_critic.actor.emd.trainable_variables q_gradient = tape.gradient(q_loss, train_vars) self.opti_q.apply_gradients(zip(q_gradient, train_vars)) def train_p(self, batch): with tf.GradientTape() as tape: img1 = batch['obs1']["img"] speed1 = batch['obs1']["speed"] direction1 = batch['obs1']["direction"] direction1 = tf.stack([tf.range(self.batch_size), direction1], axis=1) # [None, 2] pi_list, z = self.actor_critic.select_action([img1, speed1]) pi_list = tf.stack(pi_list[0:4], axis=1) # [None, 4, 3] pi = tf.gather_nd(pi_list, direction1) # [None, 3] q1_pi_list = self.actor_critic.work_q1(z, pi) # q1_pi_list, _ = self.actor_critic([img1, speed1, pi]) q1_pi_list = tf.stack(q1_pi_list, axis=1) q1_pi = tf.gather_nd(q1_pi_list, direction1) pi_loss = -tf.reduce_mean(q1_pi) train_vars_pi = self.actor_critic.actor.trainable_variables pi_gradient = tape.gradient(pi_loss, train_vars_pi) train_pi = [ var for (var, gra) in zip(train_vars_pi, pi_gradient) if gra is not None ] pi_gra = [ gra for (var, gra) in zip(train_vars_pi, pi_gradient) if gra is not None ] self.opti_pi.apply_gradients(zip(pi_gra, train_pi)) self.target_update(self.target_actor_critic, self.actor_critic) if debug_mode and self.cur_tensorboard_step % 1 == 0: tensorboard_step = int(self.cur_tensorboard_step / 1) with self.summary.as_default(): tf.summary.histogram("pi", pi, tensorboard_step) tf.summary.histogram("q1_pi", q1_pi, tensorboard_step) tf.summary.scalar("loss_pi", pi_loss, tensorboard_step) def run_one_phrase(self, min_step, eval_mode=False): step = 0 episode = 0 reward = 0. while step < min_step: done = False obs = self.env.reset() step_episode = 0 reward_episode = 0 while not done: if self.cur_train_step > self.start_steps or eval_mode or True: a = self.get_action(obs, self.act_noise, eval_mode) else: a = self.env.action_space.sample() # print(a) obs_, r, done, _ = self.env.step(a) step += 1 step_episode += 1 reward += r reward_episode += r if not eval_mode: self.cur_train_step += 1 self.replay_buffer.add(obs, a, obs_, [r], [done]) if step_episode >= self.max_ep_len: break obs = obs_ episode += 1 if self.cur_train_step > self.start_steps and not eval_mode: for j in range(step_episode): batch = self.replay_buffer.sample(self.batch_size) self.cur_tensorboard_step += 1 self.train_q(batch) if j % self.policy_delay == 0: self.train_p(batch) if episode % 20 == 0 and not eval_mode: self.manager.save() print("ep:", episode, "step:", step_episode, "r:", reward_episode) if not eval_mode: self.logger.store(step=step_episode, reward=reward_episode) else: self.logger.store(step_test=step_episode, reward_test=reward_episode) return reward, episode def train_test(self): for i in range(self.iteration): print("iter:", i + 1) self.logger.store(iter=i + 1) reward, episode = self.run_one_phrase(self.train_step) # print("reward:", reward/episode, "episode:", episode) # tf.logging.info("reward: %.2f, episode: %.2f", reward/episode, episode) reward, episode = self.run_one_phrase(self.evaluation_step, True) # print("reward:", reward / episode, "episode:", episode) # tf.logging.info("reward_test: %.2f, episode_test: %.2f", reward/episode, episode) self.logger.log_tabular("reward", with_min_and_max=True) self.logger.log_tabular("step", with_min_and_max=True) self.logger.log_tabular("reward_test", with_min_and_max=True) self.logger.log_tabular("step_test", with_min_and_max=True) # self.logger.log_tabular('Q1Vals', with_min_and_max=True) # self.logger.log_tabular('Q2Vals', with_min_and_max=True) # self.logger.log_tabular('LossPi', average_only=True) # self.logger.log_tabular('LossQ', average_only=True) self.logger.dump_tabular()
def policyg(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), seed=0, episodes_per_epoch=40, epochs=500, gamma=0.99, lam=0.97, pi_lr=3e-4, vf_lr=1e-3, train_v_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Models ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs) # Buffers local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) buff = BufferA(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) # Count variables var_counts = tuple( count_vars(module) for module in [ac.policy, ac.value_f]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr) # Parameters Sync sync_all_params(ac.parameters()) def update(e): obs, act, adv, ret, lgp_old = [ torch.Tensor(x) for x in buff.retrieve_all() ] # Policy _, lgp, _ = ac.policy(obs, act) entropy = (-lgp).mean() # Policy loss # policy gradient term + entropy term pi_loss = -(lgp * adv).mean() # Train policy train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function v = ac.value_f(obs) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = ac.value_f(obs) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Log the changes _, lgp, _, v = ac(obs, act) entropy_new = (-lgp).mean() pi_loss_new = -(lgp * adv).mean() v_loss_new = F.mse_loss(v, ret) kl = (lgp_old - lgp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, DeltaLossPi=(pi_loss_new - pi_loss), DeltaLossV=(v_loss_new - v_l_old), Entropy=entropy, KL=kl) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_t = 0 for epoch in range(epochs): ac.eval() # Policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, lopg_t, v_t = ac(obs) buff.store(o, a.detach().numpy(), r, v_t.item(), lopg_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff.end_episode() logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger._torch_save(ac, fname="expert_torch_save.pt") # Update ac.train() update(epoch) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
parser.add_argument('--batch', default=50) parser.add_argument('--norm_state', default=True) parser.add_argument('--norm_rewards', default=True) parser.add_argument('--is_clip_v', default=True) parser.add_argument('--max_grad_norm', default=-1, type=float) parser.add_argument('--anneal_lr', default=False) parser.add_argument('--debug', default=False) parser.add_argument('--log_every', default=10) args = parser.parse_args() device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") from utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) logger = EpochLogger(**logger_kwargs) writer = SummaryWriter(os.path.join(logger.output_dir, "logs")) env = gym.make(args.env) if args.env_num > 1: env = [ Env(args.env, norm_state=args.norm_state, norm_rewards=args.norm_rewards) for _ in range(args.env_num) ] env = SubVectorEnv(env) # env = CarRacing() state_dim = env.observation_space.shape[0] act_dim = env.action_space.shape action_max = env.action_space.high[0] ppo = core.PPO(state_dim,
parser.add_argument('--is_gae', action="store_true") parser.add_argument('--last_v', action="store_true") parser.add_argument('--max_grad_norm', default=-1, type=float) parser.add_argument('--anneal_lr', action="store_true") parser.add_argument('--debug', action="store_false") parser.add_argument('--log_every', default=10, type=int) parser.add_argument('--target_kl', default=0.03, type=float) parser.add_argument('--test_epoch', default=10, type=int) args = parser.parse_args() device = torch.device( "cuda:" + str(args.gpu) if torch.cuda.is_available() else "cpu") from utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) logger = EpochLogger(**logger_kwargs) writer = SummaryWriter(os.path.join(logger.output_dir, "logs")) with open(os.path.join(logger.output_dir, 'args.json'), 'w') as f: json.dump(vars(args), f, sort_keys=True, indent=4) env = make_atari(args.env) env = gym.wrappers.RecordEpisodeStatistics(env) env = wrap_deepmind(env, frame_stack=True) env = ImageToPyTorch(env) # test_env = make_atari(args.env) # test_env = gym.wrappers.RecordEpisodeStatistics(test_env) # test_env = wrap_deepmind(test_env, frame_stack=True) # test_env = ImageToPyTorch(test_env) torch.manual_seed(args.seed) np.random.seed(args.seed) env.seed(args.seed)
def __init__(self, env_name, port=2000, gpu=0, batch_size=32, train_step=25000, evaluation_step=3000, max_ep_len=6000, epsilon_train=0.1, epsilon_eval=0.01, replay_size=100000, epsilon_decay_period=25000, warmup_steps=2000, iteration=200, gamma=0.99, target_update_period=800, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=True, port=port, gpu=gpu) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session("", config=config) set_session(self.sess) self.build_model() self.cur_train_step = 0 self.observation_shape = (84, 84) self.state_shape = (1, ) + self.observation_shape + (4, ) self.s = np.zeros(self.state_shape) self.last_s = np.zeros(self.state_shape) if debug_mode: self.summary = tf.summary.FileWriter( os.path.join(self.logger.output_dir, "logs")) self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) tf.summary.histogram("q", self.q) # tf.summary.histogram("q_target", self.q_target) # tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all()
def td3(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=100, replay_size=int(1e6), gamma=0.99, polyak=0.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, update_after=1000, update_every=50, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=2, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): """ Twin Delayed Deep Deterministic Policy Gradient (TD3) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Deterministically computes actions | from policy given states. ``q1`` (batch,) | Gives one estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q2`` (batch,) | Gives another estimate of Q* for | states in ``x_ph`` and actions in | ``a_ph``. ``q1_pi`` (batch,) | Gives the composition of ``q1`` and | ``pi`` for states in ``x_ph``: | q1(x, pi(x)). =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to TD3. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) pi_lr (float): Learning rate for policy. q_lr (float): Learning rate for Q-networks. batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. act_noise (float): Stddev for Gaussian exploration noise added to policy at training time. (At test time, no noise is added.) target_noise (float): Stddev for smoothing noise added to target policy. noise_clip (float): Limit for absolute value of target policy smoothing noise. policy_delay (int): Policy will only be updated once every policy_delay times for each update of the Q-networks. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph, x2_ph, r_ph, d_ph = core.placeholders(obs_dim, act_dim, obs_dim, None, None) # Main outputs from computation graph with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target policy network with tf.variable_scope('target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope('target', reuse=True): # Target policy smoothing, by adding clipped noise to target actions epsilon = tf.random_normal(tf.shape(pi_targ), stddev=target_noise) epsilon = tf.clip_by_value(epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value(a2, -act_limit, act_limit) # Target Q-values, using action from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size) # Count variables var_counts = tuple( core.count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # Bellman backup for Q functions, using Clipped Double-Q targets min_q_targ = tf.minimum(q1_targ, q2_targ) backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * min_q_targ) # TD3 losses pi_loss = -tf.reduce_mean(q1_pi) q1_loss = tf.reduce_mean((q1 - backup)**2) q2_loss = tf.reduce_mean((q2 - backup)**2) q_loss = q1_loss + q2_loss # Separate train ops for pi, q pi_optimizer = tf.train.AdamOptimizer(learning_rate=pi_lr) q_optimizer = tf.train.AdamOptimizer(learning_rate=q_lr) train_pi_op = pi_optimizer.minimize(pi_loss, var_list=get_vars('main/pi')) train_q_op = q_optimizer.minimize(q_loss, var_list=get_vars('main/q')) # Polyak averaging for target variables target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Initializing targets to match main variables target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2 }) def get_action(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_action(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 total_steps = steps_per_epoch * epochs # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = get_action(o, act_noise) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after and t % update_every == 0: for j in range(update_every): batch = replay_buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, train_q_op] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: # Delayed policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) # End of epoch wrap-up if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if (epoch % save_freq == 0) or (epoch == epochs): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def gail(env_fn, actor_critic=ActorCritic, ac_kwargs=dict(), disc=Discriminator, dc_kwargs=dict(), seed=0, episodes_per_epoch=40, epochs=500, gamma=0.99, lam=0.97, pi_lr=3e-3, vf_lr=3e-3, dc_lr=5e-4, train_v_iters=80, train_dc_iters=80, max_ep_len=1000, logger_kwargs=dict(), save_freq=10): l_lam = 0 # balance two loss term logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape ac_kwargs['action_space'] = env.action_space # Models ac = actor_critic(input_dim=obs_dim[0], **ac_kwargs) disc = disc(input_dim=obs_dim[0], **dc_kwargs) # TODO: Load expert policy here expert = actor_critic(input_dim=obs_dim[0], **ac_kwargs) expert_name = "expert_torch_save.pt" expert = torch.load(osp.join(logger_kwargs['output_dir'], expert_name)) # Buffers local_episodes_per_epoch = int(episodes_per_epoch / num_procs()) buff_s = BufferS(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) buff_t = BufferT(obs_dim[0], act_dim[0], local_episodes_per_epoch, max_ep_len) # Count variables var_counts = tuple( count_vars(module) for module in [ac.policy, ac.value_f, disc.policy]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d, \t d: %d\n' % var_counts) # Optimizers train_pi = torch.optim.Adam(ac.policy.parameters(), lr=pi_lr) train_v = torch.optim.Adam(ac.value_f.parameters(), lr=vf_lr) train_dc = torch.optim.Adam(disc.policy.parameters(), lr=dc_lr) # Parameters Sync sync_all_params(ac.parameters()) sync_all_params(disc.parameters()) def update(e): obs_s, act, adv, ret, lgp_old = [ torch.Tensor(x) for x in buff_s.retrieve_all() ] obs_t, _ = [torch.Tensor(x) for x in buff_t.retrieve_all()] # Policy _, lgp, _ = ac.policy(obs_s, act) entropy = (-lgp).mean() # Policy loss # policy gradient term + entropy term pi_loss = -(lgp * adv).mean() - l_lam * entropy # Train policy if e > 10: train_pi.zero_grad() pi_loss.backward() average_gradients(train_pi.param_groups) train_pi.step() # Value function v = ac.value_f(obs_s) v_l_old = F.mse_loss(v, ret) for _ in range(train_v_iters): v = ac.value_f(obs_s) v_loss = F.mse_loss(v, ret) # Value function train train_v.zero_grad() v_loss.backward() average_gradients(train_v.param_groups) train_v.step() # Discriminator gt1 = torch.ones(obs_s.size()[0], dtype=torch.int) gt2 = torch.zeros(obs_t.size()[0], dtype=torch.int) _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss_old = -lgp_s.mean() - lgp_t.mean() for _ in range(train_dc_iters): _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss = -lgp_s.mean() - lgp_t.mean() # Discriminator train train_dc.zero_grad() dc_loss.backward() average_gradients(train_dc.param_groups) train_dc.step() _, lgp_s, _ = disc(obs_s, gt=gt1) _, lgp_t, _ = disc(obs_t, gt=gt2) dc_loss_new = -lgp_s.mean() - lgp_t.mean() # Log the changes _, lgp, _, v = ac(obs, act) entropy_new = (-lgp).mean() pi_loss_new = -(lgp * adv).mean() - l_lam * entropy v_loss_new = F.mse_loss(v, ret) kl = (lgp_old - lgp).mean() logger.store(LossPi=pi_loss, LossV=v_l_old, LossDC=dc_loss_old, DeltaLossPi=(pi_loss_new - pi_loss), DeltaLossV=(v_loss_new - v_l_old), DeltaLossDC=(dc_loss_new - dc_loss_old), DeltaEnt=(entropy_new - entropy), Entropy=entropy, KL=kl) start_time = time.time() o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset(), 0, 0, False, 0, 0, 0 total_t = 0 ep_len_t = 0 for epoch in range(epochs): ac.eval() disc.eval() # We recognize the probability term of index [0] correspond to the teacher's policy # Student's policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, lopg_t, v_t = ac(obs) buff_s.store(o, a.detach().numpy(), r, sdr, v_t.item(), lopg_t.detach().numpy()) logger.store(VVals=v_t) o, r, d, _ = env.step(a.detach().numpy()[0]) _, sdr, _ = disc(torch.Tensor(o.reshape(1, -1)), gt=torch.Tensor([0])) if sdr < -4: # Truncate rewards sdr = -4 ep_ret += r ep_sdr += sdr ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff_s.end_episode() logger.store(EpRetS=ep_ret, EpLenS=ep_len, EpSdrS=ep_sdr) o, r, sdr, d, ep_ret, ep_sdr, ep_len = env.reset( ), 0, 0, False, 0, 0, 0 # Teacher's policy rollout for _ in range(local_episodes_per_epoch): for _ in range(max_ep_len): obs = torch.Tensor(o.reshape(1, -1)) a, _, _, _ = expert(obs) buff_t.store(o, a.detach().numpy(), r) o, r, d, _ = env.step(a.detach().numpy()[0]) ep_ret += r ep_len += 1 total_t += 1 terminal = d or (ep_len == max_ep_len) if terminal: buff_t.end_episode() logger.store(EpRetT=ep_ret, EpLenT=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, [ac, disc], None) # Update ac.train() disc.train() update(epoch) # Log logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRetS', with_min_and_max=True) logger.log_tabular('EpSdrS', with_min_and_max=True) logger.log_tabular('EpLenS', average_only=True) logger.log_tabular('EpRetT', with_min_and_max=True) logger.log_tabular('EpLenT', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', total_t) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('LossDC', average_only=True) logger.log_tabular('DeltaLossDC', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('DeltaEnt', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class Dqn: def __init__(self, env_name, port=2000, gpu=0, batch_size=32, train_step=25000, evaluation_step=3000, max_ep_len=6000, epsilon_train=0.1, epsilon_eval=0.01, replay_size=100000, epsilon_decay_period=25000, warmup_steps=2000, iteration=200, gamma=0.99, target_update_period=800, update_period=4, logger_kwargs=dict()): self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) self.env = CarlaEnv(early_termination_enabled=True, run_offscreen=True, port=port, gpu=gpu) self.train_step = train_step self.evaluation_step = evaluation_step self.max_ep_len = max_ep_len self.epsilon_train = epsilon_train self.epsilon_eval = epsilon_eval self.batch_size = batch_size self.replay_size = replay_size self.epsilon_decay_period = epsilon_decay_period self.warmup_steps = warmup_steps self.iteration = iteration self.replay_buffer = ReplayBuffer(replay_size) self.gamma = gamma self.target_update_period = target_update_period self.update_period = update_period config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.Session("", config=config) set_session(self.sess) self.build_model() self.cur_train_step = 0 self.observation_shape = (84, 84) self.state_shape = (1, ) + self.observation_shape + (4, ) self.s = np.zeros(self.state_shape) self.last_s = np.zeros(self.state_shape) if debug_mode: self.summary = tf.summary.FileWriter( os.path.join(self.logger.output_dir, "logs")) self.loss = tf.placeholder(tf.float32, shape=[]) self.q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.q_target = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) self.target_q = tf.placeholder(tf.float32, shape=[None, self.env.action_space.n]) tf.summary.scalar("loss", self.loss) tf.summary.histogram("q", self.q) # tf.summary.histogram("q_target", self.q_target) # tf.summary.histogram("target_q", self.target_q) self.merge = tf.summary.merge_all() def build_model(self): self.model, self.model_target = nature_dqn(self.env.action_space.n, (80, 80, 6)) self.model_target.set_weights(self.model.get_weights()) def choose_action(self, s, eval_mode=False): epsilon = self.epsilon_eval if eval_mode \ else linearly_decaying_epsilon(self.epsilon_decay_period, self.cur_train_step, self.warmup_steps, self.epsilon_train) if random.random() <= 1 - epsilon: q = self.model.predict(s[np.newaxis, :]) a = np.argmax(q, axis=1)[0] # print() else: a = self.env.action_space.sample() return a def record_obs(self, observation): self.last_s = copy.copy(self.s) self.s = np.roll(self.s, -1, axis=-1) self.s[0, ..., -1] = np.squeeze(observation) def store(self, s, a, s_, r, done): pass def run_one_phrase(self, min_step, eval_mode=False): step = 0 episode = 0 reward = 0. while step < min_step: done = False obs = self.env.reset() step_episode = 0 reward_episode = 0 while not done: s = np.array(obs) a = self.choose_action(s, eval_mode) obs_, r, done, _ = self.env.step(a) step += 1 step_episode += 1 reward += r reward_episode += r if not eval_mode: self.cur_train_step += 1 self.replay_buffer.add(obs, a, obs_, r, done) if self.cur_train_step > 2000: if self.cur_train_step % self.update_period == 0: # data = self.replay_buffer.sample() (s, a, s_, r, d) = self.replay_buffer.sample(self.batch_size) q_ = np.max(self.model_target.predict(s_), axis=1) q_target = r + (1 - d) * self.gamma * q_ q = self.model.predict(s) q_recoder = np.copy(q) batch_index = np.arange(self.batch_size) q[batch_index, a] = q_target result = self.model.train_on_batch(np.array(s), q) # print("result:", result) # if self.cur_train_step%1== 0: # merge = self.sess.run(self.merge, feed_dict={self.loss: result[0], self.q: q_recoder}) # self.summary.add_summary(merge, (self.cur_train_step-20000)/self.update_period/1) if self.cur_train_step % self.target_update_period == 0: self.model_target.set_weights( self.model.get_weights()) if step_episode >= self.max_ep_len: break obs = obs_ episode += 1 savepath = os.path.join(self.logger.output_dir, "saver") if not os.path.exists(savepath): os.makedirs(savepath) self.model.save( os.path.join(savepath, "model" + str(episode % 5) + ".h5")) # info = psutil.virtual_memory() # sys.stdout.write("steps: {}".format(step) + " episode_length: {}".format(step_episode) + # " return: {}".format(reward_episode) + # " memory used : {}".format(psutil.Process(os.getpid()).memory_info().rss) + # " total memory: {}\r".format(info.total)) # # sys.stdout.flush() print("ep:", episode, "step:", step, "r:", reward) if not eval_mode: self.logger.store(step=step_episode, reward=reward_episode) else: self.logger.store(step_test=step_episode, reward_test=reward_episode) return reward, episode def train_test(self): for i in range(self.iteration): print("iter:", i + 1) self.logger.store(iter=i + 1) reward, episode = self.run_one_phrase(self.train_step) # print("reward:", reward/episode, "episode:", episode) tf.logging.info("reward: %.2f, episode: %.2f", reward / episode, episode) reward, episode = self.run_one_phrase(self.evaluation_step, True) # print("reward:", reward / episode, "episode:", episode) tf.logging.info("reward_test: %.2f, episode_test: %.2f", reward / episode, episode) self.logger.log_tabular("reward", with_min_and_max=True) self.logger.log_tabular("step", with_min_and_max=True) self.logger.log_tabular("reward_test", with_min_and_max=True) self.logger.log_tabular("step_test", with_min_and_max=True) self.logger.dump_tabular()
dynamic_model.fit(use_data_buf=True, normalize=True) cost_model.fit() env.close() if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--robot', type=str, default='point', help="robot model, selected from `point` or `car` ") parser.add_argument('--level', type=int, default=1, help="environment difficulty, selected from `1` or `2`, where `2` would be more difficult than `1`") parser.add_argument('--epoch', type=int, default=60, help="maximum epochs to train") parser.add_argument('--episode', type=int, default=10, help="determines how many episodes data to collect for each epoch") parser.add_argument('--render','-r', action='store_true', help="render the environment") parser.add_argument('--test', '-t', action='store_true', help="test the performance of pretrained models without training") parser.add_argument('--seed', '-s', type=int, default=1, help="seed for Gym, PyTorch and Numpy") parser.add_argument('--dir', '-d',type=str, default='./data/', help="directory to save the logging information") parser.add_argument('--name','-n', type=str, default='test', help="name of the experiment, used to save data in a folder named by this parameter") parser.add_argument('--save', action='store_true', help="save the trained dynamic model, data buffer, and cost model") parser.add_argument('--load',type=str, default=None, help="load the trained dynamic model, data buffer, and cost model from a specified directory") parser.add_argument('--ensemble',type=int, default=0, help="number of model ensembles, if this argument is greater than 0, then it will replace the default ensembles number in config.yml") # number of ensembles parser.add_argument('--optimizer','-o',type=str, default="rce", help=" determine the optimizer, selected from `rce`, `cem`, or `random` ") # random, cem or CCE parser.add_argument('--config', '-c', type=str, default='./config.yml', help="specify the path to the configuation file of the models") args = parser.parse_args() logger_kwargs = setup_logger_kwargs(args.name, args.seed, args.dir) logger = EpochLogger(**logger_kwargs) config = load_config(args.config) run(logger, config, args)
def ppo(env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with a ``step`` method, an ``act`` method, a ``pi`` module, and a ``v`` module. The ``step`` method should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Numpy array of actions for each | observation. ``v`` (batch,) | Numpy array of value estimates | for the provided observations. ``logp_a`` (batch,) | Numpy array of log probs for the | actions in ``a``. =========== ================ ====================================== The ``act`` method behaves the same as ``step`` but only returns ``a``. The ``pi`` module's forward call should accept a batch of observations and optionally a batch of actions, and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` N/A | Torch Distribution object, containing | a batch of distributions describing | the policy for the provided observations. ``logp_a`` (batch,) | Optional (only returned if batch of | actions is given). Tensor containing | the log probability, according to | the policy, of the provided actions. | If actions not given, will contain | ``None``. =========== ================ ====================================== The ``v`` module's forward call should accept a batch of observations and return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``v`` (batch,) | Tensor containing the value estimates | for the provided observations. (Critical: | make sure to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ # Special function to avoid certain slowdowns from PyTorch + MPI combo. setup_pytorch_for_mpi() # Set up logger and save configuration logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) # Random seed seed += 10000 * proc_id() torch.manual_seed(seed) np.random.seed(seed) # Instantiate environment env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Create actor-critic module if inspect.isclass(actor_critic): ac = actor_critic(env.observation_space, env.action_space, **ac_kwargs) else: ac = actor_critic # Sync params across processes sync_params(ac) # Count variables var_counts = tuple(core.count_vars(module) for module in [ac.pi, ac.v]) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # Set up experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Set up function for computing PPO policy loss def compute_loss_pi(data): obs, act, adv, logp_old = data['obs'], data['act'], data['adv'], data[ 'logp'] # Policy loss pi, logp = ac.pi(obs, act) ratio = torch.exp(logp - logp_old) clip_adv = torch.clamp(ratio, 1 - clip_ratio, 1 + clip_ratio) * adv loss_pi = -(torch.min(ratio * adv, clip_adv)).mean() # Useful extra info approx_kl = (logp_old - logp).mean().item() ent = pi.entropy().mean().item() clipped = ratio.gt(1 + clip_ratio) | ratio.lt(1 - clip_ratio) clipfrac = torch.as_tensor(clipped, dtype=torch.float32).mean().item() pi_info = dict(kl=approx_kl, ent=ent, cf=clipfrac) return loss_pi, pi_info # Set up function for computing value loss def compute_loss_v(data): obs, ret = data['obs'], data['ret'] return ((ac.v(obs) - ret)**2).mean() # Set up optimizers for policy and value function pi_optimizer = Adam(ac.pi.parameters(), lr=pi_lr) vf_optimizer = Adam(ac.v.parameters(), lr=vf_lr) # Set up model saving logger.setup_pytorch_saver(ac) def update(): data = buf.get() pi_l_old, pi_info_old = compute_loss_pi(data) pi_l_old = pi_l_old.item() v_l_old = compute_loss_v(data).item() # Train policy with multiple steps of gradient descent for i in range(train_pi_iters): pi_optimizer.zero_grad() loss_pi, pi_info = compute_loss_pi(data) kl = mpi_avg(pi_info['kl']) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break loss_pi.backward() mpi_avg_grads(ac.pi) # average grads across MPI processes pi_optimizer.step() logger.store(StopIter=i) # Value function learning for i in range(train_v_iters): vf_optimizer.zero_grad() loss_v = compute_loss_v(data) loss_v.backward() mpi_avg_grads(ac.v) # average grads across MPI processes vf_optimizer.step() # Log changes from update kl, ent, cf = pi_info['kl'], pi_info_old['ent'], pi_info['cf'] logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(loss_pi.item() - pi_l_old), DeltaLossV=(loss_v.item() - v_l_old)) # Prepare for interaction with environment start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v, logp = ac.step(torch.as_tensor(o, dtype=torch.float32)) next_o, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v, logp) logger.store(VVals=v) # Update obs (critical!) o = next_o timeout = ep_len == max_ep_len terminal = d or timeout epoch_ended = t == local_steps_per_epoch - 1 if terminal or epoch_ended: if epoch_ended and not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len, flush=True) # if trajectory didn't reach terminal state, bootstrap value target if timeout or epoch_ended: _, v, _ = ac.step(torch.as_tensor(o, dtype=torch.float32)) else: v = 0 buf.finish_path(v) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # current state logger.save_state({'env': env}, epoch) # for rendering # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular() logger.output_file.close()
parser.add_argument('--log', type=str, default="logs") parser.add_argument('--steps', default=300) parser.add_argument('--port', default=2000) parser.add_argument('--gpu', default=0) parser.add_argument('--exp_name', default="ppo_carla") parser.add_argument('--seed', default=0) parser.add_argument('--batch', default=50) args = parser.parse_args() gpus = tf.config.experimental.list_physical_devices(device_type='GPU') for gpu in gpus: tf.config.experimental.set_memory_growth(gpu, True) from utils.run_utils import setup_logger_kwargs logger_kwargs = setup_logger_kwargs(args.exp_name, args.seed) logger = EpochLogger(**logger_kwargs) # logger.save_config(locals()) env = CarlaEnv(early_termination_enabled=True, run_offscreen=False, port=args.port, gpu=args.gpu, discrete_control=False) ppo = core.PPO(3, 0.2, lr_a=args.lr_a, lr_c=args.lr_c) if debug_mode: summary = tf.summary.create_file_writer( os.path.join(logger.output_dir, "logs")) savepath = osp.join(logger.output_dir, "saver") checkpoint = tf.train.Checkpoint(model=ppo)
def logger_setup(self, logger_kwargs, **kwargs): self.logger = EpochLogger(**logger_kwargs) for key, value in kwargs.items(): if key != 'env' and key != 'output_dir': self.logger.log_tabular(key, value)
def run_policy(env, get_action, ckpt_num, max_con, con, max_ep_len=100, num_episodes=100, fpath=None, render=False, record=True, video_caption_off=False): assert env is not None, \ "Environment not found!\n\n It looks like the environment wasn't saved, " + \ "and we can't run the agent in it. :( \n\n Check out the readthedocs " + \ "page on Experiment Outputs for how to handle this situation." output_dir = osp.join(osp.abspath(osp.dirname(osp.dirname(__file__))), "log/tmp/experiments/%i" % int(time.time())) logger = EpochLogger(output_dir=output_dir) o, r, d, ep_ret, ep_len, n = env.reset(), 0, False, 0, 0, 0 visual_obs = [] c_onehot = F.one_hot(torch.tensor(con), max_con).squeeze().float() while n < num_episodes: vob = render_frame(env, ep_len, ep_ret, 'AC', render, record, caption_off=video_caption_off) visual_obs.append(vob) concat_obs = torch.cat( [torch.Tensor(o.reshape(1, -1)), c_onehot.reshape(1, -1)], 1) a = get_action(concat_obs) o, r, d, _ = env.step(a[0].detach().numpy()[0]) ep_ret += r ep_len += 1 d = False if d or (ep_len == max_ep_len): vob = render_frame(env, ep_len, ep_ret, 'AC', render, record, caption_off=video_caption_off) visual_obs.append(vob) # add last frame logger.store(EpRet=ep_ret, EpLen=ep_len) print('Episode %d \t EpRet %.3f \t EpLen %d' % (n, ep_ret, ep_len)) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 n += 1 logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.dump_tabular() if record: # temp_info: [video_prefix, ckpt_num, ep_ret, ep_len, con] temp_info = ['', ckpt_num, ep_ret, ep_len, con] logger.save_video(visual_obs, temp_info, fpath)
def __init__(self, env_fn, actor_critic=core.MLPActorCritic, ac_kwargs=dict(), seed=0, steps_per_epoch=100, epochs=10000, replay_size=int(2000000), gamma=0.99, polyak=0.995, lr=3e-4, p_lr=3e-4, alpha=0.0, batch_size=1024, start_steps=10000, update_after=0, update_every=50, num_test_episodes=10, max_ep_len=1000, logger_kwargs=dict(), save_freq=1, algo='SAC'): """ Soft Actor-Critic (SAC) Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: The constructor method for a PyTorch Module with an ``act`` method, a ``pi`` module, a ``q1`` module, and a ``q2`` module. The ``act`` method and ``pi`` module should accept batches of observations as inputs, and ``q1`` and ``q2`` should accept a batch of observations and a batch of actions as inputs. When called, ``act``, ``q1``, and ``q2`` should return: =========== ================ ====================================== Call Output Shape Description =========== ================ ====================================== ``act`` (batch, act_dim) | Numpy array of actions for each | observation. ``q1`` (batch,) | Tensor containing one current estimate | of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) ``q2`` (batch,) | Tensor containing the other current | estimate of Q* for the provided observations | and actions. (Critical: make sure to | flatten this!) =========== ================ ====================================== Calling ``pi`` should return: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``a`` (batch, act_dim) | Tensor containing actions from policy | given observations. ``logp_pi`` (batch,) | Tensor containing log probabilities of | actions in ``a``. Importantly: gradients | should be able to flow back into ``a``. =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the ActorCritic object you provided to SAC. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs to run and train agent. replay_size (int): Maximum length of replay buffer. gamma (float): Discount factor. (Always between 0 and 1.) polyak (float): Interpolation factor in polyak averaging for target networks. Target networks are updated towards main networks according to: .. math:: \\theta_{\\text{targ}} \\leftarrow \\rho \\theta_{\\text{targ}} + (1-\\rho) \\theta where :math:`\\rho` is polyak. (Always between 0 and 1, usually close to 1.) lr (float): Learning rate (used for both policy and value learning). alpha (float): Entropy regularization coefficient. (Equivalent to inverse of reward scale in the original SAC paper.) batch_size (int): Minibatch size for SGD. start_steps (int): Number of steps for uniform-random action selection, before running real policy. Helps exploration. update_after (int): Number of env interactions to collect before starting to do gradient descent updates. Ensures replay buffer is full enough for useful updates. update_every (int): Number of env interactions that should elapse between gradient descent updates. Note: Regardless of how long you wait between updates, the ratio of env steps to gradient steps is locked to 1. num_test_episodes (int): Number of episodes to test the deterministic policy at the end of each epoch. max_ep_len (int): Maximum length of trajectory / episode / rollout. logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ self.logger = EpochLogger(**logger_kwargs) self.logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) self.env, self.test_env = env_fn(), env_fn() self.obs_dim = self.env.observation_space.shape self.act_dim = self.env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! self.act_limit = self.env.action_space.high[0] # Create actor-critic module and target networks self.ac = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ = actor_critic(self.env.observation_space, self.env.action_space, special_policy='awac', **ac_kwargs) self.ac_targ.load_state_dict(self.ac.state_dict()) self.gamma = gamma # Freeze target networks with respect to optimizers (only update via polyak averaging) for p in self.ac_targ.parameters(): p.requires_grad = False # List of parameters for both Q-networks (save this for convenience) self.q_params = itertools.chain(self.ac.q1.parameters(), self.ac.q2.parameters()) # Experience buffer self.replay_buffer = ReplayBuffer(obs_dim=self.obs_dim, act_dim=self.act_dim, size=replay_size) # Count variables (protip: try to get a feel for how different size networks behave!) var_counts = tuple( core.count_vars(module) for module in [self.ac.pi, self.ac.q1, self.ac.q2]) self.logger.log('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d\n' % var_counts) self.algo = algo self.p_lr = p_lr self.lr = lr self.alpha = 0 # # Algorithm specific hyperparams # Set up optimizers for policy and q-function self.pi_optimizer = Adam(self.ac.pi.parameters(), lr=self.p_lr, weight_decay=1e-4) self.q_optimizer = Adam(self.q_params, lr=self.lr) self.num_test_episodes = num_test_episodes self.max_ep_len = max_ep_len self.epochs = epochs self.steps_per_epoch = steps_per_epoch self.update_after = update_after self.update_every = update_every self.batch_size = batch_size self.save_freq = save_freq self.polyak = polyak # Set up model saving self.logger.setup_pytorch_saver(self.ac) print("Running Offline RL algorithm: {}".format(self.algo))
def ppo(env_fn, actor_critic=core.mlp_actor_critic, ac_kwargs=dict(), seed=0, steps_per_epoch=4000, epochs=50, gamma=0.99, clip_ratio=0.2, pi_lr=3e-4, vf_lr=1e-3, train_pi_iters=80, train_v_iters=80, lam=0.97, max_ep_len=1000, target_kl=0.01, logger_kwargs=dict(), save_freq=10): """ Proximal Policy Optimization (by clipping), with early stopping based on approximate KL Args: env_fn : A function which creates a copy of the environment. The environment must satisfy the OpenAI Gym API. actor_critic: A function which takes in placeholder symbols for state, ``x_ph``, and action, ``a_ph``, and returns the main outputs from the agent's Tensorflow computation graph: =========== ================ ====================================== Symbol Shape Description =========== ================ ====================================== ``pi`` (batch, act_dim) | Samples actions from policy given | states. ``logp`` (batch,) | Gives log probability, according to | the policy, of taking actions ``a_ph`` | in states ``x_ph``. ``logp_pi`` (batch,) | Gives log probability, according to | the policy, of the action sampled by | ``pi``. ``v`` (batch,) | Gives the value estimate for states | in ``x_ph``. (Critical: make sure | to flatten this!) =========== ================ ====================================== ac_kwargs (dict): Any kwargs appropriate for the actor_critic function you provided to PPO. seed (int): Seed for random number generators. steps_per_epoch (int): Number of steps of interaction (state-action pairs) for the agent and the environment in each epoch. epochs (int): Number of epochs of interaction (equivalent to number of policy updates) to perform. gamma (float): Discount factor. (Always between 0 and 1.) clip_ratio (float): Hyperparameter for clipping in the policy objective. Roughly: how far can the new policy go from the old policy while still profiting (improving the objective function)? The new policy can still go farther than the clip_ratio says, but it doesn't help on the objective anymore. (Usually small, 0.1 to 0.3.) Typically denoted by :math:`\epsilon`. pi_lr (float): Learning rate for policy optimizer. vf_lr (float): Learning rate for value function optimizer. train_pi_iters (int): Maximum number of gradient descent steps to take on policy loss per epoch. (Early stopping may cause optimizer to take fewer than this.) train_v_iters (int): Number of gradient descent steps to take on value function per epoch. lam (float): Lambda for GAE-Lambda. (Always between 0 and 1, close to 1.) max_ep_len (int): Maximum length of trajectory / episode / rollout. target_kl (float): Roughly what KL divergence we think is appropriate between new and old policies after an update. This will get used for early stopping. (Usually small, 0.01 or 0.05.) logger_kwargs (dict): Keyword args for EpochLogger. save_freq (int): How often (in terms of gap between epochs) to save the current policy and value function. """ logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) seed += 10000 * proc_id() tf.set_random_seed(seed) np.random.seed(seed) env = env_fn() obs_dim = env.observation_space.shape act_dim = env.action_space.shape # Share information about action space with policy architecture ac_kwargs['action_space'] = env.action_space # Inputs to computation graph x_ph, a_ph = core.placeholders_from_spaces(env.observation_space, env.action_space) adv_ph, ret_ph, logp_old_ph = core.placeholders(None, None, None) # Main outputs from computation graph pi, logp, logp_pi, v = actor_critic(x_ph, a_ph, **ac_kwargs) # Need all placeholders in *this* order later (to zip with data from buffer) all_phs = [x_ph, a_ph, adv_ph, ret_ph, logp_old_ph] # Every step, get: action, value, and logprob get_action_ops = [pi, v, logp_pi] # Experience buffer local_steps_per_epoch = int(steps_per_epoch / num_procs()) buf = PPOBuffer(obs_dim, act_dim, local_steps_per_epoch, gamma, lam) # Count variables var_counts = tuple(core.count_vars(scope) for scope in ['pi', 'v']) logger.log('\nNumber of parameters: \t pi: %d, \t v: %d\n' % var_counts) # PPO objectives ratio = tf.exp(logp - logp_old_ph) # pi(a|s) / pi_old(a|s) min_adv = tf.where(adv_ph > 0, (1 + clip_ratio) * adv_ph, (1 - clip_ratio) * adv_ph) pi_loss = -tf.reduce_mean(tf.minimum(ratio * adv_ph, min_adv)) v_loss = tf.reduce_mean((ret_ph - v)**2) # Info (useful to watch during learning) approx_kl = tf.reduce_mean( logp_old_ph - logp) # a sample estimate for KL-divergence, easy to compute approx_ent = tf.reduce_mean( -logp) # a sample estimate for entropy, also easy to compute clipped = tf.logical_or(ratio > (1 + clip_ratio), ratio < (1 - clip_ratio)) clipfrac = tf.reduce_mean(tf.cast(clipped, tf.float32)) # Optimizers train_pi = MpiAdamOptimizer(learning_rate=pi_lr).minimize(pi_loss) train_v = MpiAdamOptimizer(learning_rate=vf_lr).minimize(v_loss) sess = tf.Session() sess.run(tf.global_variables_initializer()) # Sync params across processes sess.run(sync_all_params()) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph}, outputs={'pi': pi, 'v': v}) def update(): inputs = {k: v for k, v in zip(all_phs, buf.get())} pi_l_old, v_l_old, ent = sess.run([pi_loss, v_loss, approx_ent], feed_dict=inputs) # Training for i in range(train_pi_iters): _, kl = sess.run([train_pi, approx_kl], feed_dict=inputs) kl = mpi_avg(kl) if kl > 1.5 * target_kl: logger.log( 'Early stopping at step %d due to reaching max kl.' % i) break logger.store(StopIter=i) for _ in range(train_v_iters): sess.run(train_v, feed_dict=inputs) # Log changes from update pi_l_new, v_l_new, kl, cf = sess.run( [pi_loss, v_loss, approx_kl, clipfrac], feed_dict=inputs) logger.store(LossPi=pi_l_old, LossV=v_l_old, KL=kl, Entropy=ent, ClipFrac=cf, DeltaLossPi=(pi_l_new - pi_l_old), DeltaLossV=(v_l_new - v_l_old)) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 # Main loop: collect experience in env and update/log each epoch for epoch in range(epochs): for t in range(local_steps_per_epoch): a, v_t, logp_t = sess.run(get_action_ops, feed_dict={x_ph: o.reshape(1, -1)}) o2, r, d, _ = env.step(a[0]) ep_ret += r ep_len += 1 # save and log buf.store(o, a, r, v_t, logp_t) logger.store(VVals=v_t) # Update obs (critical!) o = o2 terminal = d or (ep_len == max_ep_len) if terminal or (t == local_steps_per_epoch - 1): if not (terminal): print('Warning: trajectory cut off by epoch at %d steps.' % ep_len) # if trajectory didn't reach terminal state, bootstrap value target last_val = 0 if d else sess.run( v, feed_dict={x_ph: o.reshape(1, -1)}) buf.finish_path(last_val) if terminal: # only save EpRet / EpLen if trajectory finished logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Save model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) # Perform PPO update! update() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('TotalEnvInteracts', (epoch + 1) * steps_per_epoch) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('DeltaLossPi', average_only=True) logger.log_tabular('DeltaLossV', average_only=True) logger.log_tabular('Entropy', average_only=True) logger.log_tabular('KL', average_only=True) logger.log_tabular('ClipFrac', average_only=True) logger.log_tabular('StopIter', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()