class HIRO: def __init__(self, env, gamma=0.99, polyak=0.995, c=10, d=2, high_act_noise=0.1, low_act_noise=0.1, high_rew_scale=0.1, low_rew_scale=1.0, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, log_dir='logs/train', training=True ): self.gamma = gamma self.polyak = polyak self.low_act_noise = low_act_noise self.high_act_noise = high_act_noise self.low_rew_scale = low_rew_scale self.high_rew_scale = high_rew_scale self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.max_episodes = max_episodes self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path self.c = c self.d = d self.higher_buffer = ReplayBuffer(buffer_capacity, tuple_length=5) self.lower_buffer = ReplayBuffer(buffer_capacity, tuple_length=4) self.low_actor, self.low_critic_1, self.low_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.low_target_actor, self.low_target_critic_1, self.low_target_critic_2 = create_actor_critic( state_dim=2 * env.observation_space.shape[0], action_dim=env.action_space.shape[0], action_range=env.action_space.high) self.high_actor, self.high_critic_1, self.high_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.high_target_actor, self.high_target_critic_1, self.high_target_critic_2 = create_actor_critic( state_dim=env.observation_space.shape[0], action_dim=env.observation_space.shape[0], action_range=env.observation_space.high) self.low_target_actor.set_weights(self.low_actor.get_weights()) self.low_target_critic_1.set_weights(self.low_critic_1.get_weights()) self.low_target_critic_2.set_weights(self.low_critic_2.get_weights()) self.high_target_actor.set_weights(self.high_actor.get_weights()) self.high_target_critic_1.set_weights(self.high_critic_1.get_weights()) self.high_target_critic_2.set_weights(self.high_critic_2.get_weights()) if training: self.low_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.low_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.low_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_actor_optimizer = tf.keras.optimizers.Adam(learning_rate=self.p_lr) self.high_critic_1_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.high_critic_2_optimizer = tf.keras.optimizers.Adam(learning_rate=self.q_lr) self.mse = tf.keras.losses.MeanSquaredError() self.summary_writer = tf.summary.create_file_writer(log_dir) self.low_actor_train_fn = self.create_train_step_actor_fn(self.low_actor, self.low_critic_1, self.low_actor_optimizer) self.low_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.low_critic_1, self.low_critic_1_optimizer), (self.low_critic_2, self.low_critic_2_optimizer)]] self.high_actor_train_fn = self.create_train_step_actor_fn(self.high_actor, self.high_critic_1, self.high_actor_optimizer) self.high_critic_train_fns = [self.create_train_step_critic_fn(critic=c, optimizer=o) for c, o in [(self.high_critic_1, self.high_critic_1_optimizer), (self.high_critic_2, self.high_critic_2_optimizer)]] if load_path is not None: self.low_actor.load_weights(f'{load_path}/low/actor') self.low_critic_1.load_weights(f'{load_path}/low/critic_1') self.low_critic_2.load_weights(f'{load_path}/low/critic_2') self.high_actor.load_weights(f'{load_path}/high/actor') self.high_critic_1.load_weights(f'{load_path}/high/critic_1') self.high_critic_2.load_weights(f'{load_path}/high/critic_2') @staticmethod def goal_transition(state, goal, next_state): return state + goal - next_state @staticmethod def intrinsic_reward(state, goal, next_state): return - np.linalg.norm(state + goal - next_state) def act(self, obs, goal, noise=False): norm_dist = tf.random.normal(self.env.action_space.shape, stddev=0.1 * self.env.action_space.high) action = self.low_actor(np.concatenate((obs, goal), axis=1)).numpy() action = np.clip(action + (norm_dist.numpy() if noise else 0), a_min=self.env.action_space.low, a_max=self.env.action_space.high) return action def get_goal(self, obs, noise=False): norm_dist = tf.random.normal(self.env.observation_space.shape, stddev=0.1 * self.env.observation_space.high) action = self.high_actor(obs).numpy() action = np.clip(action + (norm_dist.numpy() if noise else 0), a_min=self.env.observation_space.low, a_max=self.env.observation_space.high) return action @tf.function def log_probability(self, states, actions, candidate_goal): goals = tf.reshape(candidate_goal, (1, -1)) def body(curr_i, curr_goals, s): new_goals = tf.concat( (curr_goals, tf.reshape(self.goal_transition(s[curr_i - 1], curr_goals[curr_i - 1], s[curr_i]), (1, -1))), axis=0) curr_i += 1 return [curr_i, new_goals, s] def condition(curr_i, curr_goals, s): return curr_i < s.shape[0] and not ( tf.equal(tf.math.count_nonzero(s[curr_i]), 0) and tf.equal(tf.math.count_nonzero(actions[curr_i]), 0)) # If a state-action pair is all zero, then the episode ended before an entire sequence of length c was recorded. # We must remove these empty states and actions from the log probability calculation, as they could skew the # argmax computation i = tf.constant(1) i, goals, states = tf.while_loop(condition, body, [i, goals, states], shape_invariants=[tf.TensorShape(None), tf.TensorShape([None, goals.shape[1]]), states.shape]) states = states[:i, :] actions = actions[:i, :] action_predictions = self.low_actor(tf.concat((states, goals), axis=1)) return -(1 / 2) * tf.reduce_sum(tf.linalg.norm(actions - action_predictions, axis=1)) @tf.function def off_policy_correct(self, states, goals, actions, new_states): first_states = tf.reshape(states, (self.batch_size, -1))[:, :new_states[0].shape[0]] means = new_states - first_states std_dev = 0.5 * (1 / 2) * tf.convert_to_tensor(self.env.observation_space.high) for i in range(states.shape[0]): # Sample eight candidate goals sampled randomly from a Gaussian centered at s_{t+c} - s_t # Include the original goal and a goal corresponding to the difference s_{t+c} - s_t # TODO: clip the random actions to lie within the high-level action range candidate_goals = tf.concat( (tf.random.normal(shape=(8, self.env.observation_space.shape[0]), mean=means[i], stddev=std_dev), tf.reshape(goals[i], (1, -1)), tf.reshape(means[i], (1, -1))), axis=0) chosen_goal = tf.argmax( [self.log_probability(states[i], actions[i], candidate_goals[g]) for g in range(candidate_goals.shape[0])]) goals = tf.tensor_scatter_nd_update(goals, [[i]], [candidate_goals[chosen_goal]]) return first_states, goals @tf.function def train_step_critics(self, states, actions, rewards, next_states, actor, target_critic_1, target_critic_2, critic_trains_fns, target_noise, scope='Policy'): target_goal_preds = actor(next_states) target_goal_preds += target_noise target_q_values_1 = target_critic_1([next_states, target_goal_preds]) target_q_values_2 = target_critic_2([next_states, target_goal_preds]) target_q_values = tf.concat((target_q_values_1, target_q_values_2), axis=1) target_q_values = tf.reshape(tf.reduce_min(target_q_values, axis=1), (self.batch_size, -1)) targets = rewards + self.gamma * target_q_values critic_trains_fns[0](states, actions, targets, scope=scope, label='Critic 1') critic_trains_fns[1](states, actions, targets, scope=scope, label='Critic 2') def create_train_step_actor_fn(self, actor, critic, optimizer): @tf.function def train_step_actor(states, scope='policy', label='actor'): with tf.GradientTape() as tape: action_predictions = actor(states) q_values = critic([states, action_predictions]) policy_loss = -tf.reduce_mean(q_values) gradients = tape.gradient(policy_loss, actor.trainable_variables) optimizer.apply_gradients(zip(gradients, actor.trainable_variables)) with tf.name_scope(scope): with self.summary_writer.as_default(): tf.summary.scalar(f'{label} Policy Loss', policy_loss, step=optimizer.iterations) return train_step_actor def create_train_step_critic_fn(self, critic, optimizer): @tf.function def train_step_critic(states, actions, targets, scope='Policy', label='Critic'): with tf.GradientTape() as tape: q_values = critic([states, actions]) mse_loss = self.mse(q_values, targets) gradients = tape.gradient(mse_loss, critic.trainable_variables) optimizer.apply_gradients(zip(gradients, critic.trainable_variables)) with tf.name_scope(scope): with self.summary_writer.as_default(): tf.summary.scalar(f'{label} MSE Loss', mse_loss, step=optimizer.iterations) tf.summary.scalar(f'{label} Mean Q Values', tf.reduce_mean(q_values), step=optimizer.iterations) return train_step_critic def update_lower(self): if len(self.lower_buffer) >= self.batch_size: states, actions, rewards, next_states = self.lower_buffer.sample(self.batch_size) rewards = rewards.reshape(-1, 1).astype(np.float32) self.train_step_critics(states, actions, rewards, next_states, self.low_actor, self.low_target_critic_1, self.low_target_critic_2, self.low_critic_train_fns, target_noise=tf.random.normal(actions.shape, stddev=0.1 * self.env.action_space.high), scope='Lower_Policy') if self.low_critic_1_optimizer.iterations % self.d == 0: self.low_actor_train_fn(states, scope='Lower_Policy', label='Actor') # Update target networks polyak_average(self.low_actor.variables, self.low_target_actor.variables, self.polyak) polyak_average(self.low_critic_1.variables, self.low_target_critic_1.variables, self.polyak) polyak_average(self.low_critic_2.variables, self.low_target_critic_2.variables, self.polyak) def update_higher(self): if len(self.higher_buffer) >= self.batch_size: states, goals, actions, rewards, next_states = self.higher_buffer.sample(self.batch_size) rewards = rewards.reshape((-1, 1)) states, goals, actions, rewards, next_states = (tf.convert_to_tensor(states, dtype=tf.float32), tf.convert_to_tensor(goals, dtype=tf.float32), tf.convert_to_tensor(actions, dtype=tf.float32), tf.convert_to_tensor(rewards, dtype=tf.float32), tf.convert_to_tensor(next_states, dtype=tf.float32)) states, goals = self.off_policy_correct(states=states, goals=goals, actions=actions, new_states=next_states) self.train_step_critics(states, goals, rewards, next_states, self.high_actor, self.high_target_critic_1, self.high_target_critic_2, self.high_critic_train_fns, target_noise=tf.random.normal(next_states.shape, stddev=0.1 * self.env.observation_space.high), scope='Higher_Policy') if self.high_critic_1_optimizer.iterations % self.d == 0: self.high_actor_train_fn(states, scope='Higher_Policy', label='Actor') # Update target networks polyak_average(self.high_actor.variables, self.high_target_actor.variables, self.polyak) polyak_average(self.high_critic_1.variables, self.high_target_critic_1.variables, self.polyak) polyak_average(self.high_critic_2.variables, self.high_target_critic_2.variables, self.polyak) def learn(self): # Collect experiences s_t, g_t, a_t, R_t mean_reward = None total_steps = 0 for ep in range(self.max_episodes): if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print(f"-------------------------------------------------------") print(f"Mean {self.print_freq} Episode Reward: {new_mean_reward}") print(f"Total Episodes: {ep}") print(f"Total Steps: {total_steps}") print(f"-------------------------------------------------------") total_steps = 0 with tf.name_scope('Episodic Information'): with self.summary_writer.as_default(): tf.summary.scalar(f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=ep // self.print_freq) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward) and self.save_path is not None: print(f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}") print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.low_actor.save_weights(f'{self.save_path}/low/actor') self.low_critic_1.save_weights(f'{self.save_path}/low/critic_1') self.low_critic_2.save_weights(f'{self.save_path}/low/critic_2') self.high_actor.save_weights(f'{self.save_path}/high/actor') self.high_critic_1.save_weights(f'{self.save_path}/high/critic_1') self.high_critic_2.save_weights(f'{self.save_path}/high/critic_2') obs = self.env.reset() goal = self.get_goal(obs.reshape((1, -1)), noise=True).flatten() higher_goal = goal higher_obs = [] higher_actions = [] higher_reward = 0 episode_reward = 0 episode_intrinsic_rewards = 0 ep_len = 0 c = 0 done = False while not done: if self.render: self.env.render() action = self.act(obs.reshape((1, -1)), goal.reshape((1, -1)), noise=True).flatten() new_obs, rew, done, info = self.env.step(action) new_obs = new_obs.flatten() new_goal = self.goal_transition(obs, goal, new_obs) episode_reward += rew # Goals are treated as additional state information for the low level # policy. Store transitions in respective replay buffers intrinsic_reward = self.intrinsic_reward(obs, goal, new_obs) * self.low_rew_scale self.lower_buffer.add((np.concatenate((obs, goal)), action, intrinsic_reward, np.concatenate((new_obs, new_goal)),)) episode_intrinsic_rewards += intrinsic_reward self.update_lower() # Fill lists for single higher level transition higher_obs.append(obs) higher_actions.append(action) higher_reward += self.high_rew_scale * rew # Only add transitions to the high level replay buffer every c steps c += 1 if c == self.c or done: # Need all higher level transitions to be the same length # fill the rest of this transition with zeros while c < self.c: higher_obs.append(np.full(self.env.observation_space.shape, 0)) higher_actions.append(np.full(self.env.action_space.shape, 0)) c += 1 self.higher_buffer.add((higher_obs, higher_goal, higher_actions, higher_reward, new_obs)) self.update_higher() c = 0 higher_obs = [] higher_actions = [] higher_reward = 0 goal = self.get_goal(new_obs.reshape((1, -1)), noise=True).flatten() higher_goal = goal obs = new_obs goal = new_goal with tf.name_scope('Episodic Information'): with self.summary_writer.as_default(): tf.summary.scalar(f'Episode Environment Reward', episode_reward, step=ep) tf.summary.scalar(f'Episode Intrinsic Reward', episode_intrinsic_rewards, step=ep) self.rewards.append(episode_reward) total_steps += ep_len
class DDPG: def __init__( self, env, gamma=0.99, polyak=0.995, act_noise=0.1, render=False, batch_size=32, q_lr=1e-3, p_lr=1e-4, buffer_capacity=5000, max_episodes=100, save_path=None, load_path=None, print_freq=1, start_steps=10000, log_dir='logs/train', training=True, ): self.gamma = gamma self.polyak = polyak self.act_noise = act_noise self.render = render self.batch_size = batch_size self.p_lr = p_lr self.q_lr = q_lr self.max_episodes = max_episodes self.start_steps = start_steps self.actor, self.critic = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor, self.target_critic = create_actor_critic( env.observation_space.shape[0], env.action_space.shape[0], env.action_space.high) self.target_actor.set_weights(self.actor.get_weights()) self.target_critic.set_weights(self.critic.get_weights()) self.env = env self.rewards = [] self.print_freq = print_freq self.save_path = save_path if training: self.buffer = ReplayBuffer(buffer_capacity) self.actor_optimizer = tf.keras.optimizers.Adam( learning_rate=self.p_lr) self.critic_optimizer = tf.keras.optimizers.Adam( learning_rate=self.q_lr) self.summary_writer = tf.summary.create_file_writer(log_dir) self.mse = tf.keras.losses.MeanSquaredError() if load_path is not None: self.actor.load_weights(f'{load_path}/actor') self.critic.load_weights(f'{load_path}/critic') @tf.function def train_step(self, states, actions, targets): with tf.GradientTape() as tape: action_predictions = self.actor(states) q_values = self.critic([states, action_predictions]) policy_loss = -tf.reduce_mean(q_values) actor_gradients = tape.gradient(policy_loss, self.actor.trainable_variables) self.actor_optimizer.apply_gradients( zip(actor_gradients, self.actor.trainable_variables)) with tf.GradientTape() as tape: q_values = self.critic([states, actions]) mse_loss = self.mse(q_values, targets) critic_gradients = tape.gradient(mse_loss, self.critic.trainable_variables) self.critic_optimizer.apply_gradients( zip(critic_gradients, self.critic.trainable_variables)) with self.summary_writer.as_default(): tf.summary.scalar('Policy Loss', policy_loss, step=self.critic_optimizer.iterations) tf.summary.scalar('MSE Loss', mse_loss, step=self.critic_optimizer.iterations) tf.summary.scalar('Estimated Q Value', tf.reduce_mean(q_values), step=self.critic_optimizer.iterations) def update(self): if len(self.buffer) >= self.batch_size: # Sample random minibatch of N transitions states, actions, rewards, next_states, dones = self.buffer.sample( self.batch_size) dones = dones.reshape(-1, 1) rewards = rewards.reshape(-1, 1) # Set the target for learning target_action_preds = self.target_actor(next_states) target_q_values = self.target_critic( [next_states, target_action_preds]) targets = rewards + self.gamma * target_q_values * (1 - dones) # update critic by minimizing the MSE loss # update the actor policy using the sampled policy gradient self.train_step(states, actions, targets) # Update target networks polyak_average(self.actor.variables, self.target_actor.variables, self.polyak) polyak_average(self.critic.variables, self.target_critic.variables, self.polyak) def act(self, obs, noise=False): # Initialize a random process N for action exploration norm_dist = tf.random.normal(self.env.action_space.shape, stddev=self.act_noise) action = self.actor(np.expand_dims(obs, axis=0)) action = np.clip(action.numpy() + (norm_dist.numpy() if noise else 0), a_min=self.env.action_space.low, a_max=self.env.action_space.high) return action def learn(self): mean_reward = None total_steps = 0 overall_steps = 0 for ep in range(self.max_episodes): if ep % self.print_freq == 0 and ep > 0: new_mean_reward = np.mean(self.rewards[-self.print_freq - 1:]) print( f"-------------------------------------------------------") print( f"Mean {self.print_freq} Episode Reward: {new_mean_reward}" ) print(f"Mean Steps: {total_steps / self.print_freq}") print(f"Total Episodes: {ep}") print(f"Total Steps: {overall_steps}") print( f"-------------------------------------------------------") total_steps = 0 with self.summary_writer.as_default(): tf.summary.scalar(f'Mean {self.print_freq} Episode Reward', new_mean_reward, step=ep) # Model saving inspired by Open AI Baseline implementation if (mean_reward is None or new_mean_reward >= mean_reward ) and self.save_path is not None: print( f"Saving model due to mean reward increase:{mean_reward} -> {new_mean_reward}" ) print(f'Location: {self.save_path}') mean_reward = new_mean_reward self.actor.save_weights(f'{self.save_path}/actor') self.critic.save_weights(f'{self.save_path}/critic') # Receive initial observation state s_1 obs = self.env.reset() done = False episode_reward = 0 ep_len = 0 while not done: # Display the environment if self.render: self.env.render() # Execute action and observe reward and observe new state if self.start_steps > 0: self.start_steps -= 1 action = self.env.action_space.sample() else: # Select action according to policy and exploration noise action = self.act(obs, noise=True).flatten() new_obs, rew, done, info = self.env.step(action) new_obs = new_obs.flatten() episode_reward += rew # Store transition in R self.buffer.add((obs, action, rew, new_obs, done)) # Perform a single learning step self.update() obs = new_obs ep_len += 1 with self.summary_writer.as_default(): tf.summary.scalar(f'Episode Reward', episode_reward, step=ep) self.rewards.append(episode_reward) total_steps += ep_len overall_steps += ep_len
for i, data in enumerate(data_loader): realA, realB = data ### Generator # Identity loss sameA = netG_B2A(realA) identity_loss_A = torch.nn.L1Loss(sameA, realA) * lamnda_identity sameB = netG_A2B(realB) identity_loss_B = torch.nn.L1Loss(sameB, realB) * lamnda_identity # GAN loss fake_B = netG_A2B(real_A) pred_fake = netD_B(fake_B) GAN_loss_A2B = torch.nn.MSELoss(pred_fake, target_real) buffer_B.add(fake_B) fake_A = netG_B2A(real_B) pred_fake = netD_A(fake_A) GANloss_B2A = torch.nn.MSELoss(pred_fake, target_real) buffer_A.add(fake_A) # Cycle loss recovered_A = netG_B2A(fake_B) loss_cycle_ABA = torch.nn.L1Loss(recovered_A, real_A) * lambda_cycle recovered_B = netG_A2B(fake_A) loss_cycle_BAB = torch.nn.L1Loss(recovered_B, real_B) * lambda_cycle # Total loss loss_G = loss_identity_A + loss_identity_B + loss_GAN_A2B + loss_GAN_B2A + loss_cycle_ABA + loss_cycle_BAB
class DDPGAgent(): def __init__(self, state_size, action_size, num_agents): self.state_size = state_size self.action_size = action_size self.seed = random.seed(RANDOM_SEED) self.num_agents = num_agents # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size).to(device) self.actor_target = Actor(state_size, action_size).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size).to(device) self.critic_target = Critic(state_size, action_size).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE) # Directory where to save the model self.model_dir = os.getcwd() + "/DDPG/saved_models" os.makedirs(self.model_dir, exist_ok=True) def step(self, states, actions, rewards, next_states, dones): for i in range(self.num_agents): self.memory.add(states[i], actions[i], rewards[i], next_states[i], dones[i]) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, states, add_noise=True): states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # adds gradient clipping to stabilize learning self.critic_optimizer.step() actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def save_model(self): torch.save( self.actor_local.state_dict(), os.path.join(self.model_dir, 'actor_params.pth') ) torch.save( self.actor_optimizer.state_dict(), os.path.join(self.model_dir, 'actor_optim_params.pth') ) torch.save( self.critic_local.state_dict(), os.path.join(self.model_dir, 'critic_params.pth') ) torch.save( self.critic_optimizer.state_dict(), os.path.join(self.model_dir, 'critic_optim_params.pth') ) def load_model(self): """Loads weights from saved model.""" self.actor_local.load_state_dict( torch.load(os.path.join(self.model_dir, 'actor_params.pth')) ) self.actor_optimizer.load_state_dict( torch.load(os.path.join(self.model_dir, 'actor_optim_params.pth')) ) self.critic_local.load_state_dict( torch.load(os.path.join(self.model_dir, 'critic_params.pth')) ) self.critic_optimizer.load_state_dict( torch.load(os.path.join(self.model_dir, 'critic_optim_params.pth')) )