def train_expert(env_name): """Train expert policy in given environment.""" if env_name == 'InvertedPendulum-v2': env = ExpertInvertedPendulumEnv() episode_limit = 200 return_threshold = 200 elif env_name == 'InvertedDoublePendulum-v2': env = ExpertInvertedDoublePendulumEnv() episode_limit = 50 return_threshold = 460 elif env_name == 'ThreeReacherEasy-v2': env = ThreeReacherEasyEnv() episode_limit = 50 return_threshold = -0.8 elif env_name == 'ReacherEasy-v2': env = ReacherEasyEnv() episode_limit = 50 return_threshold = -0.8 elif env_name == 'Hopper-v2': env = HopperEnv() episode_limit = 200 return_threshold = 600 elif env_name == 'HalfCheetah-v2': env = ExpertHalfCheetahEnv() episode_limit = 200 return_threshold = 1000 elif env_name == 'StrikerHumanSim-v2': env = StrikerHumanSimEnv() episode_limit = 200 return_threshold = -190 elif env_name == 'PusherHumanSim-v2': env = PusherHumanSimEnv() episode_limit = 200 return_threshold = -80 else: raise NotImplementedError buffer_size = 1000000 init_random_samples = 1000 exploration_noise = 0.2 learning_rate = 3e-4 batch_size = 256 epochs = 200 steps_per_epoch = 5000 updates_per_step = 1 update_actor_every = 1 start_training = 512 gamma = 0.99 polyak = 0.995 entropy_coefficient = 0.2 clip_actor_gradients = False visual_env = True action_size = env.action_space.shape[0] tune_entropy_coefficient = True target_entropy = -1 * action_size def make_actor(): actor = StochasticActor([ tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(action_size * 2) ]) return actor def make_critic(): critic = Critic([ tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(256, 'relu'), tf.keras.layers.Dense(1) ]) return critic optimizer = tf.keras.optimizers.Adam(learning_rate) replay_buffer = ReplayBuffer(buffer_size) sampler = Sampler(env, episode_limit=episode_limit, init_random_samples=init_random_samples, visual_env=visual_env) agent = SAC(make_actor, make_critic, make_critic, actor_optimizer=optimizer, critic_optimizer=optimizer, gamma=gamma, polyak=polyak, entropy_coefficient=entropy_coefficient, tune_entropy_coefficient=tune_entropy_coefficient, target_entropy=target_entropy, clip_actor_gradients=clip_actor_gradients) if visual_env: obs = np.expand_dims(env.reset()['obs'], axis=0) else: obs = np.expand_dims(env.reset(), axis=0) agent(obs) agent.summary() mean_test_returns = [] mean_test_std = [] steps = [] step_counter = 0 for e in range(epochs): while step_counter < (e + 1) * steps_per_epoch: traj_data = sampler.sample_trajectory(agent, exploration_noise) replay_buffer.add(traj_data) if step_counter > start_training: agent.train(replay_buffer, batch_size=batch_size, n_updates=updates_per_step * traj_data['n'], act_delay=update_actor_every) step_counter += traj_data['n'] print('Epoch {}/{} - total steps {}'.format(e + 1, epochs, step_counter)) out = sampler.evaluate(agent, 10) mean_test_returns.append(out['mean']) mean_test_std.append(out['std']) steps.append(step_counter) if out['mean'] >= return_threshold: print('Early termination due to reaching return threshold') break plt.errorbar(steps, mean_test_returns, mean_test_std) plt.xlabel('steps') plt.ylabel('returns') plt.show() return agent
def main(): with tf.Session() as sess: actor = ActorNetwork(sess, STATE_DIM, ACTION_DIM, ACTION_BOUND, ACTOR_LEARNING_RATE, TAU, MINIBATCH_SIZE) critic = CriticNetwork(sess, STATE_DIM, ACTION_DIM, CRITIC_LEARNING_RATE, TAU, actor.get_num_trainable_vars()) #actor_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(ACTION_DIM)) #TODO: Ornstein-Uhlenbeck noise. sess.run(tf.global_variables_initializer()) # initialize target net actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE) # main loop. for ep in range(MAX_EPISODES): episode_reward = 0 ep_batch_avg_q = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) #+ actor_noise() s2, r, terminal, info = ENV.step(a[0]) #print(s2) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE and \ step % TRAIN_INTERVAL == 0: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE): if t_batch[i]: # terminal targets.append(r_batch[i]) else: targets.append(r_batch[i] + GAMMA * target_q[i]) # Critic を train. #TODO: predQはepisodeではなくrandom batchなのでepisode_avg_maxという統計は不適切. pred_q, _ = critic.train( s_batch, a_batch, np.reshape(targets, (MINIBATCH_SIZE, 1))) # Actor を train. a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) #print(grads[0].shape) #exit(1) actor.train(s_batch, grads[0]) # Update target networks. # 数batchに一度にするべき? actor.update_target_network() critic.update_target_network() ep_batch_avg_q += np.mean(pred_q) s = s2 episode_reward += r if terminal: print('Episode:', ep, 'Reward:', episode_reward) reward_log.append(episode_reward) q_log.append(ep_batch_avg_q / step) break
class SAC: def __init__(self, env, gamma=0.99, tau=0.005, learning_rate=3e-4, buffer_size=50000, learning_starts=100, train_freq=1, batch_size=64, target_update_interval=1, gradient_steps=1, target_entropy='auto', ent_coef='auto', random_exploration=0.0, discrete=True, regularized=True, feature_extraction="cnn"): self.env = env self.learning_starts = learning_starts self.random_exploration = random_exploration self.train_freq = train_freq self.target_update_interval = target_update_interval self.batch_size = batch_size self.gradient_steps = gradient_steps self.learning_rate = learning_rate self.graph = tf.Graph() with self.graph.as_default(): self.sess = tf.Session(graph=self.graph) self.replay_buffer = ReplayBuffer(buffer_size) self.agent = SACAgent(self.sess, env, discrete=discrete, regularized=regularized, feature_extraction=feature_extraction) self.model = SACModel(self.sess, self.agent, target_entropy, ent_coef, gamma, tau) with self.sess.as_default(): self.sess.run(tf.global_variables_initializer()) self.sess.run(self.model.target_init_op) self.num_timesteps = 0 def train(self, learning_rate): batch_obs, batch_actions, batch_rewards, batch_next_obs, batch_dones = self.replay_buffer.sample( self.batch_size) # print("batch_actions:", batch_actions.shape) # print("self.agent.actions_ph:", self.agent.actions_ph) feed_dict = { self.agent.obs_ph: batch_obs, self.agent.next_obs_ph: batch_next_obs, self.model.rewards_ph: batch_rewards.reshape(self.batch_size, -1), self.model.terminals_ph: batch_dones.reshape(self.batch_size, -1), self.model.learning_rate_ph: learning_rate } if not self.agent.discrete: feed_dict[self.agent.actions_ph] = batch_actions else: batch_actions = batch_actions.reshape(-1) feed_dict[self.agent.actions_ph] = batch_actions policy_loss, qf1_loss, qf2_loss, value_loss, *values = self.sess.run( self.model.step_ops, feed_dict) return policy_loss, qf1_loss, qf2_loss def learn(self, total_timesteps): learning_rate = get_schedule_fn(self.learning_rate) episode_rewards = [0] mb_losses = [] obs = self.env.reset() for step in range(total_timesteps): if self.num_timesteps < self.learning_starts or np.random.rand( ) < self.random_exploration: unscaled_action = self.env.action_space.sample() action = scale_action(self.env.action_space, unscaled_action) else: action = self.agent.step(obs[None]).flatten() unscaled_action = unscale_action(self.env.action_space, action) # print("\nunscaled_action:", unscaled_action) new_obs, reward, done, _ = self.env.step(unscaled_action) self.num_timesteps += 1 self.replay_buffer.add(obs, action, reward, new_obs, done) obs = new_obs if self.num_timesteps % self.train_freq == 0: for grad_step in range(self.gradient_steps): if not self.replay_buffer.can_sample( self.batch_size ) or self.num_timesteps < self.learning_starts: break frac = 1.0 - step / total_timesteps current_lr = learning_rate(frac) mb_losses.append(self.train(current_lr)) if (step + grad_step) % self.target_update_interval == 0: self.sess.run(self.model.target_update_op) episode_rewards[-1] += reward if done: obs = self.env.reset() episode_rewards.append(0) mean_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) loss_str = "/".join([f"{x:.3f}" for x in np.mean(mb_losses, 0) ]) if len(mb_losses) > 0 else "NaN" print(f"Step {step} - reward: {mean_reward} - loss: {loss_str}", end="\n" if step % 500 == 0 else "\r")
def learn(self, timesteps=10000, verbose=0, seed=None): if seed is not None: random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) self.eps_range = self._eps_range(timesteps) replay_buffer = ReplayBuffer(self.buffer_size) self._init_model() obs = self.env.reset() for step in range(timesteps): # while not done: cur_eps = next(self.eps_range, None) if cur_eps is None: cur_eps = self.final_eps action = self._select_action(obs, cur_eps) new_obs, rewards, done, info = self.env.step(action) if done: new_obs = [ np.nan ] * self.obs_shape[0] # hacky way to keep dimensions correct replay_buffer.add(obs, action, rewards, new_obs) obs = new_obs # learn gradient if step > self.learning_starts: if len(replay_buffer.buffer ) < self.batch_size: # buffer too small continue samples = replay_buffer.sample(self.batch_size, self.device) obs_batch, actions_batch, rewards_batch, new_obs_batch = samples predicted_q_values = self._predictQValue( self.step_model, obs_batch, actions_batch) ys = self._expectedLabels(self.target_model, new_obs_batch, rewards_batch) loss = F.smooth_l1_loss(predicted_q_values, ys) self.optim.zero_grad() loss.backward() for i in self.step_model.parameters(): i.grad.clamp_(min=-1, max=1) # exploding gradient # i.grad.clamp_(min=-10, max=10) # exploding gradient self.optim.step() # update target if step % self.target_network_update_freq == 0: self.target_model.load_state_dict( self.step_model.state_dict()) if done: obs = self.env.reset() if verbose == 1: if step % (timesteps * 0.1) == 0: perc = int(step / (timesteps * 0.1)) print(f"At step {step}") print(f"{perc}% done")
class MADDPG: def __init__(self, env, state_dim: int, action_dim: int, config: Dict, device=None, writer=None): self.logger = logging.getLogger("MADDPG") self.device = device if device is not None else DEVICE self.writer = writer self.env = env self.state_dim = state_dim self.action_dim = action_dim self.agents_number = config['agents_number'] hidden_layers = config.get('hidden_layers', (400, 300)) noise_scale = config.get('noise_scale', 0.2) noise_sigma = config.get('noise_sigma', 0.1) actor_lr = config.get('actor_lr', 1e-3) actor_lr_decay = config.get('actor_lr_decay', 0) critic_lr = config.get('critic_lr', 1e-3) critic_lr_decay = config.get('critic_lr_decay', 0) self.actor_tau = config.get('actor_tau', 0.002) self.critic_tau = config.get('critic_tau', 0.002) create_agent = lambda: DDPGAgent(state_dim, action_dim, agents=self.agents_number, hidden_layers=hidden_layers, actor_lr=actor_lr, actor_lr_decay=actor_lr_decay, critic_lr=critic_lr, critic_lr_decay=critic_lr_decay, noise_scale=noise_scale, noise_sigma=noise_sigma, device=self.device) self.agents = [create_agent() for _ in range(self.agents_number)] self.discount = 0.99 if 'discount' not in config else config['discount'] self.gradient_clip = 1.0 if 'gradient_clip' not in config else config[ 'gradient_clip'] self.warm_up = 1e3 if 'warm_up' not in config else config['warm_up'] self.buffer_size = int( 1e6) if 'buffer_size' not in config else config['buffer_size'] self.batch_size = config.get('batch_size', 128) self.p_batch_size = config.get('p_batch_size', int(self.batch_size // 2)) self.n_batch_size = config.get('n_batch_size', int(self.batch_size // 4)) self.buffer = ReplayBuffer(self.batch_size, self.buffer_size) self.update_every_iterations = config.get('update_every_iterations', 2) self.number_updates = config.get('number_updates', 2) self.reset() def reset(self): self.iteration = 0 self.reset_agents() def reset_agents(self): for agent in self.agents: agent.reset_agent() def step(self, state, action, reward, next_state, done) -> None: if np.isnan(state).any() or np.isnan(next_state).any(): print("State contains NaN. Skipping.") return self.iteration += 1 self.buffer.add(state, action, reward, next_state, done) if self.iteration < self.warm_up: return if len(self.buffer) > self.batch_size and ( self.iteration % self.update_every_iterations) == 0: self.evok_learning() def filter_batch(self, batch, agent_number): states, actions, rewards, next_states, dones = batch agent_states = states[:, agent_number * self.state_dim:(agent_number + 1) * self.state_dim].clone() agent_next_states = next_states[:, agent_number * self.state_dim:(agent_number + 1) * self.state_dim].clone() agent_rewards = rewards.select(1, agent_number).view(-1, 1).clone() agent_dones = dones.select(1, agent_number).view(-1, 1).clone() return (agent_states, states, actions, agent_rewards, agent_next_states, next_states, agent_dones) def evok_learning(self): for _ in range(self.number_updates): for agent_number in range(self.agents_number): batch = self.filter_batch(self.buffer.sample(), agent_number) self.learn(batch, agent_number) # self.update_targets() def act(self, states, noise: Union[None, List] = None): """get actions from all agents in the MADDPG object""" noise = [0] * self.agents_number if noise is None else noise tensor_states = torch.tensor(states).view(-1, self.agents_number, self.state_dim) with torch.no_grad(): actions = [] for agent_number, agent in enumerate(self.agents): agent.actor.eval() actions += agent.act(tensor_states.select(1, agent_number), noise[agent_number]) agent.actor.train() return torch.stack(actions) def learn(self, samples, agent_number: int) -> None: """update the critics and actors of all the agents """ action_offset = agent_number * self.action_dim flatten_actions = lambda a: a.view( -1, self.agents_number * self.action_dim) # No need to flip since there are no paralle agents agent_states, states, actions, rewards, agent_next_states, next_states, dones = samples agent = self.agents[agent_number] next_actions = actions.clone() next_actions[:, action_offset:action_offset + self.action_dim] = agent.target_actor(agent_next_states) # critic loss Q_target_next = agent.target_critic(next_states, flatten_actions(next_actions)) Q_target = rewards + (self.discount * Q_target_next * (1 - dones)) Q_expected = agent.critic(states, actions) critic_loss = F.mse_loss(Q_expected, Q_target) # Minimize the loss agent.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(agent.critic.parameters(), self.gradient_clip) agent.critic_optimizer.step() # Compute actor loss pred_actions = actions.clone() pred_actions[:, action_offset:action_offset + self.action_dim] = agent.actor(agent_states) actor_loss = -agent.critic(states, flatten_actions(pred_actions)).mean() agent.actor_optimizer.zero_grad() actor_loss.backward() agent.actor_optimizer.step() if self.writer: self.writer.add_scalar(f'agent{agent_number}/critic_loss', critic_loss.item(), self.iteration) self.writer.add_scalar(f'agent{agent_number}/actor_loss', abs(actor_loss.item()), self.iteration) self._soft_update(agent.target_actor, agent.actor, self.actor_tau) self._soft_update(agent.target_critic, agent.critic, self.critic_tau) def _soft_update(self, target: nn.Module, source: nn.Module, tau) -> None: for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - tau) + param.data * tau)
def base_train_loop(args: dict, policy, replay_buffer: ReplayBuffer, env): evaluations = [ eval_policy(policy, args.domain_name, args.task_name, args.seed) ] timestep = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num = 0 for t in range(int(args.max_timesteps)): episode_timesteps += 1 state = flat_obs(timestep.observation) # Select action randomly or according to policy if t < args.start_timesteps: action = np.random.uniform( env.action_spec().minimum, env.action_spec().maximum, size=env.action_spec().shape, ) else: action = policy.select_action(state).clip(-args.max_action, args.max_action) # Perform action timestep = env.step(action) done_bool = float(timestep.last()) # Store data in replay buffer replay_buffer.add(state, action, flat_obs(timestep.observation), timestep.reward, done_bool) episode_reward += timestep.reward # Train agent after collecting sufficient data if t >= args.start_timesteps: for _ in range(args.train_steps): if args.policy == "MPO": policy.train(replay_buffer, args.batch_size, args.num_action_samples) else: policy.train(replay_buffer, args.batch_size) if timestep.last(): # +1 to account for 0 indexing. +0 on ep_timesteps since it will increment +1 even if done=True print( f"Total T: {t+1} Episode Num: {episode_num+1} " f"Episode T: {episode_timesteps} Reward: {episode_reward:.3f}") # Reset environment timestep = env.reset() episode_reward = 0 episode_timesteps = 0 episode_num += 1 # Evaluate episode if (t + 1) % args.eval_freq == 0: evaluations.append( eval_policy(policy, args.domain_name, args.task_name, args.seed)) np.save(f"./results/{args.file_name}_{t+1}", evaluations) if (t + 1) % args.save_freq == 0: if args.save_model: policy.save(f"./models/{args.file_name}_{t+1}")
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # print("Rewards") # print(rewards.shape) ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" # self.qnetwork_local.train() local_output = self.qnetwork_local(states).gather(1, actions) selected_target_actions = torch.max(self.qnetwork_target(next_states).detach(),1)[0].unsqueeze(1) activated = torch.sub(torch.Tensor(np.ones(dones.shape)), dones) is_it_done = torch.mul(selected_target_actions, activated) target_output = torch.add(torch.mul(is_it_done, gamma), rewards) loss = F.mse_loss(local_output, target_output) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
# main loop. for ep in range(MAX_EPISODES): episode_reward = 0 s = ENV.reset() for step in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, STATE_DIM))) + actor_noise() s2, r, terminal, info = ENV.step(a[0]) replay_buffer.add(np.reshape(s, (STATE_DIM,)), \ np.reshape(a, (ACTION_DIM,)), \ r, \ terminal, \ np.reshape(s2, (STATE_DIM,))) # Batch sampling. if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # target Q値を計算. target_action = actor.predict_target(s2_batch) target_q = critic.predict_target(s2_batch, target_action) # critic の target V値を計算. targets = [] for i in range(MINIBATCH_SIZE):
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, framework, buffer_type): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.framework = framework self.buffer_type = buffer_type # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory # def __init__(self, device, buffer_size, batch_size, alpha, beta): if self.buffer_type == 'PER_ReplayBuffer': self.memory = PER_ReplayBuffer(device, BUFFER_SIZE, BATCH_SIZE, ALPHA, BETA) if self.buffer_type == 'ReplayBuffer': self.memory = ReplayBuffer(device, action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: if self.buffer_type == 'ReplayBuffer': experiences = self.memory.sample() is_weights = None idxs = None if self.buffer_type == 'PER_ReplayBuffer': experiences, is_weights, idxs = self.memory.sample() self.criterion = WeightedLoss() #print('debugging:', experiences ) self.learn(experiences, is_weights, idxs, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # use local network # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, is_weights, idxs, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if self.framework == 'DQN': # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach( ).max(1)[0].unsqueeze( 1 ) #target network: which is uploaded slower than the local network #print('Q_targets_next is ', Q_targets_next) if self.framework == "DDQN": #print("DDQN") max_actions = self.qnetwork_local(next_states).detach().argmax( 1).unsqueeze(1) #print('max_actions is ', max_actions) #print('self.qnetwork_target(next_states).detach() is ',self.qnetwork_target(next_states).detach()) #Q_targets_next = self.qnetwork_target(next_states).detach().gather(1, max_actions).squeeze(1) Q_targets_next = self.qnetwork_target(next_states).detach().gather( 1, max_actions) #print('DDQN, Q', Q_targets_next) #print('rewards is ', rewards) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) #------------------------------------------------------------------------------ #Huber Loss provides better results than MSE if is_weights is None: loss = F.smooth_l1_loss(Q_expected, Q_targets) #Compute Huber Loss manually to utilize is_weights with Prioritization else: loss, td_errors = self.criterion.huber(Q_expected, Q_targets, is_weights) self.memory.batch_update(idxs, td_errors) # Perform gradient descent self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)