def __init__(self, env, initial_act=30, gamma=0.98, tau=0.01, actor_lr=1e-4, critic_lr=1e-3, reward_scale=1., buffer_size=100, writer=None): self.env = env self.gamma = gamma self.tau = tau self.initial_act = initial_act self.reward_scale = reward_scale self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.buffer = PrioritizedReplayBuffer(buffer_size) self.actor = Actor(self.obs_dim, self.action_dim) self.target_actor = Actor(self.obs_dim, self.action_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.critic = Crtic(self.obs_dim, self.action_dim) self.target_critic = Crtic(self.obs_dim, self.action_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.actor = self.actor.to(self.device) self.critic = self.critic.to(self.device) self.target_actor = self.target_actor.to(self.device) self.target_critic = self.target_critic.to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.criterion = nn.MSELoss(reduction='none') self.writer = writer
def __init__(self, config: Config): self.config = config self.is_training = True if self.config.prioritized_replay: self.buffer = PrioritizedReplayBuffer( self.config.max_buff, alpha=self.config.prioritized_replay_alpha) if self.config.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = self.config.frames self.beta_schedule = LinearSchedule( prioritized_replay_beta_iters, initial_p=self.config.prioritized_replay_beta0, final_p=1.0) else: self.buffer = ReplayBuffer(self.config.max_buff) self.beta_schedule = None self.model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model = CnnDQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda()
def __init__(self, state_size, action_size, seed, index=0, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed index (int): Index assigned to the agent num_agents (int): Number of agents in the environment """ self.state_size = state_size # State size self.action_size = action_size # Action size self.seed = torch.manual_seed(seed) # Random seed self.index = index # Index of this agent, not used at the moment self.tau = TAU # Parameter for soft weight update self.num_updates = N_UPDATES # Number of updates to perform when updating self.num_agents = num_agents # Number of agents in the environment self.tstep = 0 # Simulation step (modulo (%) UPDATE_EVERY) self.gamma = GAMMA # Gamma for the reward discount self.alpha = ALPHA # PER: toggle prioritization (0..1) # Set up actor and critic networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.critic_local = Critic(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OUNoise((1, action_size), seed) # Replay buffer self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha)
def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 self.memory_size = 20000 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 5000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.batch_size = 64 self.train_start = 1000 # create prioritized replay memory using SumTree self.memory = PrioritizedReplayBuffer(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/cartpole_dqn')
def __init__(self, env, model, target_model, config, name_agent="prioritized-dqn"): self.name_agent = name_agent self.dim_space = env.observation_space.shape[0] self.nb_actions = env.action_space.n self.epsilon = config.epsilon_start self.epsilon_final = config.epsilon_final self.epsilon_start = config.epsilon_start self.epsilon_decay = config.epsilon_decay self.gamma = config.gamma self.update_nb_iter = config.update_nb_iter # changing the buffer (taking a priotirized buffer # insted of a uniform probability buffer) self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size, config.w, config.beta_final, config.beta_start, config.beta_decay) self.environment = env self.batch_size = config.batch_size self.model = model self.target_model = target_model self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) self.loss_data = [] self.rewards = []
def _create_buffer(self, buffer_type, action_size, buffer_size, batch_size, alpha, beta, seed, device): if buffer_type == 'prioritized': self._update_buffer_priorities = True return PrioritizedReplayBuffer(action_size, buffer_size, batch_size, seed, alpha=alpha, beta=beta, device=device) elif buffer_type == 'sample': return ReplayBuffer(action_size, buffer_size, batch_size, seed, device=device) else: raise Exception( 'Unknown buffer type - must be one of prioritized or sample')
class DQNAgent(): def __init__(self, state_size, action_size): # if you want to see Cartpole learning, then change to True self.render = False self.load_model = False # get size of state and action self.state_size = state_size self.action_size = action_size # These are hyper parameters for the DQN self.discount_factor = 0.99 self.learning_rate = 0.001 self.memory_size = 20000 self.epsilon = 1.0 self.epsilon_min = 0.01 self.explore_step = 5000 self.epsilon_decay = (self.epsilon - self.epsilon_min) / self.explore_step self.batch_size = 64 self.train_start = 1000 # create prioritized replay memory using SumTree self.memory = PrioritizedReplayBuffer(self.memory_size) # create main model and target model self.model = DQN(state_size, action_size) self.model.apply(self.weights_init) self.target_model = DQN(state_size, action_size) self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate) # initialize target model self.update_target_model() if self.load_model: self.model = torch.load('save_model/cartpole_dqn') # weight xavier initialize def weights_init(self, m): classname = m.__class__.__name__ if classname.find('Linear') != -1: torch.nn.init.xavier_uniform(m.weight) # after some time interval update the target model to be same with model def update_target_model(self): self.target_model.load_state_dict(self.model.state_dict()) # get action from model using epsilon-greedy policy def get_action(self, state): if np.random.rand() <= self.epsilon: return random.randrange(self.action_size) else: state = torch.from_numpy(state) state = Variable(state).float().cpu() q_value = self.model(state) _, action = torch.max(q_value, 1) return int(action) # save sample (error,<s,a,r,s'>) to the replay memory def append_sample(self, state, action, reward, next_state, done): target = self.model(Variable(torch.FloatTensor(state))).data old_val = target[0][action] target_val = self.target_model(Variable( torch.FloatTensor(next_state))).data if done: target[0][action] = reward else: target[0][ action] = reward + self.discount_factor * torch.max(target_val) error = abs(old_val - target[0][action]) self.memory.add(error, (state, action, reward, next_state, done)) # pick samples from prioritized replay memory (with batch_size) def train_model(self): if self.epsilon > self.epsilon_min: self.epsilon -= self.epsilon_decay mini_batch, idxs, is_weights = self.memory.sample(self.batch_size) mini_batch = np.array(mini_batch).transpose() states = np.vstack(mini_batch[0]) actions = list(mini_batch[1]) rewards = list(mini_batch[2]) next_states = np.vstack(mini_batch[3]) dones = mini_batch[4] # bool to binary dones = dones.astype(int) # Q function of current state states = torch.Tensor(states) states = Variable(states).float() pred = self.model(states) # one-hot encoding a = torch.LongTensor(actions).view(-1, 1) one_hot_action = torch.FloatTensor(self.batch_size, self.action_size).zero_() one_hot_action.scatter_(1, a, 1) pred = torch.sum(pred.mul(Variable(one_hot_action)), dim=1) # Q function of next state next_states = torch.Tensor(next_states) next_states = Variable(next_states).float() next_pred = self.target_model(next_states).data rewards = torch.FloatTensor(rewards) dones = torch.FloatTensor(dones) # Q Learning: get maximum Q value at s' from target model target = rewards + (1 - dones) * self.discount_factor * next_pred.max(1)[0] target = Variable(target) errors = torch.abs(pred - target).data.numpy() # update priority for i in range(self.batch_size): idx = idxs[i] self.memory.update(idx, errors[i]) self.optimizer.zero_grad() # MSE Loss function loss = (torch.FloatTensor(is_weights) * F.mse_loss(pred, target)).mean() loss.backward() # and train self.optimizer.step()
class DDPG_Agent: def __init__(self, state_size, action_size, seed, index=0, num_agents=2): """Initialize an Agent object. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed index (int): Index assigned to the agent num_agents (int): Number of agents in the environment """ self.state_size = state_size # State size self.action_size = action_size # Action size self.seed = torch.manual_seed(seed) # Random seed self.index = index # Index of this agent, not used at the moment self.tau = TAU # Parameter for soft weight update self.num_updates = N_UPDATES # Number of updates to perform when updating self.num_agents = num_agents # Number of agents in the environment self.tstep = 0 # Simulation step (modulo (%) UPDATE_EVERY) self.gamma = GAMMA # Gamma for the reward discount self.alpha = ALPHA # PER: toggle prioritization (0..1) # Set up actor and critic networks self.actor_local = Actor(state_size, action_size, seed).to(device) self.critic_local = Critic(state_size, action_size, seed).to(device) self.actor_target = Actor(state_size, action_size, seed).to(device) self.critic_target = Critic(state_size, action_size, seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Ornstein-Uhlenbeck noise self.noise = OUNoise((1, action_size), seed) # Replay buffer self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, self.alpha) # act and act_targets similar to exercises and MADDPG Lab def act(self, states, noise=1.0): """Returns actions for given state as per current policy. Params ====== state [n_agents, state_size]: current state noise (float): control whether or not noise is added """ # Uncomment if state is numpy array instead of tensor states = torch.from_numpy(states).float().to(device) actions = np.zeros((1, self.action_size)) # Put model into evaluation mode self.actor_local.eval() # Get actions for current state, transformed from probabilities with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() # Put actor back into training mode self.actor_local.train() # Ornstein-Uhlenbeck noise addition actions += noise * self.noise.sample() # Transform probability into valid action ranges return np.clip(actions, -1, 1) def step(self, states, actions, rewards, next_states, dones, beta): """Save experience in replay memory, use random samples from buffer to learn. PARAMS ====== states: [n_agents, state_size] current state actions: [n_agents, action_size] taken action rewards: [n_agents] earned reward next_states:[n_agents, state_size] next state dones: [n_agents] Whether episode has finished beta: [0..1] PER: toggles correction for importance weights (0 - no corrections, 1 - full correction) """ # ------------------------------------------------------------------ # Save experience in replay memory - slightly more effort due to Prioritization # We need to calculate priorities for the experience tuple. # This is in our case (Q_expected - Q_target)**2 # ----------------------------------------------------------------- # Set all networks to evaluation mode self.actor_target.eval() self.critic_target.eval() self.critic_local.eval() state = torch.from_numpy(states).float().to(device) next_state = torch.from_numpy(next_states).float().to(device) action = torch.from_numpy(actions).float().to(device) #reward = torch.from_numpy(rewards).float().to(device) #done = torch.from_numpy(dones).float().to(device) with torch.no_grad(): next_actions = self.actor_target(state) own_action = action[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_action, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_action), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_state, next_actions_agent).float() #print(f"Type Q_t_n: {type(Q_targets_next)}") #print(f"Type gamma: {type(self.gamma)}") #print(f"Type dones: {type(dones)}") Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(state, action) # Use error between Q_expected and Q_targets as priority in buffer error = (Q_expected - Q_targets)**2 self.memory.add(state, action, rewards, next_state, dones, error) # Set all networks back to training mode self.actor_target.train() self.critic_target.train() self.critic_local.train() # ------------------------------------------------------------------ # Usual learning procedure # ----------------------------------------------------------------- # Learn every UPDATE_EVERY time steps self.tstep = (self.tstep + 1) % UPDATE_EVERY # If UPDATE_EVERY and enough samples are available in memory, get random subset and learn if self.tstep == 0 and len(self.memory) > BATCH_SIZE: for _ in range(self.num_updates): experiences = self.memory.sample(beta) self.learn(experiences) def reset(self): """Reset the noise parameter of the agent.""" self.noise.reset() def learn(self, experiences): """Update value parameters using given batch of experience tuples. Update according to Q_targets = r + gamma * critic_target(next_state, actor_target(next_state)) According to the lessons: actor_target (state) gives action critic_target (state, action) gives Q-value Params ====== experiences (Tuple[torch.Variable]): tuple of states states visited actions actions taken by all agents rewards rewards received next states all next states dones whether or not a final state is reached weights weights of the experiences indices indices of the experiences """ # Load experiences from sample states, actions, rewards, next_states, dones, weights_cur, indices = experiences # ------------------- update critic ------------------- # # Get next actions via actor network next_actions = self.actor_target(next_states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1 next_actions_agent = torch.cat((own_actions, next_actions), dim=1) else: # Agent 0: flipped order next_actions_agent = torch.cat((next_actions, own_actions), dim=1) # Predicted Q value from Critic target network Q_targets_next = self.critic_target(next_states, next_actions_agent) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) Q_expected = self.critic_local(states, actions) # Update priorities in ReplayBuffer loss = (Q_expected - Q_targets).pow(2).reshape( weights_cur.shape) * weights_cur self.memory.update(indices, loss.data.cpu().numpy()) # Compute critic loss critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradients #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), GRAD_CLIPPING) self.critic_optimizer.step() # ------------------- update actor ------------------- # actions_expected = self.actor_local(states) # Stack action together with action of the agent own_actions = actions[:, self.index * self.action_size:(self.index + 1) * self.action_size] if self.index: # Agent 1: actions_expected_agent = torch.cat((own_actions, actions_expected), dim=1) else: # Agent 0: flipped order actions_expected_agent = torch.cat((actions_expected, own_actions), dim=1) # Compute actor loss based on expectation from actions_expected actor_loss = -self.critic_local(states, actions_expected_agent).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.target_soft_update(self.critic_local, self.critic_target) self.target_soft_update(self.actor_local, self.actor_target) def target_soft_update(self, local_model, target_model): """Soft update model parameters for actor and critic of all MADDPG agents. θ_target = τ*θ_local + (1 - τ)*θ_target """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data) def save(self, filename): """Saves the agent to the local workplace Params ====== filename (string): where to save the weights """ checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'actor_hidden_layers': [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'actor_state_dict': self.actor_local.state_dict(), 'critic_hidden_layers': [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ], 'critic_state_dict': self.critic_local.state_dict() } torch.save(checkpoint, filename) def load_weights(self, filename): """ Load weights to update agent's actor and critic networks. Expected is a format like the one produced by self.save() Params ====== filename (string): where to load data from. """ checkpoint = torch.load(filename) if not checkpoint['input_size'] == self.state_size: print( f"Error when loading weights from checkpoint {filename}: input size {checkpoint['input_size']} doesn't match state size of agent {self.state_size}" ) return None if not checkpoint['output_size'] == self.action_size: print( f"Error when loading weights from checkpoint {filename}: output size {checkpoint['output_size']} doesn't match action space size of agent {self.action_size}" ) return None my_actor_hidden_layers = [ each.out_features for each in self.actor_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['actor_hidden_layers'] == my_actor_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: actor hidden layers {checkpoint['actor_hidden_layers']} don't match agent's actor hidden layers {my_actor_hidden_layers}" ) return None my_critic_hidden_layers = [ each.out_features for each in self.critic_local.hidden_layers if each._get_name() != 'BatchNorm1d' ] if not checkpoint['critic_hidden_layers'] == my_critic_hidden_layers: print( f"Error when loading weights from checkpoint {filename}: critic hidden layers {checkpoint['critic_hidden_layers']} don't match agent's critic hidden layers {my_critic_hidden_layers}" ) return None self.actor_local.load_state_dict(checkpoint['actor_state_dict']) self.critic_local.load_state_dict(checkpoint['critic_state_dict'])
class Prioritized(DQN): def __init__(self, env, model, target_model, config, name_agent="prioritized-dqn"): self.name_agent = name_agent self.dim_space = env.observation_space.shape[0] self.nb_actions = env.action_space.n self.epsilon = config.epsilon_start self.epsilon_final = config.epsilon_final self.epsilon_start = config.epsilon_start self.epsilon_decay = config.epsilon_decay self.gamma = config.gamma self.update_nb_iter = config.update_nb_iter # changing the buffer (taking a priotirized buffer # insted of a uniform probability buffer) self.replay_buffer = PrioritizedReplayBuffer(10000, config.batch_size, config.w, config.beta_final, config.beta_start, config.beta_decay) self.environment = env self.batch_size = config.batch_size self.model = model self.target_model = target_model self.optimizer = optim.Adam(self.model.parameters(), lr=config.learning_rate) self.loss_data = [] self.rewards = [] def loss(self): """ the loss is equal to: Rt+1+γt+1qθ(St+1,argmax qθ(St+1,a′))−qθ(St,At))^2 """ states, actions, rewards, next_states, finish, indices, weight = self.replay_buffer.sample( ) actions = actions.long() # qθ(St,At) q0 = self.model(states).gather(1, actions.unsqueeze(1)).squeeze(1) # argmax qθ_barre(St+1,a′) max_next_q0 = self.model(next_states).max(1)[0] * (1 - finish) Rt_gamma_max = (rewards + self.gamma * max_next_q0) loss = (q0 - Rt_gamma_max).pow(2) * weight # update the priority of the buffer self.replay_buffer.add_p(indices, loss.detach().numpy()) loss = loss.sum() return loss
import MahjongPy from naiveAI import AgentNaive, NMnaive import tensorflow as tf import numpy as np from copy import deepcopy from buffer import PrioritizedReplayBuffer sess = tf.InteractiveSession() if __name__ == '__main__': nn = NMnaive(sess) env = EnvMahjong() # before the train start, create 4 agents. memory = PrioritizedReplayBuffer(state_dim=34 * 4, action_dim=34) agent = AgentNaive(nn, memory) n_games = 2 for n in range(n_games): done = 0 this_state = env.reset() step = 0 while not done and step < 10000: next_aval_states = env.get_aval_actions() action, policy = agent.select(next_aval_states) next_state, score, done, info = env.step(action) agent.remember(this_state, action, next_state, score, done, next_aval_states, policy) agent.learn() this_state = deepcopy(next_state)
def __init__(self, state_size, action_size, seed, lr_decay=9999e-4, double_dqn=False, dueling_network=False, prioritized_replay=False): """ Initialize an Agent instance. Params ====== state_size (int): Dimension of each state action_size (int): Dimension of each action seed (int): Random seed lr_decay (float): Multiplicative factor of learning rate decay double_dqn (bool): Toogle for using the Double-DQN method dueling_network (bool): Toogle for using the Dueling Network (DN) method prioritized_replay (bool): Toogle for using the Prioritized Replay method """ # Set the parameters. self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.double_dqn = double_dqn self.dueling_network = dueling_network self.prioritized_replay = prioritized_replay # Q-Network hidden layers. hidden_layers = [128, 32] # Use the Dueling Network (DN) method. if self.dueling_network: # DN requires a hidden state value. hidden_state_value = [64, 32] self.qnetwork_local = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target = DuelingQNetwork( state_size, action_size, seed, hidden_layers, hidden_state_value).to(device) self.qnetwork_target.eval() else: # Use the Deep Q-Network (DQN) method. self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers).to(device) self.qnetwork_target.eval() # Optimize using Adam. self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LEARNING_RATE) self.lr_scheduler = optim.lr_scheduler.ExponentialLR( self.optimizer, lr_decay) # Use the Prioritized Replay memory buffer if enabled. if self.prioritized_replay: self.memory = PrioritizedReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device, alpha=0.6, beta=0.4, beta_scheduler=1.0) else: # Use the Replay memory buffer instead. self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, device) # Initialize the time step (until the THRESHOLD is reached). self.t_step = 0
class RDPG: def __init__(self, env, initial_act=30, gamma=0.98, tau=0.01, actor_lr=1e-4, critic_lr=1e-3, reward_scale=1., buffer_size=100, writer=None): self.env = env self.gamma = gamma self.tau = tau self.initial_act = initial_act self.reward_scale = reward_scale self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.obs_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.buffer = PrioritizedReplayBuffer(buffer_size) self.actor = Actor(self.obs_dim, self.action_dim) self.target_actor = Actor(self.obs_dim, self.action_dim) self.target_actor.load_state_dict(self.actor.state_dict()) self.critic = Crtic(self.obs_dim, self.action_dim) self.target_critic = Crtic(self.obs_dim, self.action_dim) self.target_critic.load_state_dict(self.critic.state_dict()) self.actor = self.actor.to(self.device) self.critic = self.critic.to(self.device) self.target_actor = self.target_actor.to(self.device) self.target_critic = self.target_critic.to(self.device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.criterion = nn.MSELoss(reduction='none') self.writer = writer def store_episode(self, episode): self.buffer.add(episode) def get_action(self, obs, action, hidden_in, epoch, train=False): history = torch.cat([torch.FloatTensor(obs), torch.FloatTensor(action)]).to(torch.float).reshape(1, 1, self.obs_dim+self.action_dim).to(self.device) action, hidden_out = self.actor(history, hidden_in) if not train: return action[0, 0].detach().cpu().numpy(), hidden_out action = action[0, 0].detach().cpu().numpy() + np.random.normal(0, 0.1) return np.clip(action, -1, 1), hidden_out def soft_update(self, target_net, net): for target_param, param in zip(target_net.parameters(), net.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data ) def update(self, epoch, batch_size=10, beta=0.4): if len(self.buffer) < batch_size: return batch, indices, weights = self.buffer.replay(batch_size=batch_size, beta=beta) indices = indices.to(self.device) weights = weights.to(self.device) obs_batch, action_batch, reward_batch, done_batch = [], [], [], [] for episode in batch: obs_batch.append(episode[0]) action_batch.append(episode[1]) reward_batch.append(episode[2]) done_batch.append(episode[3]) obs_tensor = torch.cat(obs_batch).reshape(batch_size, *obs_batch[0].shape[1:]).to(self.device) # Shape(batch_size, episode_length+1, 3) next_obs_tensor = obs_tensor[:, 1: :] # Shape(batch_size, episode_length, 3) obs_tensor = obs_tensor[:, :-1, :] # Shape(batch_size, episode_length, 3) action_tensor = torch.FloatTensor(action_batch).to(self.device) # Shape(batch_size, episode_length, 1) next_action_tensor = action_tensor[:, 1:, :] action_tensor = action_tensor[:, :-1, :] reward_tensor = torch.FloatTensor(reward_batch).unsqueeze(dim=-1).to(self.device) # Shape(batch_size, episode_length, 1) done_tensor = torch.FloatTensor(done_batch).unsqueeze(dim=-1).to(self.device) # Shape(batch_size, episode_length, 1) hidden = (torch.randn(1, batch_size, 64).to(self.device), torch.randn(1, batch_size, 64).to(self.device)) # Shape(1, batch_size, hidden_size) with torch.no_grad(): target_action, _ = self.target_actor(torch.cat([next_obs_tensor, next_action_tensor], dim=2), hidden) # Shape(batch_size, episode_length, 1) target_q, _ = self.target_critic(torch.cat([next_obs_tensor, target_action], dim=2), hidden) # Shape(batch_size, episode_length, 1) y = reward_tensor * self.reward_scale + done_tensor * self.gamma * target_q # Shape(batch_size, episode_length, 1) q_values, _ = self.critic(torch.cat([obs_tensor, action_tensor], dim=2), hidden) critic_loss = (weights * self.criterion(q_values, y).mean(1).squeeze()).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() action, _ = self.actor(torch.cat([obs_tensor, action_tensor], dim=2), hidden) actor_loss = -(weights * self.critic(torch.cat([obs_tensor, action], dim=2), hidden)[0].mean(1).squeeze()).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.target_critic, self.critic) self.soft_update(self.target_actor, self.actor) # Priority update which replayed self.buffer.update_priority(indices.cpu(), (y.mean(1).squeeze() - q_values.mean(1).squeeze()).abs().detach().cpu().numpy()) if self.writer: self.writer.add_scalar("Train/ActorLoss", actor_loss.item(), epoch) self.writer.add_scalar("Train/CriticLoss", critic_loss.item(), epoch)