def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0
def __init__(self, env, batchSize = 10, bufferSize = 100, gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3, maxSteps = 200, targetUpdate = 1e-3, epsilon = 1, decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'): self.env = env self.gamma = gamma self.batchSize = batchSize self.bufferSize = bufferSize self.maxSteps = maxSteps + 1 self.rewardScale = rewardScale self.epsilon = epsilon self.decay = decay # Useful helpers. self.actionDim = self.env.action_space.shape[0] self.stateDim = self.env.observation_space.shape[0] self.featureDim = self.actionDim + self.stateDim self.minAction = self.env.action_space.low self.maxAction = self.env.action_space.high # For scaling output action values. self.actionBiasZeroOne = self.minAction self.actionScaleZeroOne = self.maxAction - self.minAction self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0 self.actionScaleTanH = self.maxAction - self.actionBiasTanH # Initialize noise process. self.noise = OUNoise(self.actionDim) # Initialize replay buffer. self.buffer = ReplayBuffer(self.bufferSize) # Initialize logging. logging.basicConfig(filename = logFile, level = logging.INFO, format = '[%(asctime)s] %(message)s', datefmt = '%m/%d/%Y %I:%M:%S %p') logging.info('Initializing DRPG agent with passed settings.') # Tensorflow GPU optimization. config = tf.ConfigProto() # GPU fix? config.gpu_options.allow_growth = True self.sess = tf.Session(config = config) from keras import backend as K K.set_session(self.sess) # Make actor network (creates target model internally). self.actor = Actor(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR, self.actionScaleTanH, self.actionBiasTanH) # Make critic network (creates target model internally). self.critic = Critic(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR)
def __init__(self, policy_params): """ arch_parameters is a dictionary like: {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} } """ super(Policy, self).__init__() self.policy_params = policy_params self.seed_as_int = policy_params['seed'] torch.manual_seed(self.seed_as_int) self.arch_params = policy_params['arch_params'] self.__state_dim = self.arch_params['state_and_action_dims'][0] self.__action_dim = self.arch_params['state_and_action_dims'][1] self.eps = policy_params['eps'] self.min_eps = policy_params['min_eps'] self.eps_decay = policy_params['eps_decay'] self.__noise_type = policy_params['noise_type'] keys = list(self.arch_params['layers'].keys()) list_of_layers = [] prev_layer_size = self.__state_dim for i in range(len(self.arch_params['layers'])): key = keys[i] layer_type = key.split('_')[0] if layer_type == 'Linear': layer_size = self.arch_params['layers'][key] list_of_layers.append(nn.Linear(prev_layer_size, layer_size)) prev_layer_size = layer_size elif layer_type == 'LayerNorm': list_of_layers.append(nn.LayerNorm(prev_layer_size)) elif layer_type == 'ReLU': list_of_layers.append(nn.ReLU()) elif layer_type == 'Tanh': list_of_layers.append(nn.Tanh()) else: print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type)) break self.layers = nn.ModuleList(list_of_layers) #noise if self.__noise_type == 'action': self.__rand_process = OUNoise((self.__action_dim,)) elif self.__noise_type == 'parameter': self.network_params_perturbations = dict() for i in range(len(self.layers)): if not ('Linear' in str(type(self.layers[i]))): i += 1 else: self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape)) else: assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'')
def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters
def __init__(self, n_input=70 + 12, n_hidden=256, n_output=1, activation=nn.LeakyReLU): # input = state+action super(DiscriminatorModel, self).__init__() self.model = nn.Sequential(nn.Linear(n_input, n_hidden), activation(), nn.Linear(n_hidden, n_output), nn.Dropout(p=0.6), nn.Sigmoid()) self.Noise = OUNoise(n_input) self.model.apply(init_weight)
def __init__(self, env, seed): self.env = env random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) self.env.seed(seed) self.actor = self.createModel() self.target_actor = self.createModel() self.noise = OUNoise( self.env.action_space.shape[0], seed, theta=0.2, sigma=0.5 ) # noise is actually OpenAI baselines OU Noise wrapped in another OUNoise function self.critic = self.createModel((self.env.observation_space.shape[0], self.env.action_space.shape[0])) self.target_critic = self.createModel( (self.env.observation_space.shape[0], self.env.action_space.shape[0])) self.target_critic.set_weights(self.critic.get_weights( )) #ensure inital weights are equal for networks self.target_actor.set_weights(self.actor.get_weights()) self.reset()
def run_n_episodes(self, num_episodes, max_ep_length, minibatch_size, explore=True, num_updates=5, summary_checkpoint=1, eta=0.01, num_updates_ac=1, T=1): #num_updates from article for i in range(num_episodes): noise = OUNoise(self.a_dim) x = self.env.reset() x = x.reshape(1, -1) u = np.zeros(self.s_dim) t = False episodes_reward = 0 #for REINFORCE self.r_rs.append([]) self.r_xs.append([]) self.r_us.append([]) self.episodes_ls.append(0) while True: self.episodes_ls[-1] = self.episodes_ls[-1] + 1 if self.det: u, V = self.sess.run((self.model.mu_det, self.model.V), feed_dict={self.model.inputs_x: x}) self.episodes_Vs.append(V) else: u, P, sigma, V = self.sess.run( (self.model.mu_norm, self.model.P, self.model.sigma, self.model.V), feed_dict={self.model.inputs_x: x}) self.episodes_Ps.append(P) self.episodes_ss.append(sigma) self.episodes_Vs.append(V) if self.separate_V: self.episodes_V_s.append(self.critic.predict_V_sep(x)) if explore: u += noise.noise() u = np.clip(u, -1.0, 1.0) u = u.reshape(1, -1) x1, r, t, info = self.env.step(u.reshape(-1)) self.r_xs[-1].append(x) self.r_us[-1].append(u) self.r_rs[-1].append(r) episodes_reward += r self.buffer.add(x.reshape(1, -1), u, r, t, x1.reshape(1, -1)) self.episodes_xs.append(x) self.episodes_us.append(u) self.episodes_rs.append(r) #Actor-Critic x = x1.reshape(1, -1) if self.qNAF: for k in range(num_updates): x_batch, u_batch, r_batch, t_batch, x1_batch = \ self.buffer.sample_batch(minibatch_size) x_batch, u_batch, r_batch, t_batch, x1_batch = \ x_batch.reshape(-1, self.s_dim), u_batch.reshape(-1, self.a_dim), r_batch.reshape(-1, 1),\ t_batch.reshape(-1), x1_batch.reshape(-1, self.s_dim) if self.qNAF: y_batch = self.gamma * self.target_model.predict_V( x1_batch) + r_batch self.model.update_Q(x_batch, u_batch, y_batch) self.target_model.soft_update_from(self.model) if t: break if self.ac: r_xs_l = np.array(self.r_xs[-1]).reshape(-1, self.s_dim) r_rs_l = np.array(self.r_rs[-1]).reshape(-1, 1) for idx in range(2, len(r_rs_l) + 1): r_rs_l[-idx] += self.gamma * r_rs_l[-idx + 1] self.r_rs[-1] = r_rs_l r_rs_ = np.array(self.r_rs).reshape(-1, 1) r_xs_ = np.array(self.r_xs).reshape(-1, self.s_dim) r_us_ = np.array(self.r_us).reshape(-1, self.a_dim) for _ in range(num_updates_ac): #update V every episode if self.separate_V: self.critic.update_V_sep(r_xs_l, r_rs_l) if i % T == 0: #Q_target = r_rs_[:-1] + self.gamma * self.critic.predict_V_sep(r_xs_[1:]) #Q_target = np.vstack((Q_target, (r_rs_[-1]))) deltas = r_rs_ if self.separate_V: deltas = deltas - self.critic.predict_V_sep(r_xs_) else: deltas = deltas - self.target_model.predict_V( r_xs_) ''' loss = self.sess.run((self.model.loss_spg), feed_dict={self.model.inputs_x: r_xs_, self.model.inputs_u: r_us_, self.model.inputs_Q: deltas}) print('loss before update', loss) ''' self.model.update_mu(r_xs_, r_us_, deltas) self.target_model.soft_update_from(self.model) ''' loss = self.sess.run((self.model.loss_spg), feed_dict={self.model.inputs_x: r_xs_, self.model.inputs_u: r_us_, self.model.inputs_Q: deltas}) print('loss after update', loss) ''' #self.target_model.soft_update_from(self.model) self.r_rs = [] self.r_xs = [] self.r_us = [] if summary_checkpoint > 0 and i % summary_checkpoint == 0: print('| Reward: %.2i' % int(episodes_reward), " | Episode", i) self.plot_rewards(self.summary_dir) self.rewards.append(episodes_reward)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed, fc1_units, fc2_units, weighted=False, individual=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.epsilon = EPSILON_MAX # Actor Network (w/ Target Network) if weighted: self.actor_local = Weight_adapter(state_size, action_size).to(device) self.actor_target = Weight_adapter(state_size, action_size).to(device) elif individual: self.actor_local = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) self.actor_target = IndividualModel(state_size, action_size, random_seed, fc1_units).to(device) else: self.actor_local = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_target = Actor(state_size, action_size, random_seed, fc1_units, fc2_units).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process self.noise = OUNoise(action_size, random_seed, mu=0, theta=0.15, sigma=0.2) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) # Make sure target is with the same weight as the source self.hard_update(self.actor_target, self.actor_local) self.hard_update(self.critic_target, self.critic_local) self.t_step = 0 def step(self, state, action, reward, next_state, done, timestep): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) if len(self.memory) > LEARN_START: # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE: for _ in range(UPDATES_PER_STEP): experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() #print(action) self.actor_local.train() if add_noise: tem_noise = self.noise.sample() action += self.epsilon * tem_noise # print(tem_noise, np.clip(action, -1, 1)) return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + ? * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() #torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # ---------------------------- update noise ---------------------------- # if self.epsilon - EPSILON_DECAY > EPSILON_MIN: self.epsilon -= EPSILON_DECAY else: self.epsilon = EPSILON_MIN self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = t*?_local + (1 - t)*?_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def hard_update(self, target, source): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
class Policy(nn.Module): """Actor (Policy) Model.""" def __init__(self, policy_params): """ arch_parameters is a dictionary like: {'state_and_action_dims' : (num1, num2), layers : {'Linear_1' : layer_size_1,..,'Linear_n' : layer_size_n} } """ super(Policy, self).__init__() self.policy_params = policy_params self.seed_as_int = policy_params['seed'] torch.manual_seed(self.seed_as_int) self.arch_params = policy_params['arch_params'] self.__state_dim = self.arch_params['state_and_action_dims'][0] self.__action_dim = self.arch_params['state_and_action_dims'][1] self.eps = policy_params['eps'] self.min_eps = policy_params['min_eps'] self.eps_decay = policy_params['eps_decay'] self.__noise_type = policy_params['noise_type'] keys = list(self.arch_params['layers'].keys()) list_of_layers = [] prev_layer_size = self.__state_dim for i in range(len(self.arch_params['layers'])): key = keys[i] layer_type = key.split('_')[0] if layer_type == 'Linear': layer_size = self.arch_params['layers'][key] list_of_layers.append(nn.Linear(prev_layer_size, layer_size)) prev_layer_size = layer_size elif layer_type == 'LayerNorm': list_of_layers.append(nn.LayerNorm(prev_layer_size)) elif layer_type == 'ReLU': list_of_layers.append(nn.ReLU()) elif layer_type == 'Tanh': list_of_layers.append(nn.Tanh()) else: print("Error: got unspecified layer type: '{}'. Check your layers!".format(layer_type)) break self.layers = nn.ModuleList(list_of_layers) #noise if self.__noise_type == 'action': self.__rand_process = OUNoise((self.__action_dim,)) elif self.__noise_type == 'parameter': self.network_params_perturbations = dict() for i in range(len(self.layers)): if not ('Linear' in str(type(self.layers[i]))): i += 1 else: self.network_params_perturbations[i] = OUNoise(tuple(self.layers[i].weight.shape)) else: assert ValueError('Got an unspecified type of noise. The only available options are \'parameter\' and \'action\'') def forward(self, state): # get action values """Build a network that maps state -> action.""" if self.__noise_type == 'action': y = state.float() for i in range(len(self.layers)): y = self.layers[i](y).float() y_perturbed = y + self.eps*torch.from_numpy(self.__rand_process.noise()).float() return y, torch.clamp(y_perturbed, min = -1.0, max = 1.0) elif self.__noise_type == 'parameter': y = state.float() y_perturbed = state.float() for i in range(len(self.layers)): if not ('Linear' in str(type(self.layers[i]))): y = self.layers[i](y).float() #layernorm y_perturbed = self.layers[i](y_perturbed).float() else: #weights if (self.layers[i].weight).shape[1] == y.shape[0]: #if there is a single state y = (self.layers[i].weight).matmul(y) n = torch.from_numpy(self.network_params_perturbations[i].noise()).float() y_perturbed = ((self.layers[i].weight) + self.eps*n).matmul(y_perturbed) else: #if there is a batch of states y = y.matmul((self.layers[i].weight).t()) n = torch.from_numpy(self.network_params_perturbations[i].noise()).float() y_perturbed = y_perturbed.matmul(((self.layers[i].weight) + self.eps*n).t()) #biases y = y + (self.layers[i].bias) y_perturbed = y_perturbed + (self.layers[i].bias) return y, y_perturbed
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Reward monitoring self.best_total_reward = -np.inf # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters def reset_episode(self): self.total_reward = 0.0 #self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward if self.total_reward > self.best_total_reward: self.best_total_reward = self.total_reward # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class RDPGAgent: def __init__(self, env, batchSize = 10, bufferSize = 100, gamma = 0.98, actorLR = 1e-4, criticLR = 1e-3, maxSteps = 200, targetUpdate = 1e-3, epsilon = 1, decay = 0.99, rewardScale = 1e-3, logFile = 'run.log'): self.env = env self.gamma = gamma self.batchSize = batchSize self.bufferSize = bufferSize self.maxSteps = maxSteps + 1 self.rewardScale = rewardScale self.epsilon = epsilon self.decay = decay # Useful helpers. self.actionDim = self.env.action_space.shape[0] self.stateDim = self.env.observation_space.shape[0] self.featureDim = self.actionDim + self.stateDim self.minAction = self.env.action_space.low self.maxAction = self.env.action_space.high # For scaling output action values. self.actionBiasZeroOne = self.minAction self.actionScaleZeroOne = self.maxAction - self.minAction self.actionBiasTanH = (self.maxAction + self.minAction) / 2.0 self.actionScaleTanH = self.maxAction - self.actionBiasTanH # Initialize noise process. self.noise = OUNoise(self.actionDim) # Initialize replay buffer. self.buffer = ReplayBuffer(self.bufferSize) # Initialize logging. logging.basicConfig(filename = logFile, level = logging.INFO, format = '[%(asctime)s] %(message)s', datefmt = '%m/%d/%Y %I:%M:%S %p') logging.info('Initializing DRPG agent with passed settings.') # Tensorflow GPU optimization. config = tf.ConfigProto() # GPU fix? config.gpu_options.allow_growth = True self.sess = tf.Session(config = config) from keras import backend as K K.set_session(self.sess) # Make actor network (creates target model internally). self.actor = Actor(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR, self.actionScaleTanH, self.actionBiasTanH) # Make critic network (creates target model internally). self.critic = Critic(self.sess, self.maxSteps, self.featureDim, self.actionDim, self.batchSize, targetUpdate, actorLR) # Train or run for some number of episodes. def run(self, numEpisodes, training = False, warmUp = 30): for i in range(numEpisodes): sequence = [] totalReward = 0 totalSteps = 0 o = self.env.reset() # Stores (O1, A1, O2, A2, etc) for prediction. history = np.zeros((self.maxSteps * self.featureDim)) history[:self.stateDim] = o for j in range(self.maxSteps - 1): # We do this reshaping to get history into (BatchSize, TimeSteps, Dims). batchedHistory = np.reshape(history, (self.maxSteps, self.featureDim)) batchedHistory = np.expand_dims(batchedHistory, axis = 0) # Predict action or use random with e-greedy. # if (np.random.random_sample() < self.epsilon and training): # a = np.random.random((self.actionDim)) # a = a * self.actionScaleZeroOne # a = a + self.actionBiasZeroOne # else: # a = self.actor.model.predict(batchedHistory)[0] # Predict an action and add noise to it for exploration purposes. a = self.actor.model.predict(batchedHistory)[0] + self.epsilon * self.noise.noise() a = np.clip(a, self.minAction, self.maxAction) # Take a single step. oPrime, r, d, _ = self.env.step(a) r *= self.rewardScale newTimeStart = (j + 1) * self.featureDim # Update agent state and ongoing agent history data. History is # passed to our actor for prediction, and sequence is for later. history[j * self.featureDim + self.stateDim:newTimeStart] = a history[newTimeStart:(j + 1) * self.featureDim + self.stateDim] = oPrime sequence.append({'o': o, 'a': a, 'r': r, 'd': d}) totalReward += r totalSteps += 1 o = oPrime # Quit early. if d: break # Anneal epsilon. if i > warmUp: self.epsilon *= self.decay # Print some episode debugging and reward information. print('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale)) logging.info('Episode: %03d / Steps: %d / Reward: %f' % (i + 1, totalSteps, totalReward / self.rewardScale)) # Simulation only. if not training: continue # Add sequence to buffer. self.buffer.add(sequence) # Resample sequences from the buffer samples = self.buffer.getBatch(self.batchSize) numSamples = len(samples) # Do not train until we have # seen self.warmUp episodes. if self.buffer.getCount() < warmUp: continue # Some more debug info. print('Training on sampled sequences from all episodes.') logging.info('Training on sampled sequences from all episodes.') # All of these do not include time step t = T. # Used to store H[i, t] for each episode i and step t. sampleHistories = np.zeros((numSamples, self.maxSteps - 1, self.maxSteps * self.featureDim)) # Used to store H[i, t + 1] for each episode i and step t. sampleHistoriesWithNext = np.zeros((numSamples, self.maxSteps - 1, self.maxSteps * self.featureDim)) # Used to store R[i, t] for each episode i and step t. sampleRewards = np.zeros((numSamples, self.maxSteps - 1)) # Used to store NotDone[i, t] for each episode i and step t. sampleNotDoneMasks = np.zeros((numSamples, self.maxSteps - 1)) # Used to store action[i, t] taken for each episode i and step t. sampleActions = np.zeros((numSamples, self.maxSteps - 1, self.actionDim)) # Compute info for each episode i. for i in range(numSamples): sample = samples[i] historySoFar = np.zeros((self.maxSteps * self.featureDim)) # Iteratively build up historySoFar for each timestep t. for t in range(len(sample) - 1): step, nextStep = sample[i], sample[i + 1] # This is (oT, aT), which we are adding to running history. history = np.concatenate([step['o'], step['a']], axis = 0) historySoFar[t * self.featureDim:(t + 1) * self.featureDim] = history # This is (o1, a1, o2, a2 ... ot). sampleHistoryEnd = (t + 1) * self.featureDim - self.actionDim sampleHistories[i, t, :sampleHistoryEnd] = historySoFar[:sampleHistoryEnd] # This is (o1, a1, o2, a2 ... ot, at, ot+1). sampleNextEnd = (t + 1) * self.featureDim sampleHistoriesWithNext[i, t, :sampleNextEnd] = historySoFar[:sampleNextEnd] sampleHistoriesWithNext[i, t, sampleNextEnd:sampleNextEnd + self.stateDim] = nextStep['o'] # Set rewards and not done masks. sampleRewards[i, t] = step['r'] sampleActions[i, t] = step['a'] sampleNotDoneMasks[i, t] = 0 if step['d'] else 1 # Separate out self.maxSteps since it is the timestep dimension for RNN. sampleHistories = np.reshape(sampleHistories, (numSamples, self.maxSteps - 1, self.maxSteps, self.featureDim)) # Separate out self.maxSteps since it is the timestep dimension for RNN. sampleHistoriesWithNext = np.reshape(sampleHistoriesWithNext, (numSamples, self.maxSteps - 1, self.maxSteps, self.featureDim)) # Update models using samples, rewards, and masks. self.update(numSamples, sampleHistories, sampleHistoriesWithNext, sampleRewards, sampleActions, sampleNotDoneMasks) # Given a bunch of experienced histories, update our models. def update(self, numSamples, histories, historiesNext, rewards, chosenActions, notDoneMasks): # Reshape [i, t] pairs to a single dimension, which will be the RNN batch dimension. historiesBatch = np.reshape(histories, (-1, self.maxSteps, self.featureDim)) historiesNextBatch = np.reshape(historiesNext, (-1, self.maxSteps, self.featureDim)) # Compute QSample targets [y] for updating the critic Q[S][A] outputs. targetActions = self.actor.target.predict(historiesNextBatch) # (B * (T - 1), F). targetQ = self.critic.target.predict([historiesNextBatch, targetActions]) # (B * (T - 1), 1). targetQ = np.reshape(targetQ, (numSamples, self.maxSteps - 1)) # (B, T - 1). y = rewards + notDoneMasks * (self.gamma * targetQ) # (B, T - 1) y = np.reshape(y, (numSamples * (self.maxSteps - 1), 1)) # (B * (T - 1), 1) # Train the critic model, passing in both the history and chosen actions. chosenActionsFlat = np.reshape(chosenActions, (numSamples * (self.maxSteps - 1), self.actionDim)) # print (chosenActionsFlat.shape, historiesBatch.shape, historiesNextBatch.shape) self.critic.model.train_on_batch([historiesBatch, chosenActionsFlat], y) # Compute the gradient of the critic output WRT to its action input. # We cannot use chosenActions here since those were noisy predictions. currentActionsForGrad = self.actor.model.predict(historiesBatch) currentActionsGrad = self.critic.modelActionGradients(historiesBatch, currentActionsForGrad) # Train the actor model using the critic gradient WRT action input. self.actor.trainModel(historiesBatch, currentActionsGrad) # Update target models. self.actor.trainTarget() self.critic.trainTarget()
def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 1e-5 #.0001 self.critic_lr = 1e-4 #0.0000001 self.network = [128, 256, 128] self.train = train network = self.network actor_lr = self.actor_lr critic_lr = self.critic_lr if (self.train): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr, network) self.critic_target = Critic(self.state_size, self.action_size, critic_lr, network) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # Mean self.exploration_theta = 0.15 #.15 How fast variable reverts to mean self.exploration_sigma = 0.2 #.2 Degree of volatility self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 5000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.targets = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr) print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma) print(self.actor_local.model.summary()) print(self.critic_local.model.summary()) # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1 # Create the TensorBoard callback, # which we will drive manually self.tensorboard = keras.callbacks.TensorBoard( log_dir='logdir', histogram_freq=0, batch_size=self.batch_size, write_graph=True, write_grads=True) self.tensorboard.set_model(self.critic_local.model) self.summary_writer = tf.summary.FileWriter("scores") self.batch_id = 0
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task, train=True): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 1e-5 #.0001 self.critic_lr = 1e-4 #0.0000001 self.network = [128, 256, 128] self.train = train network = self.network actor_lr = self.actor_lr critic_lr = self.critic_lr if (self.train): # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high, actor_lr, network) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size, critic_lr, network) self.critic_target = Critic(self.state_size, self.action_size, critic_lr, network) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 # Mean self.exploration_theta = 0.15 #.15 How fast variable reverts to mean self.exploration_sigma = 0.2 #.2 Degree of volatility self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 5000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) self.targets = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters print("DDPG init", "Actor: ", actor_lr, "Critic: ", critic_lr) print("Tau: ", self.tau, "Sigma: ", self.exploration_sigma) print(self.actor_local.model.summary()) print(self.critic_local.model.summary()) # https://stackoverflow.com/questions/44861149/keras-use-tensorboard-with-train-on-batch?rq=1 # Create the TensorBoard callback, # which we will drive manually self.tensorboard = keras.callbacks.TensorBoard( log_dir='logdir', histogram_freq=0, batch_size=self.batch_size, write_graph=True, write_grads=True) self.tensorboard.set_model(self.critic_local.model) self.summary_writer = tf.summary.FileWriter("scores") self.batch_id = 0 def reset_episode(self): if (self.train): self.noise.reset() self.noise_arr = [] self.noise_matrix = [0., 0., 0., 0.] state = self.task.reset() self.last_state = state return state def save_initial_weights(self): self.actor_local.model.save_weights('actor_local.h5') self.actor_target.model.save_weights('actor_target.h5') self.critic_local.model.save_weights('critic_local.h5') self.critic_target.model.save_weights('critic_target.h5') def load_initial_weights(self): self.actor_local.model.load_weights('actor_local.h5') self.actor_target.model.load_weights('actor_target.h5') self.critic_local.model.load_weights('critic_local.h5') self.critic_target.model.load_weights('critic_target.h5') def save_model(self): # Save the weights self.actor_local.model.save_weights('model_weights.h5') def load_weights(self, option=None): if (option == None): self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('model_weights.h5') else: self.trained = Actor(self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.network) self.trained.model.load_weights('weights-best.hdf5') print(self.trained.model.summary()) def predict(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.trained.model.predict(state)[0] return action def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if (len(self.memory) > self.batch_size * 2): experiences = self.memory.sample() self.learn(experiences) if (len(self.memory) == self.buffer_size): self.memory.memory.clear() print("buffer cleared") # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) noise = self.noise.sample() action = list(self.actor_local.model.predict(state)[0] + noise) return action, noise # add some noise for exploration def learn(self, experiences): #experiences """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) ''' print("States", states.shape) print("actions", actions.shape) print("rewards", rewards.shape) print("dones", dones.shape) print("Next states", next_states.shape) ''' # keep training actor local and critic local # use values from target model to update and train local # don't train target models, we soft update target # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch( next_states) #target #Actions predicted by target critic Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) #target # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) critic_loss = self.critic_local.model.train_on_batch( x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) actor_loss = self.actor_local.train_fn([states, action_gradients, 1]) # custom training function self.tensorboard.on_epoch_end( self.batch_id, named_logs(self.critic_local.model, [critic_loss])) self.batch_id += 1 # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent: """ Interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed): """ Initialize an Agent Params ====== state_size (int): state dimension action_size (int): action dimension num_agents (int): simultaneous running agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents random.seed(random_seed) # Actor Network and its target network self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network and its target network self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise object self.noise = OUNoise((num_agents, action_size), random_seed) # Replay Memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, EXPERIENCES_PER_SAMPLING, device, random_seed) # Initialize time step (for updating every UPDATE_NN_EVERY steps) self.t_step_nn = 0 # Initialize time step (for updating every UPDATE_MEM_PAR_EVERY steps) self.t_step_mem_par = 0 # Initialize time step (for updating every UPDATE_MEM_EVERY steps) self.t_step_mem = 0 def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use prioritized sample from buffer to learn. """ # Save memory for i in range(self.num_agents): self.memory.add(state[i, :], action[i, :], reward[i], next_state[i, :], done[i]) # Learn every UPDATE_NN_EVERY time steps. self.t_step_nn = (self.t_step_nn + 1) % UPDATE_NN_EVERY self.t_step_mem = (self.t_step_mem + 1) % UPDATE_MEM_EVERY self.t_step_mem_par = (self.t_step_mem_par + 1) % UPDATE_MEM_PAR_EVERY if self.t_step_mem_par == 0: self.memory.update_parameters() if self.t_step_nn == 0: # Learn from memory if enough samples exist if self.memory.experience_count > EXPERIENCES_PER_SAMPLING: experiences = self.memory.sample() self.learn(experiences, GAMMA) if self.t_step_mem == 0: self.memory.update_memory_sampling() def act(self, states, add_noise=True): """ Returns actions for given state as per current policy. """ states = torch.from_numpy(states).float().to(device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[i, :] = action self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, indices = experiences # update Critic # Get next predicted state, actions, and Q values actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current state Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute Critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # Update priorities delta = abs(Q_targets - Q_expected).detach().numpy() self.memory.update_priorities(delta, indices) @staticmethod def soft_update(local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_model_param, local_model_param in zip( target_model.parameters(), local_model.parameters()): target_model_param.data.copy_(tau * local_model_param.data + (1. - tau) * target_model_param.data)
class Agent: '''The most perfect DDPG Agent you have ever seen''' # Parameters taken from various sources epsilon = 0 epsilon_min = 0 decay = 0.9 learn_start = 1000 gamma = 0.99 alpha = 0.002 tau = 0.005 mem_len = 1e5 memory = deque(maxlen=int(mem_len)) def __init__(self, env, seed): self.env = env random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) self.env.seed(seed) self.actor = self.createModel() self.target_actor = self.createModel() self.noise = OUNoise( self.env.action_space.shape[0], seed, theta=0.2, sigma=0.5 ) # noise is actually OpenAI baselines OU Noise wrapped in another OUNoise function self.critic = self.createModel((self.env.observation_space.shape[0], self.env.action_space.shape[0])) self.target_critic = self.createModel( (self.env.observation_space.shape[0], self.env.action_space.shape[0])) self.target_critic.set_weights(self.critic.get_weights( )) #ensure inital weights are equal for networks self.target_actor.set_weights(self.actor.get_weights()) self.reset() # return self.actor def createModel(self, input=None): '''Generate neural network models based on inputs, defaults to Actor model''' last_init = tf.random_uniform_initializer( minval=-0.003, maxval=0.003 ) # To prevent actor network from causing steep gradients if input is None: input = self.env.observation_space.shape[0] # Actor inputs = keras.layers.Input(shape=(input, )) hidden = keras.layers.Dense(256, activation="relu")(inputs) hidden = keras.layers.Dense(256, activation="relu")(hidden) outputs = keras.layers.Dense(1, activation="tanh", kernel_initializer=last_init)(hidden) model = Actor(inputs, outputs) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=self.alpha / 2, decay_steps=1e9, decay_rate=1 ) #This could allow us to use decaying learning rate model.compile( loss="huber_loss", optimizer=Adam(learning_rate=lr_schedule) ) #Compile model with optimizer so we can apply tape.gradient later else: # Critic input_o, input_a = input input1 = keras.layers.Input(shape=(input_o, )) input2 = keras.layers.Input(shape=(input_a, )) input11 = keras.layers.Dense(16, activation="relu")(input1) input11 = keras.layers.Dense(32, activation="relu")(input11) input21 = keras.layers.Dense(32, activation="relu")(input2) cat = keras.layers.Concatenate()([input11, input21]) hidden = keras.layers.Dense(256, activation="relu")(cat) hidden = keras.layers.Dense(256, activation="relu")(hidden) outputs = keras.layers.Dense(1, activation="linear", kernel_initializer=last_init)(hidden) lr_schedule = keras.optimizers.schedules.ExponentialDecay( initial_learning_rate=self.alpha / 1, decay_steps=1e9, decay_rate=1) model = Critic([input1, input2], outputs) model.compile(loss="mean_squared_error", optimizer=Adam( learning_rate=lr_schedule)) # mean_squared_error return model def replayBuffer(self, state, action, reward, next_state, terminal): ##TODO Implement prioritised buffer self.memory.append([state, action, reward, next_state, terminal]) @tf.function #EagerExecution for speeeed def replay(self, states, actions, rewards, next_states): #, actor, target_actor, critic, target_critic): '''tf function that replays sampled experience to update actor and critic networks using gradient''' # Very much inspired by Keras tutorial: https://keras.io/examples/rl/ddpg_pendulum/ with tf.GradientTape() as tape: target_actions = self.target_actor(next_states, training=True) q_target = rewards + self.gamma * self.target_critic( [next_states, target_actions], training=True) q_current = self.critic([states, actions], training=True) critic_loss = tf.math.reduce_mean( tf.math.square(q_target - q_current)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic.optimizer.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: actions_pred = self.actor(states, training=True) q_current = self.critic([states, actions_pred], training=True) actor_loss = -tf.math.reduce_mean(q_current) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor.optimizer.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) @tf.function def update_weight(self, target_weights, weights, tau): '''tf function for updating the weights of selected target network''' for (a, b) in zip(target_weights, weights): a.assign(b * tau + a * (1 - tau)) def trainTarget(self): '''Standard function to update target networks by tau''' self.update_weight(self.target_actor.variables, self.actor.variables, self.tau) self.update_weight(self.target_critic.variables, self.critic.variables, self.tau) def sample2batch(self, batch_size=64): '''Return a set of Tensor samples from the memory buffer of batch_size, default is 64''' # batch_size = 64 if len( self.memory ) < batch_size: # return nothing if not enough experiences available return # Generate batch and emtpy arrays samples = random.sample(self.memory, batch_size) next_states = np.zeros( (batch_size, self.env.observation_space.shape[0])) states = np.zeros((batch_size, self.env.observation_space.shape[0])) rewards = np.zeros((batch_size, 1)) actions = np.zeros((batch_size, self.env.action_space.shape[0])) # Separate batch into arrays for idx, sample in enumerate(samples): state, action, reward, next_state, terminal = sample states[idx] = state actions[idx] = action rewards[idx] = reward next_states[idx] = next_state # Convert arrays to tensors so we can use replay as a callable TensorFlow graph states = tf.convert_to_tensor((states)) rewards = tf.convert_to_tensor((rewards)) rewards = tf.cast(rewards, dtype=tf.float32) actions = tf.convert_to_tensor((actions)) next_states = tf.convert_to_tensor((next_states)) return (states, actions, rewards, next_states) def train(self, state, action, reward, next_state, terminal, steps): '''Function call to update buffer and networks at predetermined intervals''' self.replayBuffer(state, action, reward, next_state, terminal) # Add new data to buffer if steps % 1 == 0 and len( self.memory) > self.learn_start: # Sample every X steps samples = self.sample2batch() states, actions, rewards, next_states = samples self.replay(states, actions, rewards, next_states) if steps % 1 == 0: # Update targets only every X steps self.trainTarget() def reset(self): self.epsilon *= self.decay self.epsilon = max(self.epsilon_min, self.epsilon) def chooseAction(self, state, scale=False): '''Choose action based on policy and noise function. Scale option used to limit maximum actions''' # self.epsilon *= self.decay # self.epsilon = round(max(self.epsilon / 1000, self.epsilon), 5) # print(state[0]) state = tf.expand_dims(tf.convert_to_tensor(state), 0) #convert to tensor for speeeed if np.random.random( ) < self.epsilon: # If using epsilon instead of exploration noise return random.uniform(-1, 1) if scale: return np.clip(0.33 * (self.actor(state)) + self.noise.sample(), -1, 1) return np.clip(1 * tf.squeeze(self.actor(state)).numpy() + self.noise.sample(), -1, 1) # np.argmax(self.model.predict(state)) # action