def train(self, transitions: int, sigma_max: float = 1., sigma_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, progress_upd_step: int = None, start_training: int = 1000, shaping_coef: float = 300.): history = ReplayBuffer(buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "sigma_max": sigma_max, "sigma_min": sigma_min, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: sigma = sigma_max - (sigma_max - sigma_min) * i / transitions action = self.act(state) noise = np.random.normal(scale=sigma, size=action.shape) action = np.clip(action + noise, -1, 1) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: batch = history.sample(batch_size) self.update_critic(batch) self.update_actor(batch) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = None, start_training: int = 10000): history = ReplayBuffer(size=buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "eps_max": eps_max, "eps_min": eps_min, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: eps = eps_max - (eps_max - eps_min) * i / transitions if random() < eps: action = self.env.action_space.sample() else: action = self.act(state) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] >= 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: self.update(history.sample(batch_size)) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description( f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}" ) return log
class TD3(): """ Twin Delayed Deep Deterministic Policy Gradient Model """ def __init__(self, state_size, action_size, random_seed): """ Initialize the model with arguments as follows: ARGUMENTS ========= - state_size (int) = dimension of input space - action_size (int) = dimension of action space - random_seed (int) = random seed Returns ======= - best learned action to take after Actor-Critic Learning """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # create noise self.noise = OUNoise(action_size, random_seed) self.noise_decay = NOISE_DECAY # create memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # Actor Networks (local online net + target net) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR) # Critic Networks (local online net + target net) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # instantiate online and target networks with same weights self.soft_update(self.actor_local, self.actor_target, 1) self.soft_update(self.critic_local, self.critic_target, 1) self.learn_counter = 0 def act(self, state, add_noise=True): """ Choose an action while interacting and learning in the environment """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() * self.noise_decay self.noise_decay *= self.noise_decay return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, noise_clip=0.5, policy_freq=2): """ Sample from experiences and learn """ # update the learn counter self.learn_counter += 1 # get experience tuples states, actions, rewards, next_states, dones = experiences # build noise on the action ##### CAVE: need to put actions onto cpu() to create a cpu tensor that is put onto CUDA with .to(device) #noise = torch.FloatTensor(actions.cpu()).data.normal_(0, policy_noise).to(device) #noise = noise.clamp(-noise_clip, noise_clip) ### <<--- adding this kind of noise was implemented in the paper on github, ### but i used OU-Noise in the act method, so maybe better to use the same while learning noise = torch.FloatTensor([self.noise.sample() for _ in range(len(actions))]).to(device) noise = noise.clamp(-noise_clip, noise_clip) # clip between -/+ max action dims because action+noise might run oor next_action = (self.actor_target(next_states) + noise).clamp(-1, 1) # compute the target Q value target_Q1, target_Q2 = self.critic_target(next_states, next_action) target_Q = torch.min(target_Q1, target_Q2) target_Q = rewards + (gamma * target_Q * (1-dones)).detach() # get current Q estimates current_Q1, current_Q2 = self.critic_local(states, actions) # compute critic loss critic_loss = F.mse_loss(current_Q1, target_Q) + F.mse_loss(current_Q2, target_Q) # update the critic self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # delay the policy update if self.learn_counter % policy_freq == 0: # get actor_local predicted next action and use critic_local to complete actions_pred = self.actor_local.forward(states) actor_loss = -self.critic_local.Q1(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # delay update of actor and critic target models self.soft_update(self.actor_local, self.actor_target, TAU) self.soft_update(self.critic_local, self.critic_target, TAU) def soft_update(self, local_model, target_model, tau): # Perform soft update of the target networks # at every time step, keep 1-tau of target network # and add only a small fraction (tau) of the current online networks # to prevent oszillation for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def step(self, state, action, reward, next_state, done): # at every iteration, add new SARS' trajectory to memory, then learn from batches # if learning_step is reached and enough samples are in the buffer self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA)
class DDPG_Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers_actor=[32, 32], hidden_layers_critic=[32, 32, 32], buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, learning_rate_actor=1e-4, learning_rate_critic=5e-4, weight_decay=0.0001, update_every=20, num_batches=10, add_noise=True, head_name_actor='Actor', head_name_critic="DuelingDQN", head_scale_actor='max', head_scale_critic="max"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate_X (float; optional): learning rate for X=actor or critic update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr_actor = learning_rate_actor self.lr_critic = learning_rate_critic self.update_every = update_every self.num_batches = num_batches self.weight_decay_critic = weight_decay self.add_noise = add_noise # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") ### SET UP THE ACTOR NETWORK ### # Assign model parameters and assign device model_params_actor = [ state_size, action_size, seed, hidden_layers_actor, head_name_actor, head_scale_actor ] # Actor Network (w/ Target Network) self.actor_local = Actor(*model_params_actor).to(self.device) self.actor_target = Actor(*model_params_actor).to(self.device) # Set up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) ### SET UP THE CRITIC NETWORK ### model_params_critic = [ state_size, action_size, seed, hidden_layers_critic, head_name_critic, head_scale_critic ] # Critic Network (w/ Target Network) self.critic_local = Critic(*model_params_critic).to(self.device) self.critic_target = Critic(*model_params_critic).to(self.device) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done, timestep): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory ) > self.batch_size and timestep % self.update_every == 0: for i in range(self.num_batches): experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(self.device) # Go to evaluation mode and get Q values for current state self.actor_local.eval() with torch.no_grad(): action_values = self.actor_local(state).cpu().data.numpy() # get back to train mode self.actor_local.train() # Add noise to the action probabilities if add_noise: action_values += self.noise.sample() return np.clip(action_values, -1.0, 1.0) def reset(self): self.noise.reset() def learn(self, experiences, gamma): # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks using the local and target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. X_target = tau*X_local + (1 - tau)*X_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], drop_p=0.3, with_dueling=False, isDDQN=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (array): Hidden number of nodes in each layer drop_p (float [0-1]) : Probability of dropping nodes (implementation of dropout) with_dueling (boolean) : If true, network is dueling network, otherwise false. isDDQN (boolean) : If true, double dqn in implemented, otherwise false. """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed, hidden_layers=hidden_layers, drop_p=drop_p, dueling=with_dueling).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 # Parameter instance of DDQN. self.isDDQN = isDDQN def step(self, state, action, reward, next_state, done): """Takes a step and with each time step sample from buffer and learn""" # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences if self.isDDQN: # Get optimal action from local model and feed forward next_states on target network best_local_actions = self.qnetwork_local(states).max( 1)[1].unsqueeze(1) double_dqn_targets = self.qnetwork_target(next_states) # Get value of the target dqn vialocal optimal action Q_targets_next = torch.gather(double_dqn_targets, 1, best_local_actions) else: # Get max predicted Q values (for next states) from target model (without ddqn) Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # update target network self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(Model): """ Interface """ def __init__(self, name, args, sess=None, reuse=False, log_tensorboard=True, save=True): self.learn_steps = 0 # hyperparameters self.gamma = args[name]['gamma'] self.tau = args[name]['tau'] self.init_noise_sigma = args[name]['init_noise_sigma'] self.noise_decay = args[name]['noise_decay'] # replay buffer self.buffer = ReplayBuffer(sample_size=args['batch_size'], max_len=args[name]['buffer_size']) super(DDPG, self).__init__(name, args, sess=sess, reuse=reuse, build_graph=True, log_tensorboard=log_tensorboard, save=save) self._initialize_target_net() @property def main_variables(self): return self.actor_critic.trainable_variables @property def _target_variables(self): return self._target_actor_critic.trainable_variables def act(self, state): self.sess.run(self.noise_op) state = state.reshape((-1, self.state_size)) action = self.sess.run(self.actor_critic.actor_action, feed_dict={self.actor_critic.state: state}) self.sess.run(self.denoise_op) return np.squeeze(action) def step(self, state, action, reward, next_state, done): self.buffer.add(state, action, reward, next_state, done) if len(self.buffer) > self.buffer.sample_size + 100: self._learn() """ Implementation """ def _build_graph(self): # env info self._setup_env() # main actor-critic self.actor_critic = self._create_actor_critic() # target actor-critic self._target_actor_critic = self._create_actor_critic(is_target=True) # losses self.actor_loss, self.critic_loss = self._loss() # optimizating operation self.opt_op = self._optimize([self.actor_loss, self.critic_loss]) # target net update operations self.init_target_op, self.update_target_op = self._targetnet_ops() # operations that add/remove noise from parameters self.noise_op, self.denoise_op = self._noise_params() def _setup_env(self): self.state_size = self._args[self.name]['state_size'] self.action_size = self._args[self.name]['action_size'] self.env_info = {} with tf.name_scope('placeholders'): self.env_info['state'] = tf.placeholder(tf.float32, shape=(None, self.state_size), name='state') self.env_info['action'] = tf.placeholder(tf.float32, shape=(None, self.action_size), name='action') self.env_info['next_state'] = tf.placeholder( tf.float32, shape=(None, self.state_size), name='next_state') self.env_info['reward'] = tf.placeholder(tf.float32, shape=(None, 1), name='reward') self.env_info['done'] = tf.placeholder(tf.uint8, shape=(None, 1), name='done') def _create_actor_critic(self, is_target=False): name = 'target_actor_critic' if is_target else 'actor_critic' log_tensorboard = False if is_target else True actor_critic = ActorCritic(name, self._args, self.env_info, self.action_size, reuse=self.reuse, log_tensorboard=log_tensorboard, is_target=is_target) return actor_critic def _loss(self): with tf.name_scope('loss'): with tf.name_scope('l2_loss'): encoder_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/state_encoder', name='encoder_l2_loss') actor_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/actor', name='actor_l2_loss') critic_l2_loss = tf.losses.get_regularization_loss( scope=self.actor_critic.variable_scope + '/critic', name='critic_l2_loss') with tf.name_scope('actor_loss'): actor_loss = tf.negative( tf.reduce_mean(self.actor_critic.Q_with_actor), name='actor_loss') + encoder_l2_loss + actor_l2_loss with tf.name_scope('critic_loss'): target_Q = tf.stop_gradient( self.env_info['reward'] + self.gamma * tf.cast(1 - self.env_info['done'], tf.float32) * self._target_actor_critic.Q_with_actor, name='target_Q') critic_loss = tf.losses.mean_squared_error( target_Q, self.actor_critic.Q) + encoder_l2_loss + critic_l2_loss if self.log_tensorboard: tf.summary.scalar('actor_l2_loss_', actor_l2_loss) tf.summary.scalar('critic_l2_loss_', critic_l2_loss) tf.summary.scalar('encoder_l2_loss_', encoder_l2_loss) tf.summary.scalar('actor_loss_', actor_loss) tf.summary.scalar('critic_loss_', critic_loss) return actor_loss, critic_loss def _optimize(self, losses): with tf.variable_scope('optimizer'): actor_loss, critic_loss = losses actor_opt_op = self._optimize_objective(actor_loss, 'actor') critic_opt_op = self._optimize_objective(critic_loss, 'critic') opt_op = tf.group(actor_opt_op, critic_opt_op) return opt_op def _optimize_objective(self, loss, name): # params for optimizer learning_rate = self._args['actor_critic'][name][ 'learning_rate'] if 'learning_rate' in self._args['actor_critic'][ name] else 1e-3 beta1 = self._args['actor_critic'][name][ 'beta1'] if 'beta1' in self._args['actor_critic'][name] else .9 beta2 = self._args['actor_critic'][name][ 'beta2'] if 'beta2' in self._args['actor_critic'][name] else .999 clip_norm = self._args[name]['actor_critic'][ 'clip_norm'] if 'clip_norm' in self._args['actor_critic'] else 5. with tf.variable_scope(name + '_opt', reuse=self.reuse): # setup optimizer self._optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate, beta1=beta1, beta2=beta2) tvars = self.actor_critic.actor_trainable_variables if name == 'actor' else self.actor_critic.critic_trainable_variables grads, tvars = list( zip(*self._optimizer.compute_gradients(loss, var_list=tvars))) grads, _ = tf.clip_by_global_norm(grads, clip_norm) opt_op = self._optimizer.apply_gradients(zip(grads, tvars)) if self.log_tensorboard: with tf.name_scope(name): with tf.name_scope('gradients_'): for grad, var in zip(grads, tvars): if grad is not None: tf.summary.histogram(var.name.replace(':0', ''), grad) with tf.name_scope('params_'): for var in tvars: tf.summary.histogram(var.name.replace(':0', ''), var) return opt_op def _targetnet_ops(self): with tf.name_scope('target_net_op'): target_main_var_pairs = list( zip(self._target_variables, self.main_variables)) init_target_op = list( map(lambda v: tf.assign(v[0], v[1], name='init_target_op'), target_main_var_pairs)) update_target_op = list( map( lambda v: tf.assign(v[0], self.tau * v[1] + (1. - self.tau) * v[0], name='update_target_op'), target_main_var_pairs)) return init_target_op, update_target_op def _learn(self): states, actions, rewards, next_states, dones = self.buffer.sample() feed_dict = { self.env_info['state']: states, self.env_info['action']: actions, self.env_info['reward']: rewards, self.env_info['next_state']: next_states, self.env_info['done']: dones, } # update the main networks if self.log_tensorboard: _, summary = self.sess.run([self.opt_op, self.merged_op], feed_dict=feed_dict) self.writer.add_summary(summary, self.learn_steps) else: _ = self.sess.run(self.opt_op, feed_dict=feed_dict) # update the target networks self.sess.run(self.update_target_op) self.learn_steps += 1 def _noise_params(self): with tf.variable_scope('noise'): noise_sigma = tf.get_variable('noise_sigma', initializer=self.init_noise_sigma, trainable=False) noise_decay_op = tf.assign(noise_sigma, self.noise_decay * noise_sigma, name='noise_decay_op') param_noise_pairs = [] for var in self.actor_critic.actor_perturbable_variables: noise = tf.truncated_normal(tf.shape(var), stddev=noise_sigma) param_noise_pairs.append((var, noise)) with tf.control_dependencies([noise_decay_op]): noise_op = list( map( lambda v: tf.assign(v[0], v[0] + v[1], name='noise_op' ), param_noise_pairs)) denoise_op = list( map( lambda v: tf.assign( v[0], v[0] - v[1], name='denoise_op'), param_noise_pairs)) return noise_op, denoise_op def _initialize_target_net(self): self.sess.run(self.init_target_op)
class DrlAgent: def __init__(self, sess, is_train, dim_state, dim_action, num_paths, actor_learn_rate, critic_learn_rate, tau, buffer_size, mini_batch, ep_begin, epsilon_end, gamma, max_epoch, seed=66): self.__is_train = is_train self.__dim_state = dim_state self.__dim_action = dim_action self.__mini_batch = mini_batch self.__ep_begin = ep_begin self.__gamma = gamma self.__max_epoch = max_epoch self.__actor = ActorNetwork(sess, dim_state, dim_action, 1.0, actor_learn_rate, tau, num_paths) self.__critic = CriticNetwork(sess, dim_state, dim_action, critic_learn_rate, tau) self.__replay = ReplayBuffer(buffer_size, seed) self.__explorer = Explorer(ep_begin, epsilon_end, max_epoch, dim_action, num_paths, seed) self.__state_curt = np.zeros(dim_state) self.__action_curt = self.__explorer.convert_action( np.ones(dim_action)) self.__episode = 0 self.__step = 0 def target_paras_init(self): self.__actor.update_target_paras() self.__critic.update_target_paras() def predict(self, state, reward): action_original = self.__actor.predict([state])[0] if not self.__is_train: return action_original action = self.__explorer.get_act(action_original) self.__replay.add(self.__state_curt, self.__action_curt, reward, state) self.__state_curt = state self.__action_curt = action if len(self.__replay) > self.__mini_batch: self.train() self.__step += 1 if self.__step >= self.__max_epoch: self.__step = 0 self.__episode += 1 self.__explorer.reset_ep(self.__ep_begin) return action def train(self): batch_state, batch_action, batch_reward, batch_state_next = self.__replay.sample_batch( self.__mini_batch) weights = [1.0] * self.__mini_batch weights = np.expand_dims(weights, axis=1) target_q = self.__critic.predict_target( batch_state_next, self.__actor.predict_target(batch_state_next)) value_q = self.__critic.predict(batch_state, batch_action) batch_y = [] batch_error = [] for k in range(len(batch_reward)): target_y = batch_reward[k] + self.__gamma * target_q[k] batch_error.append(abs(target_y - value_q[k])) batch_y.append(target_y) predicted_q, _ = self.__critic.train(batch_state, batch_action, batch_y, weights) a_outs = self.__actor.predict(batch_state) grads = self.__critic.calculate_gradients(batch_state, a_outs) weighted_grads = weights * grads[0] self.__actor.train(batch_state, weighted_grads) self.__actor.update_target_paras() self.__critic.update_target_paras()
class DQN_Agent(): """ Interacts an learns from the environment. """ def __init__(self, state_size, action_size, seed, GAMMA=GAMMA, TAU=TAU, LR=LR, UPDATE_EVERY=UPDATE_EVERY, BUFFER_SIZE=BUFFER_SIZE, BATCH_SIZE=BATCH_SIZE): """ Initialize the agent. ========== PARAMETERS ========== state_size (int) = observation dimension of the environment action_size (int) = dimension of each action seed (int) = random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.gamma = GAMMA self.tau = TAU self.lr = LR self.update_every = UPDATE_EVERY self.buffer_size = BUFFER_SIZE self.batch_size = BATCH_SIZE self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # instantiate online local and target network for weight updates self.qnetwork_local = QNetwork(state_size, action_size, seed).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # create a replay buffer self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # time steps for updating target network every time t_step % 4 == 0 self.t_step = 0 def step(self, state, action, reward, next_state, done): ''' Append a SARS sequence to memory, then every update_every steps learn from experiences''' self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # in case enough samples are available in internal memory, sample and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """ Choose action from an epsilon-greedy policy ========== PARAMETERS ========== state (array) = current state space eps (float) = epsilon, for epsilon-greedy action choice """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local.forward(state) self.qnetwork_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """ Update the value parameters using experience tuples sampled from ReplayBuffer ========== PARAMETERS ========== experiences = Tuple of torch.Variable: SARS', done gamma (float) = discount factor to weight rewards """ states, actions, rewards, next_states, dones = experiences # calculate max predicted Q values for the next states using target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # calculate expected Q vaues from the local model Q_expected = self.qnetwork_local(states).gather(1, actions) # compute MSE Loss loss = F.mse_loss(Q_expected, Q_targets) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """ Soft update for model parameters, every update steps as defined above theta_target = tau * theta_local + (1-tau)*theta_target ========== PARAMETERS ========== local_model, target_model = PyTorch Models, weights will be copied from-to tau = interpolation parameter, type=float """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DDPG: def __init__(self, env=gym.make('Pendulum-v0'), s_dim=2, a_dim=1, gamma=0.99, episodes=100, tau=0.001, buffer_size=1e06, minibatch_size=64, actor_lr=0.001, critic_lr=0.001, save_name='final_weights', render=False): self.save_name = save_name self.render = render self.env = env self.upper_bound = env.action_space.high[0] self.lower_bound = env.action_space.low[0] self.EPISODES = episodes self.MAX_TIME_STEPS = 200 self.s_dim = s_dim self.a_dim = a_dim self.GAMMA = gamma self.TAU = tau self.buffer_size = buffer_size self.minibatch_size = minibatch_size self.actor_lr = actor_lr self.critic_lr = critic_lr self.ou_noise = OUNoise(mean=np.zeros(1)) self.actor = Actor(self.s_dim, self.a_dim).model() self.target_actor = Actor(self.s_dim, self.a_dim).model() self.actor_opt = tf.keras.optimizers.Adam(learning_rate=self.actor_lr) self.target_actor.set_weights(self.actor.get_weights()) self.critic = Critic(self.s_dim, self.a_dim).model() self.critic_opt = tf.keras.optimizers.Adam( learning_rate=self.critic_lr) self.target_critic = Critic(self.s_dim, self.a_dim).model() self.target_critic.set_weights(self.critic.get_weights()) self.replay_buffer = ReplayBuffer(self.buffer_size) def update_target(self): # Two methods to update the target actor # Method 1: self.target_actor.set_weights( np.array(self.actor.get_weights()) * self.TAU + np.array(self.target_actor.get_weights()) * (1 - self.TAU)) self.target_critic.set_weights( np.array(self.critic.get_weights()) * self.TAU + np.array(self.target_critic.get_weights()) * (1 - self.TAU)) """ # Method 2: new_weights = [] target_variables = self.target_critic.weights for i, variable in enumerate(self.critic.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_critic.set_weights(new_weights) new_weights = [] target_variables = self.target_actor.weights for i, variable in enumerate(self.actor.weights): new_weights.append(variable * self.TAU + target_variables[i] * (1 - self.TAU)) self.target_actor.set_weights(new_weights) """ def train_step(self): s_batch, a_batch, r_batch, d_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) """ mu_prime = self.target_actor(s2_batch) # predictions by target actor Q_prime = self.target_critic([s2_batch, mu_prime]) # predictions by target critic y = np.zeros_like(Q_prime) for k in range(self.minibatch_size): if d_batch[k]: y[k] = r_batch[k] else: y[k] = r_batch[k] + self.GAMMA * Q_prime[k] # y = r_batch + gamma * Q_prime checkpoint_path = "training/cp_critic.ckpt" checkpoint_dir = os.path.dirname(checkpoint_path) # Create a callback that saves the model's weights cp_callback1 = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_dir, save_weights_only=True, verbose=1) self.critic.train_on_batch([s_batch, a_batch], y) # self.critic.fit([s_batch, a_batch], y, verbose=0, steps_per_epoch=8, callbacks=[cp_callback1]) with tf.GradientTape(persistent=True) as tape: a = self.actor(s_batch) tape.watch(a) theta = self.actor.trainable_variables q = self.critic([s_batch, a]) dq_da = tape.gradient(q, a) da_dtheta = tape.gradient(a, theta, output_gradients=-dq_da) self.actor_opt.apply_gradients(zip(da_dtheta, self.actor.trainable_variables)) """ with tf.GradientTape() as tape: target_actions = self.target_actor(s2_batch) y = r_batch + self.GAMMA * self.target_critic( [s2_batch, target_actions]) critic_value = self.critic([s_batch, a_batch]) critic_loss = tf.math.reduce_mean(tf.math.square(y - critic_value)) critic_grad = tape.gradient(critic_loss, self.critic.trainable_variables) self.critic_opt.apply_gradients( zip(critic_grad, self.critic.trainable_variables)) with tf.GradientTape() as tape: actions = self.actor(s_batch) q = self.critic([s_batch, actions]) # critic_value # Used `-value` as we want to maximize the value given # by the critic for our actions actor_loss = -tf.math.reduce_mean(q) actor_grad = tape.gradient(actor_loss, self.actor.trainable_variables) self.actor_opt.apply_gradients( zip(actor_grad, self.actor.trainable_variables)) self.update_target() return np.mean(q) def policy(self, s): # since batch normalization is done on self.actor, it is multiplied with upper_bound if s.ndim == 1: s = s[None, :] action = self.actor(s) * self.upper_bound + self.ou_noise() action = np.clip(action, self.lower_bound, self.upper_bound) return action def train(self): # To store reward history of each episode ep_reward_list = [] # To store average reward history of last few episodes avg_reward_list = [] monitor = Monitor([1, 1], titles=['Reward', 'Loss'], log=2) with Loop_handler( ) as interruption: # to properly save even if ctrl+C is pressed for eps in range(self.EPISODES): episode_reward = 0 s = self.env.reset() """ if an env is created using the "gym.make" method, it will terminate after 200 steps """ for t in range(self.MAX_TIME_STEPS): # done = False # while not done: if self.render: self.env.render() a = self.policy(s) s_, r, done, _ = self.env.step(a) self.replay_buffer.add(np.reshape(s, (self.s_dim, )), np.reshape(a, (self.a_dim, )), r, done, np.reshape(s_, (self.s_dim, ))) episode_reward += r if self.replay_buffer.size() > self.minibatch_size: q = self.train_step() s = s_.reshape(1, -1) if interruption(): break ep_reward_list.append(episode_reward) # Mean of last 40 episodes avg_reward = np.mean(ep_reward_list[-40:]) print("Episode * {} * Avg Reward is ==> {}".format( eps, avg_reward)) avg_reward_list.append(avg_reward) monitor.add_data(avg_reward, q) self.save_weights( save_name=self.save_name) # if you want to save weights self.plot_results(avg_reward=avg_reward_list, train=True) def save_weights(self, save_name='final_weights'): self.actor.save_weights("training/%s_actor.h5" % save_name) self.critic.save_weights("training/%s_critic.h5" % save_name) self.target_actor.save_weights("training/%s_target_actor.h5" % save_name) self.target_critic.save_weights("training/%s_target_critic.h5" % save_name) # to save in other format self.target_actor.save_weights('training/%s_actor_weights' % save_name, save_format='tf') self.target_critic.save_weights('training/%s_critic_weights' % save_name, save_format='tf') print('Training completed and network weights saved') # For evaluation of the policy learned def collect_data(self, act_net, iterations=1000): a_all, states_all = [], [] obs = self.env.reset() for t in range(iterations): obs = np.squeeze(obs) if obs.ndim == 1: a = act_net(obs[None, :]) else: a = act_net(obs) obs, _, done, _ = self.env.step(a) states_all.append(obs) a_all.append(a) # self.env.render() # Uncomment this to see the actor in action (But not in python notebook) # if done: # break states = np.squeeze( np.array(states_all)) # cos(theta), sin(theta), theta_dot a_all = np.squeeze(np.array(a_all)) return states, a_all def plot_results(self, avg_reward=None, actions=None, states=None, train=False, title=None): # An additional way to visualize the avg episode rewards if train: plt.figure() plt.plot(avg_reward) plt.xlabel("Episode") plt.ylabel("Avg. Epsiodic Reward") plt.show() else: # work only for Pendulum-v0 environment fig, ax = plt.subplots(3, sharex=True) theta = np.arctan2(states[:, 1], states[:, 0]) ax[0].set_ylabel('u') ax[0].plot(np.squeeze(actions)) ax[1].set_ylabel(u'$\\theta$') ax[1].plot(theta) # ax[1].plot(states[:, 0]) ax[2].set_ylabel(u'$\omega$') ax[2].plot(states[:, 2]) # ang velocity fig.canvas.set_window_title(title)
class Agent: def __init__(self, state_size, action_size, device, buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, lr=5e-4, update_every=4): self.state_size = state_size self.action_size = action_size self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = lr self.update_every = update_every # model settings self.qnet_local = Model(state_size, action_size).to(self.device) self.qnet_target = Model(state_size, action_size).to(self.device) self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=self.lr) # replay buffer settings self.replay_buffer = ReplayBuffer(self.buffer_size, self.batch_size) self.update_step = 0 def step(self, state, action, reward, next_state, done): self.replay_buffer.add(state, action, reward, next_state, done) self.update_step = (self.update_step + 1) % self.update_every if (self.update_step == 0) and (len(self.replay_buffer) > self.batch_size): experiences = self.replay_buffer.sample() self.learn(experiences) def act(self, state, eps=0.0): state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.qnet_local.eval() with torch.no_grad(): action_values = self.qnet_local(state) self.qnet_local.train() if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return np.random.choice(self.action_size) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences # convert to tensors and send to device states = torch.from_numpy(states).float().to(self.device) actions = torch.from_numpy(actions).long().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) next_states = torch.from_numpy(next_states).float().to(self.device) dones = torch.from_numpy(dones).float().to(self.device) # max returns max values (0) and indices (1) # unsqueeze is needed to add batch dim B x 1 q_max = self.qnet_target(next_states).detach().max(1)[0].unsqueeze(1) y = rewards + self.gamma * q_max * (1 - dones) # select action values corresponding to actions # this is what .gather does # note for the expected we pass states, not next_states q_expected = self.qnet_local(states).gather(1, actions) loss = F.mse_loss(q_expected, y) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.soft_update() def soft_update(self): for target_param, local_param in zip(self.qnet_target.parameters(), self.qnet_local.parameters()): target_param.data.copy_(self.tau * local_param.data + (1 - self.tau) * target_param.data) def train(self, env, n_episodes=2000, max_t=1000, eps_start=1.0, eps_end=0.01, eps_decay=0.995): scores = [] scores_window = deque(maxlen=100) eps = eps_start brain_name = env.brain_names[0] for i_episode in range(1, n_episodes + 1): env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] score = 0 for t in range(max_t): action = self.act(state, eps) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] self.step(state, action, reward, next_state, done) state = next_state score += reward if done: break scores_window.append(score) scores.append(score) avg_scores = np.mean(scores_window) eps = max(eps_end, eps_decay * eps) print(f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}', end='') if i_episode % 100 == 0: print( f'\rEpisode {i_episode}\tAverage Score: {avg_scores:.2f}') if avg_scores >= 13.0: print(f'\nEnvironment solved in {i_episode - 100} episodes!' f'\tAverage Score: {np.mean(scores_window):.2f}') torch.save(self.qnet_local.state_dict(), 'checkpoint.pth') break return scores def evaluate(self, env): brain_name = env.brain_names[0] env_info = env.reset(train_mode=False)[brain_name] state = env_info.vector_observations[0] score = 0 for i in range(2000): action = self.act(state) env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] reward = env_info.rewards[0] done = env_info.local_done[0] state = next_state score += reward if done: break print(f'Total score: {score:.2f}')
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id, args): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_target = ActorNetwork(state_size, action_size, args).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) if not agent_id: self.actor_network.load_state_dict(torch.load( args['agent_p0_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p0_path']), strict=False) else: self.actor_network.load_state_dict(torch.load( args['agent_p1_path']), strict=False) self.actor_target.load_state_dict(torch.load( args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() input_state = torch.from_numpy(current_state).float().to( self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def train(self, experiences): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (GAMMA * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss mCritic.optimizer.zero_grad() mCritic_loss.backward() mCritic.optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_network(states) actor_loss = -mCritic.network(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(mCritic.network, mCritic.target, TAU) self.soft_update(self.actor_network, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(self, transitions: int, eps_max: float = 0.5, eps_min: float = 0., buffer_size: int = 10000, batch_size: int = 128, shaping_coef: float = 300., progress_upd_step: int = 0, start_training: int = 10000, to_sink: bool = False): history = ReplayBuffer(size=buffer_size) progress_upd_step = progress_upd_step if progress_upd_step else transitions // 100 log = { "alpha": self.alpha, "gamma": self.gamma, "buffer_size": buffer_size, "batch_size": batch_size, "tau": self.tau, "shaping_coef": shaping_coef, "eps_max": eps_max, "eps_min": eps_min, "bins": self.num_bins, "to_sink": to_sink, "step": [], "reward_mean": [], "reward_std": [] } state = self.reset() t = tqdm(range(transitions)) for i in t: eps = eps_max - (eps_max - eps_min) * i / transitions if random() < eps: action = self.env.action_space.sample() else: action = self.act(state) next_state, reward, done, _ = self.env.step(action) reward += shaping_coef * (self.gamma * np.abs(next_state[1]) - np.abs(state[1])) done_ = next_state[0] > 0.5 history.add((state, action, next_state, reward, done_)) state = self.reset() if done else next_state if i > start_training: self.update(history.sample(batch_size)) # soft update with torch.no_grad(): for param, param_target in zip(self.dqn.parameters(), self.dqn_target.parameters()): param_target.data.mul_(1 - self.tau) param_target.data.add_(self.tau * param.data) if (i + 1) % progress_upd_step == 0: reward_mean, reward_std = self.evaluate_policy() log["step"].append(i) log["reward_mean"].append(reward_mean) log["reward_std"].append(reward_std) t.set_description(f"step: {i + 1} | Rmean = {reward_mean:0.4f} | Rstd = {reward_std:0.4f}") if to_sink and reward_mean >= 90 and self.evaluate_policy(episodes=100)[0] >= 90: self.sink(history, start_training, eps, shaping_coef) shaping_coef = 1 to_sink = False return log
def main(): ########## # CONFIG # ########## # Target Reward tgt_score = 0.5 # Device device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Seed seed = 7 seeding(seed) # Model Architecture # Actor hidden_in_actor = 256 hidden_out_actor = 128 lr_actor = 1e-4 # Critic hidden_in_critic = 256 hidden_out_critic = 128 lr_critic = 3e-4 weight_decay_critic = 0 # Episodes number_of_episodes = 10000 episode_length = 2000 # Buffer buffer_size = int(1e6) batchsize = 512 # Agent Update Frequency episode_per_update = 1 # Rewards Discounts Factor discount_factor = 0.95 # Soft Update Weight tau = 1e-2 # Noise Process noise_factor = 2 noise_reduction = 0.9999 noise_floor = 0.0 # Window win_len = 100 # Save Frequency save_interval = 200 # Logger log_path = os.getcwd() + "/log" logger = SummaryWriter(log_dir=log_path) # Model Directory model_dir = os.getcwd() + "/model_dir" os.makedirs(model_dir, exist_ok=True) # Load Saved Model load_model = False #################### # Load Environment # #################### env = UnityEnvironment(file_name="./Tennis_Linux_NoVis/Tennis.x86_64") # Get brain brain_name = env.brain_names[0] brain = env.brains[brain_name] print('Brain Name:', brain_name) # Reset the environment env_info = env.reset(train_mode=True)[brain_name] # Number of Agents num_agents = len(env_info.agents) print('Number of agents:', num_agents) # size of each action action_size = brain.vector_action_space_size print('Size of each action:', action_size) # examine the state space states = env_info.vector_observations state_size = states.shape[1] print('There are {} agents. Each observes a state with length: {}'.format( states.shape[0], state_size)) #################### # Show Progressbar # #################### widget = [ 'episode: ', pb.Counter(), '/', str(number_of_episodes), ' ', pb.Percentage(), ' ', pb.ETA(), ' ', pb.Bar(marker=pb.RotatingMarker()), ' ' ] timer = pb.ProgressBar(widgets=widget, maxval=number_of_episodes).start() start = time.time() ############### # Multi Agent # ############### maddpg = MADDPG(state_size, action_size, num_agents, hidden_in_actor, hidden_out_actor, lr_actor, hidden_in_critic, hidden_out_critic, lr_critic, weight_decay_critic, discount_factor, tau, seed, device) if load_model: load_dict_list = torch.load(os.path.join(model_dir, 'episode-saved.pt')) for i in range(num_agents): maddpg.maddpg_agent[i].actor.load_state_dict( load_dict_list[i]['actor_params']) maddpg.maddpg_agent[i].actor_optimizer.load_state_dict( load_dict_list[i]['actor_optim_params']) maddpg.maddpg_agent[i].critic.load_state_dict( load_dict_list[i]['critic_params']) maddpg.maddpg_agent[i].critic_optimizer.load_state_dict( load_dict_list[i]['critic_optim_params']) ################# # Replay Buffer # ################# rebuffer = ReplayBuffer(buffer_size, seed, device) ################# # TRAINING LOOP # ################# # initialize scores scores_history = [] scores_window = deque(maxlen=save_interval) # i_episode = 0 for i_episode in range(number_of_episodes): timer.update(i_episode) # Reset Environmet env_info = env.reset(train_mode=True)[brain_name] states = env_info.vector_observations scores = np.zeros(num_agents) # Reset Agent maddpg.reset() # episode_t = 0 for episode_t in range(episode_length): # Explore with decaying noise factor actions = maddpg.act(states, noise_factor=noise_factor) env_info = env.step(actions)[brain_name] # Environment reacts next_states = env_info.vector_observations # get the next states rewards = env_info.rewards # get the rewards dones = env_info.local_done # see if episode has finished ################### # Save Experience # ################### rebuffer.add(states, actions, rewards, next_states, dones) scores += rewards states = next_states if any(dones): break scores_history.append(np.max(scores)) # save most recent score scores_window.append(np.max(scores)) # save most recent score avg_rewards = np.mean(scores_window) noise_factor = max(noise_floor, noise_factor * noise_reduction) # Reduce Noise Factor ######### # LEARN # ######### if len(rebuffer) > batchsize and i_episode % episode_per_update == 0: for a_i in range(num_agents): samples = rebuffer.sample(batchsize) maddpg.update(samples, a_i, logger) # Soft Update maddpg.update_targets() ################## # Track Progress # ################## if i_episode % save_interval == 0 or i_episode == number_of_episodes - 1: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") ############## # Save Model # ############## save_info = ((i_episode) % save_interval == 0 or i_episode == number_of_episodes) if save_info: save_dict_list = [] for i in range(num_agents): save_dict = { 'actor_params': maddpg.maddpg_agent[i].actor.state_dict(), 'actor_optim_params': maddpg.maddpg_agent[i].actor_optimizer.state_dict(), 'critic_params': maddpg.maddpg_agent[i].critic.state_dict(), 'critic_optim_params': maddpg.maddpg_agent[i].critic_optimizer.state_dict() } save_dict_list.append(save_dict) torch.save(save_dict_list, os.path.join(model_dir, 'episode-Latest.pt')) pd.Series(scores_history).to_csv( os.path.join(model_dir, "scores.csv")) # plot the scores rolling_mean = pd.Series(scores_history).rolling(win_len).mean() fig = plt.figure() ax = fig.add_subplot(111) plt.plot(np.arange(len(scores_history)), scores_history) plt.axhline(y=tgt_score, color='r', linestyle='dashed') plt.plot(rolling_mean, lw=3) plt.ylabel('Score') plt.xlabel('Episode #') # plt.show() fig.savefig(os.path.join(model_dir, 'Average_Score.pdf')) fig.savefig(os.path.join(model_dir, 'Average_Score.jpg')) plt.close() if avg_rewards > tgt_score: logger.add_scalars('rewards', { 'Avg Reward': avg_rewards, 'Noise Factor': noise_factor }, i_episode) print( '\nElapsed time {:.1f} \t Update Count {} \t Last Episode t {}' .format((time.time() - start) / 60, maddpg.update_count, episode_t), '\nEpisode {} \tAverage Score: {:.2f} \tNoise Factor {:2f}'. format(i_episode, avg_rewards, noise_factor), end="\n") break env.close() logger.close() timer.finish()
class MADDPG: def __init__(self, num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, discount_factor=0.95, tau=0.02, device=device, random_seed=4, lr_critic=1.0e-4, weight_decay=0.0): super(MADDPG, self).__init__() # parameter configuration self.num_agents = num_agents self.device = device self.discount_factor = discount_factor self.tau = tau self.num_agents = num_agents self.global_action_size = global_action_size self.global_obs_dim = global_obs_dim torch.manual_seed(random_seed) random.seed(random_seed) self.random_seed = random_seed self.weight_decay = weight_decay # define actors self.actors = [ DDPGActor(num_agents, local_obs_dim, local_action_size, global_obs_dim, global_action_size, device=device) for _ in range(num_agents) ] # define centralized critic self.critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) self.target_critic = Critic(global_obs_dim, global_action_size, self.random_seed).to(self.device) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr_critic, weight_decay=self.weight_decay) # noise coef self.noise_coef = 1.0 self.noise_coef_decay = 1e-6 # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) def act(self, obs_all_agents): actions = [ ddpg_actor.act(local_obs, self.noise_coef) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def target_act(self, obs_all_agents): actions = [ ddpg_actor.target_act(local_obs, noise_coef=0, add_noise=False) for ddpg_actor, local_obs in zip(self.actors, obs_all_agents) ] return actions def step(self, obs, obs_full, actions, rewards, next_obs, next_obs_full, dones, timestep): self.memory.add(obs, obs_full, actions, rewards, next_obs, next_obs_full, dones) timestep = timestep % TRAIN_EVERY # Learn, if enough samples are available in memory if len(self.memory) > BATCH_SIZE and timestep == 0: for _ in range(N_LEARN_UPDATES): experiences = self.memory.sample() self.learn(experiences, self.discount_factor) def learn(self, experiences, gamma): obs, obs_full, action, reward, next_obs, next_obs_full, done = experiences obs = obs.permute(1, 0, -1) # agent_id * batch_size * state_size obs_full = obs_full.view(-1, self.global_obs_dim) next_obs = next_obs.permute(1, 0, -1) next_obs_full = next_obs_full.view(-1, self.global_obs_dim) action = action.reshape(-1, self.global_action_size) # ---------------- update centralized critic ----------------------- # self.critic_optimizer.zero_grad() # get target actions from all target_actors target_actions = np.array(self.target_act(next_obs)) target_actions = torch.from_numpy(target_actions).float().permute( 1, 0, -1) target_actions = target_actions.reshape(-1, self.global_action_size) # update critic with torch.no_grad(): q_next = self.target_critic.forward(next_obs_full, target_actions.to(self.device)) y = reward + gamma * q_next * (1 - done) q = self.critic.forward(obs_full, action) critic_loss = 0 for i in range(self.num_agents): critic_loss += F.mse_loss(q, y[:, i].detach().reshape( -1, 1)) / self.num_agents critic_loss.backward() self.critic_optimizer.step() # ---------------- update actor for all agents --------------------- # for ii in range(len(self.actors)): self.actors[ii].actor_optimizer.zero_grad() q_action = [ self.actors[i].actor_local(ob) if i == ii \ else self.actors[i].actor_local(ob).detach() for i, ob in enumerate(obs) ] q_action = torch.stack(q_action).permute(1, 0, -1) q_action = q_action.reshape(-1, self.global_action_size).to( self.device) # policy_gradient actor_loss = -self.critic.forward(obs_full, q_action).mean() actor_loss.backward() self.actors[ii].actor_optimizer.step() # --------------- soft update all target networks ------------------- # soft_update(self.target_critic, self.critic, self.tau) for actor in self.actors: actor.update_target(self.tau) # -------------- reset noise --------------------------------------- # for actor in self.actors: actor.action_noise.reset() self.noise_coef -= self.noise_coef_decay if self.noise_coef < 0.01: self.noise_coef = 0.01
class DDPG: def __init__(self, env, batch_size, mem_size, discount, actor_params, critic_params): self._batch_size = batch_size self._mem_size = mem_size self._discount = discount self._sess = tensorflow.Session() k_backend.set_session(self._sess) self._env = env self._state_dim = env.observation_space.shape[0] self._action_dim = env.action_space.shape[0] self._action_min = env.action_space.low self._action_max = env.action_space.high self._state_min = env.observation_space.low self._state_max = env.observation_space.high self._actor = Actor(self._sess, self._state_dim, self._action_dim, self._action_min, self._action_max, actor_params) self._critic = Critic(self._sess, 0.5, self._state_dim, self._action_dim, critic_params) self._memory = ReplayBuffer(mem_size) def get_action(self, state): return self._actor._model.predict(state) def train(self): ''' No training takes place until the replay buffer contains at least batch size number of experiences ''' if (self._memory.size() > self._batch_size): self._train() def _train(self): states, actions, rewards, done, next_states = self._memory.sample( self._batch_size) self._train_critic(states, actions, rewards, done, next_states) action_gradients = self._critic.action_gradients(states, actions) self._actor.train(states, action_gradients) def q_estimate(self, state, action): return self._critic._model.predict(state, action) def _get_q_targets(self, next_states, done, rewards): ''' q = r if done else = r + gamma * qnext ''' # use actor network to determine the next action under current policy # estimate Q values from the critic network actions = self.get_action(next_states) qnext = self.q_estimate(next_states, actions) q_targets = [ reward if end else reward * self._discount * next_q for (reward, next_q, end) in zip(rewards, qnext, done) ] return q_targets def _train_critic(self, states, actions, rewards, done, next_states): q_targets = self._get_q_targets(next_states, done, rewards) self._critic.train(states, actions, q_targets) def experience(self, state, action, reward, done, next_state): # store in replay buffer self._memory.add(state, action, reward, done, next_state) self.train()
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, agent_id): self.state_size = state_size self.action_size = action_size self.seed = args['seed'] self.device = args['device'] #self.args = args # Q-Network self.actor_network = ActorNetwork(state_size, action_size).to(self.device) self.actor_target = ActorNetwork(state_size, action_size).to(self.device) self.actor_optimizer = optim.Adam(self.actor_network.parameters(), lr=args['LR_ACTOR']) #Model takes too long to run --> load model weights from previous run (took > 24hours on my machine) #if not agent_id: # self.actor_network.load_state_dict(torch.load(args['agent_p0_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p0_path']), strict=False) #else: # self.actor_network.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # self.actor_target.load_state_dict(torch.load(args['agent_p1_path']), strict=False) # Replay memory self.memory = ReplayBuffer(action_size, args['BUFFER_SIZE'], args['BATCH_SIZE'], self.device, self.seed) # Noise process self.noise = OUNoise(action_size, self.seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.mCriticLoss = 0 self.actorLoss = 0 def step(self, state, action, reward, next_state, done, mCritic): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > args['BATCH_SIZE']: experiences = self.memory.sample() self.train(experiences, mCritic) def act(self, current_state): with torch.no_grad(): self.actor_network.eval() # PUT CONDITIONAL CHECK -> if CNN reshape, ELSE...dont #input_state = torch.from_numpy(current_state).float().reshape(args['reshape_size']).unsqueeze(0).unsqueeze(0).to(self.device) input_state = torch.from_numpy(current_state).float().to(self.device) with torch.no_grad(): action = self.actor_network(input_state).cpu().data.numpy() self.actor_network.train() #action += self.noise.sample() return action def reset(self): self.noise.reset() def train(self, experiences, mCritic): global states_ global next_states_ global actions_ global max_min_actions_vector global max_min_states_vector states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # with torch.no_grad(): # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) #PUT CONDITIONAL CHECK: if CNN reshape ELSE..Dont... #Q_targets_next = mCritic.target(next_states, actions_next[np.newaxis, :]) Q_targets_next = mCritic.target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (args['GAMMA'] * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = mCritic.network(states, actions) mCritic_loss = F.mse_loss(Q_expected, Q_targets)
class Agent(): """ DDPG Agent, interacts with environment and learns from environment """ def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed) # think def act(self, states, add_noise=True): """ Decide what action to take next """ # evaluate state through actor_local states = torch.from_numpy(states).float().to(self.DEVICE) actions = np.zeros((self.n_agents, self.action_size)) self.actor_local.eval() # put actor_local network in "evaluation" mode with torch.no_grad(): for n, state in enumerate(states): actions[n, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # put actor_local back into "training" mode # add noise for better performance if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) # embody def step(self, t, s, a, r, s_, done): """ Commit step into the brain """ # Save SARS' to replay buffer --- state-action-reward-next_state tuple for n in range(self.n_agents): # self.memory.add(s, a, r, s_, done) # print ("going to learn 10 times") self.memory.add(s[n], a[n], r[n], s_[n], done[n]) if t % self.LEARN_INTERVAL != 0: return # Learn (if enough samples are available in memory ) if len(self.memory) > self.BATCH_SIZE: # print ("going to learn 10 times") for _ in range(self.LEARN_NUM): experiences = self.memory.sample() # get a memory sample self.learn(experiences, self.GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Learn from experiences, with discount factor gamma Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------ Update Critic ------ # # get predicted next-state actions and Q values from target networks actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ------ Update Actor ------ # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------ Update Target Networks ------ # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) # keep count of steps taken # self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(sess, env, actor, critic): # Set up summary ops summary_ops, summary_vars = build_summaries() # Initialize Tensorflow variables sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(SUMMARY_DIR, sess.graph) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) for i in xrange(MAX_EPISODES): s = env.reset() episode_reward = 0 episode_ave_max_q = 0 noise = ExplorationNoise.ou_noise(OU_THETA, OU_MU, OU_SIGMA, MAX_STEPS_EPISODE) noise = ExplorationNoise.exp_decay(noise, EXPLORATION_TIME) for j in xrange(MAX_STEPS_EPISODE): if RENDER_ENV: env.render() # Add exploratory noise according to Ornstein-Uhlenbeck process to action # Decay exploration exponentially from 1 to 0 in EXPLORATION_TIME steps if i < EXPLORATION_TIME: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) + noise[j] else: a = actor.predict( np.reshape(s, (1, env.observation_space.shape[0]))) s2, r, terminal, info = env.step(a[0]) replay_buffer.add(np.reshape(s, actor.state_dim), np.reshape(a, actor.action_dim), r, terminal, np.reshape(s2, actor.state_dim)) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in xrange(MINIBATCH_SIZE): # If state is terminal assign reward only if t_batch[k]: y_i.append(r_batch[k]) # Else assgin reward + net target Q else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targets predicted_q_value, _ = \ critic.train(s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) episode_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) a_grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, a_grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 episode_reward += r if terminal or j == MAX_STEPS_EPISODE - 1: summary_str = sess.run(summary_ops, feed_dict={ summary_vars[0]: episode_reward, summary_vars[1]: episode_ave_max_q }) writer.add_summary(summary_str, i) writer.flush() print 'Reward: %.2i' % int(episode_reward), ' | Episode', i, \ '| Qmax: %.4f' % (episode_ave_max_q / float(j)) break
class Agent(): """Interacts with and learns from the environment""" def __init__(self, state_size, action_size, fc1_units=256, fc2_units=128, device=torch.device('cpu')): """DQN agent Args: state_size (int): dimension of each state action_size (int): dimension of each action (or the number of action choices) seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.device = device # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.qnetwork_target = QNetwork(state_size, action_size, fc1_units=fc1_units, fc2_units=fc2_units).to(self.device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Initialze qnetwork_target parameters to qnetwork_local self.soft_update(self.qnetwork_local, self.qnetwork_target, 1) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, device=self.device) # Initialize the time step counter (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subnet and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Args: state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Set qnetwork_local to evaluation mode self.qnetwork_local.eval() # This operation should not be included in gradient calculation with torch.no_grad(): action_values = self.qnetwork_local(state) # Set back qnetwork_local to training mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Args: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0].unsqueeze(1) # Compute Q tagets for current states with actual rewards Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ----- Update the target network ----- self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. theta_target = tau * theta_local + (1 - tau) * theta_target Args: local_model (torch.nn.Module): weights will be copied from target_model (torch.nn.MOdule): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1. - tau) * target_param.data)
class DDPG(): """ Deep Deterministic Policy Gradient Model """ def __init__(self, state_size, action_size, random_seed): """ Initialize the model with arguments as follows: ARGUMENTS ========= - state_size (int) = dimension of input space - action_size (int) = dimension of action space - random_seed (int) = random seed Returns ======= - best learned action to take after Actor-Critic Learning """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) # create noise self.noise = OUNoise(action_size, random_seed) # create memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed, device) # Actor Networks (local online net + target net) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr = LR_ACTOR) # Critic Networks (local online net + target net) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # instantiate online and target networks with same weights self.hard_update(self.actor_local, self.actor_target,) self.hard_update(self.critic_local, self.critic_target) def hard_update(self, local, target): for local_param, target_param in zip(local.parameters(), target.parameters()): target_param.data.copy_(local_param.data) def act(self, state, add_noise=True): """ Choose an action while interacting and learning in the environment """ state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): # Perform soft update of the target networks # at every time step, keep 1-tau of target network # and add only a small fraction (tau) of the current online networks # to prevent oszillation for local_param, target_param in zip(local_model.parameters(), target_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data) def step(self, state, action, reward, next_state, done): # at every iteration, add new SARS' trajectory to memory, then learn from batches # if learning_step is reached and enough samples are in the buffer self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA)
class DDQNAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, hidden_layers=[64, 64], buffer_size=int(1e5), batch_size=64, gamma=0.99, tau=1e-3, learning_rate=5e-4, update_every=4, head_name="DuelingDQN", head_scale="max"): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate (float; optional): learning rate update_every (int; optional): how often to update the network """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) self.buffer_size = buffer_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.lr = learning_rate self.update_every = update_every # detect GPU device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # Assign model parameters and assign device model_params = [ state_size, action_size, seed, hidden_layers, head_name, head_scale ] self.qnetwork_local = QNetwork(*model_params).to(self.device) self.qnetwork_target = QNetwork(*model_params).to(self.device) # Set up optimizer self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=self.lr) # Initialize Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, seed, self.device) # Initialize time step (for updating every self.update_every steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Update time step self.t_step = self.t_step + 1 # Learn every self.update_every time steps. if self.t_step % self.update_every == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Go to evaluation mode and get Q values for current state self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) # get back to train mode self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # Go to evaluation mode self.qnetwork_target.eval() with torch.no_grad(): # get Q values for the next state Q_dash_local = self.qnetwork_local(next_states) Q_dash_target = self.qnetwork_target(next_states) # Find the predicted action based on the local Q_network argmax_action = torch.max(Q_dash_local, dim=1, keepdim=True)[1] # Get the Q-value from the target network Q_dash_max = Q_dash_target.gather(1, argmax_action) # Update the target value y = rewards + gamma * Q_dash_max * (1 - dones) # Go back to train mode self.qnetwork_target.train() # Predict Q-values based on the local network self.optimizer.zero_grad() Q = self.qnetwork_local(states) y_pred = Q.gather(1, actions) # TD-error/loss function loss = torch.sum((y - y_pred)**2) # Optimize the network loss.backward() self.optimizer.step() # Update the target network using the local and target networks self.soft_update(self.qnetwork_local, self.qnetwork_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. ?_target = ?*?_local + (1 - ?)*?_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): """ The Agent interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed) def hard_update(self, local_model, target_model): """ Hard update model parameters. """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): """ Save experiences in replay memory and use random sample from buffer to learn. """ # Save experience / reward, cater for when multiples for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > self.params['BATCH_SIZE']: experiences = self.memory.sample() self.learn(experiences, self.params['GAMMA']) def act(self, states, add_noise=True): """ Returns actions for a given state as per current policy. """ states = torch.from_numpy(states).float().to(self.params['DEVICE']) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma=params['GAMMA']): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update Critic(Value) # Get predicted next-state actions and Q-Values from target Network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q Targe for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # Stabilize learning per bernchmark guidelines self.critic_optimizer.step() # Update Actor (Policy) # Compute Actor Loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, tau=self.params['TAU']) self.soft_update(self.actor_local, self.actor_target, tau=self.params['TAU']) def soft_update(self, local_model, target_model, tau=params['TAU']): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Code adapted from the Udacity course""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max action from max Q values (for next states) from target model indexes_of_Q_local_for_next_states = self.qnetwork_local( next_states).detach().max(1)[1].unsqueeze(1) Q_target_for_next_states = self.qnetwork_target(next_states).detach() Q_thetas = Q_target_for_next_states.gather( 1, indexes_of_Q_local_for_next_states) Q_targets = rewards + (gamma * Q_thetas * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. Polyak averaging θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG_Agent(): """Interacts with and learns from the environment.""" #self.state_size, self.action_size, self.seed, hidden_layers_actor, hidden_layers_critic, self.buffer_size, learning_rate_actor, learning_rate_critic def __init__(self, state_size, action_size, num_agents, seed, device, buffer_size=int(1e5), batch_size=128, num_batches = 5, update_every=10, gamma=0.99, tau=8e-3, learning_rate_actor=1e-3, learning_rate_critic=1e-3, weight_decay=0.0001, hidden_layers_actor=[32,32], hidden_layers_critic=[32, 32, 32], add_noise=True, start_eps=5.0, end_eps=0.0, end_eps_episode=500, agent_id=-1): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents seed (int): random seed hidden_layers (list of int ; optional): number of each layer nodes buffer_size (int ; optional): replay buffer size batch_size (int; optional): minibatch size gamma (float; optional): discount factor tau (float; optional): for soft update of target parameters learning_rate_X (float; optional): learning rate for X=actor or critic """ print('In DPPG_AGENT: seed = ', seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(seed) self.device = device self.buffer_size = buffer_size self.batch_size = batch_size self.update_every = update_every self.num_batches = num_batches self.gamma = gamma self.tau = tau self.lr_actor = learning_rate_actor self.lr_critic = learning_rate_critic self.weight_decay_critic = weight_decay self.add_noise = add_noise self.start_eps = start_eps self.eps = start_eps self.end_eps = end_eps self.eps_decay = 1/(end_eps_episode*num_batches) # set decay rate based on epsilon end target self.timestep = 0 self.agent_id = agent_id ### SET UP THE ACTOR NETWORK ### # Assign model parameters and assign device model_params_actor = [state_size, action_size, seed, hidden_layers_actor] # Create the Actor Network (w/ Target Network) self.actor_local = Actor(*model_params_actor).to(self.device) self.actor_target = Actor(*model_params_actor).to(self.device) #print('actor_local network is: ', print(self.actor_local)) # Set up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) ### SET UP THE CRITIC NETWORK ### model_params_critic = [state_size, action_size, seed, hidden_layers_critic] # Create the Critic Network (w/ Target Network) self.critic_local = Critic(*model_params_critic).to(self.device) self.critic_target = Critic(*model_params_critic).to(self.device) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise(action_size, self.seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, seed, device) def step(self, states, actions, rewards, next_states, dones, agent_number): # Increment timestep by 1 self.timestep += 1 # Save experience in replay memory self.memory.add(states, actions, rewards, next_states, dones) # If there are enough samples and a model update is to be made at this time step if len(self.memory) > self.batch_size and self.timestep%self.update_every == 0: # For each batch for i in range(self.num_batches): # Sample experiences from memory experiences = self.memory.sample() # Learn from the experience self.learn(experiences, self.gamma, agent_number) def act(self, state, scale_noise=True): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().to(self.device) # Go to evaluation mode and get Q values for current state self.actor_local.eval() with torch.no_grad(): # Get action for the agent and concatenate them action = [self.actor_local(state[0]).cpu().data.numpy()] # get back to train mode self.actor_local.train() # Add noise to the action probabilities # Note, we want the magnitude of noise to decrease as the agent keeps learning action += int(scale_noise)*(self.eps)*self.noise.sample() return np.clip(action, -1.0, 1.0) def reset(self): """ Reset the noise, and all neural network parameters for the current agent """ self.noise.reset() self.eps = self.start_eps self.timestep = 0 self.critic_local.reset_parameters() self.actor_local.reset_parameters() self.critic_target.reset_parameters() self.actor_target.reset_parameters() # ReSet up optimizer for the Actor network self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Set up optimizer for the Critic Network self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Clear the experience buffer self.memory.clear_buffer() def reset_noise(self): """ Reset the noise only """ self.noise.reset() def learn(self, experiences, gamma, agent_number): #### DRAW FROM MEMORY AND PREPARE SARS DATA #### # From the experiences buffer, separate out S_t, A_t, R_t, S_t+1, done data states, actions, rewards, next_states, dones = experiences # NOTE: actions has dimension of batch_size x concatenated action for all agents # get the next action for the current agent for the entire batch actions_next = self.actor_target(next_states) # Construct next action vector for the agent if agent_number == 0: actions_next = torch.cat((actions_next, actions[:,2:]), dim=1) else: actions_next = torch.cat((actions[:,:2], actions_next), dim=1) #### UPDATE CRITIC #### # Get predicted next-state actions and Q values from target models # Get the next targets Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) # Define the loss critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() # Clip gradient @1 torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # --------------UPDATE ACTOR -----------------------# # Compute actor loss actions_pred = self.actor_local(states) # Construct action prediction vector relative to each agent if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:,2:]), dim=1) else: actions_pred = torch.cat((actions[:,:2], actions_pred), dim=1) # Calculate the loss. Note the negative sign since we use steepest ascent actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks using the local and target networks self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # update noise decay parameter self.eps -= self.eps_decay self.eps = max(self.eps, self.end_eps) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. X_target = tau*X_local + (1 - tau)*X_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)
def main(test=False): try: # Pre proces ucenia if (test == False): # init wandb cloud wandb.init(project="dqn_maze") # hyperparametre wandb.config.batch_size = 32 wandb.config.gamma = 0.98 wandb.config.h1 = 128 wandb.config.h2 = 128 wandb.config.lr = 0.001 wandb.config.tau = 0.01 max_episodes = 5000 max_steps = 100 # Pre proces testovania else: max_episodes = 20 max_steps = 100 np.random.seed(99) # init file log_file = open("log/statistics.txt", "w") log_file.write("episode;score;step;time;apples;mines;end\n") if (test == False): a1 = Agent(26, 4, [wandb.config.h1, wandb.config.h2], wandb.config.lr) a1.save_plot() else: a1 = Agent(fileName="model.h5") a1.remove_noise() # experiences replay buffer replay_buffer = ReplayBuffer() # generate env env1 = Prostredie(10, 10, [ 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 4, 0 ]) # Hlavny cyklus hry for episode in range(1, max_episodes + 1): start_time = time.time() state = env1.reset(testing=test) # reset score score, avg_loss = 0.0, 0.0 for step in range(1, max_steps + 1): if test == True: env1.render() time.sleep(0.2) else: # reset Q net's noise params a1.reset_noise() # clovek #in_key = input() #if in_key == 'w': # action = 1 #elif in_key == 's': # action = 0 #elif in_key == 'a': # action = 2 #elif in_key == 'd': # action = 3 # nahodny agent #action = np.random.randint(0, 4) # neuronova siet action = np.argmax(a1.predict(state)) next_state, reward, done, info = env1.step(action) score += reward if (test == False): replay_buffer.add( (state, action, reward, next_state, float(done))) if len(replay_buffer.buffer) >= wandb.config.batch_size: loss = a1.train(replay_buffer, wandb.config.batch_size, wandb.config.gamma, wandb.config.tau) avg_loss += loss #else: # print(f"stav: {state}") # print(f"akcia: {action}") # print(f"odmena: {reward}") # print(f"done: {done}") # print(f"step: {step}") # print(f"replay_buffer_train: {len(replay_buffer.buffer)}") # print(f"epoch: {episode}/{max_episodes}") # print(f"score: {score}") # print(f"apples: {info['apples']}/{env1.count_apple}") # print(f"mines: {info['mines']}/{env1.count_mine}") # critical state = next_state if done == True: break # statistics avg_loss /= step if (test == False): log_dict = { 'epoch': episode, 'score': score, 'steps': step, 'loss': avg_loss, 'replay_buffer': len(replay_buffer.buffer), 'time': time.time() - start_time, 'apple': (info['apples'] / env1.count_apple) * 100.0, 'mine': (info['mines'] / env1.count_mine) * 100.0, 'end': info['end'] * 100.0 } wandb.log(log_dict) else: log_file.write( f"{episode};{score};{step};{time.time()-start_time};{(info['apples'] / env1.count_apple) * 100.0};{(info['mines'] / env1.count_mine) * 100.0};{info['end'] * 100.0}\n" ) except KeyboardInterrupt: print("Game terminated") sys.exit() finally: # Save model to file if (test == False): a1.model.save("model.h5") else: log_file.close() env1.f_startPosition.close() env1.f_apples.close() env1.f_mines.close()
action = env.action_space.sample() else: # select an action from the actor network with noise action = policy.select_action(state, noise=True) # the agent plays the action next_state, reward, done, info = env.step(action) # add to the total episode reward episode_reward += reward # check if the episode is done done_bool = 0 if episode_timesteps + 1 == env._max_episode_steps else float( done) # add to the memory buffer memory.add((state, next_state, action, reward, done_bool)) # update the state, episode timestep and total timestep state = next_state episode_timesteps += 1 total_timesteps += 1 eval_counter += 1 # train after the first episode if total_timesteps > start_timesteps: policy.train(memory) # save the model if total_timesteps % save_freq == 0: policy.save(int(total_timesteps / save_freq))
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0.0 # 0.0 self.exploration_theta = 0.1 # 0.15 self.exploration_sigma = 0.1 # 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def act_no_noise(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, random_seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.eps = 3.0 self.eps_decay = 0.9999 # Actor Network (w/ Target Network) self.actor_local = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_target = Actor(state_size * 2, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_target = Critic(state_size * 2, action_size * 2, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=0) # Noise process self.noise = OUNoise((1, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, random_seed) def step(self, state, action, reward, next_state, done, agent_number, learn_iterations=5): """Save experience in replay memory, and use random sample from buffer to learn.""" #self.timestep += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at learning interval settings if len(self.memory ) > BATCH_SIZE: #and self.timestep % LEARN_EVERY == 0: for _ in range(learn_iterations): experiences = self.memory.sample() self.learn(experiences, GAMMA, agent_number) def act(self, states, add_noise): """Returns actions for both agents as per current policy, given their respective states.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() # add noise to actions if add_noise: actions += self.eps * self.noise.sample() actions = np.clip(actions, -1, 1) return actions def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) elif agent_number == 1: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) # Compute Q targets for current states (y_i) Q_targets_next = self.critic_target(next_states, actions_next) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) # Since the critic takes the actions of both agents we need to update only # one part of the given action if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) elif agent_number == 1: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) # Compute actor loss actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) # update epsilon self.eps *= self.eps_decay self.eps = max(self.eps, 1) self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """ Interacts with and learns from then environment.""" def __init__(self, state_size, action_size, seed, model=QNetwork): """Initialize an Agent object. Param ===== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed model (object): model to use Return ====== None """ self.state_size = state_size self.action_size = action_size self.seed = seed # Q-Network self.qnetwork_local = model(state_size, action_size, seed).to(device) self.qnetwork_target = model(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=hyperparameters["lr"]) # Replay memory self.memory = ReplayBuffer(action_size, hyperparameters["buffer_size"], hyperparameters["batch_size"], seed, device) # Initialize time step (for updating every hyperparameters["update_every"] steps) self.t_step = 0 # Init tracking of params wandb.login() wandb.init(project=project_name, name=name, config=hyperparameters) jovian.log_hyperparams(hyperparameters) def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every hyperparameters["update_every"] time steps. self.t_step = (self.t_step + 1) % hyperparameters["update_every"] if self.t_step == 0: # If enough samples are availble in memory, get random subset and learn if len(self.memory) > hyperparameters["batch_size"]: experiences = self.memory.sample() self.learn(experiences, hyperparameters["gamma"]) def act(self, state, eps=0.): """Return actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for espilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params: ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', d) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next = self.qnetwork_target(next_states).detach().max( 1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ---------------- update target network ----------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, hyperparameters["tau"]) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def get_model_name(self): return name def get_project_name(self): return project_name
class DDPG: def __init__(self, env, state_dim, action_dim): self.name = 'DDPG' self.env = env self.state_dim = state_dim self.action_dim = action_dim self.AE = Actor(state_dim,action_dim).cuda() self.CE = Critic(state_dim,action_dim).cuda() self.AT = Actor(state_dim,action_dim).cuda() self.CT = Critic(state_dim,action_dim).cuda() self.replay_buffer = ReplayBuffer(REPLAY_BUFFER_SIZE) self.time_step = 0 self.AE.load_state_dict(torch.load(MODEL_DIR+'/obs/actor_340000.pkl')) # self.AT.load_state_dict(torch.load(MODEL_DIR+'/actor_280000.pkl')) # self.CE.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) # self.CT.load_state_dict(torch.load(MODEL_DIR+'/critic_280000.pkl')) self.optimizer_a = torch.optim.Adam(self.AE.parameters(), lr=1e-4) self.optimizer_c = torch.optim.Adam(self.CE.parameters(), lr=1e-4) def train(self): self.AE.train() data = self.replay_buffer.get_batch(BATCH_SIZE) bs = np.array([da[0] for da in data]) ba = np.array([da[1] for da in data]) br = np.array([da[2] for da in data]) bs_ = np.array([da[3] for da in data]) bd = np.array([da[4] for da in data]) bs = torch.FloatTensor(bs).cuda() ba = torch.FloatTensor(ba).cuda() br = torch.FloatTensor(br).cuda() bs_ = torch.FloatTensor(bs_).cuda() a_ = self.AT(bs_) ####################### NOTICE !!! ##################################### #q1 = self.CE(bs, a) ###### here use action batch !!! for policy loss!!! q2 = self.CE(bs, ba) ###### here use computed batch !!! for value loss!!! ######################################################################## q_ = self.CT(bs_, a_).detach() q_tar = torch.FloatTensor(BATCH_SIZE) for i in range(len(data)): if bd[i]: q_tar[i] = br[i] else: q_tar[i] = br[i]+GAMMA*q_[i] q_tar = q_tar.view(BATCH_SIZE,1).cuda() # minimize mse_loss of q2 and q_tar td_error = F.mse_loss(q2, q_tar.detach()) # minimize td_error self.CE.zero_grad() td_error.backward(retain_graph=True) self.optimizer_c.step() a = self.AE(bs) q1 = self.CE(bs, a) a_loss = -torch.mean(q1) # maximize q self.AE.zero_grad() a_loss.backward(retain_graph=True) self.optimizer_a.step() self.soft_replace() def soft_replace(self): for t,e in zip(self.AT.parameters(),self.AE.parameters()): t.data = (1-TAU)*t.data + TAU*e.data for t,e in zip(self.CT.parameters(),self.CE.parameters()): t.data = (1-TAU)*t.data + TAU*e.data def action(self, state): self.AE.eval() state_tensor = torch.FloatTensor(state).unsqueeze(0).cuda() # add batch_sz=1 ac_tensor = self.AE(state_tensor) ac = ac_tensor.squeeze(0).cpu().detach().numpy() return ac def perceive(self,state,action,reward,next_state,done): # Store transition (s_t,a_t,r_t,s_{t+1}) in replay buffer self.replay_buffer.add(state,action,reward,next_state,done) if self.replay_buffer.count() == REPLAY_START_SIZE: print('\n---------------Start training---------------') # Store transitions to replay start size then start training if self.replay_buffer.count() > REPLAY_START_SIZE: self.time_step += 1 self.train() if self.time_step % 10000 == 0 and self.time_step > 0: torch.save(self.AE.state_dict(), MODEL_DIR + '/obs/actor_{}.pkl'.format(self.time_step)) torch.save(self.CE.state_dict(), MODEL_DIR + '/obs/critic_{}.pkl'.format(self.time_step)) print('Save model state_dict successfully in obs dir...') return self.time_step