def __init__(self, n_states, n_actions, hidden_dim=90, device="cpu", critic_lr=5e-3, actor_lr=5e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128): self.device = device self.critic_lr = critic_lr self.actor_lr = actor_lr self.critic = Critic(n_states, n_actions, hidden_dim).to(device) self.actor = Actor(n_states, n_actions, hidden_dim).to(device) self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device) self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.memory = ReplayBuffer(memory_capacity) self.batch_size = batch_size self.soft_tau = soft_tau self.gamma = gamma
def __init__(self, env): self.env = env self.num_robots = env.num_robots self.learning_rate = 0.0001 self.epsilon = .9 self.epsilon_decay = .99995 self.eps_counter = 0 self.gamma = .90 self.tau = .01 self.buffer_size = 1000000 self.batch_size = 512 self.hyper_parameters_lambda3 = 0.2 self.hyper_parameters_eps = 0.2 self.hyper_parameters_eps_d = 0.4 self.demo_size = 1000 self.time_str = time.strftime("%Y%m%d-%H%M%S") self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights" self.save_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/results/trained_weights/exp2/HLERnoisy/" self.path = os.path.join(self.parent_dir, self.time_str) os.mkdir(self.path) # Replay buffer self.memory = deque(maxlen=1000000) # Replay Buffer self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000*256, type_buffer="HER") # File name self.file_name = "reward_{}_{}_{}".format(self.time_str, self.num_robots, self.replay_buffer.type_buffer) # Hidden Layer list self.hid_list = [512, 512, 512] # ===================================================================== # # Actor Model # # Chain rule: find the gradient of chaging the actor network params in # # getting closest to the final value network predictions, i.e. de/dA # # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act # # ===================================================================== # self.actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.target_actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.actor_optim = optim.Adam(self.actor_model.parameters(), lr=self.learning_rate) # ===================================================================== # # Critic Model # # ===================================================================== # self.critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.target_critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.critic_optim = optim.Adam(self.critic_model.parameters(), lr=self.learning_rate) hard_update(self.target_actor_model, self.actor_model) # Make sure target is with the same weight hard_update(self.target_critic_model, self.critic_model) self.cuda()
def __init__(self, env, GAMMA=0.9): self.env = env print('obs space shape: {}'.format(self.env.observation_space.shape)) print('action space shape: {}'.format(self.env.action_space.shape)) self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] print('states dim: {}\t\t actions dim: {}'.format( self.states_dim, self.action_dim)) self.actor = Actor(self.states_dim, self.action_dim, lr=0.0001) self.critic = Critic(self.states_dim, self.action_dim, lr=0.0001) self.GAMMA = GAMMA self.RANDOM_PROB = 0.025 self.replay_buffer = ReplayBuffer(1280)
def __init__(self, hparams): ''' Initializations ''' super().__init__() self.hparams = hparams # Position of human source_position = torch.tensor([[self.hparams.environment.position.end.x], [self.hparams.environment.position.end.y], [self.hparams.environment.position.end.z]]).float() # Position of agent agent_position = torch.tensor([[self.hparams.environment.position.start.x], [self.hparams.environment.position.start.y], [self.hparams.environment.position.start.z]]).float() # Initialize Replay buffer self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size) # Initialize drone self.agent = Drone(start_position = agent_position, goal_position = source_position, velocity_factor = self.hparams.environment.agent.velocity_factor, hparams = self.hparams, buffer = self.replay_buffer) # Actor networks self.net = Actor(**self.hparams.model.actor) self.target_net = Actor(**self.hparams.model.actor) # Critic networks self.critic = Critic(**self.hparams.model.critic) self.target_critic = Critic(**self.hparams.model.critic) # Hard update self.target_net.load_state_dict(self.net.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) self.total_reward = -10000 self.episode_steps = 0.0 self.max_episode_steps = self.hparams.model.max_episode self.episode_reward = 0.0 self.populate(self.hparams.model.replay_buffer_size)
def __init__(self, env, env_obs, gamma=0.99, tau=0.001, lr_actor=1e-3, lr_critic=1e-3, weight_decay=0.1, batch_size=64, subpolicies=1, action_shape=2, replay_buffer_size=5000, replay_buffer_type="rb", noise=0.1, noise_decay=0.999, max_action=1, min_action=-1, teacher=False, alpha=0.1, bc=None): self.env = env self.subpolicies = subpolicies self.total_obs = np.sum(env_obs) self.weight_decay = weight_decay self.env_obs = env_obs self.max_action = max_action self.min_action = min_action self.action_shape = action_shape self.gamma = gamma self.tau = tau self.batch_size = batch_size self.replay_buffer_type = replay_buffer_type self.replay_buffer_size = replay_buffer_size self.init_noise = noise self.noise = noise self.noise_decay = noise_decay self.teacher = teacher self.bc = bc self.alpha = alpha self.mul = 1 if self.teacher is False else 2 self.actors = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)] self.actors_targets = [[Actor(self.mul * env_obs[agent], action_shape) for i in range(self.subpolicies)] for agent in range(env.n)] self.critics = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents] self.critics_targets = [Critic(self.mul * self.total_obs + action_shape * len(env.agents)) for _ in env.agents] self.actors_optimizers = [[torch.optim.RMSprop(self.actors[agent][i].parameters(), lr=lr_actor, weight_decay=weight_decay) for i in range(self.subpolicies)] for agent in range(len(env.agents))] self.critics_optimisers = [torch.optim.RMSprop(self.critics[agent].parameters(), lr=lr_critic ,weight_decay=weight_decay) for agent in range(len(env.agents))] if self.subpolicies > 1: if self.replay_buffer_type == "rb": self.replay_buffers = [[ReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)] else: self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)] else: if self.replay_buffer_type == "rb": self.replay_buffers = ReplayBuffer(self.replay_buffer_size) else: self.replay_buffers = [[PrioritizedReplayBuffer(self.replay_buffer_size) for _ in range(self.subpolicies)] for _ in range(env.n)]
def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # initialize state self.last_state = self.task.reset()
class ActorCritic: def __init__(self, env): self.env = env self.num_robots = env.num_robots self.learning_rate = 0.0001 self.epsilon = .9 self.epsilon_decay = .99995 self.eps_counter = 0 self.gamma = .90 self.tau = .01 self.buffer_size = 1000000 self.batch_size = 512 self.hyper_parameters_lambda3 = 0.2 self.hyper_parameters_eps = 0.2 self.hyper_parameters_eps_d = 0.4 self.demo_size = 1000 self.time_str = time.strftime("%Y%m%d-%H%M%S") self.parent_dir = HOME + "/catkin_ws/src/Turtlebot3_Pheromone/src/DRLbasedController/weights" self.path = os.path.join(self.parent_dir, self.time_str) os.mkdir(self.path) # Replay buffer self.memory = deque(maxlen=1000000) # Replay Buffer self.replay_buffer = ExperienceReplayBuffer(total_timesteps=5000 * 256, type_buffer="HER") # File name self.file_name = "reward_{}_{}_{}".format( self.time_str, self.num_robots, self.replay_buffer.type_buffer) # Hidden Layer list self.hid_list = [1024, 512, 512] # ===================================================================== # # Actor Model # # Chain rule: find the gradient of chaging the actor network params in # # getting closest to the final value network predictions, i.e. de/dA # # Calculate de/dA as = de/dC * dC/dA, where e is error, C critic, A act # # ===================================================================== # self.actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.target_actor_model = Actor(self.env.observation_space.shape, self.env.action_space.shape, self.hid_list) self.actor_optim = optim.Adam(self.actor_model.parameters(), lr=self.learning_rate) # ===================================================================== # # Critic Model # # ===================================================================== # self.critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.target_critic_model = Critic(self.env.observation_space.shape, self.env.action_space.shape, 1, self.hid_list) self.critic_optim = optim.Adam(self.critic_model.parameters(), lr=self.learning_rate) hard_update( self.target_actor_model, self.actor_model) # Make sure target is with the same weight hard_update(self.target_critic_model, self.critic_model) self.cuda() # ========================================================================= # # Model Training # # ========================================================================= # def remember(self, cur_state, action, reward, new_state, done): for i in range(self.num_robots): self.memory.append( [cur_state[i], action[i], reward[i], new_state[i], done[i]]) def _train_critic_actor(self, samples): Loss = nn.MSELoss() # 1, sample cur_states, actions, rewards, new_states, dones, weights, batch_idxes = stack_samples( samples) # PER version also checks if I need to use stack_samples target_actions = to_numpy( self.target_actor_model(to_tensor(new_states))) # Critic Update self.critic_model.zero_grad() Q_now = self.critic_model([cur_states, actions]) next_Q = self.target_critic_model([new_states, target_actions]) dones = dones.astype(bool) Q_target = to_tensor(rewards) + self.gamma * next_Q.reshape( next_Q.shape[0]) * to_tensor(1 - dones) td_errors = Q_target - Q_now.reshape(Q_now.shape[0]) value_loss = Loss(Q_target, Q_now.squeeze()) value_loss.backward() self.critic_optim.step() # Actor Update self.actor_model.zero_grad() policy_loss = -self.critic_model( [to_tensor(cur_states), self.actor_model(to_tensor(cur_states))]) policy_loss = policy_loss.mean() policy_loss.backward() self.actor_optim.step() # NoisyNet noise reset self.actor_model.reset_noise() self.target_actor_model.reset_noise() return td_errors def read_Q_values(self, cur_states, actions): critic_values = self.critic_model.predict([cur_states, actions]) return critic_values def train(self, t): batch_size = self.batch_size if self.replay_buffer.replay_buffer.__len__() < batch_size: #per return samples = self.replay_buffer.replay_buffer.sample( batch_size, beta=self.replay_buffer.beta_schedule.value(t)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = samples self.samples = samples td_errors = self._train_critic_actor(samples) # priority updates #new_priorities = np.abs(td_errors) + self.replay_buffer.prioritized_replay_eps #self.replay_buffer.replay_buffer.update_priorities(batch_idxes, new_priorities) # ========================================================================= # # Target Model Updating # # ========================================================================= # def _update_actor_target(self): soft_update(self.target_actor_model, self.actor_model, self.tau) def _update_critic_target(self): soft_update(self.target_critic_model, self.critic_model, self.tau) def update_target(self): self._update_actor_target() self._update_critic_target() # ========================================================================= # # Model Predictions # # ========================================================================= # def act( self, cur_state ): # this function returns action, which is predicted by the model. parameter is epsilon if self.eps_counter >= self.num_robots: self.epsilon *= self.epsilon_decay self.eps_counter = 0 else: self.eps_counter += 1 eps = self.epsilon cur_state = np.array(cur_state).reshape(1, 8) action = to_numpy(self.actor_model(to_tensor(cur_state))).squeeze(0) action = action.reshape(1, 2) if np.random.random() < self.epsilon: action[0][0] = action[0][0] + (np.random.random() - 0.5) * 0.4 action[0][1] = action[0][1] + (np.random.random()) * 0.4 return action, eps else: action[0][0] = action[0][0] action[0][1] = action[0][1] return action, eps # ========================================================================= # # save weights # # ========================================================================= # def save_weight(self, num_trials, trial_len): torch.save( self.actor_model.state_dict(), self.path + '/actormodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.pkl') torch.save( self.critic_model.state_dict(), self.path + '/criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.pkl') #self.actor_model.save_weights(self.path + 'actormodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True) #self.critic_model.save_weights(self.path + 'criticmodel' + '-' + str(num_trials) + '-' + str(trial_len) + '.h5', overwrite=True)#("criticmodel.h5", overwrite=True) # ========================================================================= # # load weights # # ========================================================================= # def load_weights(self, output): self.actor_model.load_state_dict(torch.load('{}.pkl'.format(output))) self.critic_model.load_state_dict(torch.load('{}.pkl'.format(output))) def play(self, cur_state): return to_numpy(self.actor_model(to_tensor(cur_state), volatile=True)).squeeze(0) def cuda(self): self.actor_model.cuda() self.target_actor_model.cuda() self.critic_model.cuda() self.target_critic_model.cuda()
class AgentTrainer(pl.LightningModule): ''' Pytorch trainer class for Drone Reinforcement learning ''' def __init__(self, hparams): ''' Initializations ''' super().__init__() self.hparams = hparams # Position of human source_position = torch.tensor([[self.hparams.environment.position.end.x], [self.hparams.environment.position.end.y], [self.hparams.environment.position.end.z]]).float() # Position of agent agent_position = torch.tensor([[self.hparams.environment.position.start.x], [self.hparams.environment.position.start.y], [self.hparams.environment.position.start.z]]).float() # Initialize Replay buffer self.replay_buffer = ReplayBuffer(capacity = self.hparams.model.replay_buffer_size) # Initialize drone self.agent = Drone(start_position = agent_position, goal_position = source_position, velocity_factor = self.hparams.environment.agent.velocity_factor, hparams = self.hparams, buffer = self.replay_buffer) # Actor networks self.net = Actor(**self.hparams.model.actor) self.target_net = Actor(**self.hparams.model.actor) # Critic networks self.critic = Critic(**self.hparams.model.critic) self.target_critic = Critic(**self.hparams.model.critic) # Hard update self.target_net.load_state_dict(self.net.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict()) self.total_reward = -10000 self.episode_steps = 0.0 self.max_episode_steps = self.hparams.model.max_episode self.episode_reward = 0.0 self.populate(self.hparams.model.replay_buffer_size) def soft_update(self, target, source, tau): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_( target_param.data * (1.0 - tau) + param.data * tau ) def configure_optimizers(self): optimizer2 = getattr(torch.optim, self.hparams.optimizer.type)([{"params": self.net.parameters(), "lr": self.hparams.optimizer.args.lr}], **self.hparams.optimizer.args) optimizer = getattr(torch.optim, self.hparams.optimizer.type)(self.critic.parameters(), **self.hparams.optimizer.args, weight_decay=1e-3) scheduler2 = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args) scheduler = getattr(torch.optim.lr_scheduler, self.hparams.scheduler.type)(optimizer, **self.hparams.scheduler.args) return [optimizer, optimizer2], [scheduler, scheduler2] def dqn_mse_loss(self, batch) -> torch.Tensor: """ Calculates the mse loss using a mini batch from the replay buffer Args: batch: current mini batch of replay data Returns: loss """ states, actions, rewards, dones, next_states = batch #print(states["image"].shape, rewards.shape) rewards_out = rewards[:, -1] print(actions.shape, rewards_out.shape, rewards.shape, "shapes") #print(rewards.shape, actions.shape, "reward, action") # print(states["image"].shape) # state_action_values = self.net(states["image"], states["signal"]).gather(1, actions.unsqueeze(-1)).squeeze(-1) action_value = self.net(next_states["image"]) Q_value = self.critic(next_states["image"], action_value).squeeze(-1) # print(state_action_values) with torch.no_grad(): #next_action_value = self.target_net(next_states["image"], next_states["signal"]) #print(next_action_value.shape, "action") next_Q_value = self.target_critic(states["image"], actions.float()).squeeze(-1) # next_state_values[dones] = 0.0 #print("Q value:", next_Q_value.shape) #next_action_value = next_action_value.detach() next_Q_value = next_Q_value.detach() #Q_value_actor = self.critic(next_states["image"], next_states["signal"], action_value).squeeze(-1) #print(next_Q_value.shape, rewards_out.shape) expected_state_action_values = Q_value * self.hparams.model.gamma + rewards_out #print(expected_state_action_values.shape, Q_value.shape) return {"loss": nn.MSELoss()(next_Q_value, expected_state_action_values), "policy_loss": - (Q_value).mean()} def populate(self, steps: int = 1000) -> None: ''' Carries out several random steps through the environment to initially fill up the replay buffer with experiences ''' for i in range(steps): print(i) self.agent.playStep(self.net, 1.0, self.get_device()) if i % self.max_episode_steps == 0: self.agent.reset() self.agent.reset() def playTrajectory(self): ''' Play the trajectory ''' self.agent.reset() device = self.get_device() while (True): self.agent.playStep(self.net, 0, device) def training_step(self, batch, batch_idx, optimizer_idx): ''' Training steps ''' self.episode_steps = self.episode_steps + 1 device = self.get_device() epsilon = max(self.hparams.model.min_epsilon, self.hparams.model.max_epsilon - (self.global_step + 1) / self.hparams.model.stop_decay) print("eps:", epsilon) # step through environment with agent reward, done = self.agent.playStep(self.target_net, epsilon, device) self.episode_reward += reward # calculates training loss loss = self.dqn_mse_loss(batch) #print(loss) self.log("train_loss", loss["loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True) self.log("policy_loss", loss["policy_loss"], on_epoch = True, prog_bar = True, on_step = True, logger = True) if done: if self.episode_reward > self.total_reward: self.total_reward = self.episode_reward self.episode_reward = 0 self.episode_steps = 0 if optimizer_idx: loss_out = loss["policy_loss"] else: loss_out = loss["loss"] # Soft update of target network if self.global_step % self.hparams.model.sync_rate == 0: self.soft_update(self.target_net, self.net, self.hparams.model.tau) self.soft_update(self.target_critic, self.critic, self.hparams.model.tau) # self.target_net.load_state_dict(self.net.state_dict()) # self.target_critic.load_state_dict(self.critic.state_dict()) log = { 'total_reward': torch.tensor(self.total_reward).to(device), 'reward': torch.tensor(reward).to(device), 'steps': torch.tensor(self.global_step).to(device) } for key in log: self.log(key, log[key], logger = True, prog_bar = True, on_step = True) if self.episode_steps > self.max_episode_steps: self.episode_steps = 0 self.total_reward = self.episode_reward self.agent.reset() #print(loss_out) #return OrderedDict({'loss': loss, 'log': log, 'progress_bar': log}) return loss_out def __dataloader(self) -> DataLoader: """ Initialize the Replay Buffer dataset used for retrieving experiences """ dataset = RLDataset(self.replay_buffer, self.hparams.model.sample_size) dataloader = DataLoader( dataset=dataset, **self.hparams.dataset.loader) return dataloader def train_dataloader(self) -> DataLoader: """ Get train loader """ return self.__dataloader() def get_device(self) -> str: """ Retrieve device currently being used by minibatch """ return self.device.index if self.on_gpu else 'cpu' def forward(self, x): return self.net(x)
LR_A = 0.001 LR_C = 0.01 env = gym.make('MountainCar-v0') env = env.unwrapped sess = tf.Session() actor = Actor(sess, n_features=env.observation_space.shape[0], n_actions=env.action_space.n, learning_rate=LR_A) critic = Critic(sess, n_features=env.observation_space.shape[0], learning_rate=LR_C) sess.run(tf.global_variables_initializer()) for i_episode in range(1000): s = env.reset() t = 0 track_r = [] while True: # if RENDER: env.render() env.render() a = actor.choose_acton(s) s_, r, done, info = env.step(a)
DISPLAY_REWARD_THRESHOLD = -90 RENDER = False # rendering wastes time env = gym.make('MountainCar-v0') env.seed(1) # reproducible, general Policy gradient has high variance env = env.unwrapped print(env.action_space) print(env.observation_space) print(env.observation_space.high) print(env.observation_space.low) actor = Actor(epsilon=0) critic = Critic() Tmax = 1000 for i_episode in range(3000): observation = env.reset() action = actor.choose_action(observation) running_reward = 0 critic.reset() count = 0 while count < Tmax: count += 1 if RENDER: env.render() observation_, reward, done, info = env.step( action) # reward = -1 in all cases
class DDPG: """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # initialize state self.last_state = self.task.reset() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state, mode="train"): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] if mode.lower() == "train": return list(action + self.noise.sample()) # add some noise for exploration elif mode.lower() == "test": return list(action) else: raise AttributeError("Mode can be either train or test") def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class ActorCriticEnv(object): def __init__(self, env, GAMMA=0.9): self.env = env print('obs space shape: {}'.format(self.env.observation_space.shape)) print('action space shape: {}'.format(self.env.action_space.shape)) self.states_dim = self.env.observation_space.shape[0] self.action_dim = self.env.action_space.shape[0] print('states dim: {}\t\t actions dim: {}'.format( self.states_dim, self.action_dim)) self.actor = Actor(self.states_dim, self.action_dim, lr=0.0001) self.critic = Critic(self.states_dim, self.action_dim, lr=0.0001) self.GAMMA = GAMMA self.RANDOM_PROB = 0.025 self.replay_buffer = ReplayBuffer(1280) def add_state_action_to_buffer(self, state, action, resulting_state, done): if done: predicted_q_val = np.asarray([[-25000]]) else: best_new_action = self.actor.get_action( np.asarray([resulting_state])) predicted_next_q = self.critic.predict_q_val( np.asarray([resulting_state]), best_new_action) true_reward = get_reward(resulting_state) self.replay_buffer.add(state, action, true_reward, 0, resulting_state) # The 0 is for "t", which I don't understand the point of. return def train_from_state_action(self, state, action, resulting_state, done): if done: predicted_q_val = np.asarray([[-25000]]) else: best_new_action = self.actor.get_action( np.asarray([resulting_state])) predicted_next_q = self.critic.predict_q_val( np.asarray([resulting_state]), best_new_action) true_reward = get_reward(resulting_state) predicted_q_val = true_reward + self.GAMMA * predicted_next_q wrapped_state = np.asarray([state]) wrapped_action = np.asarray(action) # wrapped_q_goal = np.asarray([[predicted_q_val]]) # print("STATE SHAPE: {}\t\tACTION SHAPE: {}\t\tREWARD SHAPE: {}".format(wrapped_state.shape, wrapped_action.shape, wrapped_true_reward.shape)) inputs = [wrapped_state, wrapped_action, predicted_q_val] # print('created inputs. Calculating action grads.') action_grads = self.critic.get_action_grads(*inputs) # print('Optimizing critic q-val prediction.') self.critic.optimize_q_val(*inputs) # print('training actor from state and grads') self.actor.train_from_batch(wrapped_state, action_grads) # print('all done training') # def train_from_replay_buffer(self, batch_size=64): # s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch(batch_size) # best_new_actions = self.actor.get_action(s2_batch) # s2_predicted_q_vals = self.critic.predict_q_val(s2_batch, best_new_actions) def play_random_game(self, render=True): observation = env.reset() for t in range(1000): if render == True: env.render() action = env.action_space.sample() observation, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break def play_game_from_actor(self, render=True): observation = env.reset() for t in range(1000): if render == True: env.render() # action = env.action_space.sample() print(observation) action = self.actor.get_action(np.asarray([observation])) observation, reward, done, info = env.step(action) if done: print('Episode finished after {} timesteps'.format(t + 1)) break def train_actor_critic_to_stay_still(self, render=True): # My reward after each one is the difference between where you are # and where you started. true_rewards = [] observation = env.reset() for t in range(1000): if render == True: env.render() true_rewards.append(get_reward(observation)) if random_with_prob(self.RANDOM_PROB): action = np.asarray([env.action_space.sample()]) else: action = self.actor.get_action(np.asarray([observation])) new_observation, reward, done, info = env.step(action) self.train_from_state_action(observation, action, new_observation, done) observation = new_observation if done: print( 'Episode finished after {} timesteps. Average reward: {}'. format(t + 1, np.mean(np.asarray(true_rewards)))) break
class DDPG: def __init__(self, n_states, n_actions, hidden_dim=90, device="cpu", critic_lr=5e-3, actor_lr=5e-4, gamma=0.99, soft_tau=1e-2, memory_capacity=100000, batch_size=128): self.device = device self.critic_lr = critic_lr self.actor_lr = actor_lr self.critic = Critic(n_states, n_actions, hidden_dim).to(device) self.actor = Actor(n_states, n_actions, hidden_dim).to(device) self.target_critic = Critic(n_states, n_actions, hidden_dim).to(device) self.target_actor = Actor(n_states, n_actions, hidden_dim).to(device) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr) self.memory = ReplayBuffer(memory_capacity) self.batch_size = batch_size self.soft_tau = soft_tau self.gamma = gamma def select_action(self, state): state = torch.FloatTensor(state).unsqueeze(0).to(self.device) action = self.actor(state) # torch.detach()用于切断反向传播 return action.detach().cpu().numpy()[0] def update(self): if len(self.memory) < self.batch_size: return state, action, reward, next_state, done = self.memory.sample( self.batch_size) # 将所有变量转为张量 state = torch.FloatTensor(state).to(self.device) next_state = torch.FloatTensor(next_state).to(self.device) action = torch.FloatTensor(action).to(self.device) reward = torch.FloatTensor(reward).unsqueeze(1).to(self.device) done = torch.FloatTensor(np.float32(done)).unsqueeze(1).to(self.device) # 注意critic将(s_t,a)作为输入 actor_loss = self.critic(state, self.actor(state)) actor_loss = -actor_loss.mean() next_action = self.target_actor(next_state) target_value = self.target_critic(next_state, next_action.detach()) expected_value = reward + (1.0 - done) * self.gamma * target_value expected_value = torch.clamp(expected_value, -np.inf, np.inf) value = self.critic(state, action) critic_loss = nn.MSELoss()(value, expected_value.detach()) #训练优化actor及critic网络 self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # soft更新目标网络 for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.soft_tau) + param.data * self.soft_tau) def save_model(self, path): torch.save(self.target_actor.state_dict(), path) def load_model(self, path): self.actor.load_state_dict(torch.load(path)) def buffer_model_save(self, saved_dir): self.memory.save(saved_dir) torch.save(self.critic.state_dict(), saved_dir + "/critic_checkpoint.pth") torch.save(self.actor.state_dict(), saved_dir + "/actor_checkpoint.pth") torch.save(self.target_critic.state_dict(), saved_dir + "/target_critic_checkpoint.pth") torch.save(self.target_actor.state_dict(), saved_dir + "/target_actor_checkpoint.pth") def buffer_model_load(self, saved_dir): if not os.path.exists(saved_dir): # 检测是否存在文件夹 os.makedirs(saved_dir) return self.memory.load(saved_dir) self.critic.load_state_dict( torch.load(saved_dir + "/critic_checkpoint.pth")) self.actor.load_state_dict( torch.load(saved_dir + "/actor_checkpoint.pth")) self.target_critic.load_state_dict( torch.load(saved_dir + "/target_critic_checkpoint.pth")) self.target_actor.load_state_dict( torch.load(saved_dir + "/target_actor_checkpoint.pth")) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.critic_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.actor_lr)