autoencoder = Autoencoder(args.enc_hidden_dim, args.dec_hidden_dim, args.embedding_dim, args.latent_dim, vocab.size(), args.dropout, args.seq_len) autoencoder.load_state_dict( torch.load('autoencoder.th', map_location=lambda x, y: x)) generator = Generator(args.n_layers, args.block_dim) critic = Critic(args.n_layers, args.block_dim) g_optimizer = optim.Adam(generator.parameters(), lr=args.lr) c_optimizer = optim.Adam(critic.parameters(), lr=args.lr) if args.cuda: autoencoder = autoencoder.cuda() generator = generator.cuda() critic = critic.cuda() print('G Parameters:', sum([p.numel() for p in generator.parameters() if \ p.requires_grad])) print('C Parameters:', sum([p.numel() for p in critic.parameters() if \ p.requires_grad])) best_loss = np.inf for epoch in range(1, args.epochs + 1): g_loss, c_loss = train(epoch) loss = g_loss + c_loss if loss < best_loss: best_loss = loss print('* Saved') torch.save(generator.state_dict(), 'generator.th')
class AgentDDPG: """Deep Deterministic Policy Gradient implementation for continuous action space reinforcement learning tasks""" def __init__(self, state_size, hidden_size, action_size, actor_learning_rate=1e-4, critic_learning_rate=1e-3, gamma=0.99, tau=1e-2, use_cuda=False, actor_path=None, critic_path=None): # Params self.state_size, self.hidden_size, self.action_size = state_size, hidden_size, action_size self.gamma, self.tau = gamma, tau self.use_cuda = use_cuda # Networks self.actor = Actor(state_size, hidden_size, action_size) self.actor_target = Actor(state_size, hidden_size, action_size) self.critic = Critic(state_size + action_size, hidden_size, action_size) self.critic_target = Critic(state_size + action_size, hidden_size, action_size) # Load model state_dicts from saved file if actor_path and path.exists(actor_path): self.actor.load_state_dict(torch.load(actor_path)) if critic_path and path.exists(critic_path): self.critic.load_state_dict(torch.load(critic_path)) # Hard copy params from original networks to target networks copy_params(self.actor, self.actor_target) copy_params(self.critic, self.critic_target) if self.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() # Create replay buffer for storing experience self.replay_buffer = ReplayBuffer(cache_size=int(1e6)) # Training self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=critic_learning_rate) def save_to_file(self, actor_file, critic_file): # Save the state_dict's of the Actor and Critic networks torch.save(self.actor.state_dict(), actor_file) torch.save(self.critic.state_dict(), critic_file) def get_action(self, state): """Select action with respect to state according to current policy and exploration noise""" state = Variable(torch.from_numpy(state).float()) if self.use_cuda: state = state.cuda() a = self.actor.forward(state) if self.use_cuda: return a.detach().cpu().numpy() return a.detach().numpy() def save_experience(self, state_t, action_t, reward_t, state_t1): self.replay_buffer.add_sample(state_t, action_t, reward_t, state_t1) def update(self, batch_size): states, actions, rewards, next_states = self.replay_buffer.get_samples( batch_size) states = torch.FloatTensor(states) actions = torch.FloatTensor(actions) rewards = torch.FloatTensor(rewards) next_states = torch.FloatTensor(next_states) if self.use_cuda: states = states.cuda() next_states = next_states.cuda() actions = actions.cuda() rewards = rewards.cuda() # Critic loss Qvals = self.critic.forward(states, actions) next_actions = self.actor_target.forward(next_states) next_Q = self.critic_target.forward(next_states, next_actions.detach()) Qprime = rewards + self.gamma * next_Q critic_loss = self.critic_criterion(Qvals, Qprime) # Update critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Actor loss policy_loss = -self.critic.forward(states, self.actor.forward(states)).mean() # Update actor self.actor_optimizer.zero_grad() policy_loss.backward() self.actor_optimizer.step() # update target networks soft_copy_params(self.actor, self.actor_target, self.tau) soft_copy_params(self.critic, self.critic_target, self.tau) def add_noise_to_weights(self, amount=0.1): self.actor.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.actor_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda)) self.critic_target.apply( lambda x: _add_noise_to_weights(x, amount, self.use_cuda))
class DDPG(object): def __init__(self, state_dim, action_dim, max_action, memory, args): # actor self.actor = Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) self.actor_target = Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) self.actor_target.load_state_dict(self.actor.state_dict()) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=args.actor_lr) # crtic self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args.critic_lr) # cuda if torch.cuda.is_available(): self.actor = self.actor.cuda() self.actor_target = self.actor_target.cuda() self.critic = self.critic.cuda() self.critic_target = self.critic_target.cuda() # misc self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self.memory = memory # hyper-parameters self.tau = args.tau self.discount = args.discount self.batch_size = args.batch_size def show_lr(self): print(self.actor_optimizer.state_dict()) def select_action(self, state, noise=None): state = FloatTensor(state.reshape(-1, self.state_dim)) action = self.actor(state).cpu().data.numpy().flatten() if noise is not None: action += noise.sample() return np.clip(action, -self.max_action, self.max_action) def train(self, iterations): for _ in tqdm(range(iterations)): # Sample replay buffer x, y, u, r, d = self.memory.sample(self.batch_size) state = FloatTensor(x) action = FloatTensor(u) next_state = FloatTensor(y) done = FloatTensor(1 - d) reward = FloatTensor(r) # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target(next_state, self.actor_target(next_state)) target_Q = reward + (done * self.discount * target_Q) # Get current Q estimate current_Q = self.critic(state, action) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = -self.critic(state, self.actor(state)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def train_critic(self, iterations): for _ in tqdm(range(iterations)): # Sample replay buffer states, n_states, actions, rewards, dones = self.memory.sample( self.batch_size) sys.stdout.flush() # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target(n_states, self.actor_target(n_states)) target_Q = rewards + (1 - dones) * self.discount * target_Q # Get current Q estimate current_Q = self.critic(states, actions) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = - \ self.critic(states, self.actor(states)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def load(self, filename): self.actor.load_model(filename, "actor") self.critic.load_model(filename, "critic") def save(self, output): self.actor.save_model(output, "actor") self.critic.save_model(output, "critic")
class D3PG(object): def __init__(self, state_dim, action_dim, max_action, memory, args): # misc self.criterion = nn.MSELoss() self.state_dim = state_dim self.action_dim = action_dim self.max_action = max_action self.memory = memory self.n = args.n_actor # actors self.actors = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_target = [ Actor(state_dim, action_dim, max_action, layer_norm=args.layer_norm) for i in range(self.n) ] self.actors_optimizer = [ torch.optim.Adam(self.actors[i].parameters(), lr=args.actor_lr) for i in range(self.n) ] for i in range(self.n): self.actors_target[i].load_state_dict(self.actors[i].state_dict()) # crtic self.critic = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target = Critic(state_dim, action_dim, layer_norm=args.layer_norm) self.critic_target.load_state_dict(self.critic.state_dict()) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=args.critic_lr) # cuda if torch.cuda.is_available(): for i in range(self.n): self.actors[i] = self.actors[i].cuda() self.actors_target[i] = self.actors_target[i].cuda() self.critic = self.critic.cuda() self.critic_target = self.critic_target.cuda() # shared memory for i in range(self.n): self.actors[i].share_memory() self.actors_target[i].share_memory() self.critic.share_memory() self.critic_target.share_memory() # hyper-parameters self.tau = args.tau self.discount = args.discount self.batch_size = args.batch_size self.reward_scale = args.reward_scale def train(self, iterations, actor_index): for _ in tqdm(range(iterations)): # Sample replay buffer states, n_states, actions, rewards, dones = self.memory.sample( self.batch_size) # Q target = reward + discount * Q(next_state, pi(next_state)) with torch.no_grad(): target_Q = self.critic_target( n_states, self.actors_target[actor_index](n_states)) target_Q = self.reward_scale * rewards + \ (1 - dones) * self.discount * target_Q # Get current Q estimate current_Q = self.critic(states, actions) # Compute critic loss critic_loss = self.criterion(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # Compute actor loss actor_loss = - \ self.critic(states, self.actors[actor_index](states)).mean() # Optimize the actor self.actors_optimizer[actor_index].zero_grad() actor_loss.backward() self.actors_optimizer[actor_index].step() # Update the frozen target models for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.actors[actor_index].parameters(), self.actors_target[actor_index].parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) def load(self, filename): for i in range(self.n): self.actors[i].load_model(filename, "actor_" + str(i)) self.critic.load_model(filename, "critic") def save(self, output): for i in range(self.n): self.actors[i].save_model(output, "actor_" + str(i)) self.critic.save_model(output, "critic")
class Agent: def __init__(self,env, env_params, args, models=None, record_episodes=[0,.1,.25,.5,.75,1.]): self.env= env self.env_params = env_params self.args = args # networks if models == None: self.actor = Actor(self.env_params).double() self.critic = Critic(self.env_params).double() else: self.actor , self.critic = self.LoadModels() # target networks used to predict env actions with self.actor_target = Actor(self.env_params,).double() self.critic_target = Critic(self.env_params).double() self.actor_target.load_state_dict(self.actor.state_dict()) self.critic_target.load_state_dict(self.critic.state_dict()) if self.args.cuda: self.actor.cuda() self.critic.cuda() self.actor_target.cuda() self.critic_target.cuda() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=0.001) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=0.001) self.normalize = Normalizer(env_params,self.args.gamma) self.buffer = ReplayBuffer(1_000_000, self.env_params) self.tensorboard = ModifiedTensorBoard(log_dir = f"logs") self.record_episodes = [int(eps * self.args.n_epochs) for eps in record_episodes] def ModelsEval(self): self.actor.eval() self.actor_target.eval() self.critic.eval() self.critic_target.eval() def ModelsTrain(self): self.actor.train() self.actor_target.train() self.critic.train() self.critic_target.train() def GreedyAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy().squeeze() return action def NoiseAction(self, state): self.ModelsEval() with torch.no_grad(): state = torch.tensor(state, dtype=torch.double).unsqueeze(dim=0) if self.args.cuda: state = state.cuda() action = self.actor.forward(state).detach().cpu().numpy() action += self.args.noise_eps * self.env_params['max_action'] * np.random.randn(*action.shape) action = np.clip(action, -self.env_params['max_action'], self.env_params['max_action']) return action.squeeze() def Update(self): self.ModelsTrain() for i in range(self.args.n_batch): state, a_batch, r_batch, nextstate, d_batch = self.buffer.SampleBuffer(self.args.batch_size) a_batch = torch.tensor(a_batch,dtype=torch.double) r_batch = torch.tensor(r_batch,dtype=torch.double) # d_batch = torch.tensor(d_batch,dtype=torch.double) state = torch.tensor(state,dtype=torch.double) nextstate = torch.tensor(nextstate,dtype=torch.double) # d_batch = 1 - d_batch if self.args.cuda: a_batch = a_batch.cuda() r_batch = r_batch.cuda() # d_batch = d_batch.cuda() state = state.cuda() nextstate = nextstate.cuda() with torch.no_grad(): action_next = self.actor_target.forward(nextstate) q_next = self.critic_target.forward(nextstate,action_next) q_next = q_next.detach().squeeze() q_target = r_batch + self.args.gamma * q_next q_target = q_target.detach().squeeze() q_prime = self.critic.forward(state, a_batch).squeeze() critic_loss = F.mse_loss(q_target, q_prime) action = self.actor.forward(state) actor_loss = -self.critic.forward(state, action).mean() # params = torch.cat([x.view(-1) for x in self.actor.parameters()]) # l2_reg = self.args.l2_norm *torch.norm(params,2) # actor_loss += l2_reg self.actor_optim.zero_grad() actor_loss.backward() self.actor_optim.step() self.critic_optim.zero_grad() critic_loss.backward() self.critic_optim.step() self.SoftUpdateTarget(self.critic, self.critic_target) self.SoftUpdateTarget(self.actor, self.actor_target) def Explore(self): for epoch in range(self.args.n_epochs +1): start_time = time.process_time() for cycle in range(self.args.n_cycles): for _ in range(self.args.num_rollouts_per_mpi): state = self.env.reset() for t in range(self.env_params['max_timesteps']): action = self.NoiseAction(state) nextstate, reward, done, info = self.env.step([action]) nextstate = nextstate.squeeze() reward = self.normalize.normalize_reward(reward) self.buffer.StoreTransition(state, action, reward, nextstate, done) state = nextstate self.Update() avg_reward = self.Evaluate() self.tensorboard.step = epoch elapsed_time = time.process_time() - start_time print(f"Epoch {epoch} of total of {self.args.n_epochs +1} epochs, average reward is: {avg_reward}.\ Elapsedtime: {int(elapsed_time /60)} minutes {int(elapsed_time %60)} seconds") if epoch % 5 or epoch + 1 == self.args.n_epochs: self.SaveModels(epoch) self.record(epoch) def Evaluate(self): self.ModelsEval() total_reward = [] episode_reward = 0 succes_rate = [] for episode in range(self.args.n_evaluate): state = self.env.reset() episode_reward = 0 for t in range(self.env_params['max_timesteps']): action = self.GreedyAction(state) nextstate, reward, done, info = self.env.step([action]) episode_reward += reward state = nextstate if done or t + 1 == self.env_params['max_timesteps']: total_reward.append(episode_reward) episode_reward = 0 average_reward = sum(total_reward)/len(total_reward) min_reward = min(total_reward) max_reward = max(total_reward) self.tensorboard.update_stats(reward_avg=average_reward, reward_min=min_reward, reward_max=max_reward) return average_reward def record(self, epoch): self.ModelsEval() try: if not os.path.exists("videos"): os.mkdir('videos') recorder = VideoRecorder(self.env, path=f'videos/epoch-{epoch}.mp4') for _ in range(self.args.n_record): done =False state = self.env.reset() while not done: recorder.capture_frame() action = self.GreedyAction(state) nextstate,reward,done,info = self.env.step([action]) state = nextstate recorder.close() except Exception as e: print(e) def SaveModels(self, ep): if not os.path.exists("models"): os.mkdir('models') torch.save(self.actor.state_dict(), os.path.join('models', 'Actor.pt')) torch.save(self.critic.state_dict(), os.path.join('models', 'Critic.pt')) def LoadModels(self, actorpath, criticpath): actor = Actor(self.env_params, self.hidden_neurons) critic = Critic(self.env_params, self.hidden_neurons) actor.load_state_dict(torch.load(actorpath)) critic.load_state_dict(torch.load(criticpath)) return actor, critic def SoftUpdateTarget(self, source, target): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
class ddpg_agent: def __init__(self, args, env): self.args = args self.env = env # get the number of inputs... num_inputs = self.env.observation_space.shape[0] num_actions = self.env.action_space.shape[0] self.action_scale = self.env.action_space.high[0] # build up the network self.actor_net = Actor(num_inputs, num_actions) self.critic_net = Critic(num_inputs, num_actions) # get the target network... self.actor_target_net = Actor(num_inputs, num_actions) self.critic_target_net = Critic(num_inputs, num_actions) if self.args.cuda: self.actor_net.cuda() self.critic_net.cuda() self.actor_target_net.cuda() self.critic_target_net.cuda() # copy the parameters.. self.actor_target_net.load_state_dict(self.actor_net.state_dict()) self.critic_target_net.load_state_dict(self.critic_net.state_dict()) # setup the optimizer... self.optimizer_actor = torch.optim.Adam(self.actor_net.parameters(), lr=self.args.actor_lr) self.optimizer_critic = torch.optim.Adam( self.critic_net.parameters(), lr=self.args.critic_lr, weight_decay=self.args.critic_l2_reg) # setting up the noise self.ou_noise = OUNoise(num_actions) # check some dir if not os.path.exists(self.args.save_dir): os.mkdir(self.args.save_dir) self.model_path = self.args.save_dir + self.args.env_name + '/' if not os.path.exists(self.model_path): os.mkdir(self.model_path) # start to train the network.. def learn(self): # init the brain memory replay_buffer = [] total_timesteps = 0 running_reward = None for episode_idx in range(self.args.max_episode): state = self.env.reset() # get the scale of the ou noise... self.ou_noise.scale = (self.args.noise_scale - self.args.final_noise_scale) * max(0, self.args.exploration_length - episode_idx) / \ self.args.exploration_length + self.args.final_noise_scale self.ou_noise.reset() # start the training reward_total = 0 while True: state_tensor = torch.tensor(state, dtype=torch.float32).unsqueeze(0) if self.args.cuda: state_tensor = state_tensor.cuda() with torch.no_grad(): policy = self.actor_net(state_tensor) # start to select the actions... actions = self._select_actions(policy) # step state_, reward, done, _ = self.env.step(actions * self.action_scale) total_timesteps += 1 reward_total += reward # start to store the samples... replay_buffer.append((state, reward, actions, done, state_)) # check if the buffer size is outof range if len(replay_buffer) > self.args.replay_size: replay_buffer.pop(0) if len(replay_buffer) > self.args.batch_size: mini_batch = random.sample(replay_buffer, self.args.batch_size) # start to update the network _, _ = self._update_network(mini_batch) if done: break state = state_ running_reward = reward_total if running_reward is None else running_reward * 0.99 + reward_total * 0.01 if episode_idx % self.args.display_interval == 0: torch.save(self.actor_net.state_dict(), self.model_path + 'model.pt') print('[{}] Episode: {}, Frames: {}, Rewards: {}'.format( datetime.now(), episode_idx, total_timesteps, running_reward)) self.env.close() # select actions def _select_actions(self, policy): actions = policy.detach().cpu().numpy()[0] actions = actions + self.ou_noise.noise() actions = np.clip(actions, -1, 1) return actions # update the network def _update_network(self, mini_batch): state_batch = np.array([element[0] for element in mini_batch]) state_batch = torch.tensor(state_batch, dtype=torch.float32) # reward batch reward_batch = np.array([element[1] for element in mini_batch]) reward_batch = torch.tensor(reward_batch, dtype=torch.float32).unsqueeze(1) # done batch done_batch = np.array([int(element[3]) for element in mini_batch]) done_batch = 1 - done_batch done_batch = torch.tensor(done_batch, dtype=torch.float32).unsqueeze(1) # action batch actions_batch = np.array([element[2] for element in mini_batch]) actions_batch = torch.tensor(actions_batch, dtype=torch.float32) # next stsate state_next_batch = np.array([element[4] for element in mini_batch]) state_next_batch = torch.tensor(state_next_batch, dtype=torch.float32) # check if use the cuda if self.args.cuda: state_batch = state_batch.cuda() reward_batch = reward_batch.cuda() done_batch = done_batch.cuda() actions_batch = actions_batch.cuda() state_next_batch = state_next_batch.cuda() # update the critic network... with torch.no_grad(): actions_out = self.actor_target_net(state_next_batch) expected_q_value = self.critic_target_net(state_next_batch, actions_out) # get the target value target_value = reward_batch + self.args.gamma * expected_q_value * done_batch target_value = target_value.detach() values = self.critic_net(state_batch, actions_batch) critic_loss = (target_value - values).pow(2).mean() self.optimizer_critic.zero_grad() critic_loss.backward() self.optimizer_critic.step() # start to update the actor network actor_loss = -self.critic_net(state_batch, self.actor_net(state_batch)).mean() self.optimizer_actor.zero_grad() actor_loss.backward() self.optimizer_actor.step() # then, start to softupdate the network... self._soft_update_target_network(self.critic_target_net, self.critic_net) self._soft_update_target_network(self.actor_target_net, self.actor_net) return actor_loss.item(), critic_loss.item() # soft update the network def _soft_update_target_network(self, target, source): # update the critic network firstly... for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(self.args.tau * param.data + (1 - self.args.tau) * target_param.data) # functions to test the network def test_network(self): model_path = self.args.save_dir + self.args.env_name + '/model.pt' self.actor_net.load_state_dict( torch.load(model_path, map_location=lambda storage, loc: storage)) self.actor_net.eval() # start to test for _ in range(5): state = self.env.reset() reward_sum = 0 while True: self.env.render() state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) with torch.no_grad(): actions = self.actor_net(state) actions = actions.detach().numpy()[0] state_, reward, done, _ = self.env.step(self.action_scale * actions) reward_sum += reward if done: break state = state_ print('The reward of this episode is {}.'.format(reward_sum)) self.env.close()