def __init__(self, writer, device, state_dim, action_dim, args, noise): super(DDPG, self).__init__() self.device = device self.writer = writer self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.soft_update(self.q, self.target_q, 1.) self.soft_update(self.actor, self.target_actor, 1.) self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.noise = noise
def __init__(self, writer, device, state_dim, action_dim, args): super(PPO,self).__init__() self.args = args self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim) self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function,self.args.last_activation,self.args.trainable_std) self.critic = Critic(self.args.layer_num, state_dim, 1, \ self.args.hidden_dim, self.args.activation_function,self.args.last_activation) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr) self.writer = writer self.device = device
def __init__(self, writer, device, state_dim, action_dim, args): super(SAC, self).__init__() self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.soft_update(self.q_1, self.target_q_1, 1.) self.soft_update(self.q_2, self.target_q_2, 1.) self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init)) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.target_entropy = -torch.tensor(action_dim) self.q_1_optimizer = optim.Adam(self.q_1.parameters(), lr=self.args.q_lr) self.q_2_optimizer = optim.Adam(self.q_2.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr) self.device = device self.writer = writer
class SAC(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args): super(SAC, self).__init__() self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_1 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.target_q_2 = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, self.args.last_activation) self.soft_update(self.q_1, self.target_q_1, 1.) self.soft_update(self.q_2, self.target_q_2, 1.) self.alpha = nn.Parameter(torch.tensor(self.args.alpha_init)) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.target_entropy = -torch.tensor(action_dim) self.q_1_optimizer = optim.Adam(self.q_1.parameters(), lr=self.args.q_lr) self.q_2_optimizer = optim.Adam(self.q_2.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.alpha_optimizer = optim.Adam([self.alpha], lr=self.args.alpha_lr) self.device = device self.writer = writer def put_data(self, transition): self.data.put_data(transition) def soft_update(self, network, target_network, rate): for network_params, target_network_params in zip( network.parameters(), target_network.parameters()): target_network_params.data.copy_(target_network_params.data * (1.0 - rate) + network_params.data * rate) def get_action(self, state): mu, std = self.actor(state) dist = Normal(mu, std) u = dist.rsample() u_log_prob = dist.log_prob(u) a = torch.tanh(u) a_log_prob = u_log_prob - torch.log(1 - torch.square(a) + 1e-3) return a, a_log_prob.sum(-1, keepdim=True) def q_update(self, Q, q_optimizer, states, actions, rewards, next_states, dones): ###target with torch.no_grad(): next_actions, next_action_log_prob = self.get_action(next_states) q_1 = self.target_q_1(next_states, next_actions) q_2 = self.target_q_2(next_states, next_actions) q = torch.min(q_1, q_2) v = (1 - dones) * (q - self.alpha * next_action_log_prob) targets = rewards + self.args.gamma * v q = Q(states, actions) loss = F.smooth_l1_loss(q, targets) q_optimizer.zero_grad() loss.backward() q_optimizer.step() return loss def actor_update(self, states): now_actions, now_action_log_prob = self.get_action(states) q_1 = self.q_1(states, now_actions) q_2 = self.q_2(states, now_actions) q = torch.min(q_1, q_2) loss = (self.alpha.detach() * now_action_log_prob - q).mean() self.actor_optimizer.zero_grad() loss.backward() self.actor_optimizer.step() return loss, now_action_log_prob def alpha_update(self, now_action_log_prob): loss = (-self.alpha * (now_action_log_prob + self.target_entropy).detach()).mean() self.alpha_optimizer.zero_grad() loss.backward() self.alpha_optimizer.step() return loss def train_net(self, batch_size, n_epi): data = self.data.sample(shuffle=True, batch_size=batch_size) states, actions, rewards, next_states, dones = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) ###q update q_1_loss = self.q_update(self.q_1, self.q_1_optimizer, states, actions, rewards, next_states, dones) q_2_loss = self.q_update(self.q_2, self.q_2_optimizer, states, actions, rewards, next_states, dones) ### actor update actor_loss, prob = self.actor_update(states) ###alpha update alpha_loss = self.alpha_update(prob) self.soft_update(self.q_1, self.target_q_1, self.args.soft_update_rate) self.soft_update(self.q_2, self.target_q_2, self.args.soft_update_rate) if self.writer != None: self.writer.add_scalar("loss/q_1", q_1_loss, n_epi) self.writer.add_scalar("loss/q_2", q_2_loss, n_epi) self.writer.add_scalar("loss/actor", actor_loss, n_epi) self.writer.add_scalar("loss/alpha", alpha_loss, n_epi)
def train_wasserstein(config): # extractor = Extractor(n_flattens=config['n_flattens'], n_hiddens=config['n_hiddens']) extractor = InceptionV1(num_classes=32) classifier = Classifier(n_flattens=config['n_flattens'], n_hiddens=config['n_hiddens'], n_class=config['n_class']) critic = Critic(n_flattens=config['n_flattens'], n_hiddens=config['n_hiddens']) if torch.cuda.is_available(): extractor = extractor.cuda() classifier = classifier.cuda() critic = critic.cuda() triplet_type = config['triplet_type'] gamma = config['w_gamma'] weight_wd = config['w_weight'] weight_triplet = config['t_weight'] t_margin = config['t_margin'] t_confidence = config['t_confidence'] k_critic = 3 k_clf = 1 TRIPLET_START_INDEX = 95 if triplet_type == 'none': res_dir = os.path.join( config['res_dir'], 'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'], weight_wd, gamma)) if not os.path.exists(res_dir): os.makedirs(res_dir) extractor_path = os.path.join(res_dir, "extractor.pth") classifier_path = os.path.join(res_dir, "classifier.pth") critic_path = os.path.join(res_dir, "critic.pth") EPOCH_START = 1 TEST_INTERVAL = 10 else: TEST_INTERVAL = 1 w_dir = os.path.join( config['res_dir'], 'bs{}-lr{}-w{}-gamma{}'.format(config['batch_size'], config['lr'], weight_wd, gamma)) if not os.path.exists(w_dir): os.makedirs(w_dir) res_dir = os.path.join( w_dir, '{}_t_weight{}_margin{}_confidence{}'.format( triplet_type, weight_triplet, t_margin, t_confidence)) if not os.path.exists(res_dir): os.makedirs(res_dir) extractor_path = os.path.join(w_dir, "extractor.pth") classifier_path = os.path.join(w_dir, "classifier.pth") critic_path = os.path.join(w_dir, "critic.pth") if os.path.exists(extractor_path): extractor.load_state_dict(torch.load(extractor_path)) classifier.load_state_dict(torch.load(classifier_path)) critic.load_state_dict(torch.load(critic_path)) print('load models') EPOCH_START = TRIPLET_START_INDEX else: EPOCH_START = 1 set_log_config(res_dir) print('start epoch {}'.format(EPOCH_START)) print('triplet type {}'.format(triplet_type)) print(config) logging.debug('train_wt') logging.debug(extractor) logging.debug(classifier) logging.debug(critic) logging.debug(config) criterion = torch.nn.CrossEntropyLoss() softmax_layer = nn.Softmax(dim=1) critic_opt = torch.optim.Adam(critic.parameters(), lr=config['lr']) classifier_opt = torch.optim.Adam(classifier.parameters(), lr=config['lr']) feature_opt = torch.optim.Adam(extractor.parameters(), lr=config['lr'] / 10) def train(extractor, classifier, critic, config, epoch): extractor.train() classifier.train() critic.train() iter_source = iter(config['source_train_loader']) iter_target = iter(config['target_train_loader']) len_source_loader = len(config['source_train_loader']) len_target_loader = len(config['target_train_loader']) num_iter = len_source_loader for step in range(1, num_iter): data_source, label_source = iter_source.next() data_target, _ = iter_target.next() if step % len_target_loader == 0: iter_target = iter(config['target_train_loader']) if torch.cuda.is_available(): data_source, label_source = data_source.cuda( ), label_source.cuda() data_target = data_target.cuda() # 1. train critic set_requires_grad(extractor, requires_grad=False) set_requires_grad(classifier, requires_grad=False) set_requires_grad(critic, requires_grad=True) with torch.no_grad(): h_s = extractor(data_source) h_s = h_s.view(h_s.size(0), -1) h_t = extractor(data_target) h_t = h_t.view(h_t.size(0), -1) for j in range(k_critic): gp = gradient_penalty(critic, h_s, h_t) critic_s = critic(h_s) critic_t = critic(h_t) wasserstein_distance = critic_s.mean() - critic_t.mean() critic_cost = -wasserstein_distance + gamma * gp critic_opt.zero_grad() critic_cost.backward() critic_opt.step() if step == 10 and j == 0: print('EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'. format(epoch, wasserstein_distance.item(), (gamma * gp).item(), critic_cost.item())) logging.debug( 'EPOCH {}, DISCRIMINATOR: wd {}, gp {}, loss {}'. format(epoch, wasserstein_distance.item(), (gamma * gp).item(), critic_cost.item())) # 2. train feature and class_classifier set_requires_grad(extractor, requires_grad=True) set_requires_grad(classifier, requires_grad=True) set_requires_grad(critic, requires_grad=False) for _ in range(k_clf): h_s = extractor(data_source) h_s = h_s.view(h_s.size(0), -1) h_t = extractor(data_target) h_t = h_t.view(h_t.size(0), -1) source_preds = classifier(h_s) clf_loss = criterion(source_preds, label_source) wasserstein_distance = critic(h_s).mean() - critic(h_t).mean() if triplet_type != 'none' and epoch >= TRIPLET_START_INDEX: target_preds = classifier(h_t) target_labels = target_preds.data.max(1)[1] target_logits = softmax_layer(target_preds) if triplet_type == 'all': triplet_index = np.where( target_logits.data.max(1)[0].cpu().numpy() > t_margin)[0] images = torch.cat((h_s, h_t[triplet_index]), 0) labels = torch.cat( (label_source, target_labels[triplet_index]), 0) elif triplet_type == 'src': images = h_s labels = label_source elif triplet_type == 'tgt': triplet_index = np.where( target_logits.data.max(1)[0].cpu().numpy() > t_confidence)[0] images = h_t[triplet_index] labels = target_labels[triplet_index] elif triplet_type == 'sep': triplet_index = np.where( target_logits.data.max(1)[0].cpu().numpy() > t_confidence)[0] images = h_t[triplet_index] labels = target_labels[triplet_index] t_loss_sep, _ = triplet_loss(extractor, { "X": images, "y": labels }, t_confidence) images = h_s labels = label_source t_loss, _ = triplet_loss(extractor, { "X": images, "y": labels }, t_margin) loss = clf_loss + \ weight_wd * wasserstein_distance + \ weight_triplet * t_loss if triplet_type == 'sep': loss += t_loss_sep feature_opt.zero_grad() classifier_opt.zero_grad() loss.backward() feature_opt.step() classifier_opt.step() if step == 10: print( 'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}' .format(epoch, clf_loss.item(), weight_wd * wasserstein_distance.item(), weight_triplet * t_loss.item(), loss.item())) logging.debug( 'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, t_loss {}, total loss {}' .format(epoch, clf_loss.item(), weight_wd * wasserstein_distance.item(), weight_triplet * t_loss.item(), loss.item())) else: loss = clf_loss + weight_wd * wasserstein_distance feature_opt.zero_grad() classifier_opt.zero_grad() loss.backward() feature_opt.step() classifier_opt.step() if step == 10: print( 'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, loss {}' .format(epoch, clf_loss.item(), weight_wd * wasserstein_distance.item(), loss.item())) logging.debug( 'EPOCH {}, CLASSIFIER: clf_loss {}, wd {}, loss {}' .format(epoch, clf_loss.item(), weight_wd * wasserstein_distance.item(), loss.item())) # pretrain(model, config, pretrain_epochs=20) for epoch in range(EPOCH_START, config['n_epochs'] + 1): train(extractor, classifier, critic, config, epoch) if epoch % TEST_INTERVAL == 0: # print('test on source_test_loader') # test(extractor, classifier, config['source_test_loader'], epoch) # print('test on target_train_loader') # test(model, config['target_train_loader'], epoch) print('test on target_test_loader') test(extractor, classifier, config['target_test_loader'], epoch) if epoch % config['VIS_INTERVAL'] == 0: if triplet_type == 'none': title = '(a) WDGRL' else: title = '(b) TLADA' draw_confusion_matrix(extractor, classifier, config['target_test_loader'], res_dir, epoch, title) draw_tsne(extractor, classifier, config['source_train_loader'], config['target_test_loader'], res_dir, epoch, title, separate=True) # draw_tsne(extractor, classifier, config['source_test_loader'], config['target_test_loader'], res_dir, epoch, title, separate=False) if triplet_type == 'none': torch.save(extractor.state_dict(), extractor_path) torch.save(classifier.state_dict(), classifier_path) torch.save(critic.state_dict(), critic_path)
class DDPG(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args, noise): super(DDPG, self).__init__() self.device = device self.writer = writer self.args = args self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.target_actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function, self.args.last_activation, self.args.trainable_std) self.q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.target_q = Critic(self.args.layer_num, state_dim + action_dim, 1, self.args.hidden_dim, self.args.activation_function, None) self.soft_update(self.q, self.target_q, 1.) self.soft_update(self.actor, self.target_actor, 1.) self.q_optimizer = optim.Adam(self.q.parameters(), lr=self.args.q_lr) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.data = ReplayBuffer(action_prob_exist=False, max_size=int(self.args.memory_size), state_dim=state_dim, num_action=action_dim) self.noise = noise def soft_update(self, network, target_network, rate): for network_params, target_network_params in zip( network.parameters(), target_network.parameters()): target_network_params.data.copy_(target_network_params.data * (1.0 - rate) + network_params.data * rate) def get_action(self, x): return self.actor(x)[0] + torch.tensor(self.noise.sample()).to( self.device), self.actor(x)[1] def put_data(self, transition): self.data.put_data(transition) def train_net(self, batch_size, n_epi): data = self.data.sample(shuffle=True, batch_size=batch_size) states, actions, rewards, next_states, dones = convert_to_tensor( self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done']) targets = rewards + self.args.gamma * (1 - dones) * self.target_q( next_states, self.target_actor(next_states)[0]) q_loss = F.smooth_l1_loss(self.q(states, actions), targets.detach()) self.q_optimizer.zero_grad() q_loss.backward() self.q_optimizer.step() actor_loss = -self.q(states, self.actor(states)[0]).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() self.soft_update(self.q, self.target_q, self.args.soft_update_rate) self.soft_update(self.actor, self.target_actor, self.args.soft_update_rate) if self.writer != None: self.writer.add_scalar("loss/q", q_loss, n_epi) self.writer.add_scalar("loss/actor", actor_loss, n_epi)
class PPO(nn.Module): def __init__(self, writer, device, state_dim, action_dim, args): super(PPO,self).__init__() self.args = args self.data = ReplayBuffer(action_prob_exist = True, max_size = self.args.traj_length, state_dim = state_dim, num_action = action_dim) self.actor = Actor(self.args.layer_num, state_dim, action_dim, self.args.hidden_dim, \ self.args.activation_function,self.args.last_activation,self.args.trainable_std) self.critic = Critic(self.args.layer_num, state_dim, 1, \ self.args.hidden_dim, self.args.activation_function,self.args.last_activation) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=self.args.actor_lr) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=self.args.critic_lr) self.writer = writer self.device = device def get_action(self,x): mu,sigma = self.actor(x) return mu,sigma def v(self,x): return self.critic(x) def put_data(self,transition): self.data.put_data(transition) def get_gae(self, states, rewards, next_states, dones): values = self.v(states).detach() td_target = rewards + self.args.gamma * self.v(next_states) * (1 - dones) delta = td_target - values delta = delta.detach().cpu().numpy() advantage_lst = [] advantage = 0.0 for idx in reversed(range(len(delta))): if dones[idx] == 1: advantage = 0.0 advantage = self.args.gamma * self.args.lambda_ * advantage + delta[idx][0] advantage_lst.append([advantage]) advantage_lst.reverse() advantages = torch.tensor(advantage_lst, dtype=torch.float).to(self.device) return values, advantages def train_net(self,n_epi): data = self.data.sample(shuffle = False) states, actions, rewards, next_states, dones, old_log_probs = convert_to_tensor(self.device, data['state'], data['action'], data['reward'], data['next_state'], data['done'], data['log_prob']) old_values, advantages = self.get_gae(states, rewards, next_states, dones) returns = advantages + old_values advantages = (advantages - advantages.mean())/(advantages.std()+1e-3) for i in range(self.args.train_epoch): for state,action,old_log_prob,advantage,return_,old_value \ in make_mini_batch(self.args.batch_size, states, actions, \ old_log_probs,advantages,returns,old_values): curr_mu,curr_sigma = self.get_action(state) value = self.v(state).float() curr_dist = torch.distributions.Normal(curr_mu,curr_sigma) entropy = curr_dist.entropy() * self.args.entropy_coef curr_log_prob = curr_dist.log_prob(action).sum(1,keepdim = True) #policy clipping ratio = torch.exp(curr_log_prob - old_log_prob.detach()) surr1 = ratio * advantage surr2 = torch.clamp(ratio, 1-self.args.max_clip, 1+self.args.max_clip) * advantage actor_loss = (-torch.min(surr1, surr2) - entropy).mean() #value clipping (PPO2 technic) old_value_clipped = old_value + (value - old_value).clamp(-self.args.max_clip,self.args.max_clip) value_loss = (value - return_.detach().float()).pow(2) value_loss_clipped = (old_value_clipped - return_.detach().float()).pow(2) critic_loss = 0.5 * self.args.critic_coef * torch.max(value_loss,value_loss_clipped).mean() self.actor_optimizer.zero_grad() actor_loss.backward() nn.utils.clip_grad_norm_(self.actor.parameters(), self.args.max_grad_norm) self.actor_optimizer.step() self.critic_optimizer.zero_grad() critic_loss.backward() nn.utils.clip_grad_norm_(self.critic.parameters(), self.args.max_grad_norm) self.critic_optimizer.step() if self.writer != None: self.writer.add_scalar("loss/actor_loss", actor_loss.item(), n_epi) self.writer.add_scalar("loss/critic_loss", critic_loss.item(), n_epi)
class DDPGAgent(): """ DDPG This class implements the DDP algorithm. For more information see: https://spinningup.openai.com/en/latest/algorithms/ddpg.html """ def __init__(self, state_size, action_size, fc_layer_sizes, buffer_size=30000, batch_size=128, update_interval=16, num_update_steps=1, noise_std=0.2, noise_reduction=0.998, noise_std_min=0.05, warmup=1e4, tau=0.02, gamma=0.99, lr_actor=2e-4, lr_critic=2e-4, seed=0): """ Initialize an DDPG agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action fc_layer_sizes (list of int): Layer size of each FC layer buffer_size (int): the size of the replay buffer batch_size (int): the size of the batches for network updates update_interval (int): number of steps between updates num_update_steps (int): number of update steps in a row noise_std (float): std of Gaussian noise for adding to action noise_reduction (float): factor to reduce noise after each update noise_std_min (float): the minimum value of noise_std tau (float): soft weight update factor gamma (float): discount factor lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic seed (int): random seed """ self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.update_interval = update_interval self.num_update_steps = num_update_steps self.tau = tau self.gamma = gamma self.noise_std = noise_std self.noise_reduction = noise_reduction self.noise_std_min = noise_std_min self.warmup = warmup self.t = 0 # seed np.random.seed(seed) # torch device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # add replay buffer self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed) # define networks, initialize target networks with original networks self.actor = Actor(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.target_actor = Actor(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.critic = Critic(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.target_critic = Critic(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.hard_updates() # define optimizers self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0) def act(self, state, add_noise=True): """ Computes and returns the action to take Params ====== state (list of float): current state """ # input state to actor network in eval mode, get action, add Gaussian noise state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.actor.eval() with torch.no_grad(): action = self.actor(state).squeeze().cpu().detach().numpy() self.actor.train() if add_noise: action += self.noise_std * np.random.normal(size=self.action_size) return action def step(self, state, action, reward, next_state, done): """ Saves step details and potentially performs network training Params ====== state (list of float): current state action (list of float): action taken reward (float): reward received next_state (list of float): next state done (bool): bool whether end of episode reached """ self.replay_buffer.add(state, action, reward, next_state, done) self.t += 1 if self.t >= self.warmup: if self.t % self.update_interval == 0: if (len(self.replay_buffer) > self.batch_size): self.learn() def learn(self): """ Performs actor and critic network training """ for _ in range(self.num_update_steps): # sample a random batch of experiences states, actions, rewards, next_states, dones = self.replay_buffer.sample( self.batch_size) # compute Q targets actions_next = self.target_actor(next_states) q_targets = rewards + self.gamma * \ (1 - dones) * self.target_critic(next_states, actions_next) q_expected = self.critic(states, actions) # compute critic loss, update critic critic_loss = F.mse_loss(q_expected, q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), .5) # clip gradients self.critic_optimizer.step() # update actor actions_pred = self.actor(states) actor_loss = -self.critic(states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # update target networks self.soft_updates() # reduce action sampling noise self.noise_std = max(self.noise_std * self.noise_reduction, self.noise_std_min) def soft_updates(self): """ Performs a soft parameter update for target and original networks """ for target, source in zip([self.target_actor, self.target_critic], [self.actor, self.critic]): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(target_param.data * (1.0 - self.tau) + param.data * self.tau) def hard_updates(self): """ Performs a hard parameter update for target and original networks """ for target, source in zip([self.target_actor, self.target_critic], [self.actor, self.critic]): for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data)
def __init__(self, state_size, action_size, fc_layer_sizes, buffer_size=30000, batch_size=128, update_interval=16, num_update_steps=1, noise_std=0.2, noise_reduction=0.998, noise_std_min=0.05, warmup=1e4, tau=0.02, gamma=0.99, lr_actor=2e-4, lr_critic=2e-4, seed=0): """ Initialize an DDPG agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action fc_layer_sizes (list of int): Layer size of each FC layer buffer_size (int): the size of the replay buffer batch_size (int): the size of the batches for network updates update_interval (int): number of steps between updates num_update_steps (int): number of update steps in a row noise_std (float): std of Gaussian noise for adding to action noise_reduction (float): factor to reduce noise after each update noise_std_min (float): the minimum value of noise_std tau (float): soft weight update factor gamma (float): discount factor lr_actor (float): learning rate for actor lr_critic (float): learning rate for critic seed (int): random seed """ self.action_size = action_size self.buffer_size = buffer_size self.batch_size = batch_size self.update_interval = update_interval self.num_update_steps = num_update_steps self.tau = tau self.gamma = gamma self.noise_std = noise_std self.noise_reduction = noise_reduction self.noise_std_min = noise_std_min self.warmup = warmup self.t = 0 # seed np.random.seed(seed) # torch device self.device = torch.device( "cuda:0" if torch.cuda.is_available() else "cpu") # add replay buffer self.replay_buffer = ReplayBuffer(buffer_size, self.device, seed) # define networks, initialize target networks with original networks self.actor = Actor(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.target_actor = Actor(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.critic = Critic(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.target_critic = Critic(state_size, action_size, fc_layer_sizes, seed=seed).to(self.device) self.hard_updates() # define optimizers self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=lr_actor) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=lr_critic, weight_decay=0)