def closure(): minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) y = self.acquisition_func(x) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def closure(): minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) x = from_01(x) y = lower_confidence_bound(x, model) autograd.backward(x, autograd.grad(y, x)) return y
def finish_episode_ac(): '''Actor-Critic''' R = 0 rewards = [] saved_actions = net.saved_actions value_loss = 0 for r in net.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards).cuda() for (action, value), r in zip(saved_actions, rewards): reward = r - value.data[0, 0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]).cuda())) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p[0], saved_actions)) gradients = [torch.ones(1).cuda()] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) min_grad = np.Inf max_grad = -np.Inf torch.nn.utils.clip_grad_norm(net.parameters(), 10) for param in net.parameters(): if torch.min(param.grad.data) < min_grad: min_grad = torch.min(param.grad.data) if torch.max(param.grad.data) > max_grad: max_grad = torch.max(param.grad.data) optimizer.step() del net.rewards[:] del net.saved_actions[:] return min_grad, max_grad
def update_controller(actionSeqs, avgR): print('Reinforcing for epoch %d' % e) for actions in actionSeqs: actions.reinforce(avgR - b) opti.zero_grad() autograd.backward(actions, [None for _ in actions]) opti.step()
def finish_episode_re(): '''REINFORCE''' R = 0 rewards = [] for r in net.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards).cuda() for action, r in zip(net.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(net.saved_actions, [None for _ in net.saved_actions]) min_grad = np.Inf max_grad = -np.Inf torch.nn.utils.clip_grad_norm(net.parameters(), 1) for param in net.parameters(): if torch.min(param.grad.data) < min_grad: min_grad = torch.min(param.grad.data) if torch.max(param.grad.data) > max_grad: max_grad = torch.max(param.grad.data) optimizer.step() net.zero_grad() optimizer.zero_grad() del net.rewards[:] del net.saved_actions[:] return min_grad, max_grad
def test_PruneModel_ShouldBePrunedInRightPlace(self): output = self.model(self.inputs) backward(output, self.grad) # rig gradients, set all to 1, except first module's first map pConv2ds = (module for module in self.model.modules() if issubclass(type(module), pnn.PConv2d)) for idx, pConv2d in enumerate(pConv2ds): pConv2d.taylor_estimates = torch.ones( pConv2d.taylor_estimates.size()) if idx == 0: pConv2d.taylor_estimates[0] = 0.1 expected_conv2d_out_channels = self.model.features[0].out_channels - 1 expected_batchnorm_num_features = self.model.features[ 1].num_features - 1 next_conv2d_in_channels = self.model.features[4].in_channels - 1 self.model.prune() # being a little lazy here, since prunable_nn_test covered weight checking # check first conv2d's input # check first batchnorm's input # check 2nd conv2d's input self.assertEqual(self.model.features[0].out_channels, expected_conv2d_out_channels) self.assertEqual(self.model.features[1].num_features, expected_batchnorm_num_features) self.assertEqual(self.model.features[4].in_channels, next_conv2d_in_channels) # run again, ensure no bugs with modules self.model(self.inputs)
def learn_mine(self,batch, ma_rate=0.01): # batch is a tuple of (joint1, joint2, marginal (from the dataset of joint 2)) joint1 = torch.autograd.Variable(batch[0]) joint2 = torch.autograd.Variable(batch[2]) marginal = torch.autograd.Variable(batch[4]) #the uneven parts of the dataset are the labels if torch.cuda.is_available(): joint1 = joint1.to('cuda', non_blocking=True) joint2 = joint2.to('cuda', non_blocking=True) marginal = marginal.to('cuda', non_blocking=True) self.net = self.net.cuda() #joint = torch.autograd.Variable(torch.FloatTensor(joint)) #marginal = torch.autograd.Variable(torch.FloatTensor(marginal)) NIM , T, eT = self.mutual_information(joint1, joint2, marginal) #Using exponantial moving average to correct bias ma_eT = (1-ma_rate)*eT + (ma_rate)*torch.mean(eT) # unbiasing loss = -(torch.mean(T) - (1/ma_eT.mean()).detach()*torch.mean(eT)) # use biased estimator # loss = - mi_lb self.mine_net_optim.zero_grad() autograd.backward(loss) self.mine_net_optim.step() #self.scheduler.step() #self.scheduler2.step(NIM) if torch.cuda.is_available(): NIM = NIM.cpu() loss = loss.cpu() return NIM, loss
def learn_mine(batch, mine_net, mine_net_optim, ma_et, ma_rate=0.01): # batch is a tuple of (joint, marginal) joint, marginal = batch # print("joint:", joint.shape) # print("marginal:", marginal.shape) # print("input joint:", joint) joint = torch.autograd.Variable(torch.FloatTensor(joint)).cuda() # print("output joint:", joint) marginal = torch.autograd.Variable(torch.FloatTensor(marginal)).cuda() # mi_lb 是展示的曲线 mi_lb, t, et = mutual_information(joint, marginal, mine_net) # et 是 marginal 的网络输入,和输出 # ma_et 是迭代变量 ma_et = (1 - ma_rate) * ma_et + ma_rate * torch.mean(et) # print("ma_et:", ma_et) # unbiasing use moving average loss = -(torch.mean(t) - (1 / ma_et.mean()).detach() * torch.mean(et)) # print(loss) # use biased estimator # loss = - mi_lb mine_net_optim.zero_grad() autograd.backward(loss) mine_net_optim.step() return mi_lb, ma_et
def update_mine_net(self, batch, mine_net_optim, ma_rate=0.01): """[summary] Arguments: batch {[type]} -- ([batch_size X 2], [batch_size X 2]) mine_net_optim {[type]} -- [description] ma_rate {float} -- [moving average rate] (default: {0.01}) Keyword Arguments: mi_lb {} -- [] """ # batch is a tuple of (joint, marginal) joint, marginal = batch joint = torch.autograd.Variable(torch.FloatTensor(joint)) marginal = torch.autograd.Variable(torch.FloatTensor(marginal)) mi_lb, t, et = self.mutual_information(joint, marginal) self.ma_et = (1 - ma_rate) * self.ma_et + ma_rate * torch.mean(et) # unbiasing use moving average loss = -(torch.mean(t) - (1 / self.ma_et.mean()).detach() * torch.mean(et)) # use biased estimator # loss = - mi_lb lossTrain = loss mine_net_optim.zero_grad() autograd.backward(loss) mine_net_optim.step() return mi_lb, lossTrain
def test_PruneModel_PruneFirstFeatureMapOfLastModule(self): output = self.model(self.inputs) backward(output, self.grad) # rig gradients, set all to 1, except last module's first map pConv2ds = [ module for module in self.model.modules() if issubclass(type(module), pnn.PConv2d) ] last_idx = len(pConv2ds) - 1 for idx, pConv2d in enumerate(pConv2ds): pConv2d.taylor_estimates = torch.ones( pConv2d.taylor_estimates.size()) if idx == last_idx: pConv2d.taylor_estimates[0] = 0.1 old_linear_in_features = self.model.classifier[0].in_features self.model.prune() # only check linear's input size self.assertTrue( self.model.classifier[0].in_features < old_linear_in_features) # run again, ensure no bugs with modules self.model(self.inputs)
def finish_episode(e, actions, values, rewards): # Calculate discounted rewards, going backwards from end discounted_rewards = [] R = 0 for r in rewards[::-1]: R = r + gamma * R discounted_rewards.insert(0, R) discounted_rewards = torch.Tensor(discounted_rewards) # Use REINFORCE on chosen actions and associated discounted rewards value_loss = 0 for action, value, reward in zip(actions, values, discounted_rewards): reward_diff = reward - value.data[0] # Treat critic value as baseline action.reinforce(reward_diff) # Try to perform better than baseline value_loss += mse(value, Variable(torch.Tensor( [reward]))) # Compare with actual reward # Backpropagate optimizer.zero_grad() nodes = [value_loss] + actions gradients = [torch.ones(1)] + [None for _ in actions ] # No gradients for reinforced values autograd.backward(nodes, gradients) optimizer.step() # Save Model if e % 10000 == 0: ckpt = 'out_checkpoint/RG10_' + str(e) + '.pkl' torch.save(policy.state_dict(), ckpt) return discounted_rewards, value_loss
def finish_episode(actions, values, rewards): global optimizer # Calculate discounted rewards, going backwards from end discounted_rewards = [] R = 0 for r in rewards[::-1]: R = r + gamma * R discounted_rewards.insert(0, R) discounted_rewards = torch.Tensor(discounted_rewards) # Use REINFORCE on chosen actions and associated discounted rewards value_loss = 0 count = 0 for action, value, reward in zip(actions, values, discounted_rewards): count += 1 reward_diff = reward - value.data[0] # Treat critic value as baseline action.reinforce(reward_diff) # Try to perform better than baseline value_loss += mse(value, Variable(torch.Tensor( [reward]))) # Compare with actual reward # Backpropagate optimizer.zero_grad() nodes = [value_loss] + actions gradients = [torch.ones(1)] + [None for _ in actions ] # No gradients for reinforced values autograd.backward(nodes, gradients) optimizer.step() return discounted_rewards, value_loss
def finish_episode(self): """update policy based on the results in one episode""" R = 0 rewards = [] for r in self.reward_seq[::-1]: R = r + self.policynet.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) gradients = [torch.zeros(1, len(self.action_spec))] * len( self.action_seq) for t in xrange(len(self.reward_seq)): for a in np.array([0, 3]): # if self.action_seq[t][0][a] > 0.5: # gradients[t][0][a] = -rewards[t] # elif self.action_seq[t][0][a] < 0.5: # gradients[t][0][a] = rewards[t] if self.action_seq[t][a] > 0: gradients[t][0][a] = -rewards[t] elif self.action_seq[t][a] < 0: gradients[t][0][a] = rewards[t] self.optimizer.zero_grad() autograd.backward(self.action_prob_seq, gradients) self.optimizer.step() del self.reward_seq[:] del self.action_prob_seq[:] del self.action_seq[:]
def finish_episode(episodes): R = 0 rewards = [] #print(len(model.rewards)) #print(len(model.saved_actions)) # get the accumlated reward for r in model.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) #rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) log_reward = open('ind_snapshots/rewad.txt', 'a') for action, r in zip(model.saved_actions, rewards): #log_reward.write(str(action)+' '+str(r)) #print(action.data.cpu().numpy()[0,0]) action.reinforce(r) log_reward.write( str(action.data.cpu().numpy()[0, 0]) + ' ' + str(r) + '\n') log_reward.close() optimizer.zero_grad() autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) optimizer.step() #if episodes % 4 == 0: # optimizer.step() # optimizer.zero_grad() del model.rewards[:] del model.saved_actions[:] del model.saved_probs[:]
def closure(): minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) # Object of x: [[]] -> [] x = x[0] y = self.lower_confidence_bound(x, gpmodel) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def closure(): minimizer.zero_grad() x = transform_to(self.x_constraint)(unconstrained_x) x = x.reshape((1, self.dim)) y = self.lower_confidence_bound(x) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def closure(): minimizer.zero_grad() x = transf_values(x_uncon, constr, x_dims) y = model_predict(x)[return_site].mean(0) autograd.backward(x_uncon, autograd.grad(y, x_uncon)) return y
def closure(): minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) y = lower_confidence_bound(model, likelihood, x) #y = lower_confidence_bound(unconstrained_x) #print(autograd.grad(y, unconstrained_x)) #print(y) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def natural_hparams(self, value): if value.grad is not None: value.grad.zero_() copied_value = torch.tensor(value.detach(), requires_grad=True) log_norm_value = self.log_norm(copied_value) ta.backward(log_norm_value) self._expected_sufficient_statistics = torch.tensor(copied_value.grad) self._natural_hparams = copied_value self._log_norm_value = torch.tensor(log_norm_value)
def adversarial_imitation_update(algorithm, agent, discriminator, expert_trajectories, policy_trajectories, discriminator_optimiser, batch_size, r1_reg_coeff=1): expert_dataloader = DataLoader(expert_trajectories, batch_size=batch_size, shuffle=True, drop_last=True) policy_dataloader = DataLoader(policy_trajectories, batch_size=batch_size, shuffle=True, drop_last=True) # Iterate over mininum of expert and policy data for expert_transition, policy_transition in zip(expert_dataloader, policy_dataloader): expert_state, expert_action, expert_next_state, expert_terminal = expert_transition[ 'states'], expert_transition['actions'], expert_transition[ 'next_states'], expert_transition['terminals'] policy_state, policy_action, policy_next_state, policy_terminal = policy_transition[ 'states'], policy_transition['actions'], policy_transition[ 'next_states'], policy_transition['terminals'] if algorithm == 'GAIL': D_expert = discriminator(expert_state, expert_action) D_policy = discriminator(policy_state, policy_action) elif algorithm == 'AIRL': with torch.no_grad(): expert_data_policy = agent.log_prob(expert_state, expert_action).exp() policy_data_policy = agent.log_prob(policy_state, policy_action).exp() D_expert = discriminator(expert_state, expert_action, expert_next_state, expert_data_policy, expert_terminal) D_policy = discriminator(policy_state, expert_action, policy_next_state, policy_data_policy, policy_terminal) # Binary logistic regression discriminator_optimiser.zero_grad() expert_loss = F.binary_cross_entropy( D_expert, torch.ones_like(D_expert)) # Loss on "real" (expert) data autograd.backward(expert_loss, create_graph=True) r1_reg = 0 for param in discriminator.parameters(): r1_reg += param.grad.norm().mean() # R1 gradient penalty policy_loss = F.binary_cross_entropy( D_policy, torch.zeros_like(D_policy)) # Loss on "fake" (policy) data (policy_loss + r1_reg_coeff * r1_reg).backward() discriminator_optimiser.step()
def train_scvi(model, train_set, val_set, n_batches=32, n_epochs=300, lr=0.001, save_path="./models"): """ Trains the model :param model: The model to train :param dataset: The raw dataset (to split in train and test sets and mini-batches) :return: """ model.to(device) val_set = torch.tensor(val_set).to(device) adam = optim.Adam(model.parameters(), lr=lr) losses_train = [] losses_val = [] train_set_shuff = torch.tensor(train_set).to(device) log_library_size = torch.log(torch.sum(train_set_shuff, dim=1)) prior_l_m, prior_l_v = torch.mean(log_library_size), torch.var( log_library_size) # training for epoch in range(n_epochs): train_set_shuff = train_set_shuff[torch.randperm( train_set_shuff.size()[0])] # Shuffle data at each epoch model.train() for i in range(int(len(train_set) / n_batches) + 1): minibatch = train_set_shuff[i * n_batches:(i + 1) * n_batches, :] qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h = model( minibatch) # forward pass loss_train = model.loss(minibatch, qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h, prior_l_m, prior_l_v) # compute ELBO autograd.backward(loss_train, retain_graph=True) # backward pass adam.step() # paramters update adam.zero_grad( ) # put the gradients back to zero for the next mini-batch model.eval() with torch.set_grad_enabled(False): for i in range(int(len(val_set) / n_batches)): minibatch = val_set[i * n_batches:(i + 1) * n_batches, :] qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h = model(minibatch) loss_val = model.loss(minibatch, qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h, prior_l_m, prior_l_v) losses_train.append(loss_train) losses_val.append(loss_val) return losses_train, losses_val
def closure(): minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) y = log_expected_improvement(model, likelihood, x, previous_best, device) #y = lower_confidence_bound(unconstrained_x) #print(autograd.grad(y, unconstrained_x)) #print(y) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def closure(): minimizer.zero_grad() if (torch.log(torch.abs(unconstrained_x)) > 25.).any(): return torch.tensor(float('inf')) x = transform_to(self.constraints)(unconstrained_x) y = differentiable(x) autograd.backward( unconstrained_x, autograd.grad(y, unconstrained_x, retain_graph=True)) return y
def accumulate_gradients(self, grad_infos): bwd_out = list() bwd_in = list() for datas, grad_datas, etas, grad_etas in grad_infos: bwd_out += list(etas) bwd_in += list(grad_etas) for data, grad_data in zip(datas, grad_datas): data.grad.add_(grad_data) if len(bwd_out) > 0: autograd.backward(bwd_out, bwd_in)
def closure(): #ipdb.set_trace() minimizer.zero_grad() x = transform_to(constraint)(unconstrained_x) y = q_expected_improvement(x, gpmodel, sampling_type=sampling_type, sample_size=sample_size) autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x)) return y
def update_controller(self, avgR, b): for actions in self.actionSeqs: if isinstance(actions, list): for action in actions: action.reinforce(avgR - b) else: actions.reinforce(avgR - b) self.optimizer.zero_grad() autograd.backward(actions, [None for _ in actions]) self.optimizer.step() self.actionSeqs = []
def expected_value(self): '''Mean value of the random variable w.r.t. to the distribution. Returns: ``torch.Tensor`` ''' copied_tensor = torch.tensor(self.natural_parameters, requires_grad=True) log_norm = self.log_norm(copied_tensor) ta.backward(log_norm) return copied_tensor.grad.detach()
def update_controller(actionSeqs, valueSeqs, avgR): print('Reinforcing for epoch %d' % e) LossFn = nn.SmoothL1Loss() value_loss = 0 for (actions, values) in zip(actionSeqs, valueSeqs): actions.reinforce(-(values.data - avgR)) rew = Variable(torch.Tensor([avgR] * values.size(0))).detach() value_loss += LossFn(values, rew) opti.zero_grad() autograd.backward([value_loss] + actionSeqs, [torch.ones(1)] + [None for _ in actionSeqs]) opti.step()
def finish_game(policy, optimizer): rewards = policy.rewards + [0] rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(policy.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions]) optimizer.step() del policy.rewards[:] del policy.saved_actions[:]
def train_model(model, train, valid, save_path): """ Function that trains the model :param model: The model to train :param train: The training set :param valid: The validation set :return: """ # optimizer for the network adam = optim.Adam(model.parameters(), lr=3e-4) for epoch in range(args.nb_epochs): for i, (batch, label) in enumerate(train): # put batch on device batch = batch.to(args.device) # obtain the parameters from the encoder and compute KL divergence mu, log_sigma, g_z = model(batch) kl = kl_div(mu, log_sigma) # compute the reconstruction loss logpx_z = ll(batch.view(-1, 3 * 32 * 32), g_z.view(-1, 3 * 32 * 32)) # combine the two loss terms and compute gradients elbo = (logpx_z - kl).mean() # maximize the elbo i.e. minimize - elbo autograd.backward([-elbo]) # Update the parameters and zero the gradients for the next mini-batch adam.step() adam.zero_grad() # compute the loss for the validation set with torch.no_grad(): valid_elbo = torch.zeros(1) nb_batches = 0 for i, (batch, label) in enumerate(valid): nb_batches += 1 batch = batch.to(args.device) mu, log_sigma, g_z = model(batch) kl = kl_div(mu, log_sigma) logpx_z = ll(batch.view(-1, 3 * 32 * 32), g_z.view(-1, 3 * 32 * 32)) valid_elbo += (logpx_z - kl).mean() valid_elbo /= nb_batches print("After epoch {} the validation loss is: ".format(epoch + 1), valid_elbo.item()) # save the model to be used later torch.save(model.state_dict(), save_path)
def finish_episode(): R = 0 rewards = [] for r in policy.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(policy.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions]) optimizer.step() del policy.rewards[:] del policy.saved_actions[:]
def finish_episode(): # training at the end of an episode global num_episodes print('finish_episode({:d})'.format(num_episodes)) R = 0 rewards = [] for r in policy.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for action, r in zip(policy.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions]) optimizer.step() del policy.rewards[:] del policy.saved_actions[:] num_episodes += 1
def finish_episode(): R = 0 rewards = [] # Weight sum of rewards for r in model.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) # Norm rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) # What'is the action? for action, r in zip(model.saved_actions, rewards): action.reinforce(r) optimizer.zero_grad() autograd.backward(model.saved_actions, [None for _ in model.saved_actions]) optimizer.step() del model.rewards[:] del model.saved_actions[:]
def finish_episode(): R = 0 saved_actions = model.saved_actions value_loss = 0 rewards = [] for r in model.rewards[::-1]: R = r + args.gamma * R rewards.insert(0, R) rewards = torch.Tensor(rewards) rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps) for (action, value), r in zip(saved_actions, rewards): reward = r - value.data[0,0] action.reinforce(reward) value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r]))) optimizer.zero_grad() final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions)) gradients = [torch.ones(1)] + [None] * len(saved_actions) autograd.backward(final_nodes, gradients) optimizer.step() del model.rewards[:] del model.saved_actions[:]
def a2c_train_step(agent, abstractor, loader, opt, grad_fn, gamma=0.99, reward_fn=compute_rouge_l, stop_reward_fn=compute_rouge_n(n=1), stop_coeff=1.0): opt.zero_grad() indices = [] probs = [] baselines = [] ext_sents = [] art_batch, abs_batch = next(loader) for raw_arts in art_batch: (inds, ms), bs = agent(raw_arts) baselines.append(bs) indices.append(inds) probs.append(ms) ext_sents += [raw_arts[idx.item()] for idx in inds if idx.item() < len(raw_arts)] with torch.no_grad(): summaries = abstractor(ext_sents) i = 0 rewards = [] avg_reward = 0 for inds, abss in zip(indices, abs_batch): rs = ([reward_fn(summaries[i+j], abss[j]) for j in range(min(len(inds)-1, len(abss)))] + [0 for _ in range(max(0, len(inds)-1-len(abss)))] + [stop_coeff*stop_reward_fn( list(concat(summaries[i:i+len(inds)-1])), list(concat(abss)))]) assert len(rs) == len(inds) avg_reward += rs[-1]/stop_coeff i += len(inds)-1 # compute discounted rewards R = 0 disc_rs = [] for r in rs[::-1]: R = r + gamma * R disc_rs.insert(0, R) rewards += disc_rs indices = list(concat(indices)) probs = list(concat(probs)) baselines = list(concat(baselines)) # standardize rewards reward = torch.Tensor(rewards).to(baselines[0].get_device()) reward = (reward - reward.mean()) / ( reward.std() + float(np.finfo(np.float32).eps)) baseline = torch.cat(baselines).squeeze() avg_advantage = 0 losses = [] for action, p, r, b in zip(indices, probs, reward, baseline): advantage = r - b avg_advantage += advantage losses.append(-p.log_prob(action) * (advantage/len(indices))) # divide by T*B critic_loss = F.mse_loss(baseline, reward) # backprop and update autograd.backward( [critic_loss] + losses, [torch.ones(1).to(critic_loss.get_device())]*(1+len(losses)) ) grad_log = grad_fn() opt.step() log_dict = {} log_dict.update(grad_log) log_dict['reward'] = avg_reward/len(art_batch) log_dict['advantage'] = avg_advantage.item()/len(indices) log_dict['mse'] = critic_loss.item() assert not math.isnan(log_dict['grad_norm']) return log_dict