Esempio n. 1
0
    def aggregate_updates(self, global_model, agent_updates_dict, cur_round):
        # adjust LR if robust LR is selected
        lr_vector = torch.Tensor([self.server_lr] * self.n_params).to(
            self.args.device)
        if self.args.robustLR_threshold > 0:
            lr_vector = self.compute_robustLR(agent_updates_dict)

        aggregated_updates = 0
        if self.args.aggr == 'avg':
            aggregated_updates = self.agg_avg(agent_updates_dict)
        elif self.args.aggr == 'comed':
            aggregated_updates = self.agg_comed(agent_updates_dict)
        elif self.args.aggr == 'sign':
            aggregated_updates = self.agg_sign(agent_updates_dict)

        if self.args.noise > 0:
            aggregated_updates.add_(
                torch.normal(mean=0,
                             std=self.args.noise * self.args.clip,
                             size=(self.n_params, )).to(self.args.device))

        cur_global_params = parameters_to_vector(global_model.parameters())
        new_global_params = (cur_global_params +
                             lr_vector * aggregated_updates).float()
        vector_to_parameters(new_global_params, global_model.parameters())

        # some plotting stuff if desired
        # self.plot_sign_agreement(lr_vector, cur_global_params, new_global_params, cur_round)
        # self.plot_norms(agent_updates_dict, cur_round)
        return
Esempio n. 2
0
 def set_and_eval(step):
     vector_to_parameters(old_params - alpha * x * step,
                          actor_critic.policy.parameters())
     _, logp, _, _, d_kl = actor_critic.policy(obs, act, **policy_args)
     ratio = (logp - logp_old).exp()
     pi_loss = -(ratio * adv).mean()
     return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item())
Esempio n. 3
0
 def f_barrier(params):
     vector_to_parameters(params, self.module.actor.parameters())
     new_logp = self.module.actor.log_prob(cur_obs, actions)
     surr_loss = self._compute_surr_loss(old_logp, new_logp, advantages)
     avg_kl = torch.mean(old_logp - new_logp)
     return surr_loss.item(
     ) if avg_kl < self.config["delta"] else np.inf
Esempio n. 4
0
    def func(params, params_i, params_j):
        # print ("Params inpute: ", type(params))
        import time

        old_params = list(model.parameters()) #parameters_to_vector(model.parameters())

        t1s = time.time()
        cur_params = [v.clone() for v in old_params]
        cur_params[params_i:params_j] = params
        t1e = time.time()
        # print ("klidx 0: ", (t1e-t1s))

        t1s = time.time()
        vector_to_parameters(parameters_to_vector(cur_params), model.parameters())
        t1e = time.time()
        # print ("klidx 1: ", (t1e-t1s))

        new_log_probs = model(inputs)
        old_log_probs = torch.clone(new_log_probs).detach()
        f = kl_fn(new_log_probs, old_log_probs)

        t1s = time.time()
        tmp_params = list(model.parameters())[params_i:params_j]
        vector_to_parameters(parameters_to_vector(old_params), model.parameters())
        t1e = time.time()
        # print ("klidx 2: ", (t1e-t1s))

        return f, tmp_params
Esempio n. 5
0
    def step(self, closure=None, thr=1e-2, eps=1e-9):
        loss = None
        if closure is not None:
            loss = closure()
            world_size = self.dist.get_world_size()
            grads = [p.grad for p in self.model.parameters()]
            # pack
            packed_tensor = parameters_to_vector(grads)
            # all reduce
            self.dist.all_reduce(packed_tensor)
            # unpack
            vector_to_parameters(packed_tensor.div_(world_size), grads)

        if self.lars:
            for group in self.param_groups:
                for p in group['params']:
                    setattr(p, 'data_pre', p.data.detach().clone())

        self.actual_optimizer.step(closure=None)

        if self.lars:
            for group in self.param_groups:
                for p in group['params']:
                    d_norm_pre = p.data_pre.norm()
                    if d_norm_pre > thr:
                        upd = p.data - p.data_pre
                        upd_norm = upd.norm()
                        rate = group['lr'] * d_norm_pre / (upd_norm + eps)
                        p.data = p.data_pre.add(rate, upd)

        return loss
Esempio n. 6
0
    def comp_diag_fisher(self, model_params, data_loader, adv=True):

        model = models.get_model(self.args.data)
        vector_to_parameters(model_params, model.parameters())
        params = {n: p for n, p in model.named_parameters() if p.requires_grad}
        precision_matrices = {}
        for n, p in deepcopy(params).items():
            p.data.zero_()
            precision_matrices[n] = p.data

        model.eval()
        for _, (inputs, labels) in enumerate(data_loader):
            model.zero_grad()
            inputs, labels = inputs.to(device=self.args.device, non_blocking=True),\
                                    labels.to(device=self.args.device, non_blocking=True).view(-1, 1)
            if not adv:
                labels.fill_(self.args.base_class)

            outputs = model(inputs)
            log_all_probs = F.log_softmax(outputs, dim=1)
            target_log_probs = outputs.gather(1, labels)
            batch_target_log_probs = target_log_probs.sum()
            batch_target_log_probs.backward()

            for n, p in model.named_parameters():
                precision_matrices[n].data += (p.grad.data**2) / len(
                    data_loader.dataset)

        return parameters_to_vector(precision_matrices.values()).detach()
Esempio n. 7
0
    def get_mc_predictions(self, forward_function, inputs, mc_samples=1, ret_numpy=False, *args, **kwargs):
        """Returns Monte Carlo predictions.
        Arguments:
            forward_function (callable): The forward function of the model
                that takes inputs and returns the outputs.
            inputs (FloatTensor): The inputs to the model.
            mc_samples (int): The number of Monte Carlo samples.
            ret_numpy (bool): If true, the returned list contains numpy arrays,
                otherwise it contains torch tensors.
        """

        # We only support a single parameter group.
        parameters = self.param_groups[0]['params']
        predictions = []

        Precision = self.state['Precision']
        mu = self.state['mu']
        for _ in range(mc_samples):
            # Sample a parameter vector:
            raw_noise = torch.normal(mean=torch.zeros_like(mu), std=1.0)
            p = torch.addcdiv(mu, 1., raw_noise, torch.sqrt(Precision))
            vector_to_parameters(p, parameters)

            # Call the forward computation function
            outputs = forward_function(inputs, *args, **kwargs)
            if ret_numpy:
                outputs = outputs.data.cpu().numpy()
            predictions.append(outputs)

        return predictions
Esempio n. 8
0
    def evaluate_step(self, inputs, labels, device="cpu", M=0):
        epsilons = []
        for sample in range(M):
            epsilons.append(
                torch.bernoulli(torch.sigmoid(2 * self.optim.state["lambda"]))
            )

        params = self.optim.param_groups[0]["params"]
        if len(epsilons) == 0:
            epsilons.append(
                torch.where(
                    self.optim.state["mu"] <= 0,
                    torch.zeros_like(self.optim.state["mu"]),
                    torch.ones_like(self.optim.state["mu"]),
                )
            )
        output_list = []

        for epsilon in epsilons:
            vector_to_parameters(2 * epsilon - 1, params)
            outputs = self.model(inputs.to(device))
            output_list.append(outputs)

        output_tensor = torch.stack(output_list, dim=2)
        probs = torch.mean(output_tensor, dim=2)
        loss = self.criterion(probs, labels.to(device))
        _, pred = torch.max(probs, 1)
        correct = (
            pred.eq(labels.to(device).view_as(pred)).sum().item() / labels.shape[0]
        ) * 100
        return loss, correct
Esempio n. 9
0
 def from_vec(self, x):
     r"""Set the network parameters from a single flattened vector.
     
     Args:
         x (Tensor): A single flattened vector of the network parameters with consistent size.
     """
     vector_to_parameters(vec=x, parameters=self.parameters())
Esempio n. 10
0
    def func(params):
        old_params = parameters_to_vector(model.parameters())

        # print ("old: ", len(old_params), len(list(model.parameters())))
        # print (type(params), type(params[0]))
        if isinstance(params[0], torch.nn.Parameter):
            # print ("1")
            if mat == 'A':
                vector_to_parameters(params[0].view(-1, 1), model.A) #parameters())
            elif mat == 'B':
                vector_to_parameters(params[0].view(-1, 1), model.B) #parameters())
        else:
            # print ("2")
            if mat == 'A':
                vector_to_parameters(parameters_to_vector(params[0]), model.A) #parameters())
            elif mat == 'B':
                vector_to_parameters(parameters_to_vector(params[0]), model.B) #parameters())

        z = model(ids)
        f = mat_completion_loss(W, M, z, model.A, model.B, ids)
        if mat == 'A':
            tmp_params = [model.A]
        elif mat == 'B':
            tmp_params = [model.B]

        vector_to_parameters(old_params, model.parameters())
        return f, z, tmp_params
Esempio n. 11
0
    def SNN_error(self, loader, delta_prime, n_mtcarlo_approx, sample_freq):
        """
        Compute upper bound on the error of the Stochastic neural network by application of Theorem of the sample convergence bound 
        """
        samples_errors = 0.
        snn_error = []
        
        with torch.no_grad():
            t = time.time()
            iter_counter = sample_freq#00
            
            for i in range(n_mtcarlo_approx):
                vector_to_parameters(self.sample_weights().detach(), self.model.parameters())
                samples_errors += test_error(self.model, loader, self.accuracy_loss, self.device)
                if i == iter_counter:
                    snn_error_intermed = solve_kl_sup(samples_errors/i, (log(2/delta_prime)/i))
                    plog("Iter {}; SNN error {:.4g}; Took {:.4g}s".format(i, snn_error_intermed, time.time()-t))
                    snn_error.append(snn_error_intermed)
                    # print("Computational time for {} is {}".format(i, time.time() - t))
                    iter_counter += sample_freq#00
                    t = time.time()

        snn_final_error = solve_kl_sup(samples_errors/n_mtcarlo_approx, (log(2/delta_prime)/n_mtcarlo_approx))
        snn_error.append(snn_final_error)
        
        return snn_error
Esempio n. 12
0
    def get_mc_predictions(self, forward_function, inputs, ret_numpy=False, raw_noises=None, *args, **kwargs):
        """Returns Monte Carlo predictions.
        Arguments:
            forward_function (callable): The forward function of the model
                that takes inputs and returns the outputs.
            inputs (FloatTensor): The inputs to the model.
            mc_samples (int): The number of Monte Carlo samples.
            ret_numpy (bool): If true, the returned list contains numpy arrays,
                otherwise it contains torch tensors.
        """

        # We only support a single parameter group.
        parameters = self.param_groups[0]['params']
        predictions = []

        if raw_noises is None: # use the mean value (sign) to make predictions
            raw_noises = []
            mean_vector = torch.where(self.state['mu']<=0,torch.zeros_like(self.state['mu']),torch.ones_like(self.state['mu']))
            raw_noises.append(mean_vector)  # perform inference using the sign of the mean value when there is no sampling

        for raw_noise in raw_noises:
            # Sample a parameter vector:
            vector_to_parameters(2*raw_noise-1, parameters)
            # Call the forward computation function
            outputs = forward_function(inputs, *args, **kwargs)
            if ret_numpy:
                outputs = outputs.data.cpu().numpy()
            predictions.append(outputs)

        return predictions
def act_nn(obs, weights, actions):
    model = ModelNes(obs.size, len(actions))
    vector_to_parameters(torch.from_numpy(weights).float(), model.parameters())
    with torch.no_grad():
        q_estimate = model(torch.from_numpy(obs).float())
    action_i = np.argmax(q_estimate.data.numpy())
    return actions[action_i]
Esempio n. 14
0
    def get_dual_predictions(self, jac_closure, mc_samples=10, ret_jac=False):
        mu = self.state['mu']
        precision = self.state['precision']
        parameters = self.param_groups[0]['params']
        J_list = []
        fxs = []
        Jv_list = []
        for _ in range(mc_samples):
            # Sample a parameter vector:
            raw_noise = torch.normal(mean=torch.zeros_like(mu), std=1.0)
            p = torch.addcdiv(mu, 1., raw_noise, torch.sqrt(precision))
            vector_to_parameters(p, parameters)

            # Get loss and predictions
            preds, J = jac_closure()
            fxs.append(preds)
            J_list.append(J)  # each J in n x p
            Jv_list.append(J @ p)
        vector_to_parameters(mu, parameters)
        fx_hat = torch.mean(torch.stack(fxs), 0).flatten()
        J_hat = torch.mean(torch.stack(J_list), 0)
        Jv_hat = torch.mean(torch.stack(Jv_list), 0)
        mu_pred = fx_hat + J_hat @ mu - Jv_hat
        std_pred = torch.sqrt(
            torch.diag(J_hat @ torch.diag(1. / precision) @ J_hat.t()))
        if ret_jac:
            return (fx_hat.detach().numpy(), (J_hat @ mu).detach().numpy(),
                    Jv_hat.detach().numpy(), std_pred.detach().numpy())
        return mu_pred.detach().numpy(), std_pred.detach().numpy()
    def SNN_error(self, loader, delta_prime, n_mtcarlo_approx):
        """
      Compute upper bound on the error of the Stochastic neural network by application of Theorem of the sample convergence bound 
        """
        samples_errors = 0.
        snn_error = []

        with torch.no_grad():
            t = time.time()
            iter_counter = 10  #00

            for i in range(n_mtcarlo_approx):
                vector_to_parameters(self.sample_weights().detach(),
                                     self.model.parameters())
                samples_errors += test_error(loader, self.model, self.device)
                if i == iter_counter:
                    print("It's {}th Monte-Carlo iteration".format(i))
                    snn_error_intermed = solve_kl_sup(
                        samples_errors / i, (log(2 / delta_prime) / i))
                    print("SNN-error is {}".format(snn_error_intermed))
                    snn_error.append(snn_error_intermed)
                    print("Computational time for {} is {}".format(
                        i,
                        time.time() - t))
                    iter_counter += 10  #00

        snn_final_error = solve_kl_sup(
            samples_errors / n_mtcarlo_approx,
            (log(2 / delta_prime) / n_mtcarlo_approx))
        snn_error.append(snn_final_error)

        return snn_error
Esempio n. 16
0
def trpo_update(replay, policy, baseline):
    gamma = 0.99
    tau = 0.95
    max_kl = 0.01
    ls_max_steps = 15
    backtrack_factor = 0.5
    old_policy = deepcopy(policy)
    for step in range(10):
        states = replay.state()
        actions = replay.action()
        rewards = replay.reward()
        dones = replay.done()
        next_states = replay.next_state()
        returns = ch.td.discount(gamma, rewards, dones)
        baseline.fit(states, returns)
        values = baseline(states)
        next_values = baseline(next_states)

        # Compute KL
        with th.no_grad():
            old_density = old_policy.density(states)
        new_density = policy.density(states)
        kl = kl_divergence(old_density, new_density).mean()

        # Compute surrogate loss
        old_log_probs = old_density.log_prob(actions).mean(dim=1, keepdim=True)
        new_log_probs = new_density.log_prob(actions).mean(dim=1, keepdim=True)
        bootstraps = values * (1.0 - dones) + next_values * dones
        advantages = ch.pg.generalized_advantage(gamma, tau, rewards, dones,
                                                 bootstraps, th.zeros(1))
        advantages = ch.normalize(advantages).detach()
        surr_loss = trpo.policy_loss(new_log_probs, old_log_probs, advantages)

        # Compute the update
        grad = autograd.grad(surr_loss, policy.parameters(), retain_graph=True)
        Fvp = trpo.hessian_vector_product(kl, policy.parameters())
        grad = parameters_to_vector(grad).detach()
        step = trpo.conjugate_gradient(Fvp, grad)
        lagrange_mult = 0.5 * th.dot(step, Fvp(step)) / max_kl
        step = step / lagrange_mult
        step_ = [th.zeros_like(p.data) for p in policy.parameters()]
        vector_to_parameters(step, step_)
        step = step_

        #  Line-search
        for ls_step in range(ls_max_steps):
            stepsize = backtrack_factor**ls_step
            clone = deepcopy(policy)
            for c, u in zip(clone.parameters(), step):
                c.data.add_(-stepsize, u.data)
            new_density = clone.density(states)
            new_kl = kl_divergence(old_density, new_density).mean()
            new_log_probs = new_density.log_prob(actions).mean(dim=1,
                                                               keepdim=True)
            new_loss = trpo.policy_loss(new_log_probs, old_log_probs,
                                        advantages)
            if new_loss < surr_loss and new_kl < max_kl:
                for p, c in zip(policy.parameters(), clone.parameters()):
                    p.data[:] = c.data[:]
                break
Esempio n. 17
0
    def update():
        net.train()
        net.vf_targ.eval()

        # datas
        batch = replay_buffer.sample_batch(batch_size)
        x_ph = torch.from_numpy(batch['obs1'])
        x2_ph = torch.from_numpy(batch['obs2'])
        a_ph = torch.from_numpy(batch['acts'])
        r_ph = torch.from_numpy(batch['rews'][:, np.newaxis])
        d_ph = torch.from_numpy(batch['done'][:, np.newaxis])

        # computation graph
        mu, pi, logp_pi = net.apply_policy(x_ph)
        q1, q2 = net.apply_qf(x_ph, a_ph)
        q1_pi, q2_pi = net.apply_qf(x_ph, pi)
        v = net.apply_vf(x_ph)

        with torch.no_grad():
            v_targ = net.apply_vf_targ(x2_ph)

        # Min Double-Q:
        min_q_pi = torch.min(q1_pi, q2_pi)

        # Targets for Q and V regression
        q_backup = r_ph + gamma * (1 - d_ph) * v_targ.detach()
        v_backup = (min_q_pi - alpha * logp_pi).detach()

        # Soft actor-critic losses
        pi_loss = torch.mean(alpha * logp_pi - q1_pi)
        q1_loss = 0.5 * criterion_mse(q1, q_backup)
        q2_loss = 0.5 * criterion_mse(q2, q_backup)
        v_loss = 0.5 * criterion_mse(v, v_backup)
        value_loss = q1_loss + q2_loss + v_loss

        # Policy train
        optimizer_actor.zero_grad()
        pi_loss.backward()
        optimizer_actor.step()

        # Value train
        optimizer_critic.zero_grad()
        value_loss.backward()
        optimizer_critic.step()

        # update target network
        param = parameters_to_vector(net.vf.parameters())
        param_targ = parameters_to_vector(net.vf_targ.parameters())
        param_targ = polyak * param_targ + (1 - polyak) * param
        vector_to_parameters(param_targ, net.vf_targ.parameters())

        logger.store(LossPi=pi_loss.item(),
                     LossQ1=q1_loss.item(),
                     LossQ2=q2_loss.item(),
                     LossV=v_loss.item(),
                     Q1Vals=q1.detach().numpy(),
                     Q2Vals=q2.detach().numpy(),
                     VVals=value_loss.item(),
                     LogPi=logp_pi.detach().numpy())
Esempio n. 18
0
 def perturb_params(self, src_net, dst_net):
     params = parameters_to_vector(src_net.parameters())
     vector_to_parameters(params, dst_net.parameters())
     for m in dst_net.modules():
         if self.param_noise_filter_func(m):
             for param in m.parameters():
                 param.data += torch.randn_like(
                     param.data) * self.param_noise_scale
Esempio n. 19
0
 def from_vec(self, x):
     """
     Unflatten the given vector as the network parameters. 
     
     Args:
         x (Tensor): flattened single vector with size consistent of the number of network paramters. 
     """
     vector_to_parameters(vec=x, parameters=self.parameters())
 def forward(self, images, labels):
     self.noise = torch.randn(self.d_size).to(self.device) * torch.exp(
         self.sigma_posterior_)
     vector_to_parameters(self.flat_params + self.noise,
                          self.model.parameters())
     outputs = self.model(images)
     # loss = self.criterion(outputs.float(), labels.long())
     loss = F.cross_entropy(outputs.float(), labels.long())
     return loss
Esempio n. 21
0
 def Fvp_fn(theta):
     # import time
     # s = time.time()
     # theta should be a parameter vector.
     temp_model = copy.deepcopy(model)
     vector_to_parameters(theta, temp_model.parameters())
     full_inp = [temp_model, inputs, outputs, kl_fn, regu_coef]
     H = eval_F(*full_inp)
     return H
Esempio n. 22
0
 def set_and_eval(step):
     vector_to_parameters(old_params - alpha * x * step,
                          net.actor.parameters())
     _, logp, _, _, d_kl = net.apply_actor(x_ph,
                                           a_ph,
                                           old_logp_or_mu=inputs[-1])
     ratio = torch.exp(logp - logp_old_ph)  # pi(a|s) / pi_old(a|s)
     pi_loss = -torch.mean(ratio * adv_ph)
     return mpi_avg(d_kl.item()), mpi_avg(pi_loss.item())
Esempio n. 23
0
def conjugate_gradient(Ax, b, num_iterations=10, tol=1e-10, eps=1e-8):
    """
    [[Source]](https://github.com/seba-1511/cherry/blob/master/cherry/algorithms/trpo.py)

    **Description**

    Computes \\(x = A^{-1}b\\) using the conjugate gradient algorithm.

    **Credit**

    Adapted from Kai Arulkumaran's implementation, with additions inspired from John Schulman's implementation.

    **References**

    1. Nocedal and Wright. 2006. "Numerical Optimization, 2nd edition". Springer.
    2. Shewchuk et al. 1994. “An Introduction to the Conjugate Gradient Method without the Agonizing Pain.” CMU.

    **Arguments**

    * **Ax** (callable) - Given a vector x, computes A@x.
    * **b** (tensor or list) - The reference vector.
    * **num_iterations** (int, *optional*, default=10) - Number of conjugate gradient iterations.
    * **tol** (float, *optional*, default=1e-10) - Tolerance for proposed solution.
    * **eps** (float, *optional*, default=1e-8) - Numerical stability constant.

    **Returns**

    * **x** (tensor or list) - The solution to Ax = b, as a list if b is a list else a tensor.

    **Example**

    ~~~python
    pass
    ~~~
    """
    shape = None
    if not isinstance(b, th.Tensor):
        shape = [th.zeros_like(b_i) for b_i in b]
        b = parameters_to_vector(b)
    x = th.zeros_like(b)
    r = b
    p = r
    r_dot_old = th.dot(r, r)
    for _ in range(num_iterations):
        Ap = Ax(p)
        alpha = r_dot_old / (th.dot(p, Ap) + eps)
        x += alpha * p
        r -= alpha * Ap
        r_dot_new = th.dot(r, r)
        p = r + (r_dot_new / r_dot_old) * p
        r_dot_old = r_dot_new
        if r_dot_new.item() < tol:
            break
    if shape is not None:
        vector_to_parameters(x, shape)
        x = shape
    return x
Esempio n. 24
0
def fitness(batch, model, params, val=False):
    vector_to_parameters(params, model.parameters())

    model.set_decode_type('greedy')
    model.eval()
    with torch.no_grad():
        length, _ = model(batch)

    return length.mean()
Esempio n. 25
0
File: learner.py Progetto: sisl/CEEM
    def eval_f(vparams):
        vparams = torch.tensor(vparams).to(torch.get_default_dtype())
        vector_to_parameters(vparams, params)

        with torch.no_grad():
            loss = criterion(model, criterion_x, **crit_kwargs)

        vector_to_parameters(vparams0, params)

        return loss.detach().numpy()
Esempio n. 26
0
def save(model, params, history, savedir, start, epoch, check=True):
    vector_to_parameters(params, model.parameters())

    if check:
        torch.save(model,'{}/epoch{}-evo-model.pt'.format(savedir,epoch))
    else:
        hr_time = int(round((time()-start)/3600))
        torch.save(model,'{}/{}hr-evo-model.pt'.format(savedir,hr_time))
        with open(f'{savedir}/fitness_history_{hr_time}.pickle', 'wb') as f:
            pickle.dump(history, f, protocol=pickle.HIGHEST_PROTOCOL)
Esempio n. 27
0
 def surrogate_loss(self, theta):
     """
     Returns the surrogate loss w.r.t. the given parameter vector theta (-> float)
     """
     old_theta = parameters_to_vector(self.policy_net.parameters())
     prob_old = self.policy_net(self.observations_tensor).gather(1, self.actions).data
     vector_to_parameters(theta, self.policy_net.parameters())
     prob_new = self.policy_net(self.observations_tensor).gather(1, self.actions).data
     vector_to_parameters(old_theta, self.policy_net.parameters())
     return -torch.mean((prob_new / (prob_old + eps)) * self.advantages)
Esempio n. 28
0
 def f_barrier(params,
               all_obs=all_obs,
               all_acts=all_acts,
               all_advs=all_advs):
     vector_to_parameters(params, policy.parameters())
     new_dists = policy(all_obs)
     new_logp = new_dists.log_prob(all_acts)
     surr_loss = -((new_logp - old_logp).exp() * all_advs).mean()
     avg_kl = kl(old_dists, new_dists).mean().item()
     return surr_loss.item() if avg_kl < delta else float("inf")
Esempio n. 29
0
    def eval_f(vparams):
        vparams = torch.tensor(vparams)

        vparams_ = parameters_to_vector(params)
        vector_to_parameters(vparams, params)

        with torch.no_grad():
            obj = objfun()

        vector_to_parameters(vparams_, params)

        return obj.detach().numpy()
Esempio n. 30
0
    def update(policy_update):
        net.train()
        net_targ.eval()

        # datas
        batch = replay_buffer.sample_batch(batch_size)
        x_ph = torch.from_numpy(batch['obs1'])
        x2_ph = torch.from_numpy(batch['obs2'])
        a_ph = torch.from_numpy(batch['acts'])
        r_ph = torch.from_numpy(batch['rews'][:, np.newaxis])
        d_ph = torch.from_numpy(batch['done'][:, np.newaxis])

        # Q-learning update
        q1, q2 = net.apply_critic(x_ph, a_ph)

        # compute q target
        with torch.no_grad():
            pi_targ = net_targ.act_limit * net_targ.actor(x2_ph)
        epsilon = torch.randn_like(pi_targ) * target_noise
        epsilon = torch.clamp(epsilon, -noise_clip, noise_clip)
        a2 = torch.clamp(pi_targ + epsilon, -net.act_limit, net.act_limit)
        with torch.no_grad():
            q1_targ, q2_targ = net_targ.apply_critic(x2_ph, a2)

        min_q_targ = torch.min(q1_targ, q2_targ)
        backup = r_ph + gamma * (1 - d_ph) * min_q_targ.detach()
        q_loss = criterion_mse(q1, backup) + criterion_mse(q2, backup)

        # update
        optimizer_critic.zero_grad()
        q_loss.backward()
        optimizer_critic.step()

        logger.store(LossQ=q_loss.item(),
                     Q1Vals=q1.detach().numpy(),
                     Q2Vals=q2.detach().numpy())

        if policy_update:
            # Policy update
            q_pi = net(x_ph)
            pi_loss = -torch.mean(q_pi)

            optimizer_actor.zero_grad()
            pi_loss.backward()
            optimizer_actor.step()

            logger.store(LossPi=pi_loss.item())

            # update target network
            param = parameters_to_vector(net.parameters())
            param_targ = parameters_to_vector(net_targ.parameters())
            param_targ = polyak * param_targ + (1 - polyak) * param
            vector_to_parameters(param_targ, net_targ.parameters())