コード例 #1
0
 def closure():
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     y = self.acquisition_func(x)
     autograd.backward(unconstrained_x,
                       autograd.grad(y, unconstrained_x))
     return y
コード例 #2
0
 def closure():
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     x = from_01(x)
     y = lower_confidence_bound(x, model)
     autograd.backward(x, autograd.grad(y, x))
     return y
コード例 #3
0
    def finish_episode_ac():
        '''Actor-Critic'''
        R = 0
        rewards = []
        saved_actions = net.saved_actions
        value_loss = 0
        for r in net.rewards[::-1]:
            R = r + args.gamma * R
            rewards.insert(0, R)
        rewards = torch.Tensor(rewards).cuda()

        for (action, value), r in zip(saved_actions, rewards):
            reward = r - value.data[0, 0]
            action.reinforce(reward)
            value_loss += F.smooth_l1_loss(value,
                                           Variable(torch.Tensor([r]).cuda()))

        optimizer.zero_grad()
        final_nodes = [value_loss] + list(map(lambda p: p[0], saved_actions))
        gradients = [torch.ones(1).cuda()] + [None] * len(saved_actions)

        autograd.backward(final_nodes, gradients)
        min_grad = np.Inf
        max_grad = -np.Inf
        torch.nn.utils.clip_grad_norm(net.parameters(), 10)
        for param in net.parameters():
            if torch.min(param.grad.data) < min_grad:
                min_grad = torch.min(param.grad.data)
            if torch.max(param.grad.data) > max_grad:
                max_grad = torch.max(param.grad.data)

        optimizer.step()
        del net.rewards[:]
        del net.saved_actions[:]
        return min_grad, max_grad
コード例 #4
0
def update_controller(actionSeqs, avgR):
    print('Reinforcing for epoch %d' % e)
    for actions in actionSeqs:
        actions.reinforce(avgR - b)
        opti.zero_grad()
        autograd.backward(actions, [None for _ in actions])
        opti.step()
コード例 #5
0
 def finish_episode_re():
     '''REINFORCE'''
     R = 0
     rewards = []
     for r in net.rewards[::-1]:
         R = r + args.gamma * R
         rewards.insert(0, R)
     rewards = torch.Tensor(rewards).cuda()
     for action, r in zip(net.saved_actions, rewards):
         action.reinforce(r)
     optimizer.zero_grad()
     autograd.backward(net.saved_actions, [None for _ in net.saved_actions])
     min_grad = np.Inf
     max_grad = -np.Inf
     torch.nn.utils.clip_grad_norm(net.parameters(), 1)
     for param in net.parameters():
         if torch.min(param.grad.data) < min_grad:
             min_grad = torch.min(param.grad.data)
         if torch.max(param.grad.data) > max_grad:
             max_grad = torch.max(param.grad.data)
     optimizer.step()
     net.zero_grad()
     optimizer.zero_grad()
     del net.rewards[:]
     del net.saved_actions[:]
     return min_grad, max_grad
コード例 #6
0
    def test_PruneModel_ShouldBePrunedInRightPlace(self):
        output = self.model(self.inputs)
        backward(output, self.grad)

        # rig gradients, set all to 1, except first module's first map
        pConv2ds = (module for module in self.model.modules()
                    if issubclass(type(module), pnn.PConv2d))
        for idx, pConv2d in enumerate(pConv2ds):
            pConv2d.taylor_estimates = torch.ones(
                pConv2d.taylor_estimates.size())
            if idx == 0:
                pConv2d.taylor_estimates[0] = 0.1

        expected_conv2d_out_channels = self.model.features[0].out_channels - 1
        expected_batchnorm_num_features = self.model.features[
            1].num_features - 1
        next_conv2d_in_channels = self.model.features[4].in_channels - 1
        self.model.prune()

        # being a little lazy here, since prunable_nn_test covered weight checking
        # check first conv2d's input
        # check first batchnorm's input
        # check 2nd conv2d's input
        self.assertEqual(self.model.features[0].out_channels,
                         expected_conv2d_out_channels)
        self.assertEqual(self.model.features[1].num_features,
                         expected_batchnorm_num_features)
        self.assertEqual(self.model.features[4].in_channels,
                         next_conv2d_in_channels)

        # run again, ensure no bugs with modules
        self.model(self.inputs)
コード例 #7
0
 def learn_mine(self,batch, ma_rate=0.01):
     # batch is a tuple of (joint1, joint2, marginal (from the dataset of joint 2))
     joint1 = torch.autograd.Variable(batch[0])
     joint2 = torch.autograd.Variable(batch[2])
     marginal = torch.autograd.Variable(batch[4]) #the uneven parts of the dataset are the labels 
     if torch.cuda.is_available():
         joint1 = joint1.to('cuda', non_blocking=True)
         joint2 = joint2.to('cuda', non_blocking=True)
         marginal = marginal.to('cuda', non_blocking=True)
         self.net = self.net.cuda()
     #joint = torch.autograd.Variable(torch.FloatTensor(joint))
     #marginal = torch.autograd.Variable(torch.FloatTensor(marginal))
     
     NIM , T, eT = self.mutual_information(joint1, joint2, marginal)
     
     #Using exponantial moving average to correct bias 
     ma_eT = (1-ma_rate)*eT + (ma_rate)*torch.mean(eT) 
     # unbiasing 
     loss = -(torch.mean(T) - (1/ma_eT.mean()).detach()*torch.mean(eT))
     # use biased estimator
     # loss = - mi_lb
     
     self.mine_net_optim.zero_grad()
     autograd.backward(loss)
     self.mine_net_optim.step()
     #self.scheduler.step()
     #self.scheduler2.step(NIM)
     if torch.cuda.is_available():
         NIM = NIM.cpu()
         loss = loss.cpu()
     return NIM, loss
コード例 #8
0
ファイル: exp9.py プロジェクト: zhaochenqiu/courses
def learn_mine(batch, mine_net, mine_net_optim, ma_et, ma_rate=0.01):
    # batch is a tuple of (joint, marginal)
    joint, marginal = batch

    #     print("joint:",    joint.shape)
    #     print("marginal:", marginal.shape)

    #    print("input joint:", joint)
    joint = torch.autograd.Variable(torch.FloatTensor(joint)).cuda()
    #    print("output joint:", joint)

    marginal = torch.autograd.Variable(torch.FloatTensor(marginal)).cuda()

    # mi_lb 是展示的曲线
    mi_lb, t, et = mutual_information(joint, marginal, mine_net)

    # et 是 marginal 的网络输入,和输出
    # ma_et 是迭代变量
    ma_et = (1 - ma_rate) * ma_et + ma_rate * torch.mean(et)
    #    print("ma_et:", ma_et)

    # unbiasing use moving average
    loss = -(torch.mean(t) - (1 / ma_et.mean()).detach() * torch.mean(et))
    #    print(loss)
    # use biased estimator
    #     loss = - mi_lb

    mine_net_optim.zero_grad()
    autograd.backward(loss)
    mine_net_optim.step()
    return mi_lb, ma_et
コード例 #9
0
    def update_mine_net(self, batch, mine_net_optim, ma_rate=0.01):
        """[summary]
        
        Arguments:
            batch {[type]} -- ([batch_size X 2], [batch_size X 2])
            mine_net_optim {[type]} -- [description]
            ma_rate {float} -- [moving average rate] (default: {0.01})
        
        Keyword Arguments:
            mi_lb {} -- []
        """

        # batch is a tuple of (joint, marginal)
        joint, marginal = batch
        joint = torch.autograd.Variable(torch.FloatTensor(joint))
        marginal = torch.autograd.Variable(torch.FloatTensor(marginal))
        mi_lb, t, et = self.mutual_information(joint, marginal)
        self.ma_et = (1 - ma_rate) * self.ma_et + ma_rate * torch.mean(et)

        # unbiasing use moving average
        loss = -(torch.mean(t) -
                 (1 / self.ma_et.mean()).detach() * torch.mean(et))
        # use biased estimator
        #     loss = - mi_lb
        lossTrain = loss
        mine_net_optim.zero_grad()
        autograd.backward(loss)
        mine_net_optim.step()
        return mi_lb, lossTrain
コード例 #10
0
    def test_PruneModel_PruneFirstFeatureMapOfLastModule(self):
        output = self.model(self.inputs)
        backward(output, self.grad)

        # rig gradients, set all to 1, except last module's first map
        pConv2ds = [
            module for module in self.model.modules()
            if issubclass(type(module), pnn.PConv2d)
        ]
        last_idx = len(pConv2ds) - 1
        for idx, pConv2d in enumerate(pConv2ds):
            pConv2d.taylor_estimates = torch.ones(
                pConv2d.taylor_estimates.size())
            if idx == last_idx:
                pConv2d.taylor_estimates[0] = 0.1

        old_linear_in_features = self.model.classifier[0].in_features
        self.model.prune()

        # only check linear's input size
        self.assertTrue(
            self.model.classifier[0].in_features < old_linear_in_features)

        # run again, ensure no bugs with modules
        self.model(self.inputs)
コード例 #11
0
ファイル: GR10.py プロジェクト: HoracceFeng/Grid_World_demo
def finish_episode(e, actions, values, rewards):

    # Calculate discounted rewards, going backwards from end
    discounted_rewards = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    discounted_rewards = torch.Tensor(discounted_rewards)

    # Use REINFORCE on chosen actions and associated discounted rewards
    value_loss = 0
    for action, value, reward in zip(actions, values, discounted_rewards):
        reward_diff = reward - value.data[0]  # Treat critic value as baseline
        action.reinforce(reward_diff)  # Try to perform better than baseline
        value_loss += mse(value, Variable(torch.Tensor(
            [reward])))  # Compare with actual reward

    # Backpropagate
    optimizer.zero_grad()
    nodes = [value_loss] + actions
    gradients = [torch.ones(1)] + [None for _ in actions
                                   ]  # No gradients for reinforced values
    autograd.backward(nodes, gradients)
    optimizer.step()

    # Save Model
    if e % 10000 == 0:
        ckpt = 'out_checkpoint/RG10_' + str(e) + '.pkl'
        torch.save(policy.state_dict(), ckpt)

    return discounted_rewards, value_loss
コード例 #12
0
def finish_episode(actions, values, rewards):
    global optimizer
    # Calculate discounted rewards, going backwards from end
    discounted_rewards = []
    R = 0
    for r in rewards[::-1]:
        R = r + gamma * R
        discounted_rewards.insert(0, R)
    discounted_rewards = torch.Tensor(discounted_rewards)

    # Use REINFORCE on chosen actions and associated discounted rewards
    value_loss = 0
    count = 0
    for action, value, reward in zip(actions, values, discounted_rewards):
        count += 1
        reward_diff = reward - value.data[0]  # Treat critic value as baseline
        action.reinforce(reward_diff)  # Try to perform better than baseline
        value_loss += mse(value, Variable(torch.Tensor(
            [reward])))  # Compare with actual reward
    # Backpropagate
    optimizer.zero_grad()
    nodes = [value_loss] + actions
    gradients = [torch.ones(1)] + [None for _ in actions
                                   ]  # No gradients for reinforced values
    autograd.backward(nodes, gradients)
    optimizer.step()

    return discounted_rewards, value_loss
コード例 #13
0
ファイル: toy_agent.py プロジェクト: jwyang/RLCourseProject
    def finish_episode(self):
        """update policy based on the results in one episode"""
        R = 0
        rewards = []
        for r in self.reward_seq[::-1]:
            R = r + self.policynet.gamma * R
            rewards.insert(0, R)
        rewards = torch.Tensor(rewards)
        rewards = (rewards - rewards.mean()) / (rewards.std() +
                                                np.finfo(np.float32).eps)
        gradients = [torch.zeros(1, len(self.action_spec))] * len(
            self.action_seq)
        for t in xrange(len(self.reward_seq)):
            for a in np.array([0, 3]):
                # if self.action_seq[t][0][a] > 0.5:
                #   gradients[t][0][a] = -rewards[t]
                # elif self.action_seq[t][0][a] < 0.5:
                #   gradients[t][0][a] = rewards[t]
                if self.action_seq[t][a] > 0:
                    gradients[t][0][a] = -rewards[t]
                elif self.action_seq[t][a] < 0:
                    gradients[t][0][a] = rewards[t]

        self.optimizer.zero_grad()
        autograd.backward(self.action_prob_seq, gradients)
        self.optimizer.step()
        del self.reward_seq[:]
        del self.action_prob_seq[:]
        del self.action_seq[:]
コード例 #14
0
def finish_episode(episodes):
    R = 0
    rewards = []
    #print(len(model.rewards))
    #print(len(model.saved_actions))
    # get the accumlated reward
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    #rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    log_reward = open('ind_snapshots/rewad.txt', 'a')
    for action, r in zip(model.saved_actions, rewards):
        #log_reward.write(str(action)+' '+str(r))
        #print(action.data.cpu().numpy()[0,0])
        action.reinforce(r)
        log_reward.write(
            str(action.data.cpu().numpy()[0, 0]) + ' ' + str(r) + '\n')
    log_reward.close()
    optimizer.zero_grad()
    autograd.backward(model.saved_actions, [None for _ in model.saved_actions])
    optimizer.step()
    #if episodes % 4 == 0:
    #	optimizer.step()
    #	optimizer.zero_grad()
    del model.rewards[:]
    del model.saved_actions[:]
    del model.saved_probs[:]
コード例 #15
0
ファイル: BO.py プロジェクト: jonahhdeykin/UML_2020
 def closure():
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     # Object of x: [[]] -> []
     x = x[0]
     y = self.lower_confidence_bound(x, gpmodel)
     autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x))
     return y
コード例 #16
0
 def closure():
     minimizer.zero_grad()
     x = transform_to(self.x_constraint)(unconstrained_x)
     x = x.reshape((1, self.dim))
     y = self.lower_confidence_bound(x)
     autograd.backward(unconstrained_x,
                       autograd.grad(y, unconstrained_x))
     return y
コード例 #17
0
    def closure():
        minimizer.zero_grad()
        x = transf_values(x_uncon, constr, x_dims)

        y = model_predict(x)[return_site].mean(0)

        autograd.backward(x_uncon, autograd.grad(y, x_uncon))
        return y
コード例 #18
0
 def closure():
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     y = lower_confidence_bound(model, likelihood, x)
     #y = lower_confidence_bound(unconstrained_x)
     #print(autograd.grad(y, unconstrained_x))
     #print(y)
     autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x))
     return y
コード例 #19
0
ファイル: expfamilyprior.py プロジェクト: creatorscan/beer
 def natural_hparams(self, value):
     if value.grad is not None:
         value.grad.zero_()
     copied_value = torch.tensor(value.detach(), requires_grad=True)
     log_norm_value = self.log_norm(copied_value)
     ta.backward(log_norm_value)
     self._expected_sufficient_statistics = torch.tensor(copied_value.grad)
     self._natural_hparams = copied_value
     self._log_norm_value = torch.tensor(log_norm_value)
コード例 #20
0
def adversarial_imitation_update(algorithm,
                                 agent,
                                 discriminator,
                                 expert_trajectories,
                                 policy_trajectories,
                                 discriminator_optimiser,
                                 batch_size,
                                 r1_reg_coeff=1):
    expert_dataloader = DataLoader(expert_trajectories,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   drop_last=True)
    policy_dataloader = DataLoader(policy_trajectories,
                                   batch_size=batch_size,
                                   shuffle=True,
                                   drop_last=True)

    # Iterate over mininum of expert and policy data
    for expert_transition, policy_transition in zip(expert_dataloader,
                                                    policy_dataloader):
        expert_state, expert_action, expert_next_state, expert_terminal = expert_transition[
            'states'], expert_transition['actions'], expert_transition[
                'next_states'], expert_transition['terminals']
        policy_state, policy_action, policy_next_state, policy_terminal = policy_transition[
            'states'], policy_transition['actions'], policy_transition[
                'next_states'], policy_transition['terminals']

        if algorithm == 'GAIL':
            D_expert = discriminator(expert_state, expert_action)
            D_policy = discriminator(policy_state, policy_action)
        elif algorithm == 'AIRL':
            with torch.no_grad():
                expert_data_policy = agent.log_prob(expert_state,
                                                    expert_action).exp()
                policy_data_policy = agent.log_prob(policy_state,
                                                    policy_action).exp()
            D_expert = discriminator(expert_state, expert_action,
                                     expert_next_state, expert_data_policy,
                                     expert_terminal)
            D_policy = discriminator(policy_state, expert_action,
                                     policy_next_state, policy_data_policy,
                                     policy_terminal)

        # Binary logistic regression
        discriminator_optimiser.zero_grad()
        expert_loss = F.binary_cross_entropy(
            D_expert,
            torch.ones_like(D_expert))  # Loss on "real" (expert) data
        autograd.backward(expert_loss, create_graph=True)
        r1_reg = 0
        for param in discriminator.parameters():
            r1_reg += param.grad.norm().mean()  # R1 gradient penalty
        policy_loss = F.binary_cross_entropy(
            D_policy,
            torch.zeros_like(D_policy))  # Loss on "fake" (policy) data
        (policy_loss + r1_reg_coeff * r1_reg).backward()
        discriminator_optimiser.step()
コード例 #21
0
def train_scvi(model,
               train_set,
               val_set,
               n_batches=32,
               n_epochs=300,
               lr=0.001,
               save_path="./models"):
    """
    Trains the model
    :param model: The model to train
    :param dataset: The raw dataset (to split in train and test sets and mini-batches)
    :return:
    """
    model.to(device)

    val_set = torch.tensor(val_set).to(device)

    adam = optim.Adam(model.parameters(), lr=lr)

    losses_train = []
    losses_val = []

    train_set_shuff = torch.tensor(train_set).to(device)
    log_library_size = torch.log(torch.sum(train_set_shuff, dim=1))
    prior_l_m, prior_l_v = torch.mean(log_library_size), torch.var(
        log_library_size)

    # training
    for epoch in range(n_epochs):
        train_set_shuff = train_set_shuff[torch.randperm(
            train_set_shuff.size()[0])]  # Shuffle data at each epoch
        model.train()
        for i in range(int(len(train_set) / n_batches) + 1):
            minibatch = train_set_shuff[i * n_batches:(i + 1) * n_batches, :]
            qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h = model(
                minibatch)  # forward pass
            loss_train = model.loss(minibatch, qz, mu_z, sigma_z, ql, mu_l,
                                    sigma_l, mu, h, prior_l_m,
                                    prior_l_v)  # compute ELBO
            autograd.backward(loss_train, retain_graph=True)  # backward pass
            adam.step()  # paramters update
            adam.zero_grad(
            )  # put the gradients back to zero for the next mini-batch

        model.eval()
        with torch.set_grad_enabled(False):
            for i in range(int(len(val_set) / n_batches)):
                minibatch = val_set[i * n_batches:(i + 1) * n_batches, :]
                qz, mu_z, sigma_z, ql, mu_l, sigma_l, mu, h = model(minibatch)
                loss_val = model.loss(minibatch, qz, mu_z, sigma_z, ql, mu_l,
                                      sigma_l, mu, h, prior_l_m, prior_l_v)

        losses_train.append(loss_train)
        losses_val.append(loss_val)

    return losses_train, losses_val
コード例 #22
0
 def closure():
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     y = log_expected_improvement(model, likelihood, x, previous_best,
                                  device)
     #y = lower_confidence_bound(unconstrained_x)
     #print(autograd.grad(y, unconstrained_x))
     #print(y)
     autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x))
     return y
コード例 #23
0
 def closure():
     minimizer.zero_grad()
     if (torch.log(torch.abs(unconstrained_x)) > 25.).any():
         return torch.tensor(float('inf'))
     x = transform_to(self.constraints)(unconstrained_x)
     y = differentiable(x)
     autograd.backward(
         unconstrained_x,
         autograd.grad(y, unconstrained_x, retain_graph=True))
     return y
コード例 #24
0
 def accumulate_gradients(self, grad_infos):
     bwd_out = list()
     bwd_in = list()
     for datas, grad_datas, etas, grad_etas in grad_infos:
         bwd_out += list(etas)
         bwd_in += list(grad_etas)
         for data, grad_data in zip(datas, grad_datas):
             data.grad.add_(grad_data)
     if len(bwd_out) > 0:
         autograd.backward(bwd_out, bwd_in)
コード例 #25
0
 def closure():
     #ipdb.set_trace()
     minimizer.zero_grad()
     x = transform_to(constraint)(unconstrained_x)
     y = q_expected_improvement(x,
                                gpmodel,
                                sampling_type=sampling_type,
                                sample_size=sample_size)
     autograd.backward(unconstrained_x, autograd.grad(y, unconstrained_x))
     return y
コード例 #26
0
ファイル: rl.py プロジェクト: tsekitsi/N2N
 def update_controller(self, avgR, b):
     for actions in self.actionSeqs:
         if isinstance(actions, list):
             for action in actions:
                 action.reinforce(avgR - b)
         else:
             actions.reinforce(avgR - b)
         self.optimizer.zero_grad()
         autograd.backward(actions, [None for _ in actions])
         self.optimizer.step()
     self.actionSeqs = []
コード例 #27
0
ファイル: baseprior.py プロジェクト: ruizhilijhu/beer
    def expected_value(self):
        '''Mean value of the random variable w.r.t. to the distribution.

        Returns:
            ``torch.Tensor``
        '''
        copied_tensor = torch.tensor(self.natural_parameters,
                                     requires_grad=True)
        log_norm = self.log_norm(copied_tensor)
        ta.backward(log_norm)
        return copied_tensor.grad.detach()
コード例 #28
0
def update_controller(actionSeqs, valueSeqs, avgR):
    print('Reinforcing for epoch %d' % e)
    LossFn = nn.SmoothL1Loss()
    value_loss = 0
    for (actions, values) in zip(actionSeqs, valueSeqs):
        actions.reinforce(-(values.data - avgR))
        rew = Variable(torch.Tensor([avgR] * values.size(0))).detach()
        value_loss += LossFn(values, rew)
    opti.zero_grad()
    autograd.backward([value_loss] + actionSeqs,
                      [torch.ones(1)] + [None for _ in actionSeqs])
    opti.step()
コード例 #29
0
ファイル: first_policy.py プロジェクト: jklaise/sushigo
def finish_game(policy, optimizer):
    rewards = policy.rewards + [0]
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)

    for action, r in zip(policy.saved_actions, rewards):
        action.reinforce(r)
    optimizer.zero_grad()
    autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions])
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_actions[:]
コード例 #30
0
def train_model(model, train, valid, save_path):
    """
    Function that trains the model
    :param model: The model to train
    :param train: The training set
    :param valid: The validation set
    :return:
    """
    # optimizer for the network
    adam = optim.Adam(model.parameters(), lr=3e-4)

    for epoch in range(args.nb_epochs):
        for i, (batch, label) in enumerate(train):
            # put batch on device
            batch = batch.to(args.device)

            # obtain the parameters from the encoder and compute KL divergence
            mu, log_sigma, g_z = model(batch)
            kl = kl_div(mu, log_sigma)

            # compute the reconstruction loss
            logpx_z = ll(batch.view(-1, 3 * 32 * 32),
                         g_z.view(-1, 3 * 32 * 32))

            # combine the two loss terms and compute gradients
            elbo = (logpx_z - kl).mean()

            # maximize the elbo i.e. minimize - elbo
            autograd.backward([-elbo])

            # Update the parameters and zero the gradients for the next mini-batch
            adam.step()
            adam.zero_grad()

        # compute the loss for the validation set
        with torch.no_grad():
            valid_elbo = torch.zeros(1)
            nb_batches = 0
            for i, (batch, label) in enumerate(valid):
                nb_batches += 1
                batch = batch.to(args.device)
                mu, log_sigma, g_z = model(batch)
                kl = kl_div(mu, log_sigma)
                logpx_z = ll(batch.view(-1, 3 * 32 * 32),
                             g_z.view(-1, 3 * 32 * 32))
                valid_elbo += (logpx_z - kl).mean()
            valid_elbo /= nb_batches
            print("After epoch {} the validation loss is: ".format(epoch + 1),
                  valid_elbo.item())

    # save the model to be used later
    torch.save(model.state_dict(), save_path)
コード例 #31
0
ファイル: reinforce.py プロジェクト: JerryLauzzz/examples
def finish_episode():
    R = 0
    rewards = []
    for r in policy.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for action, r in zip(policy.saved_actions, rewards):
        action.reinforce(r)
    optimizer.zero_grad()
    autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions])
    optimizer.step()
    del policy.rewards[:]
    del policy.saved_actions[:]
コード例 #32
0
ファイル: RL.py プロジェクト: Alpslee/jetson-reinforcement
def finish_episode():					# training at the end of an episode
	global num_episodes
	print('finish_episode({:d})'.format(num_episodes))
	R = 0
	rewards = []
	for r in policy.rewards[::-1]:
		R = r + args.gamma * R
		rewards.insert(0, R)
	rewards = torch.Tensor(rewards)
	rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
	for action, r in zip(policy.saved_actions, rewards):
		action.reinforce(r)
	optimizer.zero_grad()
	autograd.backward(policy.saved_actions, [None for _ in policy.saved_actions])
	optimizer.step()
	del policy.rewards[:]
	del policy.saved_actions[:]
	num_episodes += 1
コード例 #33
0
ファイル: breakout-ram.py プロジェクト: qinjian623/dlnotes
def finish_episode():
    R = 0
    rewards = []
    # Weight sum of rewards
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)

    # Norm
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)

    # What'is the action?
    for action, r in zip(model.saved_actions, rewards):
        action.reinforce(r)
    optimizer.zero_grad()
    autograd.backward(model.saved_actions, [None for _ in model.saved_actions])
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]
コード例 #34
0
ファイル: actor_critic.py プロジェクト: deo1/deo1
def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    value_loss = 0
    rewards = []
    for r in model.rewards[::-1]:
        R = r + args.gamma * R
        rewards.insert(0, R)
    rewards = torch.Tensor(rewards)
    rewards = (rewards - rewards.mean()) / (rewards.std() + np.finfo(np.float32).eps)
    for (action, value), r in zip(saved_actions, rewards):
        reward = r - value.data[0,0]
        action.reinforce(reward)
        value_loss += F.smooth_l1_loss(value, Variable(torch.Tensor([r])))
    optimizer.zero_grad()
    final_nodes = [value_loss] + list(map(lambda p: p.action, saved_actions))
    gradients = [torch.ones(1)] + [None] * len(saved_actions)
    autograd.backward(final_nodes, gradients)
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]
コード例 #35
0
ファイル: rl.py プロジェクト: ShawnXiha/fast_abs_rl
def a2c_train_step(agent, abstractor, loader, opt, grad_fn,
                   gamma=0.99, reward_fn=compute_rouge_l,
                   stop_reward_fn=compute_rouge_n(n=1), stop_coeff=1.0):
    opt.zero_grad()
    indices = []
    probs = []
    baselines = []
    ext_sents = []
    art_batch, abs_batch = next(loader)
    for raw_arts in art_batch:
        (inds, ms), bs = agent(raw_arts)
        baselines.append(bs)
        indices.append(inds)
        probs.append(ms)
        ext_sents += [raw_arts[idx.item()]
                      for idx in inds if idx.item() < len(raw_arts)]
    with torch.no_grad():
        summaries = abstractor(ext_sents)
    i = 0
    rewards = []
    avg_reward = 0
    for inds, abss in zip(indices, abs_batch):
        rs = ([reward_fn(summaries[i+j], abss[j])
              for j in range(min(len(inds)-1, len(abss)))]
              + [0 for _ in range(max(0, len(inds)-1-len(abss)))]
              + [stop_coeff*stop_reward_fn(
                  list(concat(summaries[i:i+len(inds)-1])),
                  list(concat(abss)))])
        assert len(rs) == len(inds)
        avg_reward += rs[-1]/stop_coeff
        i += len(inds)-1
        # compute discounted rewards
        R = 0
        disc_rs = []
        for r in rs[::-1]:
            R = r + gamma * R
            disc_rs.insert(0, R)
        rewards += disc_rs
    indices = list(concat(indices))
    probs = list(concat(probs))
    baselines = list(concat(baselines))
    # standardize rewards
    reward = torch.Tensor(rewards).to(baselines[0].get_device())
    reward = (reward - reward.mean()) / (
        reward.std() + float(np.finfo(np.float32).eps))
    baseline = torch.cat(baselines).squeeze()
    avg_advantage = 0
    losses = []
    for action, p, r, b in zip(indices, probs, reward, baseline):
        advantage = r - b
        avg_advantage += advantage
        losses.append(-p.log_prob(action)
                      * (advantage/len(indices))) # divide by T*B
    critic_loss = F.mse_loss(baseline, reward)
    # backprop and update
    autograd.backward(
        [critic_loss] + losses,
        [torch.ones(1).to(critic_loss.get_device())]*(1+len(losses))
    )
    grad_log = grad_fn()
    opt.step()
    log_dict = {}
    log_dict.update(grad_log)
    log_dict['reward'] = avg_reward/len(art_batch)
    log_dict['advantage'] = avg_advantage.item()/len(indices)
    log_dict['mse'] = critic_loss.item()
    assert not math.isnan(log_dict['grad_norm'])
    return log_dict