Example #1
0
    def forward(ctx, true_binary, rule_masks, input_logits):
        ctx.save_for_backward(true_binary, rule_masks, input_logits)

        b = F.torch.max(input_logits, 2, keepdim=True)[0]
        raw_logits = input_logits - b
        exp_pred = torch.exp(raw_logits) * rule_masks + cmd_args.prob_fix

        norm = torch.sum(exp_pred, 2, keepdim=True)
        prob = torch.div(exp_pred, norm)

        ll = F.torch.abs(F.torch.sum( true_binary * prob, 2))
        
        mask = 1 - rule_masks[:, :, -1]

        logll = mask * F.torch.log(ll)

        if cmd_args.old_loss:
            nnz = torch.sum(mask)
            loss = -torch.sum(logll) / nnz
        else:
            loss = -torch.sum(logll) / true_binary.size()[1]
        
        if input_logits.is_cuda:
            return torch.Tensor([loss]).cuda()
        else:
            return torch.Tensor([loss])
Example #2
0
    def forward(self, feat, right, wrong, batch_wrong, fake=None, fake_diff_mask=None):

        num_wrong = wrong.size(1)
        batch_size = feat.size(0)

        feat = feat.view(-1, self.ninp, 1)
        right_dis = torch.bmm(right.view(-1, 1, self.ninp), feat)
        wrong_dis = torch.bmm(wrong, feat)
        batch_wrong_dis = torch.bmm(batch_wrong, feat)

        wrong_score = torch.sum(torch.exp(wrong_dis - right_dis.expand_as(wrong_dis)),1) \
                + torch.sum(torch.exp(batch_wrong_dis - right_dis.expand_as(batch_wrong_dis)),1)

        loss_dis = torch.sum(torch.log(wrong_score + 1))
        loss_norm = right.norm() + feat.norm() + wrong.norm() + batch_wrong.norm()

        if fake:
            fake_dis = torch.bmm(fake.view(-1, 1, self.ninp), feat)
            fake_score = torch.masked_select(torch.exp(fake_dis - right_dis), fake_diff_mask)

            margin_score = F.relu(torch.log(fake_score + 1) - self.margin)
            loss_fake = torch.sum(margin_score)
            loss_dis += loss_fake
            loss_norm += fake.norm()

        loss = (loss_dis + 0.1 * loss_norm) / batch_size
        if fake:
            return loss, loss_fake.data[0] / batch_size
        else:
            return loss
    def norm_flow(self, params, z, v):

        # print (z.size())
        h = F.tanh(params[0][0](z))
        mew_ = params[0][1](h)
        sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z]

        # print (v.size())
        # print (mew_.size())
        # print (self.B)
        # print (self.P)

        v = v*sig_ + mew_

        logdet = torch.sum(torch.log(sig_), 1)


        h = F.tanh(params[1][0](v))
        mew_ = params[1][1](h)
        sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z]

        z = z*sig_ + mew_

        logdet2 = torch.sum(torch.log(sig_), 1)

        #[PB]
        logdet = logdet + logdet2
        
        #[PB,Z], [PB]
        return z, v, logdet
Example #4
0
    def _test_jacobian(self, input_dim, hidden_dim):
        jacobian = torch.zeros(input_dim, input_dim)
        iaf = InverseAutoregressiveFlow(input_dim, hidden_dim, sigmoid_bias=0.5)

        def nonzero(x):
            return torch.sign(torch.abs(x))

        x = torch.randn(1, input_dim)
        iaf_x = iaf(x)
        analytic_ldt = iaf.log_abs_det_jacobian(x, iaf_x).data.sum()

        for j in range(input_dim):
            for k in range(input_dim):
                epsilon_vector = torch.zeros(1, input_dim)
                epsilon_vector[0, j] = self.epsilon
                iaf_x_eps = iaf(x + epsilon_vector)
                delta = (iaf_x_eps - iaf_x) / self.epsilon
                jacobian[j, k] = float(delta[0, k].data.sum())

        permutation = iaf.arn.get_permutation()
        permuted_jacobian = jacobian.clone()
        for j in range(input_dim):
            for k in range(input_dim):
                permuted_jacobian[j, k] = jacobian[permutation[j], permutation[k]]
        numeric_ldt = torch.sum(torch.log(torch.diag(permuted_jacobian)))
        ldt_discrepancy = np.fabs(analytic_ldt - numeric_ldt)

        diag_sum = torch.sum(torch.diag(nonzero(permuted_jacobian)))
        lower_sum = torch.sum(torch.tril(nonzero(permuted_jacobian), diagonal=-1))

        assert ldt_discrepancy < self.epsilon
        assert diag_sum == float(input_dim)
        assert lower_sum == float(0.0)
Example #5
0
    def f1_score_macro(y_true, y_pred, per_class=False, threshold=0.5):
        '''
        Macro f1

        y_true: [bs, classes, x, y]
        y_pred: [bs, classes, x, y]

        Tested: same results as sklearn f1 macro
        '''
        y_true = y_true.byte()
        y_pred = y_pred > threshold

        y_true = y_true.permute(0, 2, 3, 1)
        y_pred = y_pred.permute(0, 2, 3, 1)

        y_true = y_true.contiguous().view(-1, y_true.size()[3])  # [bs*x*y, classes]
        y_pred = y_pred.contiguous().view(-1, y_pred.size()[3])

        f1s = []
        for i in range(y_true.size()[1]):
            intersect = torch.sum(y_true[:, i] * y_pred[:, i])  # works because all multiplied by 0 gets 0
            denominator = torch.sum(y_true[:, i]) + torch.sum(y_pred[:, i])  # works because all multiplied by 0 gets 0
            #maybe have to cast to float here (for python3 ??) otherwise always 0
            # f1 = (2 * intersect) / (denominator + 1e-6)
            f1 = (2 * intersect.float()) / (denominator.float() + 1e-6)
            f1s.append(f1)
        if per_class:
            return np.array(f1s)
        else:
            return np.mean(np.array(f1s))
Example #6
0
    def forward(self, input, target):
        y_true = target.int().unsqueeze(-1)
        same_id = torch.eq(y_true, y_true.t()).type_as(input)

        pos_mask = same_id
        neg_mask = 1 - same_id

        def _mask_max(input_tensor, mask, axis=None, keepdims=False):
            input_tensor = input_tensor - 1e6 * (1 - mask)
            _max, _idx = torch.max(input_tensor, dim=axis, keepdim=keepdims)
            return _max, _idx

        def _mask_min(input_tensor, mask, axis=None, keepdims=False):
            input_tensor = input_tensor + 1e6 * (1 - mask)
            _min, _idx = torch.min(input_tensor, dim=axis, keepdim=keepdims)
            return _min, _idx

        # output[i, j] = || feature[i, :] - feature[j, :] ||_2
        dist_squared = torch.sum(input ** 2, dim=1, keepdim=True) + \
                       torch.sum(input.t() ** 2, dim=0, keepdim=True) - \
                       2.0 * torch.matmul(input, input.t())
        dist = dist_squared.clamp(min=1e-16).sqrt()

        pos_max, pos_idx = _mask_max(dist, pos_mask, axis=-1)
        neg_min, neg_idx = _mask_min(dist, neg_mask, axis=-1)

        # loss(x, y) = max(0, -y * (x1 - x2) + margin)
        y = torch.ones(same_id.size()[0]).to(DEVICE)
        return F.margin_ranking_loss(neg_min.float(),
                                     pos_max.float(),
                                     y,
                                     self.margin,
                                     self.size_average)
    def norm_flow(self, params, z, v, logposterior):

        h = F.tanh(params[0][0](z))
        mew_ = params[0][1](h)
        sig_ = F.sigmoid(params[0][2](h)+5.) #[PB,Z]


        z_reshaped = z.view(self.P, self.B, self.z_size)

        gradients = torch.autograd.grad(outputs=logposterior(z_reshaped), inputs=z_reshaped,
                          grad_outputs=self.grad_outputs,
                          create_graph=True, retain_graph=True, only_inputs=True)[0]
        gradients = gradients.detach()

        gradients = gradients.view(-1,self.z_size)


        v = v*sig_ + mew_*gradients

        logdet = torch.sum(torch.log(sig_), 1)


        h = F.tanh(params[1][0](v))
        mew_ = params[1][1](h)
        sig_ = F.sigmoid(params[1][2](h)+5.) #[PB,Z]

        z = z*sig_ + mew_*v

        logdet2 = torch.sum(torch.log(sig_), 1)

        #[PB]
        logdet = logdet + logdet2
        
        #[PB,Z], [PB]
        return z, v, logdet
Example #8
0
    def test_regularization(self):
        penalty = self.model.get_regularization_penalty().data
        assert (penalty > 0).all()

        penalty2 = 0

        # Config specifies penalty as
        #   "regularizer": [
        #     ["weight$", {"type": "l2", "alpha": 10}],
        #     ["bias$", {"type": "l1", "alpha": 5}]
        #   ]
        for name, parameter in self.model.named_parameters():
            if name.endswith("weight"):
                weight_penalty = 10 * torch.sum(torch.pow(parameter, 2))
                penalty2 += weight_penalty
            elif name.endswith("bias"):
                bias_penalty = 5 * torch.sum(torch.abs(parameter))
                penalty2 += bias_penalty

        assert (penalty == penalty2.data).all()

        # You get a RuntimeError if you call `model.forward` twice on the same inputs.
        # The data and config are such that the whole dataset is one batch.
        training_batch = next(self.iterator(self.instances, num_epochs=1))
        validation_batch = next(self.iterator(self.instances, num_epochs=1))

        training_loss = self.trainer._batch_loss(training_batch, for_training=True).data
        validation_loss = self.trainer._batch_loss(validation_batch, for_training=False).data

        # Training loss should have the regularization penalty, but validation loss should not.
        assert (training_loss != validation_loss).all()

        # Training loss should equal the validation loss plus the penalty.
        penalized = validation_loss + penalty
        assert (training_loss == penalized).all()
def iou(pr, gt, eps=1e-7, threshold=None, activation='sigmoid'):
    """
    Source:
        https://github.com/catalyst-team/catalyst/
    Args:
        pr (torch.Tensor): A list of predicted elements
        gt (torch.Tensor):  A list of elements that are to be predicted
        eps (float): epsilon to avoid zero division
        threshold: threshold for outputs binarization
    Returns:
        float: IoU (Jaccard) score
    """

    if activation is None or activation == "none":
        activation_fn = lambda x: x
    elif activation == "sigmoid":
        activation_fn = torch.nn.Sigmoid()
    elif activation == "softmax2d":
        activation_fn = torch.nn.Softmax2d()
    else:
        raise NotImplementedError(
            "Activation implemented for sigmoid and softmax2d"
        )

    pr = activation_fn(pr)

    if threshold is not None:
        pr = (pr > threshold).float()

    intersection = torch.sum(gt * pr)
    union = torch.sum(gt) + torch.sum(pr) - intersection + eps
    return (intersection + eps) / union
 def forward(self, log_prob, y_true, mask):
     mask = mask.float()
     log_P = torch.gather(log_prob.view(-1, log_prob.size(2)), 1, y_true.contiguous().view(-1, 1))  # batch*time x 1
     log_P = log_P.view(y_true.size(0), y_true.size(1))  # batch x time
     log_P = log_P * mask  # batch x time
     sum_log_P = torch.sum(log_P, dim=1) / torch.sum(mask, dim=1)  # batch
     return -sum_log_P
def get_reinforce_ps_loss(phi, p0, reinforce = False):
    # returns pseudoloss: loss whose gradient is unbiased for the
    # true gradient

    d = len(p0)
    e_b = sigmoid(phi)

    bn_rv = Bernoulli(probs = torch.ones(d) * e_b)
    binary_samples = bn_rv.sample().detach()
    # binary_samples = (torch.rand(d) > e_b).float().detach()

    if reinforce:
        binary_samples_ = bn_rv.sample().detach()
        baseline = torch.sum((binary_samples_ - p0)**2)

    else:
        baseline = 0.0

    sampled_loss = torch.sum((binary_samples - p0)**2)

    # probs, draw_array = get_all_probs(e_b, d)
    # losses_array = get_losses_from_draw_array(draw_array, p0)
    #
    # cat_rv = Categorical(probs)
    # indx = cat_rv.sample()
    # binary_samples = draw_array[indx]
    # sampled_loss = losses_array[indx]
    #
    sampled_log_q = get_bernoulli_log_prob(e_b, binary_samples)

    ps_loss = (sampled_loss - baseline).detach() * sampled_log_q

    return ps_loss
    def _get_state_cost(self, state: NlvrDecoderState) -> torch.Tensor:
        """
        Return the costs a finished state. Since it is a finished state, the group size will be 1,
        and hence we'll return just one cost.
        """
        if not state.is_finished():
            raise RuntimeError("_get_state_cost() is not defined for unfinished states!")
        # Our checklist cost is a sum of squared error from where we want to be, making sure we
        # take into account the mask.
        checklist_balance = state.checklist_state[0].get_balance()
        checklist_cost = torch.sum((checklist_balance) ** 2)

        # This is the number of items on the agenda that we want to see in the decoded sequence.
        # We use this as the denotation cost if the path is incorrect.
        # Note: If we are penalizing the model for producing non-agenda actions, this is not the
        # upper limit on the checklist cost. That would be the number of terminal actions.
        denotation_cost = torch.sum(state.checklist_state[0].checklist_target.float())
        checklist_cost = self._checklist_cost_weight * checklist_cost
        # TODO (pradeep): The denotation based cost below is strict. May be define a cost based on
        # how many worlds the logical form is correct in?
        # label_strings being None happens when we are testing. We do not care about the cost then.
        # TODO (pradeep): Make this cleaner.
        if state.label_strings is None or all(self._check_state_denotations(state)):
            cost = checklist_cost
        else:
            cost = checklist_cost + (1 - self._checklist_cost_weight) * denotation_cost
        return cost
Example #13
0
    def forward(self, true_binary, rule_masks, raw_logits):
        if cmd_args.loss_type == 'binary':
            exp_pred = torch.exp(raw_logits) * rule_masks

            norm = F.torch.sum(exp_pred, 2, keepdim=True)
            prob = F.torch.div(exp_pred, norm)

            return F.binary_cross_entropy(prob, true_binary) * cmd_args.max_decode_steps

        if cmd_args.loss_type == 'perplexity':
            return my_perp_loss(true_binary, rule_masks, raw_logits)

        if cmd_args.loss_type == 'vanilla':
            exp_pred = torch.exp(raw_logits) * rule_masks + 1e-30
            norm = torch.sum(exp_pred, 2, keepdim=True)
            prob = torch.div(exp_pred, norm)

            ll = F.torch.abs(F.torch.sum( true_binary * prob, 2))
            mask = 1 - rule_masks[:, :, -1]
            logll = mask * F.torch.log(ll)

            loss = -torch.sum(logll) / true_binary.size()[1]
            
            return loss
        print('unknown loss type %s' % cmd_args.loss_type)
        raise NotImplementedError
Example #14
0
def average_without_padding(x, ids, padding_id, cuda=False, eps=1e-8):
    if cuda:
        mask = Variable(torch.from_numpy(np.not_equal(ids, padding_id).astype(int)[:,:,np.newaxis])).float().cuda().permute(1, 2, 0).expand_as(x)
    else:
        mask = Variable(torch.from_numpy(np.not_equal(ids, padding_id).astype(int)[:,:,np.newaxis])).float().permute(1, 2, 0).expand_as(x)
    s = torch.sum(x*mask, dim=2) / (torch.sum(mask, dim=2)+eps)
    return s
def calculate_variance_term(pred, gt, means, n_objects, delta_v, norm=2):
    """pred: bs, height * width, n_filters
       gt: bs, height * width, n_instances
       means: bs, n_instances, n_filters"""

    bs, n_loc, n_filters = pred.size()
    n_instances = gt.size(2)

    # bs, n_loc, n_instances, n_filters
    means = means.unsqueeze(1).expand(bs, n_loc, n_instances, n_filters)
    # bs, n_loc, n_instances, n_filters
    pred = pred.unsqueeze(2).expand(bs, n_loc, n_instances, n_filters)
    # bs, n_loc, n_instances, n_filters
    gt = gt.unsqueeze(3).expand(bs, n_loc, n_instances, n_filters)

    _var = (torch.clamp(torch.norm((pred - means), norm, 3) -
                        delta_v, min=0.0) ** 2) * gt[:, :, :, 0]

    var_term = 0.0
    for i in range(bs):
        _var_sample = _var[i, :, :n_objects[i]]  # n_loc, n_objects
        _gt_sample = gt[i, :, :n_objects[i], 0]  # n_loc, n_objects

        var_term += torch.sum(_var_sample) / torch.sum(_gt_sample)
    var_term = var_term / bs

    return var_term
Example #16
0
    def compute_loss(self, outputs, masks, labels):
        """
        Our implementation of weighted BCE loss.
        """
        labels = labels.view(-1)
        masks = masks.view(-1)
        outputs = outputs.view(-1)

        # Generate the weights
        ones = torch.sum(labels)
        total = labels.nelement()
        weights = torch.FloatTensor(outputs.size()).type_as(outputs.data)
        weights[labels.long() == 1] = 1.0 - ones / total
        weights[labels.long() == 0] = ones / total
        weights = weights.view(weights.size(0), 1).expand(weights.size(0), 2)

        # Generate the log outputs
        outputs = outputs.clamp(min=1e-8)
        log_outputs = torch.log(outputs)
        neg_outputs = 1.0 - outputs
        neg_outputs = neg_outputs.clamp(min=1e-8)
        neg_log_outputs = torch.log(neg_outputs)
        all_outputs = torch.cat((log_outputs.view(-1, 1), neg_log_outputs.view(-1, 1)), 1)

        all_values = all_outputs.mul(torch.autograd.Variable(weights))
        all_labels = torch.autograd.Variable(torch.cat((labels.view(-1, 1), (1.0 - labels).view(-1, 1)), 1))
        all_masks = torch.autograd.Variable(torch.cat((masks.view(-1, 1), masks.view(-1, 1)), 1))
        loss = -torch.sum(all_values.mul(all_labels).mul(all_masks)) / outputs.size(0)
        return loss
Example #17
0
def sample_from_discretized_mix_logistic_1d(l, nr_mix):
    # Pytorch ordering
    l = l.permute(0, 2, 3, 1)
    ls = [int(y) for y in l.size()]
    xs = ls[:-1] + [1] #[3]

    # unpack parameters
    logit_probs = l[:, :, :, :nr_mix]
    l = l[:, :, :, nr_mix:].contiguous().view(xs + [nr_mix * 2]) # for mean, scale

    # sample mixture indicator from softmax
    temp = torch.FloatTensor(logit_probs.size())
    if l.is_cuda : temp = temp.cuda()
    temp.uniform_(1e-5, 1. - 1e-5)
    temp = logit_probs.data - torch.log(- torch.log(temp))
    _, argmax = temp.max(dim=3)
   
    one_hot = to_one_hot(argmax, nr_mix)
    sel = one_hot.view(xs[:-1] + [1, nr_mix])
    # select logistic parameters
    means = torch.sum(l[:, :, :, :, :nr_mix] * sel, dim=4) 
    log_scales = torch.clamp(torch.sum(
        l[:, :, :, :, nr_mix:2 * nr_mix] * sel, dim=4), min=-7.)
    u = torch.FloatTensor(means.size())
    if l.is_cuda : u = u.cuda()
    u.uniform_(1e-5, 1. - 1e-5)
    u = Variable(u)
    x = means + torch.exp(log_scales) * (torch.log(u) - torch.log(1. - u))
    x0 = torch.clamp(torch.clamp(x[:, :, :, 0], min=-1.), max=1.)
    out = x0.unsqueeze(1)
    return out
Example #18
0
    def forward(self, input_features, adj):
        #x = self.conv1(input_features, adj)
        #x = self.bn1(x)
        #x = self.act(x)
        #x = self.conv2(x, adj)
        #x = self.bn2(x)

        # pool over all nodes 
        #graph_h = self.pool_graph(x)
        graph_h = input_features.view(-1, self.max_num_nodes * self.max_num_nodes)
        # vae
        h_decode, z_mu, z_lsgms = self.vae(graph_h)
        out = F.sigmoid(h_decode)
        out_tensor = out.cpu().data
        recon_adj_lower = self.recover_adj_lower(out_tensor)
        recon_adj_tensor = self.recover_full_adj_from_lower(recon_adj_lower)

        # set matching features be degree
        out_features = torch.sum(recon_adj_tensor, 1)

        adj_data = adj.cpu().data[0]
        adj_features = torch.sum(adj_data, 1)

        S = self.edge_similarity_matrix(adj_data, recon_adj_tensor, adj_features, out_features,
                self.deg_feature_similarity)

        # initialization strategies
        init_corr = 1 / self.max_num_nodes
        init_assignment = torch.ones(self.max_num_nodes, self.max_num_nodes) * init_corr
        #init_assignment = torch.FloatTensor(4, 4)
        #init.uniform(init_assignment)
        assignment = self.mpm(init_assignment, S)
        #print('Assignment: ', assignment)

        # matching
        # use negative of the assignment score since the alg finds min cost flow
        row_ind, col_ind = scipy.optimize.linear_sum_assignment(-assignment.numpy())
        print('row: ', row_ind)
        print('col: ', col_ind)
        # order row index according to col index
        #adj_permuted = self.permute_adj(adj_data, row_ind, col_ind)
        adj_permuted = adj_data
        adj_vectorized = adj_permuted[torch.triu(torch.ones(self.max_num_nodes,self.max_num_nodes) )== 1].squeeze_()
        adj_vectorized_var = Variable(adj_vectorized).cuda()

        #print(adj)
        #print('permuted: ', adj_permuted)
        #print('recon: ', recon_adj_tensor)
        adj_recon_loss = self.adj_recon_loss(adj_vectorized_var, out[0])
        print('recon: ', adj_recon_loss)
        print(adj_vectorized_var)
        print(out[0])

        loss_kl = -0.5 * torch.sum(1 + z_lsgms - z_mu.pow(2) - z_lsgms.exp())
        loss_kl /= self.max_num_nodes * self.max_num_nodes # normalize
        print('kl: ', loss_kl)

        loss = adj_recon_loss + loss_kl

        return loss
Example #19
0
    def forward(self, input1):
        self.batchgrid3d = torch.zeros(torch.Size([input1.size(0)]) + self.grid3d.size())

        for i in range(input1.size(0)):
            self.batchgrid3d[i] = self.grid3d

        self.batchgrid3d = Variable(self.batchgrid3d)
        #print(self.batchgrid3d)

        x = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,0:4]), 3)
        y = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,4:8]), 3)
        z = torch.sum(torch.mul(self.batchgrid3d, input1[:,:,:,8:]), 3)
        #print(x)
        r = torch.sqrt(x**2 + y**2 + z**2) + 1e-5

        #print(r)
        theta = torch.acos(z/r)/(np.pi/2)  - 1
        #phi = torch.atan(y/x)
        phi = torch.atan(y/(x + 1e-5))  + np.pi * x.lt(0).type(torch.FloatTensor) * (y.ge(0).type(torch.FloatTensor) - y.lt(0).type(torch.FloatTensor))
        phi = phi/np.pi


        output = torch.cat([theta,phi], 3)

        return output
Example #20
0
 def pixel_acc(pred, label, ignore_index=-1):
     _, preds = torch.max(pred, dim=1)
     valid = (label != ignore_index).long()
     acc_sum = torch.sum(valid * (preds == label).long())
     pixel_sum = torch.sum(valid)
     acc = acc_sum.float() / (pixel_sum.float() + 1e-10)
     return acc
Example #21
0
    def eval_one_batch(self, batch):
        enc_batch, enc_padding_mask, enc_lens, enc_batch_extend_vocab, extra_zeros, c_t_1, coverage = \
            get_input_from_batch(batch, use_cuda)
        dec_batch, dec_padding_mask, max_dec_len, dec_lens_var, target_batch = \
            get_output_from_batch(batch, use_cuda)

        encoder_outputs, encoder_hidden, max_encoder_output = self.model.encoder(enc_batch, enc_lens)
        s_t_1 = self.model.reduce_state(encoder_hidden)

        if config.use_maxpool_init_ctx:
            c_t_1 = max_encoder_output

        step_losses = []
        for di in range(min(max_dec_len, config.max_dec_steps)):
            y_t_1 = dec_batch[:, di]  # Teacher forcing
            final_dist, s_t_1, c_t_1,attn_dist, p_gen, coverage = self.model.decoder(y_t_1, s_t_1,
                                                                encoder_outputs, enc_padding_mask, c_t_1,
                                                                extra_zeros, enc_batch_extend_vocab, coverage)
            target = target_batch[:, di]
            gold_probs = torch.gather(final_dist, 1, target.unsqueeze(1)).squeeze()
            step_loss = -torch.log(gold_probs + config.eps)
            if config.is_coverage:
                step_coverage_loss = torch.sum(torch.min(attn_dist, coverage), 1)
                step_loss = step_loss + config.cov_loss_wt * step_coverage_loss

            step_mask = dec_padding_mask[:, di]
            step_loss = step_loss * step_mask
            step_losses.append(step_loss)

        sum_step_losses = torch.sum(torch.stack(step_losses, 1), 1)
        batch_avg_loss = sum_step_losses / dec_lens_var
        loss = torch.mean(batch_avg_loss)

        return loss.data[0]
Example #22
0
    def get_receptive_field(self, patch, layer_idx=None):
        is_originally_frozen = self._is_frozen
        self.zero_grad()
        self.freeze(False)

        image_size = self.input_size
        batch_shape = (1, 3, image_size, image_size)

        x = self._make_cuda(torch.autograd.Variable(
            torch.rand(*batch_shape), requires_grad=True))
        z = self.forward(x, layer_idx=layer_idx)
        z_patch = patch.forward(z)

        torch.sum(z_patch).backward()

        rf = x.grad.data.cpu().numpy()
        rf = rf[0, 0]
        rf = list(zip(*np.where(np.abs(rf) > 1e-6)))

        (i_nw, j_nw), (i_se, j_se) = rf[0], rf[-1]

        rf_w, rf_h = (j_se - j_nw + 1,
                      i_se - i_nw + 1)

        self.zero_grad()
        self.freeze(is_originally_frozen)

        return (i_nw, j_nw), (rf_w, rf_h)
Example #23
0
    def reverse_flow(self, z):

        B = z.shape[0]
        C = z.shape[1]
        f = self.flows

        logdet = 0.
        reverse_ = list(range(self.n_flows))[::-1]
        for i in reverse_:
            z1 = z[:,:C//2]
            z2 = z[:,C//2:]
            sig1 = torch.sigmoid(f[str(i)]['f2_sig'](z1))
            mu1 = f[str(i)]['f2_mu'](z1)

            z2 = (z2 - mu1) / sig1

            sig2 = torch.sigmoid(f[str(i)]['f1_sig'](z2))
            mu2 = f[str(i)]['f1_mu'](z2)

            z1 = (z1 - mu2) / sig2
            
            z = torch.cat([z1,z2],1)
            z = z[:,f[str(i)]['inv_perm']]

            sig1 = sig1.view(B, -1)
            sig2 = sig2.view(B, -1)
            logdet += torch.sum(torch.log(sig1), 1)
            logdet += torch.sum(torch.log(sig2), 1)

        return z, logdet
Example #24
0
def test_fun_weak(model,loss_fn,dataloader,dataloader_neg,batch_tnf,use_cuda=True,triplet=False,tps_grid_regularity_loss=0):
    model.eval()
    test_loss = 0
    if dataloader_neg is not None: 
        dataloader_neg_iter=iter(dataloader_neg)
    for batch_idx, batch in enumerate(dataloader):
        batch = batch_tnf(batch)
        if dataloader_neg is not None: 
            batch_neg = next(dataloader_neg_iter)
            batch_neg = batch_tnf(batch_neg)
            theta_pos,corr_pos,theta_neg,corr_neg = model(batch, batch_neg)
            inliers_pos = loss_fn(theta_pos,corr_pos)            
            inliers_neg = loss_fn(theta_neg,corr_neg)
            loss = torch.sum(inliers_neg - inliers_pos)
        elif dataloader_neg is None and triplet==False:
            theta,corr = model(batch)
            loss = loss_fn(theta,corr)
        elif dataloader_neg is None and triplet==True:
            theta_pos,corr_pos,theta_neg,corr_neg = model(batch, triplet=True)
            inliers_pos = loss_fn(theta_pos,corr_pos)            
            inliers_neg = loss_fn(theta_neg,corr_neg)
            loss = torch.sum(inliers_neg - inliers_pos)
        test_loss += loss.data.cpu().numpy()[0]

    test_loss /= len(dataloader)
    print('Test set: Average loss: {:.4f}'.format(test_loss))
    return test_loss
Example #25
0
def neg_hartmann6(X: Tensor) -> Tensor:
    r"""Negative Hartmann6 test function.

    Six-dimensional function (typically evaluated on `[0, 1]^6`)

        `H(x) = - sum_{i=1}^4 ALPHA_i exp( - sum_{j=1}^6 A_ij (x_j - P_ij)**2 )`

    H has a 6 local minima and a global minimum at

        `z = (0.20169, 0.150011, 0.476874, 0.275332, 0.311652, 0.6573)`

    with `H(z) = -3.32237`

    Args:
        X: A Tensor of size `6` or `k x 6` (k batch evaluations).

    Returns:
        `-H(X)`, the negative value of the standard Hartmann6 function.
    """
    batch = X.ndimension() > 1
    X = X if batch else X.unsqueeze(0)
    inner_sum = torch.sum(X.new(A) * (X.unsqueeze(1) - 0.0001 * X.new(P)) ** 2, dim=2)
    H = -torch.sum(X.new(ALPHA) * torch.exp(-inner_sum), dim=1)
    result = -H
    return result if batch else result.squeeze(0)
Example #26
0
def _mmd2(K_XX, K_XY, K_YY, const_diagonal=False, biased=False):
    m = K_XX.size(0)    # assume X, Y are same shape

    # Get the various sums of kernels that we'll use
    # Kts drop the diagonal, but we don't need to compute them explicitly
    if const_diagonal is not False:
        diag_X = diag_Y = const_diagonal
        sum_diag_X = sum_diag_Y = m * const_diagonal
    else:
        diag_X = torch.diag(K_XX)                       # (m,)
        diag_Y = torch.diag(K_YY)                       # (m,)
        sum_diag_X = torch.sum(diag_X)
        sum_diag_Y = torch.sum(diag_Y)

    Kt_XX_sums = K_XX.sum(dim=1) - diag_X             # \tilde{K}_XX * e = K_XX * e - diag_X
    Kt_YY_sums = K_YY.sum(dim=1) - diag_Y             # \tilde{K}_YY * e = K_YY * e - diag_Y
    K_XY_sums_0 = K_XY.sum(dim=0)                     # K_{XY}^T * e

    Kt_XX_sum = Kt_XX_sums.sum()                       # e^T * \tilde{K}_XX * e
    Kt_YY_sum = Kt_YY_sums.sum()                       # e^T * \tilde{K}_YY * e
    K_XY_sum = K_XY_sums_0.sum()                       # e^T * K_{XY} * e

    if biased:
        mmd2 = ((Kt_XX_sum + sum_diag_X) / (m * m)
            + (Kt_YY_sum + sum_diag_Y) / (m * m)
            - 2.0 * K_XY_sum / (m * m))
    else:
        mmd2 = (Kt_XX_sum / (m * (m - 1))
            + Kt_YY_sum / (m * (m - 1))
            - 2.0 * K_XY_sum / (m * m))

    return mmd2
Example #27
0
    def accGradParameters(self, input, gradOutput, scale=1):
        assert input.dim() == 2
        inputSize = self.weight.size(1)
        outputSize = self.weight.size(0)

        """
        dy_j            x_i                     w_ji
        ----- = -------------------  -  y_j -----------
        dw_ji   || w_j || * || x ||         || w_j ||^2
        """

        if self._weight is None:
            self._weight = self.weight.new()
        if self._sum is None:
            self._sum = input.new()

        self._weight.resize_as_(self.weight).copy_(self.weight)
        if self._gradOutput is None:
            self._gradOutput = gradOutput.new()
        self._gradOutput.resize_as_(gradOutput).copy_(gradOutput)
        self._gradOutput.mul_(self.output)
        torch.sum(self._gradOutput, 0, out=self._sum, keepdim=True)
        grad = self._sum[0]
        grad.div_(self._weightNorm.select(1, 0))
        self._weight.mul_(grad.view(outputSize, 1).expand_as(self._weight))

        input_ = self._gradOutput
        input_.resize_as_(input).copy_(input)
        input_.div_(self._inputNorm.expand_as(input))
        self._weight.addmm_(-1, 1, gradOutput.t(), input_)

        self._weight.div_(self._weightNorm.expand_as(self._weight))
        self.gradWeight.add_(self._weight)
Example #28
0
    def forward_flow(self, z, xenc):

        B = z.shape[0]
        C = z.shape[1]
        f = self.flows
        logdet = 0.
        for i in range(self.n_flows):
            z = z[:,f[str(i)]['perm']]
            z1 = z[:,:C//2]
            z2 = z[:,C//2:]

            sig2 = torch.sigmoid(f[str(i)]['f1_sig'](torch.cat([z2,xenc],1)))
            mu2 = f[str(i)]['f1_mu'](torch.cat([z2,xenc],1))

            z1 = z1*sig2 + mu2

            mu1 = f[str(i)]['f2_mu'](torch.cat([z1,xenc],1))
            sig1 = torch.sigmoid(f[str(i)]['f2_sig'](torch.cat([z1,xenc],1)))

            z2 = z2*sig1 + mu1
            z = torch.cat([z1,z2],1)

            sig1 = sig1.view(B, -1)
            sig2 = sig2.view(B, -1)
            logdet += torch.sum(torch.log(sig1), 1)
            logdet += torch.sum(torch.log(sig2), 1)

        return z, logdet
    def _get_state_cost(self, worlds: List[WikiTablesWorld], state: CoverageState) -> torch.Tensor:
        if not state.is_finished():
            raise RuntimeError("_get_state_cost() is not defined for unfinished states!")
        world = worlds[state.batch_indices[0]]

        # Our checklist cost is a sum of squared error from where we want to be, making sure we
        # take into account the mask. We clamp the lower limit of the balance at 0 to avoid
        # penalizing agenda actions produced multiple times.
        checklist_balance = torch.clamp(state.checklist_state[0].get_balance(), min=0.0)
        checklist_cost = torch.sum((checklist_balance) ** 2)

        # This is the number of items on the agenda that we want to see in the decoded sequence.
        # We use this as the denotation cost if the path is incorrect.
        denotation_cost = torch.sum(state.checklist_state[0].checklist_target.float())
        checklist_cost = self._checklist_cost_weight * checklist_cost
        action_history = state.action_history[0]
        batch_index = state.batch_indices[0]
        action_strings = [state.possible_actions[batch_index][i][0] for i in action_history]
        logical_form = world.get_logical_form(action_strings)
        lisp_string = state.extras[batch_index]
        if self._executor.evaluate_logical_form(logical_form, lisp_string):
            cost = checklist_cost
        else:
            cost = checklist_cost + (1 - self._checklist_cost_weight) * denotation_cost
        return cost
Example #30
0
def project_to_2d(X, camera_params):
    """
    Project 3D points to 2D using the Human3.6M camera projection function.
    This is a differentiable and batched reimplementation of the original MATLAB script.
    
    Arguments:
    X -- 3D points in *camera space* to transform (N, *, 3)
    camera_params -- intrinsic parameteres (N, 2+2+3+2=9)
    """
    assert X.shape[-1] == 3
    assert len(camera_params.shape) == 2
    assert camera_params.shape[-1] == 9
    assert X.shape[0] == camera_params.shape[0]
    
    while len(camera_params.shape) < len(X.shape):
        camera_params = camera_params.unsqueeze(1)
        
    f = camera_params[..., :2]
    c = camera_params[..., 2:4]
    k = camera_params[..., 4:7]
    p = camera_params[..., 7:]
    
    XX = torch.clamp(X[..., :2] / X[..., 2:], min=-1, max=1)
    r2 = torch.sum(XX[..., :2]**2, dim=len(XX.shape)-1, keepdim=True)

    radial = 1 + torch.sum(k * torch.cat((r2, r2**2, r2**3), dim=len(r2.shape)-1), dim=len(r2.shape)-1, keepdim=True)
    tan = torch.sum(p*XX, dim=len(XX.shape)-1, keepdim=True)

    XXX = XX*(radial + tan) + p*r2
    
    return f*XXX + c
Example #31
0
def train_model(model,
                criterion,
                optimizer,
                lr_scheduler,
                lr,
                dset_loaders,
                dset_sizes,
                use_gpu,
                num_epochs,
                exp_dir='./',
                resume=''):
    print('dictoinary length' + str(len(dset_loaders)))
    #reg_params=model.reg_params
    since = time.time()

    best_model = model
    best_acc = 0.0
    if os.path.isfile(resume):
        print("=> loading checkpoint '{}'".format(resume))
        checkpoint = torch.load(resume)
        start_epoch = checkpoint['epoch']
        #best_prec1 = checkpoint['best_prec1']
        #model = checkpoint['model']
        model.load_state_dict(checkpoint['state_dict'])
        #modelx = checkpoint['model']
        #model.reg_params=modelx.reg_params
        print('load')
        optimizer.load_state_dict(checkpoint['optimizer'])
        #pdb.
        #model.reg_params=reg_params
        #del model.reg_params
        print("=> loaded checkpoint '{}' (epoch {})".format(
            resume, checkpoint['epoch']))
    else:
        start_epoch = 0
        print("=> no checkpoint found at '{}'".format(resume))

    print(str(start_epoch))
    #pdb.set_trace()
    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                optimizer = lr_scheduler(optimizer, epoch, lr)
                model.train(True)  # Set model to training mode
            else:
                model.train(False)  # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for data in dset_loaders[phase]:
                # get the inputs
                inputs, labels = data
                inputs = inputs.squeeze()
                # wrap them in Variable
                if use_gpu:
                    inputs, labels = Variable(inputs.cuda()), \
                        Variable(labels.cuda())
                else:
                    inputs, labels = Variable(inputs), Variable(labels)

                # zero the parameter gradients
                optimizer.zero_grad()
                model.zero_grad()
                # forward
                outputs = model(inputs)
                _, preds = torch.max(outputs.data, 1)
                loss = criterion(outputs, labels)

                # backward + optimize only if in training phase
                if phase == 'train':
                    loss.backward()
                    #print('step')
                    optimizer.step()

                # statistics
                running_loss += loss.data[0]
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dset_sizes[phase]
            epoch_acc = running_corrects / dset_sizes[phase]

            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss,
                                                       epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                del outputs
                del labels
                del inputs
                del loss
                del preds
                best_acc = epoch_acc
                #best_model = copy.deepcopy(model)
                torch.save(model, os.path.join(exp_dir, 'best_model.pth.tar'))

        #epoch_file_name=exp_dir+'/'+'epoch-'+str(epoch)+'.pth.tar'
        epoch_file_name = exp_dir + '/' + 'epoch' + '.pth.tar'
        save_checkpoint(
            {
                'epoch': epoch + 1,
                'epoch_acc': epoch_acc,
                'arch': 'alexnet',
                'model': model,
                'state_dict': model.state_dict(),
                'optimizer': optimizer.state_dict(),
            }, epoch_file_name)
        print()

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))
    return model
Example #32
0
    def _optimize(self, model, optimizer, inputs_tanh_var, pert_tanh_var,
                  targets_oh_var, c_var):
        """
        Optimize for one step.

        :param model: the model to attack
        :type model: nn.Module
        :param optimizer: the Adam optimizer to optimize ``modifier_var``
        :type optimizer: optim.Adam
        :param inputs_tanh_var: the input images in tanh-space
        :type inputs_tanh_var: Variable
        :param pert_tanh_var: the perturbation to optimize in tanh-space,
               ``pert_tanh_var.requires_grad`` flag must be set to True
        :type pert_tanh_var: Variable
        :param targets_oh_var: the one-hot encoded target tensor (the attack
               targets if self.targeted else image labels)
        :type targets_oh_var: Variable
        :param c_var: the constant :math:`c` for each perturbation of a batch,
               a Variable of FloatTensor of dimension [B]
        :type c_var: Variable
        :return: the batch loss, squared L2-norm of adversarial perturbations
                 (of dimension [B]), the perturbed activations (of dimension
                 [B]), the adversarial examples (of dimension [B x C x H x W])
        """
        # the adversarial examples in the image space
        # of dimension [B x C x H x W]
        advxs_var = self._from_tanh_space(inputs_tanh_var +
                                          pert_tanh_var)  # type: Variable
        # the perturbed activation before softmax
        pert_outputs_var = model(advxs_var)  # type: Variable
        # the original inputs
        inputs_var = self._from_tanh_space(inputs_tanh_var)  # type: Variable

        perts_norm_var = torch.pow(advxs_var - inputs_var, 2)
        perts_norm_var = torch.sum(
            perts_norm_var.view(perts_norm_var.size(0), -1), 1)

        # In Carlini's code, `target_activ_var` is called `real`.
        # It should be a Variable of tensor of dimension [B], such that the
        # `target_activ_var[i]` is the final activation (right before softmax)
        # of the $t$th class, where $t$ is the attack target or the image label
        #
        # noinspection PyArgumentList
        target_activ_var = torch.sum(targets_oh_var * pert_outputs_var, 1)
        inf = 1e4  # sadly pytorch does not work with np.inf;
        # 1e4 is also used in Carlini's code
        # In Carlini's code, `maxother_activ_var` is called `other`.
        # It should be a Variable of tensor of dimension [B], such that the
        # `maxother_activ_var[i]` is the maximum final activation of all classes
        # other than class $t$, where $t$ is the attack target or the image
        # label.
        #
        # The assertion here ensures (sufficiently yet not necessarily) the
        # assumption behind the trick to get `maxother_activ_var` holds, that
        # $\max_{i \ne t}{o_i} \ge -\text{_inf}$, where $t$ is the target and
        # $o_i$ the $i$th element along axis=1 of `pert_outputs_var`.
        #
        # noinspection PyArgumentList
        assert (pert_outputs_var.max(1)[0] >= -inf).all(), 'assumption failed'
        # noinspection PyArgumentList
        maxother_activ_var = torch.max(
            ((1 - targets_oh_var) * pert_outputs_var - targets_oh_var * inf),
            1)[0]

        # Compute $f(x')$, where $x'$ is the adversarial example in image space.
        # The result `f_var` should be of dimension [B]
        if self.targeted:
            # if targeted, optimize to make `target_activ_var` larger than
            # `maxother_activ_var` by `self.confidence`
            #
            # noinspection PyArgumentList
            f_var = torch.clamp(maxother_activ_var - target_activ_var +
                                self.confidence,
                                min=0.0)
        else:
            # if not targeted, optimize to make `maxother_activ_var` larger than
            # `target_activ_var` (the ground truth image labels) by
            # `self.confidence`
            #
            # noinspection PyArgumentList
            f_var = torch.clamp(target_activ_var - maxother_activ_var +
                                self.confidence,
                                min=0.0)
        # the total loss of current batch, should be of dimension [1]
        batch_loss_var = torch.sum(perts_norm_var +
                                   c_var * f_var)  # type: Variable

        # Do optimization for one step
        optimizer.zero_grad()
        batch_loss_var.backward()
        optimizer.step()

        # Make some records in python/numpy on CPU
        batch_loss = batch_loss_var.item()  # type: float
        pert_norms_np = _var2numpy(perts_norm_var)
        pert_outputs_np = _var2numpy(pert_outputs_var)
        advxs_np = _var2numpy(advxs_var)
        return batch_loss, pert_norms_np, pert_outputs_np, advxs_np
Example #33
0
 def concat_score(self, hidden, encoder_output):
     energy = self.attn(torch.cat((hidden.expand(encoder_output.size(0), -1, -1), encoder_output), 2)).tanh()
     return torch.sum(self.v * energy, dim=2)
Example #34
0
 def general_score(self, hidden, encoder_output):
     energy = self.attn(encoder_output)
     return torch.sum(hidden * energy, dim=2)
Example #35
0
 def dot_score(self, hidden, encoder_output):
     return torch.sum(hidden * encoder_output, dim=2)
Example #36
0
def train_model(device, model, dataloaders, dataset_sizes, 
                criterion=None, optimizer=None, scheduler=None, 
                num_epochs=100, checkpoints=10, output_dir='output', 
                status=1, train_acc=0, track_steps=False,
                seed=414921):
    ''' Helper function to train PyTorch model based on parameters '''
    # pylint: disable=no-member 
    # # <-- VC code pylint complains about torch.sum() and .max()

    # create the model directory if it doesn't exist
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)

    # configure the training if it was not specified by user
    if not criterion:
        criterion = nn.CrossEntropyLoss()
    if not optimizer:
        optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
    if not scheduler:
        scheduler = lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.1)

    # send the model to the device
    model = model.to(device)
    
    since = time.time()
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0
    metrics = []
    step_metrics = [] # if track_steps=True
    training_step = 0
    acc_reached = False
    for epoch in range(num_epochs):
        epoch_start_time = time.time()
        if (epoch) % status == 0 or epoch == num_epochs-1:
            print()
            print(f'Epoch {epoch}/{num_epochs - 1}')
            print('-' * 10)
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()
            else:
                model.eval()
            epoch_phase_start_time = time.time()
            running_loss = 0.0
            running_corrects = 0
            for inputs, labels in dataloaders[phase]:
                step_start_time = time.time()
                inputs = inputs.to(device)
                labels = labels.to(device)
                optimizer.zero_grad()
                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    # backward + optimize only if in training phase
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        if track_steps:
                            # store per step metrics (WARNING! lots of data)
                            step_metrics.append({
                                'device': str(device),
                                'epoch': epoch,
                                'training_step': training_step,
                                'training_step_loss': loss.item(),
                                'training_step_time': time.time() - step_start_time
                            })
                        training_step += 1
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            if phase == 'train':
                scheduler.step()
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            epoch_phase_end_time = time.time()            
            
            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc.item()
                best_model_wts = copy.deepcopy(model.state_dict())
            
            # check if training accuracy has met target, if so signal exit
            if (train_acc > 0) and (epoch_acc.item() >= train_acc) and phase == 'train':
                acc_reached = True
                print()
                print(f'Epoch {epoch}/{num_epochs - 1}')
                print('-' * 10)
                
            if (epoch) % status == 0 or epoch == num_epochs-1 or acc_reached:
                print(f'{phase} Loss: {round(epoch_loss, 4)} Acc: {round(epoch_acc.item(), 4)}')
            else:
                prog = '-' * int(((epoch) % status))
                print('\r{}|{}'.format(prog,epoch),end='')
                
            # store per epoch metrics
            if phase == 'val':
                validation_time = time.time() - epoch_start_time
                avg_val_loss = loss.item()
                avg_val_acc = epoch_acc.item()
                
            else:
                training_time = time.time() - epoch_start_time
                avg_train_loss = loss.item()
                avg_train_acc = epoch_acc.item()
                
        metrics.append({
                        'device': str(device),
                        'epoch': epoch,
                        'average_training_loss': avg_train_loss, 
                        'average_validation_loss': avg_val_loss,
                        'training_acc': avg_train_acc,
                        'validaton_acc': avg_val_acc,
                        'training_time': training_time,
                        'validation_time': validation_time
                    })

        ####### save checkpoint after epoch
        if (epoch > 0 and epoch != num_epochs-1) and \
            ((epoch+1) % checkpoints == 0 and os.path.isdir(output_dir)):
            checkpoint=os.path.join(output_dir,
                                f'epoch{epoch+1}_checkpoint_model.th')
            torch.save({
                'epoch': epoch + 1,
                'state_dict': model.state_dict(),
                'best_acc': best_acc,
            }, checkpoint)
            # dump the data for later
            json_file = os.path.join(output_dir,
                                    f'epoch{epoch+1}_checkpoint_metrics.json')
            with open(json_file, 'w') as fp:
                json.dump(metrics, fp)
        #######
        
        # if the target accuracy was reached during this epoch, it is time to exit
        if acc_reached: 
            break
    
    ####### save final checkpoint
    if os.path.isdir(output_dir):
        timestamp = time.strftime("%Y-%m-%dT%H%M%S")
        checkpoint= os.path.join(output_dir, f'final_model_{timestamp}.th')
        # save the model
        torch.save({
            'state_dict': model.state_dict(),
            'best_acc': best_acc,
        }, checkpoint)
        # dump the data for later
        metric_path = os.path.join(output_dir,f'final_metrics_{timestamp}.json')
        with open(metric_path, 'w') as fp:
            json.dump(metrics, fp)
    #######
    
    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60}m {time_elapsed % 60}s')
    print(f'Best val Acc: {round(best_acc, 4)}')
    # load best model weights
    model.load_state_dict(best_model_wts)
    # set up return structures
    metrics_df = pd.DataFrame(data=metrics)
    step_metrics_df = pd.DataFrame(data=step_metrics) if step_metrics else None
        
    return model, metrics_df, step_metrics_df
Example #37
0
def active_learning_taylor(func_name,start_rand_idxs=None, bud=None, valid=True,fac_loc_idx=None):
    
    torch.manual_seed(42)
    torch.cuda.manual_seed(42)
    np.random.seed(42)
    random.seed(42)
    torch.backends.cudnn.deterministic = True
    #model = ThreeLayerNet(M, num_cls, 5, 5)
    #model = LogisticRegNet(M, num_cls)
    model = TwoLayerNet(M, num_cls, 100)
    # if data_name == 'mnist':
    #     model = MnistNet()
    if torch.cuda.device_count() > 1:
        print("Using:", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)
        cudnn.benchmark = True

    model = model.to(device)

    idxs = start_rand_idxs

    if func_name == 'Facloc Regularized':
        x_val1 = torch.cat([x_val, x_trn[fac_loc_idx]], dim=0)
        y_val1 = torch.cat([y_val, y_trn[fac_loc_idx]], dim=0)

    criterion = nn.CrossEntropyLoss()
    criterion_nored = nn.CrossEntropyLoss(reduction='none')
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)

    if func_name == 'Full OneStep':
        setf_model = SetFunctionBatch(x_val, y_val, model, criterion, criterion_nored, learning_rate, device)

    elif func_name == 'Facility Location':
        if data_name != 'covertype':
            setf_model = SetFunctionFacLoc(device, train_batch_size_for_greedy)
            idxs = setf_model.lazy_greedy_max(bud, x_trn,model)
        else:
            idxs = run_stochastic_Facloc(x_trn, y_trn, bud)

        facility_loaction_warm_start = copy.deepcopy(idxs)


    elif func_name == 'Facloc Regularized':
        setf_model = SetFunctionTaylor(x_val1, y_val1, model, criterion, criterion_nored, learning_rate, device,num_cls)

    else:
        #setf_model = SetFunctionTaylorDeep(train_loader_greedy, valid_loader, valid, model, 
        #        criterion, criterion_nored, learning_rate, device, N)
        setf_model = SetFunctionTaylor(x_val, y_val, model, criterion, criterion_nored, learning_rate, device,num_cls)

        #setf_model = SetFunctionTaylorDeep_ReLoss_Mean(x_trn, y_trn, train_batch_size_for_greedy, x_val, y_val, valid, model, 
        #        criterion, criterion_nored, learning_rate, device, N) 

    remainList = set(list(range(N)))
    idxs = list(idxs)
    remainList = remainList.difference(idxs)

    if func_name == 'Taylor Online':
        print("Starting Online OneStep Run with taylor on loss!")
    elif func_name == 'Full OneStep':
        print("Starting Online OneStep Run without taylor!")
    elif func_name == 'Facloc Regularized':
        print("Starting Facility Location Regularized Online OneStep Run with taylor!")
    elif func_name == 'Random Greedy':
        print("Starting Randomized Greedy Online OneStep Run with taylor!")
    elif func_name == 'Facility Location':
         print("Starting Facility Location!")
    elif func_name == 'Random':
        print("Starting Random Run!")
    elif func_name == 'Random Perturbation':
        print("Starting Online OneStep Run with taylor with random perturbation!")
    elif func_name == "FASS":
        print("Filtered Active Submodular Selection(FASS)!")
    #elif func_name == 'Proximal':
        #print("Starting Online Proximal OneStep Run with taylor!")
    #elif func_name == 'Taylor on Logit':
    #    print("Starting Online OneStep Run with taylor on logit!")
    
    
    # if valid:
    #     print("Online OneStep Run with Taylor approximation and with Validation Set",file=logfile)
    # else:
    #     print("Online OneStep Run with Taylor approximation and without Validation Set",file=logfile)

    val_accies = np.zeros(no_select)
    test_accies = np.zeros(no_select)
    unlab_accies = np.zeros(no_select)
    # idxs = start_rand_idxs

    def weight_reset(m):
        torch.manual_seed(42)
        torch.cuda.manual_seed(42)
        np.random.seed(42)
        random.seed(42)
        torch.backends.cudnn.deterministic = True
        if isinstance(m, nn.Linear):
            #m.reset_parameters()
            m.weight.data.normal_(0.0, 0.02)
            m.bias.data.fill_(0)

    model =  model.apply(weight_reset).cuda()
    #print(model.linear2.weight)
    for n in range(no_select):
        loader_tr = DataLoader(CustomDataset_act(x_trn[idxs], y_trn[idxs], transform=None),batch_size=no_points)
        model.train()
        for i in range(num_epochs):
            # inputs, targets = x_trn[idxs].to(device), y_trn[idxs].to(device)
            '''inputs, targets = x_trn[idxs], y_trn[idxs]
            optimizer.zero_grad()
            scores = model(inputs)
            loss = criterion(scores, targets)
            loss.backward()
            optimizer.step()'''
            #model =  model.apply(weight_reset).cuda()

            accFinal = 0. 
            for batch_idx in list(loader_tr.batch_sampler):
                x, y, idxs = loader_tr.dataset[batch_idx]

                x, y = Variable(x.cuda()), Variable(y.cuda())
                optimizer.zero_grad()
                out = model(x)
                loss = F.cross_entropy(out, y)
                accFinal += torch.sum((torch.max(out,1)[1] == y).float()).data.item()
                loss.backward()

                if (i % 50 == 0) and (accFinal < 0.2): # reset if not converging
                    model =  model.apply(weight_reset).cuda()
                    optimizer = optim.SGD(model.parameters(), lr = learning_rate)

                # clamp gradients, just in case
                for p in filter(lambda p: p.grad is not None, model.parameters()): p.grad.data.clamp_(min=-.1, max=.1)

                optimizer.step()

            #if accFinal/len(loader_tr.dataset.X) >= 0.99:
            #    break

            '''with torch.no_grad():
                # val_in, val_t = x_val.to(device), y_val.to(device)
                val_outputs = model(x_val)
                val_loss = criterion(val_outputs, y_val)
                full_trn_outputs = model(x_trn)
                full_trn_loss = criterion(full_trn_outputs, y_trn)'''

            #accFinal = torch.sum((torch.max(scores,1)[1] == targets).float()).data.item()
            #print(accFinal / len(loader_tr.dataset.X))

            #if i % print_every == 0:  # Print Training and Validation Loss
        print( n+1,'Time', 'SubsetTrn', loss.item())#, ,FullTrn,ValLoss: full_trn_loss.item(), val_loss.item())

        curr_X_trn = x_trn[list(remainList)]
        curr_Y_trn = y_trn[list(remainList)]

        model.eval()
        with torch.no_grad():
            '''full_trn_out = model(x_trn)
            full_trn_loss = criterion(full_trn_out, y_trn).mean()
            sub_trn_out = model(x_trn[idxs])
            sub_trn_loss = criterion(sub_trn_out, y_trn[idxs]).mean()'''
            val_out = model(x_val)
            val_loss = criterion(val_out, y_val)
            _, val_predict = val_out.max(1)
            val_correct = val_predict.eq(y_val).sum().item()
            val_total = y_val.size(0)
            val_acc = 100 * val_correct / val_total

            correct = 0
            total = 0
            
            inputs, targets = x_tst.to(device), y_tst.to(device)
            outputs = model(inputs)
            test_loss = criterion(outputs, targets)
            _, predicted = outputs.max(1)
            total += targets.size(0)
            correct += predicted.eq(targets).sum().item()
            tst_acc = 100.0 * correct / total

            rem_out = model(curr_X_trn)
            rem_loss = criterion(rem_out, curr_Y_trn)
            _, rem_predict = rem_out.max(1)
            rem_correct = rem_predict.eq(curr_Y_trn).sum().item()
            rem_total = curr_Y_trn.size(0)
            rem_acc = 100 * rem_correct / rem_total

        val_accies[n] = val_acc
        test_accies[n] = tst_acc
        unlab_accies[n] = rem_acc

        #if ((i + 1) % select_every == 0) and func_name not in ['Facility Location','Random']:
            # val_in, val_t = x_val.to(device), y_val.to(device)  # Transfer them to device
        cached_state_dict = copy.deepcopy(model.state_dict())
        clone_dict = copy.deepcopy(model.state_dict())
        # Dont put the logs for Selection on logfile!!
        # print("With Taylor approximation",file=logfile)
        # print("selEpoch: %d, Starting Selection:" % i, str(datetime.datetime.now()),file=logfile)
        #t_ng_start = time.time()

        if func_name == 'Random Greedy':
            new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,int(0.9 * no_points), clone_dict)
            new_idxs = list(np.array(list(remainList))[new_idxs])
            
            remainList = remainList.difference(new_idxs)
            new_idxs.extend(list(np.random.choice(list(remainList), size=int(0.1 * no_points), replace=False)))
            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)

        elif func_name == "FASS":

            fn = nn.Softmax(dim=1)
            soft = fn(rem_out)

            entropy2 = Categorical(probs = soft).entropy()

            #print(entropy2.shape)
            if 5*no_points < entropy2.shape[0]:
                values,indices = entropy2.topk(5*no_points)
                #indices = list(np.array(list(remainList))[indices.cpu()])
            else:
                indices = [i for i in range(entropy2.shape[0])]#list(remainList)

            knn_idxs_flag_val = perform_knnsb_selection(datadir, data_name, curr_X_trn[indices],rem_predict[indices], 
                fraction, selUsing='val') 
            #print(knn_idxs_flag_val)
            #print(len(knn_idxs_flag_val))

            ##print(len(knn_idxs_flag_val),len(indices))
            knn_idxs_flag_val = list(np.array(list(remainList))[indices.cpu()][knn_idxs_flag_val])

            remainList = remainList.difference(knn_idxs_flag_val)
            idxs.extend(knn_idxs_flag_val)

        elif func_name == 'Random':
            state = np.random.get_state()
            np.random.seed(n*n)
            #new_idxs = gen_rand_prior_indices(list(remainList), size=no_points)
            new_idxs = np.random.choice(list(remainList), size=no_points, replace=False)
            np.random.set_state(state)
            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)


        elif func_name == 'Random Perturbation':
            new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,no_points, clone_dict,None,True)  # , grads_idxs
            new_idxs = np.array(list(remainList))[new_idxs]

            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs) 

        elif func_name == 'Facility Location':

            if data_name == 'covertype':
                new_idxs = run_stochastic_Facloc(curr_X_trn, rem_predict, bud)
            else:
                new_idxs = setf_model.lazy_greedy_max(bud, curr_X_trn ,model)
            new_idxs = np.array(list(remainList))[new_idxs]

            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs)

        else: 
            new_idxs = setf_model.naive_greedy_max(curr_X_trn,rem_predict,no_points, clone_dict)  # , grads_idxs
            new_idxs = np.array(list(remainList))[new_idxs]

            remainList = remainList.difference(new_idxs)
            idxs.extend(new_idxs) 

        '''elif func_name == 'Proximal':
            previous = torch.zeros(N,device=device)
            previous[idxs] = 1.0 
            new_idxs = setf_model.naive_greedy_max(bud, clone_dict,None,previous)
            idxs = new_idxs'''

        # print("selEpoch: %d, Selection Ended at:" % (i), str(datetime.datetime.now()),file=logfile)
        # print("Naive greedy total time with taylor:", time.time()-t_ng_start,file=logfile)
        model.load_state_dict(cached_state_dict)

    # Calculate Final SubsetTrn, FullTrn, Val and Test Loss
    # Calculate Val and Test Accuracy
    
    if func_name == 'Facility Location':
        return val_accies, test_accies, unlab_accies, idxs,facility_loaction_warm_start
    else:
        return val_accies, test_accies, unlab_accies, idxs
 def forward(self, input):
     self.x_diff = input[:,:,1:,:] - input[:,:,:-1,:]
     self.y_diff = input[:,:,:,1:] - input[:,:,:,:-1]
     self.loss = self.strength * (torch.sum(torch.abs(self.x_diff)) + torch.sum(torch.abs(self.y_diff)))
     return input
def main(args):
    # load graph data
    if args.dataset == 'aifb':
        dataset = AIFBDataset()
    elif args.dataset == 'mutag':
        dataset = MUTAGDataset()
    elif args.dataset == 'bgs':
        dataset = BGSDataset()
    elif args.dataset == 'am':
        dataset = AMDataset()
    else:
        raise ValueError()

    g = dataset[0]
    category = dataset.predict_category
    num_classes = dataset.num_classes
    train_mask = g.nodes[category].data.pop('train_mask')
    test_mask = g.nodes[category].data.pop('test_mask')
    train_idx = th.nonzero(train_mask).squeeze()
    test_idx = th.nonzero(test_mask).squeeze()
    labels = g.nodes[category].data.pop('labels')

    # split dataset into train, validate, test
    if args.validation:
        val_idx = train_idx[:len(train_idx) // 5]
        train_idx = train_idx[len(train_idx) // 5:]
    else:
        val_idx = train_idx

    # check cuda
    device = 'cpu'
    use_cuda = args.gpu >= 0 and th.cuda.is_available()
    if use_cuda:
        th.cuda.set_device(args.gpu)
        device = 'cuda:%d' % args.gpu

    train_label = labels[train_idx]
    val_label = labels[val_idx]
    test_label = labels[test_idx]

    # create embeddings
    embed_layer = RelGraphEmbed(g, args.n_hidden)
    node_embed = embed_layer()
    # create model
    model = EntityClassify(g,
                           args.n_hidden,
                           num_classes,
                           num_bases=args.n_bases,
                           num_hidden_layers=args.n_layers - 2,
                           dropout=args.dropout,
                           use_self_loop=args.use_self_loop)

    if use_cuda:
        model.cuda()

    # train sampler
    sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
    loader = dgl.dataloading.NodeDataLoader(
        g, {category: train_idx}, sampler,
        batch_size=args.batch_size, shuffle=True, num_workers=0)

    # validation sampler
    # we do not use full neighbor to save computation resources
    val_sampler = dgl.dataloading.MultiLayerNeighborSampler([args.fanout] * args.n_layers)
    val_loader = dgl.dataloading.NodeDataLoader(
        g, {category: val_idx}, val_sampler,
        batch_size=args.batch_size, shuffle=True, num_workers=0)

    # optimizer
    all_params = itertools.chain(model.parameters(), embed_layer.parameters())
    optimizer = th.optim.Adam(all_params, lr=args.lr, weight_decay=args.l2norm)

    # training loop
    print("start training...")
    dur = []
    for epoch in range(args.n_epochs):
        model.train()
        optimizer.zero_grad()
        if epoch > 3:
            t0 = time.time()

        for i, (input_nodes, seeds, blocks) in enumerate(loader):
            blocks = [blk.to(device) for blk in blocks]
            seeds = seeds[category]     # we only predict the nodes with type "category"
            batch_tic = time.time()
            emb = extract_embed(node_embed, input_nodes)
            lbl = labels[seeds]
            if use_cuda:
                emb = {k : e.cuda() for k, e in emb.items()}
                lbl = lbl.cuda()
            logits = model(emb, blocks)[category]
            loss = F.cross_entropy(logits, lbl)
            loss.backward()
            optimizer.step()

            train_acc = th.sum(logits.argmax(dim=1) == lbl).item() / len(seeds)
            print("Epoch {:05d} | Batch {:03d} | Train Acc: {:.4f} | Train Loss: {:.4f} | Time: {:.4f}".
                  format(epoch, i, train_acc, loss.item(), time.time() - batch_tic))

        if epoch > 3:
            dur.append(time.time() - t0)

        val_loss, val_acc = evaluate(model, val_loader, node_embed, labels, category, device)
        print("Epoch {:05d} | Valid Acc: {:.4f} | Valid loss: {:.4f} | Time: {:.4f}".
              format(epoch, val_acc, val_loss, np.average(dur)))
    print()
    if args.model_path is not None:
        th.save(model.state_dict(), args.model_path)

    output = model.inference(
        g, args.batch_size, 'cuda' if use_cuda else 'cpu', 0, node_embed)
    test_pred = output[category][test_idx]
    test_labels = labels[test_idx]
    test_acc = (test_pred.argmax(1) == test_labels).float().mean()
    print("Test Acc: {:.4f}".format(test_acc))
    print()
Example #40
0
    def forward(self, query, key, value, attention_mask):

        attention_mask = (attention_mask == 0).float().to(key.device).squeeze()

        length = torch.sum(attention_mask, dim=1)

        # attention_mask = attention_mask[:,:,None,None].repeat((1,1,key.size()[-2], key.size()[-1]))

        key = key * attention_mask[:, :, None, None].repeat(
            (1, 1, key.size()[-2], key.size()[-1]))
        key_sent = torch.sum(key, dim=1) / length[:, None, None].repeat(
            1,
            key.size()[-2],
            key.size()[-1])

        if (self.config.adapter_fusion["query"]
                and not self.config.adapter_fusion["key"]
                and not self.config.adapter_fusion["value"]):
            query = query * attention_mask[:, :, None].repeat(
                (1, 1, query.size()[-1]))
            query_sent = torch.sum(query, dim=1) / length[:, None].repeat(
                1,
                query.size()[-1])
            query_enc = self.query(query_sent)
            scores_t = torch.matmul(key_sent, query_enc[:, :,
                                                        None]).squeeze(-1)
            probs = nn.Softmax(dim=-1)(scores_t / self.T)

            # result = torch.squeeze(torch.matmul(probs, value), dim=2)
            result = torch.squeeze(torch.matmul(probs[:, None, None, :],
                                                value))
        #     {'MR': {'devacc': 77.53, 'acc': 76.7, 'ndev': 9596, 'ntest': 9596}}
        if (self.config.adapter_fusion["query"]
                and self.config.adapter_fusion["key"]
                and not self.config.adapter_fusion["value"]):
            query = query * attention_mask[:, :, None].repeat(
                (1, 1, query.size()[-1]))
            query_sent = torch.sum(query, dim=1) / length[:, None].repeat(
                1,
                query.size()[-1])
            query_enc = self.query(query_sent)
            key_enc = self.key(key_sent)
            scores_t = torch.matmul(key_enc, query_enc[:, :, None]).squeeze(-1)
            probs = nn.Softmax(dim=-1)(scores_t / self.T)

            # result = torch.squeeze(torch.matmul(probs, value), dim=2)
            result = torch.squeeze(torch.matmul(probs[:, None, None, :],
                                                value))

        if (self.config.adapter_fusion["query"]
                and self.config.adapter_fusion["key"]
                and self.config.adapter_fusion["value"]):
            query = query * attention_mask[:, :, None].repeat(
                (1, 1, query.size()[-1]))
            query_sent = torch.sum(query, dim=1) / length[:, None].repeat(
                1,
                query.size()[-1])
            query_enc = self.query(query_sent)
            key_enc = self.key(key_sent)
            value_enc = self.value(value)
            scores_t = torch.matmul(key_enc, query_enc[:, :, None]).squeeze(-1)
            probs = nn.Softmax(dim=-1)(scores_t / self.T)

            # result = torch.squeeze(torch.matmul(probs, value), dim=2)
            result = torch.squeeze(
                torch.matmul(probs[:, None, None, :], value_enc))

        if (not self.config.adapter_fusion["query"]
                and not self.config.adapter_fusion["key"]
                and not self.config.adapter_fusion["value"]):
            # key_sent = torch.mean(key, dim=1)
            scores = self.dense(key_sent)
            scores_t = scores.transpose(-2, -1)

            probs = nn.Softmax(dim=-1)(scores_t / self.T)
            result = torch.squeeze(torch.matmul(probs.unsqueeze(2), value),
                                   dim=2)
        # attention_scores = attention_scores + attention_mask
        # weighted_value = probs.unsqueeze(1).unsqueeze(-1) * value
        # result = torch.sum(weighted_value, dim=2)

        self.T = max(self.T - self.reduction, 1.0)

        return result
Example #41
0
def training_loop(args, model, criterion, optimizer, dataset, f, device, experiment):

    start = time.time()
    best_weights = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    for epoch in range(args.num_epochs):
        print(f'Epoch {epoch} began')
        running_loss = 0.0
        running_corrects = 0
        # training phase
        for idx, data in enumerate(Bar(dataset['train_dataloader'])):
            inputs = Variable(data.get('image')).to(device)
            target = Variable(data.get('target')).to(device)
            # forward pass
            output = model(inputs)
            _, preds = torch.max(output, 1)
            loss = criterion(output, target)
            loss = loss / args.accumulation_steps           # Normalize accumulated loss (averaged)
            loss = loss.mean()
            # backward pass
            loss.backward()                                 # Backward pass (mean of parallel loss)
            if (idx+1) % args.accumulation_steps == 0:      # Wait for several backward steps
                optimizer.step()                            # Now we can do an optimizer step
                model.zero_grad()                           # Reset gradient tensors

            running_loss += loss.item() * inputs.size(0)
            running_corrects += torch.sum(preds == target.data)
        # log training stats
        train_epoch_loss = running_loss / len(dataset['train_data'])
        train_epoch_acc = running_corrects.double() / len(dataset['train_data'])
        print('Epoch [{}/{}], training loss:{:.4f}'.format(epoch+1, args.num_epochs, train_epoch_loss))
        print('Epoch [{}/{}], training accuracy:{:.4f}'.format(epoch+1, args.num_epochs, train_epoch_acc))
        # validation phase
        running_loss = 0.0
        running_corrects = 0
        with torch.no_grad():
            for idx, data in enumerate(Bar(dataset['val_dataloader'])):
                inputs = Variable(data.get('image')).to(device)
                target = Variable(data.get('target')).to(device)
                output = model(inputs)
                _, preds = torch.max(output, 1)
                loss = criterion(output, target).mean()
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == target.data)
        # log validation stats
        valid_epoch_loss = running_loss / len(dataset['val_data'])
        valid_epoch_acc = running_corrects.double() / len(dataset['val_data'])
        print('Epoch [{}/{}], validation loss:{:.4f}'.format(epoch+1, args.num_epochs, valid_epoch_loss))
        print('Epoch [{}/{}], validation accuracy:{:.4f}'.format(epoch+1, args.num_epochs, valid_epoch_acc))
        # append to experiment report
        print(f'{epoch+1}\t{train_epoch_loss}\t{train_epoch_acc}\t{valid_epoch_loss}\t{valid_epoch_acc}',
              file=open(f, "a"))
        # save best weights
        if valid_epoch_acc > best_acc:
            best_acc = valid_epoch_acc
            best_weights = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), f'models/{args.dataset}/{experiment}.pth')

    time_elapsed = time.time() - start
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60), file=open(f, "a"))
    print('Best val Acc: {:4f}'.format(best_acc), file=open(f, "a"))

    # load best weights
    model.load_state_dict(f'models/{args.dataset}/{experiment}.pth')

    return model
def train_model(args):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device_ids=[0,1,2,3]
    batch_size=args.batch_size
    input_channels = 1
    out_channels = [args.out_channels1, args.out_channels2]
    kernel_size_cnn = [[args.kernel_size_cnn1, args.kernel_size_cnn2],[args.kernel_size_cnn2, args.kernel_size_cnn1]]
    stride_size_cnn = [[args.stride_size_cnn1, args.stride_size_cnn2],[args.stride_size_cnn2, args.stride_size_cnn1]]
    kernel_size_pool = [[args.kernel_size_pool1, args.kernel_size_pool2],[args.kernel_size_pool2, args.kernel_size_pool1]]
    stride_size_pool = [[args.stride_size_pool1, args.stride_size_pool2],[args.stride_size_pool2, args.stride_size_pool1]]
    hidden_dim=200
    num_layers=2
    dropout=0
    num_labels=4
    hidden_dim_lstm=200
    epoch_num=50
    num_layers_lstm=2
    nfft=[512,1024]
    weight = args.weight
    model = MultiSpectrogramModel(input_channels,out_channels, kernel_size_cnn, stride_size_cnn, kernel_size_pool,
                                stride_size_pool, hidden_dim,num_layers,dropout,num_labels, batch_size,
                                hidden_dim_lstm,num_layers_lstm,device, nfft, weight, False)

    print("============================ Number of parameters ====================================")
    print(str(sum(p.numel() for p in model.parameters() if p.requires_grad)))

    path="batch_size:{};out_channels:{};kernel_size_cnn:{};stride_size_cnn:{};kernel_size_pool:{};stride_size_pool:{}; weight:{}".format(args.batch_size,out_channels,kernel_size_cnn,stride_size_cnn,kernel_size_pool,stride_size_pool, weight)
    with open("/scratch/speech/models/classification/spec_multi_joint_stats_weight.txt","a+") as f:
        f.write("\n"+"============ model starts ===========")
        f.write("\n"+"model_parameters: "+str(sum(p.numel() for p in model.parameters() if p.requires_grad))+"\n"+path+"\n")
    model.cuda()
    model=DataParallel(model,device_ids=device_ids)
    model.train()

    # Use Adam as the optimizer with learning rate 0.01 to make it fast for testing purposes
    optimizer = optim.Adam(model.parameters(),lr=0.001)
    optimizer2=optim.SGD(model.parameters(), lr=0.1)
    scheduler = ReduceLROnPlateau(optimizer=optimizer,factor=0.5, patience=2, threshold=1e-3)
    #scheduler2=ReduceLROnPlateau(optimizer=optimizer2, factor=0.5, patience=2, threshold=1e-3)
    #scheduler2 =CosineAnnealingLR(optimizer2, T_max=300, eta_min=0.0001)
    scheduler3 =MultiStepLR(optimizer, [5,10,15],gamma=0.1)

    # Load the training data
    training_data = IEMOCAP(name='mel', nfft=nfft, train=True)
    train_loader = DataLoader(dataset=training_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0, drop_last=True)
    testing_data = IEMOCAP(name='mel', nfft=nfft, train=False)
    test_loader = DataLoader(dataset=testing_data, batch_size=batch_size, shuffle=True, collate_fn=my_collate, num_workers=0,drop_last=True)

    #print("=================")
    #print(len(training_data))
    #print("===================")

    test_acc=[]
    train_acc=[]
    test_loss=[]
    train_loss=[]
    for epoch in range(epoch_num):  # again, normally you would NOT do 300 epochs, it is toy data
        #print("===================================" + str(epoch+1) + "==============================================")
        losses = 0
        correct=0
        model.train()
        for j, (input_lstm, input1, input2, target, seq_length) in enumerate(train_loader):
            #if (j+1)%20==0:
                #print("=================================Train Batch"+ str(j+1)+str(weight)+"===================================================")
            model.zero_grad()
            losses_batch,correct_batch= model(input_lstm, input1, input2, target, seq_length)
            loss = torch.mean(losses_batch,dim=0)
            correct_batch=torch.sum(correct_batch,dim=0)
            losses += loss.item() * batch_size
            loss.backward()
            weight=model.module.state_dict()["weight"]
            weight=torch.exp(10*weight)/(1+torch.exp(10*weight)).item()
            optimizer.step()
            correct += correct_batch.item()
        accuracy=correct*1.0/((j+1)*batch_size)
        losses=losses / ((j+1)*batch_size)
        #scheduler3.step()
        losses_test = 0
        correct_test = 0
        #torch.save(model.module.state_dict(), "/scratch/speech/models/classification/spec_full_joint_checkpoint_epoch_{}.pt".format(epoch+1))
        model.eval()
        with torch.no_grad():
            for j,(input_lstm, input1, input2, target, seq_length) in enumerate(test_loader):
                #if (j+1)%10==0: print("=================================Test Batch"+ str(j+1)+ "===================================================")
                #input_lstm = pad_sequence(sequences=input_lstm,batch_first=True)
                losses_batch,correct_batch= model(input_lstm,input1, input2, target, seq_length)
                loss = torch.mean(losses_batch,dim=0)
                correct_batch=torch.sum(correct_batch,dim=0)
                losses_test += loss.item() * batch_size
                correct_test += correct_batch.item()

        #print("how many correct:", correct_test)
        accuracy_test = correct_test * 1.0 / ((j+1)*batch_size)
        losses_test = losses_test / ((j+1)*batch_size)

        # data gathering
        test_acc.append(accuracy_test)
        train_acc.append(accuracy)
        test_loss.append(losses_test)
        train_loss.append(losses)
        print("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n")
        with open("/scratch/speech/models/classification/spec_multi_joint_stats_weight.txt","a+") as f:
            #f.write("Epoch: {}-----------Training Loss: {} -------- Testing Loss: {} -------- Training Acc: {} -------- Testing Acc: {}".format(epoch+1,losses,losses_test, accuracy, accuracy_test)+"\n")
            if epoch==epoch_num-1:
                f.write("Best Accuracy:{:06.5f}".format(max(test_acc))+"\n")
                f.write("Average Top 10 Accuracy:{:06.5f}".format(np.mean(np.sort(np.array(test_acc))[-10:]))+"\n")
                f.write("=============== model ends ==================="+"\n")
    print("success:{}, Best Accuracy:{}".format(path,max(test_acc)))
Example #43
0
 def regulization(self, model, Lambda):
     w = torch.cat([x.view(-1) for x in model.parameters()])
     err = Lambda * torch.sum(torch.abs(w))
     return err
def k2(kesi,f_x,f_y,mean_logk,lamda):
    logk=mean_logk+torch.sum(torch.sqrt(lamda)*f_x*f_y*kesi,1)
    kk=torch.exp(logk)
    return kk
Example #45
0
def train(x, y):
    frame_predictor.zero_grad()
    posterior_mu.zero_grad()
    prior_mu.zero_grad()
    encoder.zero_grad()
    decoder.zero_grad()

    # initialize the hidden state.
    frame_predictor.hidden = frame_predictor.init_hidden()
    posterior_mu.hidden = posterior_mu.init_hidden()
    prior_mu.hidden = prior_mu.init_hidden()

    mse = 0
    var = 0

    h_match_prev = [encoder(y[m][0])[0].detach() for m in range(5)]
    for i in range(1, opt.n_past+opt.n_future):
        h = encoder(x[i-1])
        h_target = encoder(x[i])[0]
        h_match = [encoder(y[t][i])[0].detach() for t in range(5)]
        if opt.last_frame_skip or i < opt.n_past:	
            h, skip = h
        else:
            h = h[0]
        
        ## Our work: at each time stamp we predict 5 random results ##
        select_tensor = torch.zeros((opt.batch_size, 5, opt.g_dim)).cuda()
        err_list = []
        pred_list = []
        pre_hidden = [(frame_predictor.hidden[0][0].clone(), frame_predictor.hidden[0][1].clone()),
                      (frame_predictor.hidden[1][0].clone(), frame_predictor.hidden[1][1].clone())]
        
        ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1)
        mu = posterior_mu(h_target)
        mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1))
        
        for j in range(5):
            frame_predictor.hidden = [(pre_hidden[0][0].clone(), pre_hidden[0][1].clone()),
                                      (pre_hidden[1][0].clone(), pre_hidden[1][1].clone())]
            z_t = reparameterize(mu, ref_var)
            h_pred = frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1))
            pred_list.append(h_pred.unsqueeze(1))
            err_list.append(torch.mean(torch.abs(h_pred - h_target), -1))
        
        ## Our work: select the best match one as prediction ##
        err_tensor = torch.cat([err.unsqueeze(-1) for err in err_list], -1)
        min_idx = torch.argmin(err_tensor, -1)
        for bs in range(opt.batch_size):
            select_tensor[bs, min_idx[bs], :] = 1.0
        h_pred = torch.sum(torch.cat(pred_list, 1) * select_tensor.detach(), 1)

        x_pred = decoder([h_pred, skip])

        ## Our work: match with the expection ##
        mse += (mse_criterion(x_pred, x[i]) + opt.alpha * mse_criterion(mu, mu_p))
    
        ## Our work: match with the variation ##
        ref_var = torch.std(torch.cat([(h_match[m] - h_match_prev[m]).unsqueeze(1) for m in range(5)], 1), 1)
        pre_var = torch.std(torch.cat([pred_list[m] - h.unsqueeze(1) for m in range(5)], 1), 1)
        var += mse_criterion(ref_var, pre_var)
        h_match_prev = h_match

    loss = mse + var * opt.beta
    loss.backward()

    frame_predictor_optimizer.step()
    posterior_mu_optimizer.step()
    prior_mu_optimizer.step()
    var_encoder_optimizer.step()
    encoder_optimizer.step()
    decoder_optimizer.step()


    return mse.data.cpu().numpy()/(opt.n_past+opt.n_future), var.data.cpu().numpy()/(opt.n_future+opt.n_past)
Example #46
0
        def test_headmasking(self):
            if not self.test_head_masking:
                return

            global_rng.seed(42)
            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common(
            )
            global_rng.seed()

            config.output_attentions = True
            config.output_hidden_states = True
            configs_no_init = _config_zero_init(
                config)  # To be sure we have no Nan
            for model_class in self.all_model_classes:
                model = model_class(config=configs_no_init)
                model.to(torch_device)
                model.eval()

                # Prepare head_mask
                # Set require_grad after having prepared the tensor to avoid error (leaf variable has been moved into the graph interior)
                head_mask = torch.ones(self.model_tester.num_hidden_layers,
                                       self.model_tester.num_attention_heads,
                                       device=torch_device)
                head_mask[0, 0] = 0
                head_mask[-1, :-1] = 0
                head_mask.requires_grad_(requires_grad=True)
                inputs = inputs_dict.copy()
                inputs['head_mask'] = head_mask

                outputs = model(**inputs)

                # Test that we can get a gradient back for importance score computation
                output = sum(t.sum() for t in outputs[0])
                output = output.sum()
                output.backward()
                multihead_outputs = head_mask.grad

                attentions = outputs[-1]
                hidden_states = outputs[-2]

                # Remove Nan
                for t in attentions:
                    self.assertLess(
                        torch.sum(torch.isnan(t)),
                        t.numel() / 4
                    )  # Check we don't have more than 25% nans (arbitrary)
                attentions = [
                    t.masked_fill(torch.isnan(t), 0.0) for t in attentions
                ]  # remove them (the test is less complete)

                self.assertIsNotNone(multihead_outputs)
                self.assertEqual(len(multihead_outputs),
                                 self.model_tester.num_hidden_layers)
                self.assertAlmostEqual(
                    attentions[0][..., 0, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[0][..., -1, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[1][..., 0, :, :].flatten().sum().item(), 0.0)
                self.assertAlmostEqual(
                    attentions[-1][..., -2, :, :].flatten().sum().item(), 0.0)
                self.assertNotEqual(
                    attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
Example #47
0
def discriminator_criterion(
    G,
    D,
    reals,
    alpha=1.,
    device='cuda',
    wgan_lambda=10.0,  # Weight for the gradient penalty term.
    wgan_epsilon=0.001,  # Weight for the epsilon term, \epsilon_{drift}.
    wgan_target=1.0):  # Target value for gradient magnitudes.
    """
    Wasserstein distance criterion.

    Parameters
    ----------
    discriminator_fake_output
    discriminator_real_output
    generated_output
    real_output
    discriminator
    lambda_

    Returns
    -------

    """
    latents = torch.randn([reals.shape[0], 512]).to(device)
    fake_images_out = G(latents, alpha=alpha)
    real_scores_out = D(reals, alpha=alpha)
    fake_scores_out = D(fake_images_out, alpha=alpha)
    loss = fake_scores_out - real_scores_out

    mixing_factors = torch.rand([reals.shape[0], 1, 1, 1]).to(device)
    mixed_images_out = torch.lerp(reals, fake_images_out,
                                  mixing_factors).to(device)
    mixed_scores_out = D(mixed_images_out, alpha=alpha)

    #     # Apply dynamic loss scaling for the given expression.
    #     def apply_loss_scaling(self, value):
    #         assert is_tf_expression(value)
    #         if not self.use_loss_scaling:
    #             return value
    #         return value * exp2(self.get_loss_scaling_var(value.device))
    #
    #     # Undo the effect of dynamic loss scaling for the given expression.
    #     def undo_loss_scaling(self, value):
    #         assert is_tf_expression(value)
    #         if not self.use_loss_scaling:
    #             return value
    #         return value * exp2(-self.get_loss_scaling_var(value.device))

    mixed_loss = torch.sum(
        mixed_scores_out
    )  # originally wrapped in loss_scaling, but appears to not use it
    grad_outputs = torch.ones(mixed_loss.size()).to(device)
    mixed_grads = torch.autograd.grad(mixed_loss,
                                      mixed_images_out,
                                      grad_outputs=grad_outputs,
                                      create_graph=True)[0]
    mixed_norms = torch.sqrt(torch.sum(mixed_grads**2, axis=[1, 2, 3]))
    gradient_penalty = ((mixed_norms - wgan_target)**2).reshape(-1, 1)

    loss += gradient_penalty * (wgan_lambda / (wgan_target**2))
    epsilon_penalty = real_scores_out**2
    loss += epsilon_penalty * wgan_epsilon

    # def D_wgangp_acgan(G, D, opt, training_set, minibatch_size, reals, labels,
    #     wgan_lambda     = 10.0,     # Weight for the gradient penalty term.
    #     wgan_epsilon    = 0.001,    # Weight for the epsilon term, \epsilon_{drift}.
    #     wgan_target     = 1.0,      # Target value for gradient magnitudes.
    #     cond_weight     = 1.0):     # Weight of the conditioning terms.
    #
    #     latents = tf.random_normal([minibatch_size] + G.input_shapes[0][1:])
    #     fake_images_out = G.get_output_for(latents, labels, is_training=True)
    #     real_scores_out, real_labels_out = fp32(D.get_output_for(reals, is_training=True))
    #     fake_scores_out, fake_labels_out = fp32(D.get_output_for(fake_images_out, is_training=True))
    #     real_scores_out = tfutil.autosummary('Loss/real_scores', real_scores_out)
    #     fake_scores_out = tfutil.autosummary('Loss/fake_scores', fake_scores_out)
    #     loss = fake_scores_out - real_scores_out
    #
    #     with tf.name_scope('GradientPenalty'):
    #         mixing_factors = tf.random_uniform([minibatch_size, 1, 1, 1], 0.0, 1.0, dtype=fake_images_out.dtype)
    #         mixed_images_out = tfutil.lerp(tf.cast(reals, fake_images_out.dtype), fake_images_out, mixing_factors)
    #         mixed_scores_out, mixed_labels_out = fp32(D.get_output_for(mixed_images_out, is_training=True))
    #         mixed_scores_out = tfutil.autosummary('Loss/mixed_scores', mixed_scores_out)
    #         mixed_loss = opt.apply_loss_scaling(tf.reduce_sum(mixed_scores_out))
    #         mixed_grads = opt.undo_loss_scaling(fp32(tf.gradients(mixed_loss, [mixed_images_out])[0]))
    #         mixed_norms = tf.sqrt(tf.reduce_sum(tf.square(mixed_grads), axis=[1,2,3]))
    #         mixed_norms = tfutil.autosummary('Loss/mixed_norms', mixed_norms)
    #         gradient_penalty = tf.square(mixed_norms - wgan_target)
    #     loss += gradient_penalty * (wgan_lambda / (wgan_target**2))
    #
    #     with tf.name_scope('EpsilonPenalty'):
    #         epsilon_penalty = tfutil.autosummary('Loss/epsilon_penalty', tf.square(real_scores_out))
    #     loss += epsilon_penalty * wgan_epsilon
    #
    #     if D.output_shapes[1][1] > 0:
    #         with tf.name_scope('LabelPenalty'):
    #             label_penalty_reals = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=real_labels_out)
    #             label_penalty_fakes = tf.nn.softmax_cross_entropy_with_logits_v2(labels=labels, logits=fake_labels_out)
    #             label_penalty_reals = tfutil.autosummary('Loss/label_penalty_reals', label_penalty_reals)
    #             label_penalty_fakes = tfutil.autosummary('Loss/label_penalty_fakes', label_penalty_fakes)
    #         loss += (label_penalty_reals + label_penalty_fakes) * cond_weight
    #     return loss

    return loss.mean(
    )  # added mean in pytorch implementation; tf did not have!
Example #48
0
    def forward(self, visible, training=False):
        """
        forward pass for each sample (multiple steps)
        return list caches, which each element is
        the result from one timestep.
        """

        def get_rnn_hidden(v_t, u_tm1):
            activation = torch.matmul(self.wvu, v_t) + torch.matmul(self.wuu, u_tm1) + self.bu
            return torch.tanh(activation)

        time_steps = visible.shape[0]
        u_tm1 = self.u0
        sum1 = 0.0
        sum2 = 0.0

        total_cost = 0
        cost = 0
        mse = []
        for t in range(1, time_steps):
            v_t = visible[t]
            v_tm1 = visible[t - 1]
            #bh_t, bv_t = self.get_bias(u_tm1)
            bh_t = F.linear(u_tm1, self.wuh, self.bh)
            bv_t = F.linear(u_tm1, self.wuv, self.bv)

            ## gibbs sampling start from v_tm1
            # _, negative_sample = self.gibbs_sample(v_tm1, self.w, bh_t, bv_t, num_steps=20)

            v_ = v_t

            for _ in range(20):
                pre_h_, h_ = self.v_to_h(v_, bh_t)

                pre_v_, v_ = self.h_to_v(h_, bv_t)

            negative_sample = v_

            #mean_v = self.gibbs_step(negative_sample)[0]

            if training:
                self.optimizer.zero_grad()
                cost = self.free_energy(v_t) - self.free_energy(negative_sample)

                cost.backward(retain_graph=(t != time_steps-1))
                #print(self.u0.grad)
                self.optimizer.step()


            # RBM Loss
            # cost += self.free_energy(v_t) - self.free_energy(negative_sample)


            # RNN Loss
            y_t = torch.sigmoid(bv_t)
            total_cost += torch.sum(-v_t * torch.log(1e-6 + y_t) - (1 - v_t) * torch.log(1e-6 + 1 - y_t))

            '''

            cost = self.free_energy(v_t) - self.free_energy(negative_sample)

            if training:
                self.optimizer.zero_grad()
                cost.backward(retain_graph=(t != time_steps-1))


                self.optimizer.step()
            '''
            mse.append(torch.abs(v_t - negative_sample).mean().item())



            sum1 += v_t
            sum2 += negative_sample
            ut = get_rnn_hidden(v_t, u_tm1)
            u_tm1 = ut



        # regularization term
        total_cost /= time_steps
        reg_term = (torch.norm(self.wuv)+torch.norm(self.wuh) ) * self.reg_factor
        total_cost += reg_term

        return total_cost, mse,reg_term
Example #49
0
 def train(self, v0, vk, ph0, phk):
     self.W += torch.mm(v0.t(), ph0) - torch.mm(vk.t(), phk)
     self.b += torch.sum((v0 - vk), 0)
     self.a += torch.sum((ph0 - phk), 0)
Example #50
0
def plot(x, y, epoch):
    nsample = 20 
    gen_seq = [[] for i in range(nsample)]
    gt_seq = [x[i] for i in range(len(x))]

    for s in range(nsample):
        frame_predictor.hidden = frame_predictor.init_hidden()
        posterior_mu.hidden = posterior_mu.init_hidden()
        prior_mu.hidden = prior_mu.init_hidden()
        gen_seq[s].append(x[0])
        x_in = x[0]
        h_match_prev = [encoder(y[m][0])[0].detach() for m in range(5)]
        for i in range(1, opt.n_eval):
            h_match = [encoder(y[m][i])[0].detach() for m in range(5)]
            h = encoder(x_in)
            if opt.last_frame_skip or i < opt.n_past:	
                h, skip = h
            else:
                h, _ = h
            h = h.detach()
            if i < opt.n_past:
                h_target = encoder(x[i])
                h_target = h_target[0].detach()
                mu = posterior_mu(h_target)
                mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1)) 
                ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1)
                z_t = reparameterize(mu, ref_var)
                frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1))
                x_in = x[i]
                gen_seq[s].append(x_in)
            else:
                mu_p = prior_mu(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h], -1))
                ref_var = torch.mean(torch.cat([var_encoder(h_match[m] - h_match_prev[m] + h).unsqueeze(1) for m in range(5)], 1), 1)
                z_t = reparameterize(mu_p, ref_var)
                h = frame_predictor(torch.cat([h_match[m] - h_match_prev[m] for m in range(5)] + [h, z_t], 1)).detach()
                x_in = decoder([h, skip]).detach()
                gen_seq[s].append(x_in)
            
            h_match_prev = h_match

    to_plot = []
    gifs = [ [] for t in range(opt.n_eval) ]
    nrow = min(opt.batch_size, 10)
    for i in range(nrow):
        # ground truth sequence
        row = [] 
        for t in range(opt.n_eval):
            row.append(gt_seq[t][i])
        to_plot.append(row)

        # best sequence
        min_mse = 1e7
        for s in range(nsample):
            mse = 0
            for t in range(opt.n_eval):
                mse +=  torch.sum( (gt_seq[t][i].data.cpu() - gen_seq[s][t][i].data.cpu())**2 )
            if mse < min_mse:
                min_mse = mse
                min_idx = s

        s_list = [min_idx, 
                  np.random.randint(nsample), 
                  np.random.randint(nsample), 
                  np.random.randint(nsample), 
                  np.random.randint(nsample)]
        for ss in range(len(s_list)):
            s = s_list[ss]
            row = []
            for t in range(opt.n_eval):
                row.append(gen_seq[s][t][i]) 
            to_plot.append(row)
        for t in range(opt.n_eval):
            row = []
            row.append(gt_seq[t][i])
            for ss in range(len(s_list)):
                s = s_list[ss]
                row.append(gen_seq[s][t][i])
            gifs[t].append(row)

    fname = '%s/gen/sample_%d.png' % (opt.log_dir, epoch) 
    utils.save_tensors_image(fname, to_plot)

    fname = '%s/gen/sample_%d.gif' % (opt.log_dir, epoch) 
    utils.save_gif(fname, gifs)
    def compute_adjacency_info(vertices: torch.Tensor, faces: torch.Tensor):
        """Build data structures to help speed up connectivity queries. Assumes
        a homogeneous mesh, i.e., each face has the same number of vertices.

        The outputs have the following format: AA, AA_count
        AA_count: [count_0, ..., count_n]
        with AA:
        [[aa_{0,0}, ..., aa_{0,count_0} (, -1, ..., -1)],
         [aa_{1,0}, ..., aa_{1,count_1} (, -1, ..., -1)],
                    ...
         [aa_{n,0}, ..., aa_{n,count_n} (, -1, ..., -1)]]
        """

        device = vertices.device
        facesize = faces.shape[1]
        nb_vertices = vertices.shape[0]
        nb_faces = faces.shape[0]
        edges = torch.cat([faces[:, i:i + 2]
                           for i in range(facesize - 1)] + [faces[:, [-1, 0]]],
                          dim=0)
        # Sort the vertex of edges in increasing order
        edges = torch.sort(edges, dim=1)[0]
        # id of corresponding face in edges
        face_ids = torch.arange(nb_faces, device=device,
                                dtype=torch.long).repeat(facesize)
        # remove multiple occurences and sort by the first vertex
        # the edge key / id is fixed from now as the first axis position
        # edges_ids will give the key of the edges on the original vector
        edges, edges_ids = torch.unique(edges,
                                        sorted=True,
                                        return_inverse=True,
                                        dim=0)
        nb_edges = edges.shape[0]

        # EDGE2FACE
        sorted_edges_ids, order_edges_ids = torch.sort(edges_ids)
        sorted_faces_ids = face_ids[order_edges_ids]
        # indices of first occurences of each key
        idx_first = torch.where(
            torch.nn.functional.pad(
                sorted_edges_ids[1:] != sorted_edges_ids[:-1], (1, 0),
                value=1))[0]
        nb_faces_per_edge = idx_first[1:] - idx_first[:-1]
        # compute sub_idx (2nd axis indices to store the faces)
        offsets = torch.zeros(sorted_edges_ids.shape[0],
                              device=device,
                              dtype=torch.long)
        offsets[idx_first[1:]] = nb_faces_per_edge
        sub_idx = torch.arange(sorted_edges_ids.shape[0],
                               device=device,
                               dtype=torch.long) - torch.cumsum(offsets, dim=0)
        # TODO(cfujitsang): potential way to compute sub_idx differently
        #                   to test with bigger model
        # sub_idx = torch.ones(sorted_edges_ids.shape[0], device=device, dtype=torch.long)
        # sub_idx[0] = 0
        # sub_idx[idx_first[1:]] = 1 - nb_faces_per_edge
        # sub_idx = torch.cumsum(sub_idx, dim=0)
        nb_faces_per_edge = torch.cat(
            [nb_faces_per_edge, sorted_edges_ids.shape[0] - idx_first[-1:]],
            dim=0)
        max_sub_idx = torch.max(nb_faces_per_edge)
        ef = torch.zeros(
            (nb_edges, max_sub_idx), device=device, dtype=torch.long) - 1
        ef[sorted_edges_ids, sub_idx] = sorted_faces_ids
        # FACE2FACES
        nb_faces_per_face = (torch.stack([
            nb_faces_per_edge[edges_ids[i * nb_faces:(i + 1) * nb_faces]]
            for i in range(facesize)
        ],
                                         dim=1).sum(dim=1) - facesize)
        ff = torch.cat([
            ef[edges_ids[i * nb_faces:(i + 1) * nb_faces]]
            for i in range(facesize)
        ],
                       dim=1)
        # remove self occurences
        ff[ff == torch.arange(nb_faces, device=device, dtype=torch.long).view(
            -1, 1)] = -1
        ff = torch.sort(ff, dim=-1, descending=True)[0]
        to_del = (ff[:, 1:] == ff[:, :-1]) & (ff[:, 1:] != -1)
        ff[:, 1:][to_del] = -1
        nb_faces_per_face = nb_faces_per_face - torch.sum(to_del, dim=1)
        max_sub_idx = torch.max(nb_faces_per_face)
        ff = torch.sort(ff, dim=-1, descending=True)[0][:, :max_sub_idx]

        # VERTEX2VERTICES and VERTEX2EDGES
        npy_edges = edges.cpu().numpy()
        edge2key = {tuple(npy_edges[i]): i for i in range(nb_edges)}
        # _edges and double_edges 2nd axis correspond to the triplet:
        # [left vertex, right vertex, edge key]
        _edges = torch.cat(
            [edges, torch.arange(nb_edges, device=device).view(-1, 1)], dim=1)
        double_edges = torch.cat([_edges, _edges[:, [1, 0, 2]]], dim=0)
        double_edges = torch.unique(double_edges, sorted=True, dim=0)
        # TODO(cfujitsang): potential improvment, to test with bigger model:
        # double_edges0, order_double_edges = torch.sort(double_edges[0])
        nb_double_edges = double_edges.shape[0]
        # indices of first occurences of each key
        idx_first = torch.where(
            torch.nn.functional.pad(
                double_edges[1:, 0] != double_edges[:-1, 0], (1, 0),
                value=1))[0]
        nb_edges_per_vertex = idx_first[1:] - idx_first[:-1]
        # compute sub_idx (2nd axis indices to store the edges)
        offsets = torch.zeros(nb_double_edges, device=device, dtype=torch.long)
        offsets[idx_first[1:]] = nb_edges_per_vertex
        sub_idx = torch.arange(nb_double_edges,
                               device=device,
                               dtype=torch.long) - torch.cumsum(offsets, dim=0)
        nb_edges_per_vertex = torch.cat(
            [nb_edges_per_vertex, nb_double_edges - idx_first[-1:]], dim=0)
        max_sub_idx = torch.max(nb_edges_per_vertex)
        vv = torch.zeros(
            (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1
        vv[double_edges[:, 0], sub_idx] = double_edges[:, 1]
        ve = torch.zeros(
            (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1
        ve[double_edges[:, 0], sub_idx] = double_edges[:, 2]
        # EDGE2EDGES
        ee = torch.cat([ve[edges[:, 0], :], ve[edges[:, 1], :]], dim=1)
        nb_edges_per_edge = nb_edges_per_vertex[
            edges[:, 0]] + nb_edges_per_vertex[edges[:, 1]] - 2
        max_sub_idx = torch.max(nb_edges_per_edge)
        # remove self occurences
        ee[ee == torch.arange(nb_edges, device=device, dtype=torch.long).view(
            -1, 1)] = -1
        ee = torch.sort(ee, dim=-1, descending=True)[0][:, :max_sub_idx]
        # VERTEX2FACES
        vertex_ordered, order_vertex = torch.sort(faces.view(-1))
        face_ids_in_vertex_order = order_vertex / facesize
        # indices of first occurences of each id
        idx_first = torch.where(
            torch.nn.functional.pad(vertex_ordered[1:] != vertex_ordered[:-1],
                                    (1, 0),
                                    value=1))[0]
        nb_faces_per_vertex = idx_first[1:] - idx_first[:-1]
        # compute sub_idx (2nd axis indices to store the faces)
        offsets = torch.zeros(vertex_ordered.shape[0],
                              device=device,
                              dtype=torch.long)
        offsets[idx_first[1:]] = nb_faces_per_vertex
        sub_idx = torch.arange(vertex_ordered.shape[0],
                               device=device,
                               dtype=torch.long) - torch.cumsum(offsets, dim=0)
        # TODO(cfujitsang): it seems that nb_faces_per_vertex == nb_edges_per_vertex ?
        nb_faces_per_vertex = torch.cat(
            [nb_faces_per_vertex, vertex_ordered.shape[0] - idx_first[-1:]],
            dim=0)
        max_sub_idx = torch.max(nb_faces_per_vertex)
        vf = torch.zeros(
            (nb_vertices, max_sub_idx), device=device, dtype=torch.long) - 1
        vf[vertex_ordered, sub_idx] = face_ids_in_vertex_order

        return (
            edge2key,
            edges,
            vv,
            nb_edges_per_vertex,
            ve,
            nb_edges_per_vertex,
            vf,
            nb_faces_per_vertex,
            ff,
            nb_faces_per_face,
            ee,
            nb_edges_per_edge,
            ef,
            nb_faces_per_edge,
        )
Example #52
0
def get_gradient_penalty(discriminator,
                         generated_output,
                         real_output,
                         alpha=1.,
                         lambda_=10.):
    """
    Get gradient penalty.

    Parameters
    ----------
    discriminator
    generated_output
    real_output
    lambda_

    Returns
    -------

    """

    if real_output.shape != generated_output.shape:
        generated_output = generated_output[:real_output.shape[0]]

    batch_size = real_output.shape[0]

    # get epsilon
    # each image receives its own epsilon
    # (e.g., image 1 eps == .8047, image 2 eps == .1988, etc.)
    epsilon = torch.rand(batch_size, 1, 1, 1)
    # stretch the eps value to dim of each image
    epsilon = epsilon.expand(real_output.shape)
    epsilon = epsilon.cuda()

    # get interpolation
    interpolation = epsilon * real_output.data + (
        1 - epsilon) * generated_output.data
    interpolation.requires_grad = True
    interpolation = interpolation.cuda()

    # get interpolation logits
    interpolation_logits = discriminator(interpolation, alpha=alpha)

    # get gradients
    grad_outputs = torch.ones(interpolation_logits.size())
    grad_outputs = grad_outputs.cuda()
    gradients = torch.autograd.grad(outputs=interpolation_logits,
                                    inputs=interpolation,
                                    grad_outputs=grad_outputs,
                                    create_graph=True,
                                    retain_graph=True)[0]
    gradients = gradients.detach()
    # gradients = gradients.view(batch_size, -1)

    # with tf.name_scope('GradientPenalty'):
    #     #         mixing_factors = tf.random_uniform([minibatch_size, 1, 1, 1], 0.0, 1.0, dtype=fake_images_out.dtype)
    #     #         mixed_images_out = tfutil.lerp(tf.cast(reals, fake_images_out.dtype), fake_images_out, mixing_factors)
    #     #         mixed_scores_out, mixed_labels_out = fp32(D.get_output_for(mixed_images_out, is_training=True))
    #     #         mixed_scores_out = tfutil.autosummary('Loss/mixed_scores', mixed_scores_out)
    #     #         mixed_loss = opt.apply_loss_scaling(tf.reduce_sum(mixed_scores_out))
    #     #         mixed_grads = opt.undo_loss_scaling(fp32(tf.gradients(mixed_loss, [mixed_images_out])[0]))
    #     #         mixed_norms = tf.sqrt(tf.reduce_sum(tf.square(mixed_grads), axis=[1,2,3]))
    #     #         mixed_norms = tfutil.autosummary('Loss/mixed_norms', mixed_norms)
    #     #         gradient_penalty = tf.square(mixed_norms - wgan_target)
    #     #     loss += gradient_penalty * (wgan_lambda / (wgan_target**2))

    # get gradient penalty
    mixed_norms = torch.sqrt(torch.sum(gradients**2, dim=[1, 2, 3]))
    gradient_penalty = (mixed_norms - 1)**2
    # gradient_penalty = gradient_penalty.item()

    # remove gradient tracking
    del interpolation
    torch.cuda.empty_cache()

    return (gradient_penalty * (lambda_ / 1.**2)).mean()
Example #53
0
        x = self.fc4(x)
        return x


sae = SAE()
criterion = nn.MSELoss()
optimizer = optim.RMSprop(sae.parameters(), lr=0.01, weight_decay=0.5)

nb_epoch = 200
for epoch in range(1, nb_epoch + 1):
    train_loss = 0
    s = 0.
    for id_user in range(nb_users):
        input = Variable(training_set[id_user]).unsqueeze(0)
        target = input.clone()
        if torch.sum(target.data > 0) > 0:
            output = sae(input)
            target.require_grad = False
            output[target == 0] = 0
            loss = criterion(output, target)
            mean_corrector = nb_movies / \
                float(torch.sum(target.data > 0) + 1e-10)
            loss.backward()
            train_loss += np.sqrt(loss.data * mean_corrector)
            s += 1.
            optimizer.step()
    print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss / s))

test_loss = 0
s = 0.
for id_user in range(nb_users):
    def forward(self,  # type: ignore
                tokens: Dict[str, torch.LongTensor],
                label: torch.LongTensor = None) -> Dict[str, torch.Tensor]:
        # pylint: disable=arguments-differ
        """
        Parameters
        ----------
        tokens : Dict[str, torch.LongTensor], required
            The output of ``TextField.as_array()``.
        label : torch.LongTensor, optional (default = None)
            A variable representing the label for each instance in the batch.
        Returns
        -------
        An output dictionary consisting of:
        class_probabilities : torch.FloatTensor
            A tensor of shape ``(batch_size, num_classes)`` representing a
            distribution over the label classes for each instance.
        loss : torch.FloatTensor, optional
            A scalar loss to be optimised.
        """
        text_mask = util.get_text_field_mask(tokens).float()
        # Pop elmo tokens, since elmo embedder should not be present.
        elmo_tokens = tokens.pop("elmo", None)
        if tokens:
            embedded_text = self._text_field_embedder(tokens)
        else:
            # only using "elmo" for input
            embedded_text = None

        # Add the "elmo" key back to "tokens" if not None, since the tests and the
        # subsequent training epochs rely not being modified during forward()
        if elmo_tokens is not None:
            tokens["elmo"] = elmo_tokens

        # Create ELMo embeddings if applicable
        if self._elmo:
            if elmo_tokens is not None:
                elmo_representations = self._elmo(elmo_tokens)["elmo_representations"]
                # Pop from the end is more performant with list
                if self._use_integrator_output_elmo:
                    integrator_output_elmo = elmo_representations.pop()
                if self._use_input_elmo:
                    input_elmo = elmo_representations.pop()
                assert not elmo_representations
            else:
                raise ConfigurationError(
                        "Model was built to use Elmo, but input text is not tokenized for Elmo.")

        if self._use_input_elmo:
            if embedded_text is not None:
                embedded_text = torch.cat([embedded_text, input_elmo], dim=-1)
            else:
                embedded_text = input_elmo

        # While using embeddings from the mt-cnn encoder, the hardcoded values for vocab_size can be initialsed appropriately
        if cnn:
            embedded_text_cnn = embedded_text
            enc = Encoder(7855, 300, 600, 5, 3, 0.25, 'cuda')
            dec = Decoder(5893, 300, 600, 5, 3, 0.25, 1, 'cuda')

            cnn_model = Seq2Seq(enc, dec).cuda()
            cnn_model.load_state_dict(torch.load('../cnn_lstm_model.pt'))
            cnn_model.eval()
            v1, v2 = cnn_model.encoder(embedded_text[:,:,:256])

            v3 = torch.cat((v1,v2),2)



            embedded_text = torch.cat((embedded_text_cnn,v3),2)

        # While using embeddings from the mt-lstm encoder (either load from the saved model from the paper or the reproduced model)
        elif lstm:
            outputs_both_layer_cove_with_glove = MTLSTM(n_vocab=None, vectors=None, layer0=True, residual_embeddings=True)
            outputs_both_layer_cove_with_glove.cuda()
            embedded_text = outputs_both_layer_cove_with_glove(embedded_text,[embedded_text.shape[1]]*embedded_text.shape[0])

        dropped_embedded_text = self._embedding_dropout(embedded_text)
        pre_encoded_text = self._pre_encode_feedforward(dropped_embedded_text)

        encoded_tokens = self._encoder(pre_encoded_text, text_mask)
        # Compute biattention. This is a special case since the inputs are the same.
        attention_logits = encoded_tokens.bmm(encoded_tokens.permute(0, 2, 1).contiguous())
        attention_weights = util.masked_softmax(attention_logits, text_mask)
        encoded_text = util.weighted_sum(encoded_tokens, attention_weights)

        # Build the input to the integrator
        integrator_input = torch.cat([encoded_tokens,
                                      encoded_tokens - encoded_text,
                                      encoded_tokens * encoded_text], 2)
        integrated_encodings = self._integrator(integrator_input, text_mask)

        # Concatenate ELMo representations to integrated_encodings if specified
        if self._use_integrator_output_elmo:
            integrated_encodings = torch.cat([integrated_encodings,
                                              integrator_output_elmo], dim=-1)

        # Simple Pooling layers
        max_masked_integrated_encodings = util.replace_masked_values(
                integrated_encodings, text_mask.unsqueeze(2), -1e7)
        max_pool = torch.max(max_masked_integrated_encodings, 1)[0]
        min_masked_integrated_encodings = util.replace_masked_values(
                integrated_encodings, text_mask.unsqueeze(2), +1e7)
        min_pool = torch.min(min_masked_integrated_encodings, 1)[0]
        mean_pool = torch.sum(integrated_encodings, 1) / torch.sum(text_mask, 1, keepdim=True)

        # Self-attentive pooling layer
        # Run through linear projection. Shape: (batch_size, sequence length, 1)
        # Then remove the last dimension to get the proper attention shape (batch_size, sequence length).
        self_attentive_logits = self._self_attentive_pooling_projection(
                integrated_encodings).squeeze(2)
        self_weights = util.masked_softmax(self_attentive_logits, text_mask)
        self_attentive_pool = util.weighted_sum(integrated_encodings, self_weights)

        pooled_representations = torch.cat([max_pool, min_pool, mean_pool, self_attentive_pool], 1)
        pooled_representations_dropped = self._integrator_dropout(pooled_representations)

        logits = self._output_layer(pooled_representations_dropped)
        class_probabilities = F.softmax(logits, dim=-1)

        output_dict = {'logits': logits, 'class_probabilities': class_probabilities}
        if label is not None:
            loss = self.loss(logits, label)
            for metric in self.metrics.values():
                metric(logits, label)
            output_dict["loss"] = loss

        return output_dict
def test(args, model, classifier, test_loader):

    # switch to evaluate mode
    model.eval()
    classifier.eval()

    batch_time = AverageMeter()
    losses = AverageMeter()
    acc = AverageMeter()

    total_pred = []
    total_target = []
    total_pred_score = []

    with torch.no_grad():

        end = time.time()

        for batch_idx, (input, target) in enumerate(tqdm(test_loader, disable=False)):

            # Get inputs and target
            input, target = input.float(), target.long()

            # Move the variables to Cuda
            input, target = input.cuda(), target.cuda()

            # compute output ###############################
            feats = model(input)
            output = classifier(feats)
            pred_score = torch.softmax(output.detach_(), dim=-1)

            #######
            loss = F.cross_entropy(output, target, reduction='mean')

            # compute loss and accuracy
            batch_size = target.size(0)
            losses.update(loss.item(), batch_size)

            pred = torch.argmax(output, dim=1)
            acc.update(torch.sum(target == pred).item() / batch_size, batch_size)

            # Save pred, target to calculate metrics
            total_pred.append(pred)
            total_target.append(target)
            total_pred_score.append(pred_score)

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # print statistics and write summary every N batch
            if (batch_idx + 1) % args.print_freq == 0:
                print('Test: [{0}/{1}]\t'
                      'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                      'loss {loss.val:.3f} ({loss.avg:.3f})\t'
                      'acc {acc.val:.3f} ({acc.avg:.3f})'.format(
                    batch_idx, len(test_loader), batch_time=batch_time, loss=losses, acc=acc))

        # Pred and target for performance metrics
        final_predictions = torch.cat(total_pred).to('cpu')
        final_targets = torch.cat(total_target).to('cpu')
        final_pred_score = torch.cat(total_pred_score).to('cpu')

    return final_predictions, final_targets, final_pred_score
Example #56
0
    def _evaluate(self, verbose=False):

        # Predict on full dataset
        embeds_0 = self.inputs[0]  # model.input_layer()
        embeds_1 = self.model.gc1([embeds_0] + self.inputs[1:])
        embeds_2 = self.model.gc2([embeds_1] + self.inputs[1:])
        scores = self.model.clf_bias(embeds_2)
        preds = torch.argmax(scores, dim=1)
        loss_train = self.cross_entropy_loss(scores[self.idx_train],
                                             self.labels_train)
        loss_valid = self.cross_entropy_loss(scores[self.idx_valid],
                                             self.labels_valid)
        loss_test = self.cross_entropy_loss(scores[self.idx_test],
                                            self.labels_test)
        correct_train = torch.sum(preds[self.idx_train] == self.labels_train)
        correct_valid = torch.sum(preds[self.idx_valid] == self.labels_valid)
        correct_test = torch.sum(preds[self.idx_test] == self.labels_test)
        train_acc_net = correct_train.item() / self.labels_train.size(0)
        valid_acc_net = correct_valid.item() / self.labels_valid.size(0)
        test_acc_net = correct_test.item() / self.labels_test.size(0)
        if verbose:
            print('Graph:', train_acc_net, valid_acc_net, test_acc_net)

        scores_shareu = []
        scores_value = scores.data.cpu().numpy()
        for node in range(12127):
            scores_t = np.zeros(3)
            if (node not in self.node2adj):
                scores_shareu.append(scores_t)
                continue

            adj = list(self.node2adj[node])

            adj_coef = [1 / len(adj)] * len(adj)

            for user, coef in zip(adj, adj_coef):
                scores_t += coef * scores_value[user]
            scores_shareu.append(scores_t)
        scores_shareu = torch.FloatTensor(np.array(scores_shareu)).cuda()
        preds_shareu = torch.argmax(scores_shareu, dim=1)
        correct_train_shareu = torch.sum(
            preds_shareu[self.idx_train] == self.labels_train)
        correct_valid_shareu = torch.sum(
            preds_shareu[self.idx_valid] == self.labels_valid)
        correct_test_shareu = torch.sum(
            preds_shareu[self.idx_test] == self.labels_test)
        train_acc_shareu = correct_train_shareu.item(
        ) / self.labels_train.size(0)
        valid_acc_shareu = correct_valid_shareu.item(
        ) / self.labels_valid.size(0)
        test_acc_shareu = correct_test_shareu.item() / self.labels_test.size(0)
        if verbose:
            print('User:'******'G+U:', train_acc_netshareu, valid_acc_netshareu,
                  test_acc_netshareu)

        text_idx_perm = [i for i in range(self.num_docs)]
        scores_text = []
        for start in range(0, self.num_docs, self.text_batch_size):
            self.model.zero_grad()
            end = start + self.text_batch_size
            if (end > self.num_docs):
                end = self.num_docs
            doc_idx_list_raw = text_idx_perm[start:end]
            doctext_idx_list = torch.LongTensor(doc_idx_list_raw).cuda()
            batch_input = self.model.input_layer.get_doc_embed(
                doctext_idx_list)
            # torch.mm(model.gc2.W[0])
            scores_text.extend(
                list(self.model.clf_bias(batch_input).data.cpu().numpy()))
        scores_text = torch.FloatTensor(scores_text).cuda()
        preds_text = torch.argmax(scores_text, dim=1)
        # print(idx_train-num_non_docs)
        # print(preds_shareu[idx_train-num_non_docs])
        # exit()
        correct_train = torch.sum(
            preds_text[self.idx_train -
                       self.num_non_docs] == self.labels_train)
        correct_valid = torch.sum(
            preds_text[self.idx_valid -
                       self.num_non_docs] == self.labels_valid)
        correct_test = torch.sum(
            preds_text[self.idx_test - self.num_non_docs] == self.labels_test)
        train_acc_text = correct_train.item() / self.labels_train.size(0)
        valid_acc_text = correct_valid.item() / self.labels_valid.size(0)
        test_acc_text = correct_test.item() / self.labels_test.size(0)
        if verbose:
            print('Text:', train_acc_text, valid_acc_text, test_acc_text)

        scores[self.num_non_docs:] += scores_text
        preds_nettext = torch.argmax(scores, dim=1)
        correct_train = torch.sum(
            preds_nettext[self.idx_train] == self.labels_train)
        correct_valid = torch.sum(
            preds_nettext[self.idx_valid] == self.labels_valid)
        correct_test = torch.sum(
            preds_nettext[self.idx_test] == self.labels_test)
        train_acc_nettext = correct_train.item() / self.labels_train.size(0)
        valid_acc_nettext = correct_valid.item() / self.labels_valid.size(0)
        test_acc_nettext = correct_test.item() / self.labels_test.size(0)
        if verbose:
            print('G+T:', train_acc_nettext, valid_acc_nettext,
                  test_acc_nettext)

        scores_shareu[self.num_non_docs:] += scores_text
        preds_all = torch.argmax(scores_shareu, dim=1)
        correct_train = torch.sum(
            preds_all[self.idx_train] == self.labels_train)
        correct_valid = torch.sum(
            preds_all[self.idx_valid] == self.labels_valid)
        correct_test = torch.sum(preds_all[self.idx_test] == self.labels_test)
        train_acc_all = correct_train.item() / self.labels_train.size(0)
        valid_acc_all = correct_valid.item() / self.labels_valid.size(0)
        test_acc_all = correct_test.item() / self.labels_test.size(0)
        if verbose:
            print('G+U+T:', train_acc_all, valid_acc_all, test_acc_all)

        if (self.PRED_TYPE == 'net'):
            train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = (
                train_acc_net, valid_acc_net, test_acc_net,
                preds.data.cpu().numpy())
        elif (self.PRED_TYPE == 'netshareu'):
            train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = (
                train_acc_netshareu, valid_acc_netshareu, test_acc_netshareu,
                preds_netshareu.data.cpu().numpy())
        elif (self.PRED_TYPE == 'text'):
            train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = (
                train_acc_text, valid_acc_text, test_acc_text,
                preds_text.data.cpu().numpy())
        elif (self.PRED_TYPE == 'all'):
            train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel = (
                train_acc_all, valid_acc_all, test_acc_all,
                preds_all.data.cpu().numpy())
        else:
            print('wrong PRED_TYPE')
            exit()

        result_table = [
            [train_acc_net, valid_acc_net, test_acc_net],
            [train_acc_shareu, valid_acc_shareu, test_acc_shareu],
            [train_acc_netshareu, valid_acc_netshareu, test_acc_netshareu],
            [train_acc_text, valid_acc_text, test_acc_text],
            [train_acc_nettext, valid_acc_nettext, test_acc_nettext],
            [train_acc_all, valid_acc_all, test_acc_all]
        ]
        return (loss_train.item(), loss_valid.item(), loss_test.item(),
                train_acc_sel, valid_acc_sel, test_acc_sel, preds_sel,
                result_table)
    def draw_one_density_plot(self,
                              ax,
                              model,
                              data_dict,
                              traj_id,
                              multiply_by_poisson=False):

        scale = 5
        cmap = add_white(plt.cm.get_cmap('Blues', 9))  # plt.cm.BuGn_r
        cmap2 = add_white(plt.cm.get_cmap('Reds', 9))  # plt.cm.BuGn_r
        #cmap = plt.cm.get_cmap('viridis')

        data = data_dict["data_to_predict"]
        time_steps = data_dict["tp_to_predict"]
        mask = data_dict["mask_predicted_data"]

        observed_data = data_dict["observed_data"]
        observed_time_steps = data_dict["observed_tp"]
        observed_mask = data_dict["observed_mask"]

        npts = 50
        xx, yy, z0_grid = get_meshgrid(npts=npts,
                                       int_y1=(-scale, scale),
                                       int_y2=(-scale, scale))
        z0_grid = z0_grid.to(get_device(data))

        if model.latent_dim > 2:
            z0_grid = torch.cat(
                (z0_grid, torch.zeros(z0_grid.size(0), model.latent_dim - 2)),
                1)

        if model.use_poisson_proc:
            n_traj, n_dims = z0_grid.size()
            # append a vector of zeros to compute the integral of lambda and also zeros for the first point of lambda
            zeros = torch.zeros([n_traj, model.input_dim + model.latent_dim
                                 ]).to(get_device(data))
            z0_grid_aug = torch.cat((z0_grid, zeros), -1)
        else:
            z0_grid_aug = z0_grid

        # Shape of sol_y [n_traj_samples, n_samples, n_timepoints, n_latents]
        sol_y = model.diffeq_solver(z0_grid_aug.unsqueeze(0), time_steps)

        if model.use_poisson_proc:
            sol_y, log_lambda_y, int_lambda, _ = model.diffeq_solver.ode_func.extract_poisson_rate(
                sol_y)

            assert (torch.sum(int_lambda[:, :, 0, :]) == 0.)
            assert (torch.sum(int_lambda[0, 0, -1, :] <= 0) == 0.)

        pred_x = model.decoder(sol_y)

        # Plot density for one trajectory
        one_traj = data[traj_id]
        mask_one_traj = None
        if mask is not None:
            mask_one_traj = mask[traj_id].unsqueeze(0)
            mask_one_traj = mask_one_traj.repeat(npts**2, 1, 1).unsqueeze(0)

        ax.cla()

        # Plot: prior
        prior_density_grid = model.z0_prior.log_prob(
            z0_grid.unsqueeze(0)).squeeze(0)
        # Sum the density over two dimensions
        prior_density_grid = torch.sum(prior_density_grid, -1)

        # =================================================
        # Plot: p(x | y(t0))

        masked_gaussian_log_density_grid = masked_gaussian_log_density(
            pred_x,
            one_traj.repeat(npts**2, 1, 1).unsqueeze(0),
            mask=mask_one_traj,
            obsrv_std=model.obsrv_std).squeeze(-1)

        # Plot p(t | y(t0))
        if model.use_poisson_proc:
            poisson_info = {}
            poisson_info["int_lambda"] = int_lambda[:, :, -1, :]
            poisson_info["log_lambda_y"] = log_lambda_y

            poisson_log_density_grid = compute_poisson_proc_likelihood(
                one_traj.repeat(npts**2, 1, 1).unsqueeze(0),
                pred_x,
                poisson_info,
                mask=mask_one_traj)
            poisson_log_density_grid = poisson_log_density_grid.squeeze(0)

        # =================================================
        # Plot: p(x , y(t0))

        log_joint_density = prior_density_grid + masked_gaussian_log_density_grid
        if multiply_by_poisson:
            log_joint_density = log_joint_density + poisson_log_density_grid

        density_grid = torch.exp(log_joint_density)

        density_grid = torch.reshape(density_grid, (xx.shape[0], xx.shape[1]))
        density_grid = density_grid.cpu().numpy()

        ax.contourf(xx, yy, density_grid, cmap=cmap, alpha=1)

        # =================================================
        # Plot: q(y(t0)| x)
        #self.ax_density.set_title("Red: q(y(t0) | x)    Blue: p(x, y(t0))")
        ax.set_xlabel('z1(t0)')
        ax.set_ylabel('z2(t0)')

        data_w_mask = observed_data[traj_id].unsqueeze(0)
        if observed_mask is not None:
            data_w_mask = torch.cat(
                (data_w_mask, observed_mask[traj_id].unsqueeze(0)), -1)
        z0_mu, z0_std = model.encoder_z0(data_w_mask, observed_time_steps)

        if model.use_poisson_proc:
            z0_mu = z0_mu[:, :, :model.latent_dim]
            z0_std = z0_std[:, :, :model.latent_dim]

        q_z0 = Normal(z0_mu, z0_std)

        q_density_grid = q_z0.log_prob(z0_grid)
        # Sum the density over two dimensions
        q_density_grid = torch.sum(q_density_grid, -1)
        density_grid = torch.exp(q_density_grid)

        density_grid = torch.reshape(density_grid, (xx.shape[0], xx.shape[1]))
        density_grid = density_grid.cpu().numpy()

        ax.contourf(xx, yy, density_grid, cmap=cmap2, alpha=0.3)
def train(args, model_teacher, model_student, classifier_teacher, classifier_student, train_labeled_loader, train_unlabeled_loader, optimizer, epoch):

    model_teacher.eval()
    classifier_teacher.eval()

    model_student.train()
    classifier_student.train()

    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()
    losses_x = AverageMeter()
    losses_u = AverageMeter()
    acc = AverageMeter()

    end = time.time()

    train_loader = zip(train_labeled_loader, train_unlabeled_loader)

    for batch_idx, (data_x, data_u) in enumerate(tqdm(train_loader, disable=False)):

        # Get inputs and target
        inputs_x, targets_x = data_x
        inputs_u_w, inputs_u_s = data_u

        inputs_x, inputs_u_w, inputs_u_s, targets_x = inputs_x.float(), inputs_u_w.float(), inputs_u_s.float(), targets_x.long()

        # Move the variables to Cuda
        inputs_x, inputs_u_w, inputs_u_s, targets_x = inputs_x.cuda(), inputs_u_w.cuda(), inputs_u_s.cuda(), targets_x.cuda()

        # Compute output
        inputs_x = inputs_x.reshape(-1, 3, 256, 256)  #Reshape
        targets_x = targets_x.reshape(-1, )

        # Compute pseudolabels for weak_unlabeled images using the teacher model
        with torch.no_grad():
            feat_u_w = model_teacher(inputs_u_w)  # weak unlabeled data
            logits_u_w = classifier_teacher(feat_u_w)

        # Compute output for labeled and strong_unlabeled images using the student model
        inputs = torch.cat((inputs_x, inputs_u_s))
        feats = model_student(inputs)
        logits = classifier_student(feats)

        batch_size = inputs_x.shape[0]
        logits_x = logits[:batch_size]  # labeled data
        logits_u_s = logits[batch_size:]  # unlabeled data
        del logits

        # Compute loss
        Supervised_loss = F.cross_entropy(logits_x, targets_x, reduction='mean')

        pseudo_label = torch.softmax(logits_u_w.detach_(), dim=-1)
        max_probs, targets_u = torch.max(pseudo_label, dim=-1)
        Consistency_loss = F.cross_entropy(logits_u_s, targets_u, reduction='mean')

        final_loss = Supervised_loss + args.lambda_u * Consistency_loss

        # compute gradient and do SGD step #############
        optimizer.zero_grad()
        final_loss.backward()
        optimizer.step()

        # compute loss and accuracy ####################
        losses_x.update(Supervised_loss.item(), batch_size)
        losses_u.update(Consistency_loss.item(), batch_size)
        losses.update(final_loss.item(), batch_size)
        pred = torch.argmax(logits_x, dim=1)
        acc.update(torch.sum(targets_x == pred).item() / batch_size, batch_size)

        # measure elapsed time
        batch_time.update(time.time() - end)
        end = time.time()

        # print statistics and write summary every N batch
        if (batch_idx + 1) % args.print_freq == 0:
            print('Train: [{0}][{1}/{2}]\t'
                  'BT {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'DT {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'acc {acc.val:.3f} ({acc.avg:.3f})\t'
                  'final_loss {final_loss.val:.3f} ({final_loss.avg:.3f})\t'
                  'Supervised_loss {Supervised_loss.val:.3f} ({Supervised_loss.avg:.3f})\t'
                  'Consistency_loss {Consistency_loss.val:.3f} ({Consistency_loss.avg:.3f})'.format(epoch, batch_idx + 1, len(train_labeled_loader),
                                                                                                    batch_time=batch_time,
                                                                                                    data_time=data_time,
                                                                                                    acc=acc,
                                                                                                    final_loss=losses,
                                                                                                    Supervised_loss=losses_x,
                                                                                                    Consistency_loss=losses_u))

    return losses.avg, losses_x.avg, losses_u.avg, acc.avg
                remd = emd.emdModule()
                remd = remd.cuda()
                dis, ind = remd(point_a, point_b, 0.005, 300)
                for ass in range(B):
                    point_c[ass, :, :] = point_c[ass, ind[ass].long(), :]

                int_lam = int(args.num_points * lam)
                int_lam = max(1, int_lam)

                random_point = torch.from_numpy(
                    np.random.choice(1024, B, replace=False, p=None))
                # kNN
                ind1 = torch.tensor(range(B))
                query = point_a[ind1, random_point].view(B, 1, 3)
                dist = torch.sqrt(
                    torch.sum(
                        (point_a - query.repeat(1, args.num_points, 1))**2, 2))
                idxs = dist.topk(int_lam, dim=1, largest=False,
                                 sorted=True).indices
                for i2 in range(B):
                    points[i2, idxs[i2], :] = point_c[i2, idxs[i2], :]
                # adjust lambda to exactly match point ratio
                lam = int_lam * 1.0 / args.num_points
                points = points.transpose(2, 1)
                pred, trans_feat = model(points)
                loss = criterion(pred, target_a.long()) * (
                    1. - lam) + criterion(pred, target_b.long()) * lam
            else:
                points = points.transpose(2, 1)
                pred, trans_feat = model(points)
                loss = criterion(pred, target.long())
def loss_KLDivergence(mu, sigma):
    return -0.5 * torch.sum(1 + sigma - torch.pow(mu, 2) - torch.exp(sigma))