def forward(self, x, y, z):
     x = F.relu(F.max_pool2d(self.conv1(x), 2))
     x = F.relu(F.max_pool2d(self.conv2(x), 2))
     x = x.view(-1, 1600)
     x = F.relu(self.fc1(x))
     x = F.dropout(x, training=self.training)
     x = self.fc2(x)
     return F.log_softmax(x), F.log_softmax(x), F.log_softmax(x)
Esempio n. 2
0
    def _decode_step(self, input_list, state_list, k=1,
                     feed_all_timesteps=False,
                     remove_unknown=False,
                     get_attention=False):

        view_shape = (-1, 1) if self.decoder.batch_first else (1, -1)
        time_dim = 1 if self.decoder.batch_first else 0
        device = next(self.decoder.parameters()).device

        # For recurrent models, the last input frame is all we care about,
        # use feed_all_timesteps whenever the whole input needs to be fed
        if feed_all_timesteps:
            inputs = [torch.tensor(inp, device=device, dtype=torch.long)
                      for inp in input_list]
            inputs = batch_sequences(
                inputs, device=device, batch_first=self.decoder.batch_first)[0]

        else:
            last_tokens = [inputs[-1] for inputs in input_list]
            inputs = torch.stack(last_tokens).view(*view_shape)

        states = State().from_list(state_list)
        logits, new_states = self.decode(
            inputs, states, get_attention=get_attention)
        # use only last prediction
        logits = logits.select(time_dim, -1).contiguous()
        if remove_unknown:
            # Remove possibility of unknown
            logits[:, UNK].fill_(-float('inf'))
        logprobs = log_softmax(logits, dim=1)
        logprobs, words = logprobs.topk(k, 1)
        new_states_list = [new_states[i] for i in range(len(input_list))]
        return words, logprobs, new_states_list
Esempio n. 3
0
    def inference(self, unary, num_iter=5):

        if not self.conf['logsoftmax']:
            lg_unary = torch.log(unary)
            prediction = exp_and_normalize(lg_unary, dim=1)
        else:
            lg_unary = nnfun.log_softmax(unary, dim=1, _stacklevel=5)
            if self.conf['softmax'] and False:
                prediction = exp_and_normalize(lg_unary, dim=1)
            else:
                prediction = lg_unary

        for i in range(num_iter):
            message = self.kernel.compute(prediction)

            if self.comp is not None:
                # message_r = message.view(tuple([1]) + message.shape)
                comp = self.comp(message)
                message = message + comp

            if self.weight is None:
                prediction = lg_unary + message
            else:
                prediction = (self.unary_weight - self.weight) * lg_unary + \
                    self.weight * message

            if not i == num_iter - 1 or self.final_softmax:
                if self.conf['softmax']:
                    prediction = exp_and_normalize(prediction, dim=1)

        return prediction
 def forward(self, x):
     x = F.max_pool2d(F.relu(self.conv1(x)), 2)
     x = F.max_pool2d(F.relu(self.conv2(x)), 2)
     x = x.view(-1, 64 * 7 * 7)  # reshape Variable
     x = F.relu(self.fc1(x))
     x = self.fc2(x)
     return F.log_softmax(x, dim=-1)
 def forward(self, x):
     x = F.relu(F.max_pool2d(self.conv1(x), 2))
     x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
     x = x.view(-1, 320)
     x = F.relu(self.fc1(x))
     x = F.dropout(x, training=self.training)
     return F.log_softmax(self.fc2(x))
Esempio n. 6
0
    def f_next(self, ctx_dict, y, h):
        # Get hidden states from the first decoder (purely cond. on LM)
        h1 = self.dec0(y, h)

        # Apply attention over multiple modalities
        txt_alpha_t, txt_z_t = self.txt_att(h1.unsqueeze(0), *ctx_dict['txt'])
        img_alpha_t, img_z_t = self.img_att(h1.unsqueeze(0), *ctx_dict['image'])

        # Context will double dimensionality if fusion_type is concat
        # final_z_t should be compatible with hidden_size
        final_z_t = self.fusion(txt_z_t, img_z_t)

        h2 = self.dec1(final_z_t, h1)

        # This is a bottleneck to avoid going from H to V directly
        logit = self.hid2out(h2)

        # Apply dropout if any
        if self.dropout_out > 0:
            logit = self.do_out(logit)

        # Transform logit to T*B*V (V: vocab_size)
        # Compute log_softmax over token dim
        log_p = -F.log_softmax(self.out2prob(logit), dim=-1)

        # Return log probs and new hidden states
        return log_p, h2
def masked_cross_entropy(logits, target, length):
    length = Variable(torch.LongTensor(length)).cuda()

    """
    Args:
        logits: A Variable containing a FloatTensor of size
            (batch, max_len, num_classes) which contains the
            unnormalized probability for each class.
        target: A Variable containing a LongTensor of size
            (batch, max_len) which contains the index of the true
            class for each corresponding step.
        length: A Variable containing a LongTensor of size (batch,)
            which contains the length of each data in a batch.

    Returns:
        loss: An average loss value masked by the length.
    """

    # logits_flat: (batch * max_len, num_classes)
    logits_flat = logits.view(-1, logits.size(-1))
    # log_probs_flat: (batch * max_len, num_classes)
    log_probs_flat = functional.log_softmax(logits_flat)
    # target_flat: (batch * max_len, 1)
    target_flat = target.view(-1, 1)
    # losses_flat: (batch * max_len, 1)
    losses_flat = -torch.gather(log_probs_flat, dim=1, index=target_flat)
    # losses: (batch, max_len)
    losses = losses_flat.view(*target.size())
    # mask: (batch, max_len)
    mask = sequence_mask(sequence_length=length, max_len=target.size(1))
    losses = losses * mask.float()
    loss = losses.sum() / length.float().sum()
    return loss
 def forward(self, **sentence):
     input_words = sentence['input_words']
     embeds = self.word_embeddings(input_words)
     lstm_out, self.hidden = self.lstm(embeds.view(len(input_words), 1, -1))
     tag_space = self.hidden2tag(lstm_out.view(len(input_words), -1))
     tag_scores = F.log_softmax(tag_space)
     return tag_scores
Esempio n. 9
0
 def forward(self, x):
     y = F.dropout(F.relu(self.linears[0](x)), self.training)
     for layer in self.linears[1:-1]:
         y = F.relu(layer(y))
         y = F.dropout(y, self.training)
     y = F.log_softmax(self.linears[-1](y))
     return y
    def forward(self, input, hidden, encoder_outputs):
        '''
        input: batch, 1
        hidden: 1, batch, hidden
        encoder_outputs: length, hidden
        '''
        embedded = self.embedding(input)  # batch, 1, hidden
        embedded = self.dropout(embedded)
        embedded = embedded.squeeze(1)  # batch, hidden

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded, hidden[0]), 1)))
        # batch, max_length
        encoder_outputs = encoder_outputs.unsqueeze(0)
        # batch, max_length, hidden
        attn_applied = torch.bmm(attn_weights.unsqueeze(1), encoder_outputs)
        # batch, 1, hidden
        output = torch.cat((embedded, attn_applied.squeeze(1)), 1)
        # batch, 2xhidden
        output = self.attn_combine(output).unsqueeze(0)
        #1, batch, hidden

        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output.squeeze(0)))
        return output, hidden, attn_weights
def train_a2c(net, mb_obs, mb_rewards, mb_actions, mb_values, optimizer, tb_tracker, step_idx, device="cpu"):
    optimizer.zero_grad()
    mb_adv = mb_rewards - mb_values
    adv_v = torch.FloatTensor(mb_adv).to(device)
    obs_v = torch.FloatTensor(mb_obs).to(device)
    rewards_v = torch.FloatTensor(mb_rewards).to(device)
    actions_t = torch.LongTensor(mb_actions).to(device)
    logits_v, values_v = net(obs_v)
    log_prob_v = F.log_softmax(logits_v, dim=1)
    log_prob_actions_v = adv_v * log_prob_v[range(len(mb_actions)), actions_t]

    loss_policy_v = -log_prob_actions_v.mean()
    loss_value_v = F.mse_loss(values_v.squeeze(-1), rewards_v)

    prob_v = F.softmax(logits_v, dim=1)
    entropy_loss_v = (prob_v * log_prob_v).sum(dim=1).mean()
    loss_v = ENTROPY_BETA * entropy_loss_v + VALUE_LOSS_COEF * loss_value_v + loss_policy_v
    loss_v.backward()
    nn_utils.clip_grad_norm_(net.parameters(), CLIP_GRAD)
    optimizer.step()

    tb_tracker.track("advantage", mb_adv, step_idx)
    tb_tracker.track("values", values_v, step_idx)
    tb_tracker.track("batch_rewards", rewards_v, step_idx)
    tb_tracker.track("loss_entropy", entropy_loss_v, step_idx)
    tb_tracker.track("loss_policy", loss_policy_v, step_idx)
    tb_tracker.track("loss_value", loss_value_v, step_idx)
    tb_tracker.track("loss_total", loss_v, step_idx)
    return obs_v
    def sample_beam(self, fc_feats, att_feats, opt={}):
        beam_size = opt.get('beam_size', 10)
        batch_size = fc_feats.size(0)

        assert beam_size <= self.vocab_size + 1, 'lets assume this for now, otherwise this corner case causes a few headaches down the road. can be dealt with in future if needed'
        seq = torch.LongTensor(self.seq_length, batch_size).zero_()
        seqLogprobs = torch.FloatTensor(self.seq_length, batch_size)
        # lets process every image independently for now, for simplicity

        self.done_beams = [[] for _ in range(batch_size)]
        for k in range(batch_size):
            state = self.init_hidden(beam_size)
            for t in range(2):
                if t == 0:
                    xt = self.img_embed(fc_feats[k:k+1]).expand(beam_size, self.input_encoding_size)
                elif t == 1: # input <bos>
                    it = fc_feats.data.new(beam_size).long().zero_()
                    xt = self.embed(Variable(it, requires_grad=False))

                output, state = self.core(xt, state)
                logprobs = F.log_softmax(self.logit(output))

            self.done_beams[k] = self.beam_search(state, logprobs, opt=opt)
            seq[:, k] = self.done_beams[k][0]['seq'] # the first beam has highest cumulative score
            seqLogprobs[:, k] = self.done_beams[k][0]['logps']
        # return the samples and their log likelihoods
        return seq.transpose(0, 1), seqLogprobs.transpose(0, 1)
 def forward(self, sentence):
     embeds = self.word_embeddings(sentence)  # sentence must be a list of word_ixs
     lstm_out, self.hidden = self.lstm(embeds.view(len(sentence), 1, -1), self.hidden)
     # print(lstm_out.view(len(sentence), -1).shape)  # torch.Size([5, 6]) or torch.Size([4, 6])
     tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))  # Batch, embeding_dim
     tag_scores = F.log_softmax(tag_space, dim=1)
     return tag_scores
 def forward(self, input):
     x, slope = input
     x = x.view(-1, 784)
     x_fc1 = self.act((self.fc1(x), slope))
     x_fc2 = self.fc2(x_fc1)
     x_out = F.log_softmax(x_fc2, dim=1)
     return x_out
Esempio n. 15
0
def cross_entropy2d(input, target, weight=None, size_average=True):
    """
    Function to compute pixelwise cross-entropy for 2D image. This is the segmentation loss.
    Args:
        input: input tensor of shape (minibatch x num_channels x h x w)
        target: 2D label map of shape (minibatch x h x w)
        weight (optional): tensor of size 'C' specifying the weights to be given to each class
        size_average (optional): boolean value indicating whether the NLL loss has to be normalized
            by the number of pixels in the image 
    """
    
    # input: (n, c, h, w), target: (n, h, w)
    n, c, h, w = input.size()
    
    # log_p: (n, c, h, w)
    log_p = F.log_softmax(input)
    
    # log_p: (n*h*w, c)
    log_p = log_p.transpose(1, 2).transpose(2, 3).contiguous().view(-1, c)
    try:
        log_p = log_p[target.view(n, h, w, 1).repeat(1, 1, 1, c) >= 0]
    except:
        print "Exception: ", target.size()
    log_p = log_p.view(-1, c)
    
    # target: (n*h*w,)
    mask = target >= 0
    target = target[mask]
    target = torch.squeeze(target)
    loss = F.nll_loss(log_p, target, weight=weight, size_average=False)
    if size_average:
        loss /= mask.data.sum()

    return loss
Esempio n. 16
0
    def action_logprobs(self, x):
        x = self(x)

        log_probs = F.log_softmax(x, dim=1)
        # probs = F.softmax(x)

        return log_probs
Esempio n. 17
0
    def predict(self, inputs):
        classifier = self.nets.classifier

        outputs = classifier(inputs)
        predicted = torch.max(F.log_softmax(outputs, dim=1).data, 1)[1]

        return predicted
Esempio n. 18
0
    def forward(self, sentences, sentences_len, hidden):
        sentences_len = sentences_len.cpu().data.numpy()

        idx = np.argsort(sentences_len).tolist()[::-1]
        ridx = np.argsort(idx).tolist()

        sentences = sentences[idx, :]
        sentences_len = sentences_len[idx, ]
        embedding = self.embedding(sentences)
        embedding = nn.Dropout(0.1)(embedding)

        packed_embedding = pack_padded_sequence(embedding, sentences_len, batch_first=True)
        packed_rnn_feature, hidden = self.rnn_feature(packed_embedding, hidden)
        sentence_feature, _ = pad_packed_sequence(packed_rnn_feature, batch_first=True)

        idx = Variable(LongTensor(sentences_len - 1))
        idx = idx.view(-1, 1).expand(sentence_feature.size(0), sentence_feature.size(2)).unsqueeze(1)
        if sentence_feature.is_cuda:
            idx = idx.cuda()
        sentence_feature = sentence_feature.gather(1, idx).squeeze()

        sentence_feature = sentence_feature[ridx, :]
        sentences_len = sentences_len[ridx, ]

        logits = self.classifier(sentence_feature)
        pred = F.log_softmax(logits, dim=0)
        return pred
def calc_loss(batch, net, tgt_net, gamma, device="cpu", save_prefix=None):
    states, actions, rewards, dones, next_states = common.unpack_batch(batch)
    batch_size = len(batch)

    states_v = torch.tensor(states).to(device)
    actions_v = torch.tensor(actions).to(device)
    next_states_v = torch.tensor(next_states).to(device)

    # next state distribution
    next_distr_v, next_qvals_v = tgt_net.both(next_states_v)
    next_actions = next_qvals_v.max(1)[1].data.cpu().numpy()
    next_distr = tgt_net.apply_softmax(next_distr_v).data.cpu().numpy()

    next_best_distr = next_distr[range(batch_size), next_actions]
    dones = dones.astype(np.bool)

    # project our distribution using Bellman update
    proj_distr = common.distr_projection(next_best_distr, rewards, dones, Vmin, Vmax, N_ATOMS, gamma)

    # calculate net output
    distr_v = net(states_v)
    state_action_values = distr_v[range(batch_size), actions_v.data]
    state_log_sm_v = F.log_softmax(state_action_values, dim=1)
    proj_distr_v = torch.tensor(proj_distr).to(device)

    if save_prefix is not None:
        pred = F.softmax(state_action_values, dim=1).data.cpu().numpy()
        save_transition_images(batch_size, pred, proj_distr, next_best_distr, dones, rewards, save_prefix)

    loss_v = -state_log_sm_v * proj_distr_v
    return loss_v.sum(dim=1).mean()
Esempio n. 20
0
def train(epoch, model):
    LEARNING_RATE = lr / math.pow((1 + 10 * (epoch - 1) / epochs), 0.75)
    print('learning rate{: .4f}'.format(LEARNING_RATE) )
    optimizer = torch.optim.SGD([
        {'params': model.sharedNet.parameters()},
        {'params': model.cls_fc.parameters(), 'lr': LEARNING_RATE},
        ], lr=LEARNING_RATE / 10, momentum=momentum, weight_decay=l2_decay)

    model.train()

    iter_source = iter(source_loader)
    iter_target = iter(target_train_loader)
    num_iter = len_source_loader
    for i in range(1, num_iter):
        data_source, label_source = iter_source.next()
        data_target, _ = iter_target.next()
        if i % len_target_loader == 0:
            iter_target = iter(target_train_loader)
        if cuda:
            data_source, label_source = data_source.cuda(), label_source.cuda()
            data_target = data_target.cuda()
        data_source, label_source = Variable(data_source), Variable(label_source)
        data_target = Variable(data_target)

        optimizer.zero_grad()
        label_source_pred, loss_mmd = model(data_source, data_target)
        loss_cls = F.nll_loss(F.log_softmax(label_source_pred, dim=1), label_source)
        gamma = 2 / (1 + math.exp(-10 * (epoch) / epochs)) - 1
        loss = loss_cls + gamma * loss_mmd
        loss.backward()
        optimizer.step()
        if i % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tsoft_Loss: {:.6f}\tmmd_Loss: {:.6f}'.format(
                epoch, i * len(data_source), len_source_dataset,
                100. * i / len_source_loader, loss.data[0], loss_cls.data[0], loss_mmd.data[0]))
 def forward(self, sentence):
     embeds = self.word_embeddings(sentence)
     lstm_out, self.hidden = self.lstm(
         embeds.view(len(sentence), 1, -1), self.hidden)
     tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1))
     tag_scores = F.log_softmax(tag_space, dim=1)
     return tag_scores
Esempio n. 22
0
 def forward(self, inputs, hidden):
     sum_emb = torch.zeros(self.d)
     for input in inputs:
         emb = self.encoder(input)
         sum_emb.add_(emb)
     decoded = self.decoder(sum_emb)
     return F.log_softmax(decoded)
Esempio n. 23
0
 def forward(self, x):
     x = F.relu(self.linear1(x))
     x = F.dropout(x, 0.8)
     x = F.relu(self.linear2(x))
     x = F.dropout(x, 0.8)
     x = F.log_softmax(self.linear3(x))
     return x
    def forward(self, screen, variables):
        action_prob, input = super(AdvantageActorCriticNoisy, self).forward(screen, variables)

        if not self.training:
            _, action = action_prob.max(1, keepdim=True)
            return action, None

        # greedy actions
        if random.random() < 0.1:
            action = torch.LongTensor(action_prob.size(0), 1).random_(0, action_prob.size(1))
            action = Variable(action)
            if USE_CUDA:
                action = action.cuda()
        else:
           _, action = action_prob.max(1, keepdim=True)

        # value prediction - critic
        value = F.relu(self.value1(input))
        value = torch.cat([value, variables], 1)
        value = self.value2(value)

        # save output for backpro
        action_prob = F.log_softmax(action_prob, dim=1)
        self.outputs.append(ModelOutput(action_prob.gather(-1, action), value))
        return action, value
Esempio n. 25
0
 def forward(self, x):
     out = F.relu(F.max_pool2d(self.conv1(x), 2))
     out = F.relu(F.max_pool2d(self.conv2(out), 2))
     out = out.view(-1, 320)
     out = F.relu(self.fc1(out))
     out = self.fc2(out)
     return F.log_softmax(out, dim=1)
Esempio n. 26
0
 def forward(self, x, word):
     char = torch.FloatTensor()
     for each in word:
         char_list = []
         for letter in each:
             char_list.append(character_to_idx[letter.lower()])
         char_list = torch.LongTensor(char_list)
         char_list = char_list.unsqueeze(0)
         if torch.cuda.is_available():
             tempchar = self.char_lstm(Variable(char_list).cuda())
         else:
             tempchar = self.char_lstm(Variable(char_list))
         tempchar = tempchar.squeeze(0)
         char = torch.cat((char, tempchar.cpu().data), 0)
     if torch.cuda.is_available():
         char = char.cuda()
     char = Variable(char)
     x = self.word_embedding(x)
     x = torch.cat((x, char), 1)
     x = x.unsqueeze(0)
     x, _ = self.lstm(x)
     x = x.squeeze(0)
     x = self.linear1(x)
     y = F.log_softmax(x)
     return y
    def forward(self, fc_feats, att_feats, seq):
        batch_size = fc_feats.size(0)
        state = self.init_hidden(batch_size)
        outputs = []

        for i in range(seq.size(1)):
            if i == 0:
                xt = self.img_embed(fc_feats)
            else:
                if self.training and i >= 2 and self.ss_prob > 0.0: # otherwiste no need to sample
                    sample_prob = fc_feats.data.new(batch_size).uniform_(0, 1)
                    sample_mask = sample_prob < self.ss_prob
                    if sample_mask.sum() == 0:
                        it = seq[:, i-1].clone()
                    else:
                        sample_ind = sample_mask.nonzero().view(-1)
                        it = seq[:, i-1].data.clone()
                        #prob_prev = torch.exp(outputs[-1].data.index_select(0, sample_ind)) # fetch prev distribution: shape Nx(M+1)
                        #it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1))
                        prob_prev = torch.exp(outputs[-1].data) # fetch prev distribution: shape Nx(M+1)
                        it.index_copy_(0, sample_ind, torch.multinomial(prob_prev, 1).view(-1).index_select(0, sample_ind))
                        it = Variable(it, requires_grad=False)
                else:
                    it = seq[:, i-1].clone()
                # break if all the sequences end
                if i >= 2 and seq[:, i-1].data.sum() == 0:
                    break
                xt = self.embed(it)

            output, state = self.core(xt, state)
            output = F.log_softmax(self.logit(output))
            outputs.append(output)

        return torch.cat([_.unsqueeze(1) for _ in outputs[1:]], 1).contiguous()
Esempio n. 28
0
    def routine(self, inputs, targets,
                criterion=nn.CrossEntropyLoss(reduce=False)):
        '''

        Args:
            criterion: Classifier criterion.

        '''
        classifier = self.nets.classifier

        outputs = classifier(inputs)
        predicted = torch.max(F.log_softmax(outputs, dim=1).data, 1)[1]

        unlabeled = targets.eq(-1).long()
        losses = criterion(outputs, (1 - unlabeled) * targets)
        labeled = 1. - unlabeled.float()
        loss = (losses * labeled).sum() / labeled.sum()

        if labeled.sum() > 0:
            correct = 100. * (labeled * predicted.eq(
                targets.data).float()).cpu().sum() / labeled.cpu().sum()
            self.results.accuracy = correct
            self.losses.classifier = loss

        self.results.perc_labeled = labeled.mean()
Esempio n. 29
0
    def action_probs(self, x):
        x = self(x)

        log_probs = F.log_softmax(x)
        probs = F.softmax(x)

        return probs
 def forward(self, x):
     in_size = x.size(0)
     x = F.relu(self.mp(self.conv1(x)))
     x = F.relu(self.mp(self.conv2(x)))
     x = x.view(in_size, -1)  # flatten the tensor
     x = self.fc(x)
     return F.log_softmax(x)
Esempio n. 31
0
 def forward(self, x, adj):
     x = F.dropout(x, self.dropout, training=self.training)
     x = torch.cat([att(x, adj) for att in self.attentions], dim=1)
     x = F.dropout(x, self.dropout, training=self.training)
     x = F.elu(self.out_att(x, adj))
     return F.log_softmax(x, dim=1)
Esempio n. 32
0
    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos // len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses
Esempio n. 33
0
    def train(self, epoch):

        T = 2

        self.model.train()
        print("Epochs %d" % epoch)

        tasknum = self.train_data_iterator.dataset.t
        start = 0
        end = self.train_data_iterator.dataset.end
        mid = end - self.args.step_size
        kwargs = {'num_workers': 32, 'pin_memory': True}
        exemplar_dataset_loaders = ExemplarLoader(
            self.train_data_iterator.dataset)
        exemplar_iterator = torch.utils.data.DataLoader(
            exemplar_dataset_loaders,
            batch_size=self.args.replay_batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)

        selfsupervised_dataset_loaders = SelfSupervisedLoader(
            self.train_data_iterator.dataset)
        exemplar_iterator = torch.utils.data.DataLoader(
            selfsupervised_dataset_loaders,
            batch_size=self.args.batch_size,
            shuffle=True,
            drop_last=True,
            **kwargs)

        if tasknum > 0:
            iterator = zip(selfsupervised_dataset_loaders, exemplar_iterator)
        else:
            iterator = selfsupervised_dataset_loaders

        for samples in tqdm(iterator):
            if tasknum > 0:
                curr, prev = samples

                data, target, target_rot = curr
                data, target, target_rot = data.cuda(), target.cuda(
                ), rot_target.cuda()

                batch_size = data.shape[0]

                data_r, target_r, target_rot_r = prev
                data_r, target_r, target_rot_r = data_r.cuda(), target_r.cuda(
                ), rot_target_r.cuda()

                replay_size = data_r.shape[0]

                data = torch.cat((data, data_r))
                target = torch.cat((target, target_r))
                target_rot = torch.cat((target_rot, target_rot_r))

            else:
                data, target, target_rot = samples
                data, target, target_rot = data.cuda(), target.cuda(
                ), target_rot.cuda()

                batch_size = data.shape[0]

            data = data.view(-1, 3, 224, 224)
            target = target.view(data.size(0), -1)
            target_rot = target_rot.view(data.size(0), -1)

            y_onehot = torch.FloatTensor(len(target),
                                         self.dataset.classes).cuda()

            y_onehot.zero_()
            #             y_onehot.scatter_(1, torch.unsqueeze(target, 1), 1)
            y_onehot.scatter_(1, target, 1)

            y_onehot_rot = torch.FloatTensor(len(target_rot), 4).cuda()

            y_onehot_rot.zero_()
            #             y_onehot_rot.scatter_(1, torch.unsqueeze(target_rot, 1), 1)
            y_onehot_rot.scatter_(1, target_rot, 1)

            uniform = torch.ones_like(y_onehot)

            output = self.model(data)
            output_log = F.log_softmax(output[:batch_size, start:end], dim=1)
            output_rot_log = F.log_softmax(output[:batch_size, 1000:1004],
                                           dim=1)

            if self.args.loss == 'GCE':
                loss_CE = self.gce(output[:batch_size, start:end],
                                   target % (end - start))
            else:

                if self.args.prev_new:
                    loss_CE_curr = 0
                    loss_CE_prev = 0

                    curr = output[:batch_size, mid:end]
                    curr_log = F.log_softmax(curr, dim=1)
                    loss_CE_curr = F.kl_div(curr_log,
                                            y_onehot[:batch_size, mid:end],
                                            reduction='sum')

                    curr_rot = output[:batch_size, 1000:1004]
                    curr_rot_log = F.log_softmax(curr_rot, dim=1)
                    loss_rot_CE_curr = F.kl_div(curr_rot_log,
                                                y_onehot_rot[:batch_size],
                                                reduction='sum')

                    loss_CE_curr += loss_rot_CE_curr

                    if tasknum > 0:
                        prev = output[batch_size:batch_size + replay_size,
                                      start:mid]
                        prev_log = F.log_softmax(prev, dim=1)
                        loss_CE_prev = F.kl_div(
                            prev_log,
                            y_onehot[batch_size:batch_size + replay_size,
                                     start:mid],
                            reduction='sum')

                        #                         prev_rot = output[batch_size:batch_size+replay_size,1000:1004]
                        #                         prev_rot_log = F.log_softmax(prev_rot, dim=1)
                        #                         loss_rot_CE_prev = F.kl_div(prev_rot_log,
                        #                                                     y_onehot_rot[batch_size:batch_size+replay_size], reduction='sum')

                        #                         loss_CE_prev  += loss_rot_CE_prev

                        loss_CE = (loss_CE_curr + loss_CE_prev) / (batch_size +
                                                                   replay_size)
                    else:
                        loss_CE = loss_CE_curr / batch_size

                else:
                    loss_CE = F.kl_div(output_log,
                                       y_onehot[:batch_size, start:end],
                                       reduction='batchmean')
                    loss_rot_CE = F.kl_div(output_rot_log,
                                           y_onehot_rot[:batch_size],
                                           reduction='batchmean')
                    loss_CE += loss_rot_CE

            if self.args.CI:
                loss_CE += F.kl_div(output_log,
                                    uniform[:batch_size, start:end] /
                                    (end - start),
                                    reduction='batchmean') * self.args.beta

            if tasknum > 0 and self.args.uniform_penalty:
                prev_uni = output[batch_size:batch_size + replay_size,
                                  start:end]
                prev_uni_log = F.log_softmax(prev_uni, dim=1)
                loss_uni_prev = F.kl_div(
                    prev_uni_log,
                    uniform[:replay_size, start:end] / (end - start),
                    reduction='batchmean') * self.args.beta
                loss_CE += loss_uni_prev

            self.optimizer.zero_grad()
            (loss_CE).backward()
            self.optimizer.step()
Esempio n. 34
0
    def forward(self, x):
        x = self.fc1(x)
        x = torch.sigmoid(x)
        x = self.fc2(x)

        return f.log_softmax(x, dim=1)
Esempio n. 35
0
 def forward(self, x):
     x = self.model(x)
     return F.log_softmax(x,dim=1)
Esempio n. 36
0
 def forward(self, x):
     x, trans, trans_feat = self.feat(x)
     x = F.relu(self.bn1(self.fc1(x)))
     x = F.relu(self.bn2(self.dropout(self.fc2(x))))
     x = self.fc3(x)
     return F.log_softmax(x, dim=1), trans, trans_feat
Esempio n. 37
0
def train(rank, args, shared_model):
    torch.cuda.set_device(args.gpus.index(args.gpus[rank % len(args.gpus)]))

    if args.model_type == 'pacman':

        model_kwargs = {'question_vocab': load_vocab(args.vocab_json)}
        model = NavPlannerControllerModel(**model_kwargs)

    else:

        exit()

    lossFn = torch.nn.CrossEntropyLoss().cuda()

    optim = torch.optim.Adam(
        filter(lambda p: p.requires_grad, shared_model.parameters()),
        lr=args.learning_rate)

    train_loader_kwargs = {
        'questions_h5': args.train_h5,
        'data_json': args.data_json,
        'vocab': args.vocab_json,
        'batch_size': args.batch_size,
        'input_type': args.model_type,
        'num_frames': 5,
        'split': 'train',
        'max_threads_per_gpu': args.max_threads_per_gpu,
        'gpu_id': args.gpus[rank % len(args.gpus)],
        'to_cache': args.to_cache
    }

    eval_loader_kwargs = {
        'questions_h5': getattr(args, args.eval_split + '_h5'),
        'data_json': args.data_json,
        'vocab': args.vocab_json,
        'target_obj_conn_map_dir': args.target_obj_conn_map_dir,
        'map_resolution': args.map_resolution,
        'batch_size': 1,
        'input_type': args.model_type,
        'num_frames': 5,
        'split': args.eval_split,
        'max_threads_per_gpu': args.max_threads_per_gpu,
        'gpu_id': args.gpus[rank % len(args.gpus)],
        'to_cache': False
    }

    args.output_log_path = os.path.join(args.log_dir,
                                        'train_' + str(rank) + '.json')

    if 'pacman' in args.model_type:
        metrics = NavMetric(
            info={'split': args.eval_split,
                  'thread': rank},
            metric_names=[
                'd_0_10', 'd_0_30', 'd_0_50', 'd_T_10', 'd_T_30', 'd_T_50',
                'd_D_10', 'd_D_30', 'd_D_50', 'd_min_10', 'd_min_30',
                'd_min_50', 'r_T_10', 'r_T_30', 'r_T_50', 'r_e_10', 'r_e_30',
                'r_e_50', 'stop_10', 'stop_30', 'stop_50', 'ep_len_10',
                'ep_len_30', 'ep_len_50'
            ],
            log_json=args.output_log_path)
    else:

        metrics = NavMetric(
            info={'split': 'train',
                  'thread': rank},
            metric_names=['loss'],
            log_json=args.output_log_path)

    train_loader = EqaDataLoader(**train_loader_kwargs)
    eval_loader = EqaDataLoader(**eval_loader_kwargs)

    print('train_loader has %d samples' % len(train_loader.dataset))

    t, epoch, best_eval_acc = 0, 0, 0

    while epoch < int(args.max_epochs):

        if 'pacman' in args.model_type:

            planner_lossFn = MaskedNLLCriterion().cuda()
            controller_lossFn = MaskedNLLCriterion().cuda()

            done = False
            model.train()

            all_envs_loaded = train_loader.dataset._check_if_all_envs_loaded()

            while done == False:

                for batch in train_loader:

                    t += 1

                    model.load_state_dict(shared_model.state_dict())
                    model.train()
                    model.cuda()

                    idx, questions, _, planner_img_feats, planner_actions_in, \
                        planner_actions_out, planner_action_lengths, planner_masks, \
                        controller_img_feats, controller_actions_in, planner_hidden_idx, \
                        controller_outs, controller_action_lengths, controller_masks = batch

                    questions_var = Variable(questions.cuda())

                    planner_img_feats_var = Variable(planner_img_feats.cuda())
                    planner_actions_in_var = Variable(
                        planner_actions_in.cuda())
                    planner_actions_out_var = Variable(
                        planner_actions_out.cuda())
                    planner_action_lengths = planner_action_lengths.cuda()
                    planner_masks_var = Variable(planner_masks.cuda())

                    controller_img_feats_var = Variable(
                        controller_img_feats.cuda())
                    controller_actions_in_var = Variable(
                        controller_actions_in.cuda())
                    planner_hidden_idx_var = Variable(
                        planner_hidden_idx.cuda())
                    controller_outs_var = Variable(controller_outs.cuda())
                    controller_action_lengths = controller_action_lengths.cuda(
                    )
                    controller_masks_var = Variable(controller_masks.cuda())

                    planner_action_lengths, perm_idx = planner_action_lengths.sort(
                        0, descending=True)

                    questions_var = questions_var[perm_idx]

                    planner_img_feats_var = planner_img_feats_var[perm_idx]
                    planner_actions_in_var = planner_actions_in_var[perm_idx]
                    planner_actions_out_var = planner_actions_out_var[perm_idx]
                    planner_masks_var = planner_masks_var[perm_idx]

                    controller_img_feats_var = controller_img_feats_var[
                        perm_idx]
                    controller_actions_in_var = controller_actions_in_var[
                        perm_idx]
                    controller_outs_var = controller_outs_var[perm_idx]
                    planner_hidden_idx_var = planner_hidden_idx_var[perm_idx]
                    controller_action_lengths = controller_action_lengths[
                        perm_idx]
                    controller_masks_var = controller_masks_var[perm_idx]

                    planner_scores, controller_scores, planner_hidden = model(
                        questions_var, planner_img_feats_var,
                        planner_actions_in_var,
                        planner_action_lengths.cpu().numpy(),
                        planner_hidden_idx_var, controller_img_feats_var,
                        controller_actions_in_var, controller_action_lengths)

                    planner_logprob = F.log_softmax(planner_scores, dim=1)
                    controller_logprob = F.log_softmax(
                        controller_scores, dim=1)

                    planner_loss = planner_lossFn(
                        planner_logprob,
                        planner_actions_out_var[:, :planner_action_lengths.max(
                        )].contiguous().view(-1, 1),
                        planner_masks_var[:, :planner_action_lengths.max()]
                        .contiguous().view(-1, 1))

                    controller_loss = controller_lossFn(
                        controller_logprob,
                        controller_outs_var[:, :controller_action_lengths.max(
                        )].contiguous().view(-1, 1),
                        controller_masks_var[:, :controller_action_lengths.max(
                        )].contiguous().view(-1, 1))

                    # zero grad
                    optim.zero_grad()

                    # update metrics
                    # metrics.update(
                    #     [planner_loss.data[0], controller_loss.data[0]])

                    # backprop and update
                    (planner_loss + controller_loss).backward()

                    ensure_shared_grads(model.cpu(), shared_model)
                    optim.step()

                    # if t % args.print_every == 0:
                    #     print(metrics.get_stat_string())
                    #     if args.to_log == 1:
                    #        metrics.dump_log()

                    print('[CHECK][Cache:%d][Total:%d]' %
                          (len(train_loader.dataset.img_data_cache),
                           len(train_loader.dataset.env_list)))

                if all_envs_loaded == False:
                    train_loader.dataset._load_envs(in_order=True)
                    if len(train_loader.dataset.pruned_env_set) == 0:
                        done = True
                        if args.to_cache == False:
                            train_loader.dataset._load_envs(
                                start_idx=0, in_order=True)
                else:
                    done = True

        invalids = []
        done = False
        model.eval()

        while done == False:

            for batch in tqdm(eval_loader):
                if batch is None:
                    continue

                model.load_state_dict(shared_model.state_dict())
                model.cuda()

                idx, question, answer, actions, action_length = batch
                metrics_slug = {}

                h3d = eval_loader.dataset.episode_house

                # evaluate at multiple initializations
                for i in [10, 30, 50]:

                    t += 1

                    if i > action_length[0]:
                        invalids.append([idx[0], i])
                        continue

                    question_var = Variable(question.cuda())

                    controller_step = False
                    planner_hidden = model.planner_nav_rnn.init_hidden(1)

                    # forward through planner till spawn
                    planner_actions_in, planner_img_feats, controller_step, controller_action_in, controller_img_feat, init_pos = eval_loader.dataset.get_hierarchical_features_till_spawn(
                        actions[0, :action_length[0] + 1].numpy(), i)

                    planner_actions_in_var = Variable(
                        planner_actions_in.cuda())
                    planner_img_feats_var = Variable(
                        planner_img_feats.cuda())

                    for step in range(planner_actions_in.size(0)):

                        planner_scores, planner_hidden = model.planner_step(
                            question_var, planner_img_feats_var[step].view(
                                1, 1,
                                3200), planner_actions_in_var[step].view(
                                    1, 1), planner_hidden)

                    if controller_step == True:

                        controller_img_feat_var = Variable(
                            controller_img_feat.cuda())
                        controller_action_in_var = Variable(
                            torch.LongTensor(1, 1).fill_(
                                int(controller_action_in)).cuda())

                        controller_scores = model.controller_step(
                            controller_img_feat_var.view(1, 1, 3200),
                            controller_action_in_var.view(1, 1),
                            planner_hidden[0])

                        prob = F.softmax(controller_scores, dim=1)
                        controller_action = int(
                            prob.max(1)[1].data.cpu().numpy()[0])

                        if controller_action == 1:
                            controller_step = True
                        else:
                            controller_step = False

                        action = int(controller_action_in)
                        action_in = torch.LongTensor(
                            1, 1).fill_(action + 1).cuda()

                    else:

                        prob = F.softmax(planner_scores, dim=1)
                        action = int(prob.max(1)[1].data.cpu().numpy()[0])

                        action_in = torch.LongTensor(
                            1, 1).fill_(action + 1).cuda()

                    h3d.env.reset(
                        x=init_pos[0], y=init_pos[2], yaw=init_pos[3])

                    init_dist_to_target = h3d.get_dist_to_target(
                        h3d.env.cam.pos)
                    if init_dist_to_target < 0:  # unreachable
                        invalids.append([idx[0], i])
                        continue

                    episode_length = 0
                    episode_done = True
                    controller_action_counter = 0

                    dists_to_target, pos_queue, pred_actions = [
                        init_dist_to_target
                    ], [init_pos], []
                    planner_actions, controller_actions = [], []

                    if action != 3:

                        # take the first step
                        img, _, _ = h3d.step(action)
                        img = torch.from_numpy(img.transpose(
                            2, 0, 1)).float() / 255.0
                        img_feat_var = eval_loader.dataset.cnn(
                            Variable(img.view(1, 3, 224,
                                              224).cuda())).view(
                                                  1, 1, 3200)

                        for step in range(args.max_episode_length):

                            episode_length += 1

                            if controller_step == False:
                                planner_scores, planner_hidden = model.planner_step(
                                    question_var, img_feat_var,
                                    Variable(action_in), planner_hidden)

                                prob = F.softmax(planner_scores, dim=1)
                                action = int(
                                    prob.max(1)[1].data.cpu().numpy()[0])
                                planner_actions.append(action)

                            pred_actions.append(action)
                            img, _, episode_done = h3d.step(action)

                            episode_done = episode_done or episode_length >= args.max_episode_length

                            img = torch.from_numpy(img.transpose(
                                2, 0, 1)).float() / 255.0
                            img_feat_var = eval_loader.dataset.cnn(
                                Variable(img.view(1, 3, 224, 224)
                                         .cuda())).view(1, 1, 3200)

                            dists_to_target.append(
                                h3d.get_dist_to_target(h3d.env.cam.pos))
                            pos_queue.append([
                                h3d.env.cam.pos.x, h3d.env.cam.pos.y,
                                h3d.env.cam.pos.z, h3d.env.cam.yaw
                            ])

                            if episode_done == True:
                                break

                            # query controller to continue or not
                            controller_action_in = Variable(
                                torch.LongTensor(1,
                                                 1).fill_(action).cuda())
                            controller_scores = model.controller_step(
                                img_feat_var, controller_action_in,
                                planner_hidden[0])

                            prob = F.softmax(controller_scores, dim=1)
                            controller_action = int(
                                prob.max(1)[1].data.cpu().numpy()[0])

                            if controller_action == 1 and controller_action_counter < 4:
                                controller_action_counter += 1
                                controller_step = True
                            else:
                                controller_action_counter = 0
                                controller_step = False
                                controller_action = 0

                            controller_actions.append(controller_action)

                            action_in = torch.LongTensor(
                                1, 1).fill_(action + 1).cuda()

                    # compute stats
                    metrics_slug['d_0_' + str(i)] = dists_to_target[0]
                    metrics_slug['d_T_' + str(i)] = dists_to_target[-1]
                    metrics_slug['d_D_' + str(
                        i)] = dists_to_target[0] - dists_to_target[-1]
                    metrics_slug['d_min_' + str(i)] = np.array(
                        dists_to_target).min()
                    metrics_slug['ep_len_' + str(i)] = episode_length
                    if action == 3:
                        metrics_slug['stop_' + str(i)] = 1
                    else:
                        metrics_slug['stop_' + str(i)] = 0
                    inside_room = []
                    for p in pos_queue:
                        inside_room.append(
                            h3d.is_inside_room(
                                p, eval_loader.dataset.target_room))
                    if inside_room[-1] == True:
                        metrics_slug['r_T_' + str(i)] = 1
                    else:
                        metrics_slug['r_T_' + str(i)] = 0
                    if any([x == True for x in inside_room]) == True:
                        metrics_slug['r_e_' + str(i)] = 1
                    else:
                        metrics_slug['r_e_' + str(i)] = 0

                # collate and update metrics
                metrics_list = []
                for i in metrics.metric_names:
                    if i not in metrics_slug:
                        metrics_list.append(metrics.metrics[
                            metrics.metric_names.index(i)][0])
                    else:
                        metrics_list.append(metrics_slug[i])

                # update metrics
                metrics.update(metrics_list)

            try:
                print(metrics.get_stat_string(mode=0))
            except:
                pass

            print('epoch', epoch)
            print('invalids', len(invalids))

            eval_loader.dataset._load_envs()
            if len(eval_loader.dataset.pruned_env_set) == 0:
                done = True


        # checkpoint if best val loss
        print("ecoch {}: if {} > best_eval_acc {}".format(epoch, metrics.metrics[8][0], best_eval_acc))
        if metrics.metrics[8][0] > best_eval_acc:  # d_D_50
            best_eval_acc = metrics.metrics[8][0]
            if epoch % args.eval_every == 0 and args.to_log == 1:
                metrics.dump_log()

                model_state = get_state(model)

                aad = dict(args.__dict__)
                ad = {}
                for i in aad:
                    if i[0] != '_':
                        ad[i] = aad[i]

                checkpoint = {'args': ad, 'state': model_state, 'epoch': epoch}

                checkpoint_path = '%s/epoch_%d_d_D_50_%.04f.pt' % (
                    args.checkpoint_dir, epoch, best_eval_acc)
                print('Saving checkpoint to %s' % checkpoint_path)
                torch.save(checkpoint, checkpoint_path)

        print('[best_eval_d_D_50:%.04f]' % best_eval_acc)

        eval_loader.dataset._load_envs(start_idx=0, in_order=True)
        epoch += 1
Esempio n. 38
0
    def forward(self,
                input,
                hidden,
                encoder_output,
                encoder_outputs,
                input_variable,
                attn=False):
        output = self.embedding(input).unsqueeze(0)  #.view(1, 1, -1)
        output = self.dropout(output)

        attn_weights = None
        if attn == 1:
            #print(output)
            #print("is the output")
            #print(" ")
            #print(hidden)
            #print("is the hidden")
            #print(" ")
            if self.recurrent_unit == "LSTM" or self.recurrent_unit == "MyLSTM" or self.recurrent_unit == "LSTMSqueeze" or self.recurrent_unit == "ONLSTM":
                attn_weights = F.softmax(
                    self.attn(torch.cat((output[0], hidden[0][0]), 1)))
            else:
                attn_weights = F.softmax(
                    self.attn(torch.cat((output[0], hidden[0]), 1)))

#print(attn_weights.unsqueeze(1))
#print(encoder_outputs.transpose(0,1))
#attn_applied = torch.bmm(attn_weights.unsqueeze(0), encoder_outputs.unsqueeze(0))
            attn_applied = torch.bmm(attn_weights.unsqueeze(1),
                                     encoder_outputs.transpose(0, 1))
            #print(attn_applied)
            attn_applied = attn_applied.transpose(0, 1)

            #print(output)
            #print(attn_applied)

            output = torch.cat((output[0], attn_applied[0]), 1)
            #print(output)
            output = self.attn_combine(output).unsqueeze(0)
#print(output)

        if attn == 2:  # For the other type of attention
            #print("encoder_outputs", encoder_outputs)
            #print("input_variable", input_variable)
            input_length = input_variable.size()[
                0]  # Check if this is the right index
            u_i = Variable(torch.zeros(len(encoder_outputs), batch_size))
            #print("u_i", u_i)

            if use_cuda:
                u_i = u_i.cuda()
            for i in range(
                    input_length
            ):  # can this be done with just matrix operations (i.e. without a for loop)? (probably)
                #print("enc out input", encoder_outputs[i].unsqueeze(0))
                #print("hidden_reshaped", hidden[0].unsqueeze(0))
                #print("output", output)
                #print("output_reshaped", output.unsqueeze(0))

                if self.recurrent_unit == "LSTM" or self.recurrent_unit == "MyLSTM" or self.recurrent_unit == "LSTMSqueeze" or self.recurrent_unit == "ONLSTM":
                    attn_hidden = F.tanh(
                        self.attn_layer(
                            torch.cat((encoder_outputs[i].unsqueeze(0),
                                       hidden[0][0].unsqueeze(0), output), 2)))
                else:
                    attn_hidden = F.tanh(
                        self.attn_layer(
                            torch.cat((encoder_outputs[i].unsqueeze(0),
                                       hidden[0].unsqueeze(0), output),
                                      2)))  # the view(-1) is probably bad
#print("attn_hidden", attn_hidden)
#print("v", self.v.unsqueeze(1).unsqueeze(0))
                u_i_j = torch.bmm(attn_hidden,
                                  self.v.unsqueeze(1).unsqueeze(0))
                #print("u_i_j", u_i_j)
                #print("u_i_j[0][0][0]", u_i_j[0][0][0])
                u_i[i] = u_i_j[0].view(-1)

            a_i = F.softmax(u_i.transpose(
                0, 1))  # is it correct to be log softmax?
            #print("a_i", a_i)
            #print("a_i_reshaped", a_i.unsqueeze(1))
            #print("enc outputs transpose", encoder_outputs.transpose(0,1))
            attn_applied = torch.bmm(a_i.unsqueeze(1),
                                     encoder_outputs.transpose(0, 1))

            #print("attn_applied", attn_applied)
            attn_applied = attn_applied.transpose(0, 1)

            #print("output[0]", output)
            output = torch.cat((output[0], attn_applied[0]), 1)
            output = self.attn_combine(output).unsqueeze(0)
#print("output_end", output)

        for i in range(self.n_layers):
            #print(output)
            #print(" ")
            #print(hidden)
            #print(" ")
            output = F.relu(output)
            output, hidden = self.rnn(output, hidden)

        output = F.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights
Esempio n. 39
0
    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of one number representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors

        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # (tgt_len, b)

        source_padded_chars = self.vocab.src.to_input_tensor_char(
            source, device=self.device)  # (src_len, b, w_len)
        target_padded_chars = self.vocab.tgt.to_input_tensor_char(
            target, device=self.device)  # (tgt_len, b, w_len)

        enc_hiddens, dec_init_state = self.encode(source_padded_chars,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded_chars)

        ### YOUR CODE HERE for part 1i
        ### TODO:
        ###     Modify the code lines above as needed to fetch the character-level tensor
        ###     to feed into encode() and decode(). You should:
        ###     - Keep `target_padded` from A4 code above for predictions
        ###     - Add `source_padded_chars` for character level padded encodings for source
        ###     - Add `target_padded_chars` for character level padded encodings for target
        ###     - Modify calls to encode() and decode() to use the character level encodings

        ### END YOUR CODE

        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(
        )  # mhahn2 Small modification from A4 code.

        if self.charDecoder is not None:
            max_word_len = target_padded_chars.shape[-1]

            target_words = target_padded[1:].contiguous().view(-1)
            target_chars = target_padded_chars[1:].reshape(-1, max_word_len)
            target_outputs = combined_outputs.view(-1, 256)

            target_chars_oov = target_chars  # torch.index_select(target_chars, dim=0, index=oovIndices)
            rnn_states_oov = target_outputs  # torch.index_select(target_outputs, dim=0, index=oovIndices)
            oovs_losses = self.charDecoder.train_forward(
                target_chars_oov.t().contiguous(),
                (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0)))
            scores = scores - oovs_losses

        return scores
Esempio n. 40
0
    def decode(self, hbatch, lengths, model_lm=None):
        device = hbatch.device
        batch_size = hbatch.size(0)
        num_frames = hbatch.size(1)
        beam_width = self.hp.beam_width
        e_mask = torch.ones((batch_size, num_frames, 1),
                            device=device,
                            requires_grad=False)

        token_beam_sel = [
            ([], 0.0, (torch.zeros((batch_size, self.num_decoder_hidden_nodes),
                                   device=device,
                                   requires_grad=False),
                       torch.zeros((batch_size, self.num_decoder_hidden_nodes),
                                   device=device,
                                   requires_grad=False),
                       torch.zeros((batch_size, 1, num_frames),
                                   device=device,
                                   requires_grad=False)))
        ]

        for i, tmp in enumerate(lengths):
            if tmp < num_frames:
                e_mask[i, tmp:] = 0.0

        alpha_accum = []
        for seq_step in range(self.hp.max_decoder_seq_len):
            token_beam_all = []

            for current_token in token_beam_sel:
                cand_seq, cand_seq_score, (c, s, alpha) = current_token

                g, alpha = self.att(s, hbatch, alpha, e_mask)
                alpha_accum.append(alpha.cpu().numpy())

                # generate
                y = self.L_yy(torch.tanh(self.L_gy(g) + self.L_sy(s)))

                if self.hp.score_func == 'log_softmax':
                    y = F.log_softmax(y, dim=1)
                    if model_lm is not None and len(cand_seq) > 0:
                        lm_input = torch.from_numpy(np.array(
                            [cand_seq])).to(DEVICE).long()
                        lm_score = model_lm(lm_input)[:, -1, :]
                        tmpy = y + self.hp.lm_weight * F.log_softmax(lm_score,
                                                                     dim=1)
                    else:
                        tmpy = y.clone()
                elif self.hp.score_func == 'softmax':
                    y = F.softmax(y, dim=1)
                    if model_lm is not None:
                        lm_input = torch.from_numpy(np.array(
                            [cand_seq])).to(DEVICE).long()
                        lm_score = model_lm(lm_input)[:, -1, :]
                        y = y + self.hp.lm_weight * F.softmax(lm_score, dim=1)
                    else:
                        tmpy = y.clone()

                #tmpy = y.clone()
                for _ in range(beam_width):
                    bestidx = tmpy.data.argmax(1)

                    tmpseq = cand_seq.copy()
                    tmpseq.append(bestidx.item())

                    tmpscore = cand_seq_score + tmpy.data[0][bestidx]
                    tmpy.data[0][bestidx] = -10000000000.0

                    rec_input = self.L_ys(bestidx) + self.L_ss(s) + self.L_gs(
                        g)
                    tmps, tmpc = self._func_lstm(rec_input, c)

                    token_beam_all.append(
                        (tmpseq, tmpscore, (tmpc, tmps, alpha)))
            sorted_token_beam_all = sorted(token_beam_all,
                                           key=itemgetter(1),
                                           reverse=True)
            token_beam_sel = sorted_token_beam_all[:beam_width]
            results = []
            if token_beam_sel[0][0][-1] == self.hp.eos_id:
                for character in token_beam_sel[0][0]:
                    results.append(character)
                break
        alpha_accum = np.array(alpha_accum)
        return results
Esempio n. 41
0
    def decode_v2(self, hbatch, lengths, model_lm=None):
        """
        decode function with a few modification.
        1. Add the candidate when the prediction is </s>
        """
        device = hbatch.device
        #import sentencepiece as spm
        #sp = spm.SentencePieceProcessor()
        #sp.Load(self.hp.spm_model)
        batch_size = hbatch.shape[0]
        num_frames = hbatch.shape[1]
        e_mask = torch.ones((batch_size, num_frames, 1),
                            device=device,
                            requires_grad=False)

        beam_width = self.hp.beam_width
        beam_search = {
            'result':
            torch.zeros((beam_width, self.hp.max_decoder_seq_len),
                        device=device,
                        dtype=torch.long),
            'length':
            torch.zeros(beam_width).long(),
            'score':
            torch.zeros((beam_width), device=device,
                        dtype=torch.float).fill_(0),
            'c':
            torch.zeros((beam_width, self.num_decoder_hidden_nodes),
                        device=device),
            's':
            torch.zeros((beam_width, self.num_decoder_hidden_nodes),
                        device=device),
            'alpha':
            torch.zeros((beam_width, self.hp.max_decoder_seq_len, num_frames),
                        device=device)
        }

        beam_results = {
            'score':
            torch.zeros((beam_width), device=device,
                        dtype=torch.float).fill_(0),
            'result':
            torch.zeros((beam_width, self.hp.max_decoder_seq_len),
                        device=device,
                        dtype=torch.long),
            'length':
            torch.zeros(beam_width).long(),
            'alpha':
            torch.zeros((beam_width, self.hp.max_decoder_seq_len, num_frames),
                        device=device,
                        requires_grad=False)
        }

        beam_step = 0

        for i, tmp in enumerate(lengths):
            if tmp < num_frames:
                e_mask[i, tmp:] = 0.0

        for seq_step in range(self.hp.max_decoder_seq_len):
            # length_penalty = ((5 + seq_step + 1)**0.9 / (5 + 1)**0.9)
            cand_seq = copy.deepcopy(beam_search['result'])
            cand_score = copy.deepcopy(beam_search['score'].unsqueeze(1))
            c = copy.deepcopy(beam_search['c'])
            s = copy.deepcopy(beam_search['s'])
            cand_alpha = copy.deepcopy(beam_search['alpha'])
            if seq_step == 0:
                g, alpha = self.att(s, hbatch,
                                    cand_alpha[:, seq_step, :].unsqueeze(1),
                                    e_mask)
            else:
                g, alpha = self.att(
                    s, hbatch, cand_alpha[:, seq_step - 1, :].unsqueeze(1),
                    e_mask)
            # generate
            y = self.L_yy(torch.tanh(self.L_gy(g) + self.L_sy(s)))

            if self.hp.score_func == 'log_softmax':
                y = F.log_softmax(y, dim=1)
                if model_lm is not None and seq_step > 0:
                    lm_input = cand_seq[:, :seq_step]
                    lm_score = model_lm(lm_input)[:, -1, :]
                    tmpy = y + self.hp.lm_weight * F.log_softmax(lm_score,
                                                                 dim=1)
                else:
                    tmpy = y.clone()
            elif self.hp.score_func == 'softmax':
                y = F.softmax(y, dim=1)
                if model_lm is not None and seq_step:
                    lm_input = cand_seq[:, :seq_step]
                    lm_score = model_lm(lm_input)[:, -1, :]
                    y = y + self.hp.lm_weight * F.softmax(lm_score, dim=1)
                else:
                    tmpy = y.clone()

            best_scores, best_indices = tmpy.data.topk(beam_width, dim=1)
            scores = cand_score + best_scores + 1  #0.5
            tmp_s = torch.zeros((beam_width, self.num_decoder_hidden_nodes),
                                device=device)
            tmp_c = torch.zeros((beam_width, self.num_decoder_hidden_nodes),
                                device=device)

            if seq_step == 0:
                beam_search['score'] = scores[0]
                beam_search['result'][:, 0] = best_indices[0]
                beam_search['length'] += 1
                beam_search['alpha'][:, 0, :] = alpha.squeeze(1)
                tmp_s = s
                tmp_c = c
                rec_input = self.L_ys(
                    best_indices[0]) + self.L_ss(tmp_s) + self.L_gs(g)
                tmps, tmpc = self._func_lstm(rec_input, tmp_c)
                beam_search['s'] = tmps
                beam_search['c'] = tmpc
            else:
                k_scores, k_ix = scores.reshape(-1).topk(beam_width * 2)
                cand_idx = k_ix // beam_width
                cand_ids = k_ix % beam_width

                num_cand = 0
                i_cand = 0
                tmp_bestidx = torch.zeros((beam_width),
                                          dtype=torch.long,
                                          device=DEVICE)
                tmp_g = torch.zeros(
                    (beam_width, self.num_decoder_hidden_nodes * 2),
                    dtype=torch.float,
                    device=DEVICE)

                while num_cand < beam_width:
                    if best_indices[cand_idx[i_cand],
                                    cand_ids[i_cand]] == self.hp.eos_id:
                        beam_results['score'][beam_step] = k_scores[i_cand]
                        beam_results['result'][beam_step] = cand_seq[
                            cand_idx[i_cand]]
                        beam_results['result'][beam_step][
                            seq_step] = best_indices[cand_idx[i_cand],
                                                     cand_ids[i_cand]]
                        beam_results['length'][beam_step] = seq_step + 1
                        beam_results['alpha'][beam_step] = cand_alpha[
                            cand_idx[i_cand], :, :]
                        beam_results['alpha'][beam_step][seq_step] = alpha[
                            cand_idx[i_cand]].squeeze(0)
                        beam_step += 1
                        i_cand += 1
                    else:
                        beam_search['score'][num_cand] = k_scores[i_cand]
                        beam_search['result'][num_cand] = cand_seq[
                            cand_idx[i_cand]]
                        beam_search['result'][num_cand][
                            seq_step] = best_indices[cand_idx[i_cand],
                                                     cand_ids[i_cand]]
                        beam_search['length'][num_cand] += 1
                        tmp_bestidx[num_cand] = best_indices[cand_idx[i_cand],
                                                             cand_ids[i_cand]]
                        beam_search['alpha'][num_cand] = cand_alpha[
                            cand_idx[i_cand], :, :]
                        beam_search['alpha'][num_cand][seq_step] = alpha[
                            cand_idx[i_cand]].squeeze(0)
                        tmp_s[num_cand] = s[cand_idx[i_cand]]
                        tmp_c[num_cand] = c[cand_idx[i_cand]]
                        tmp_g[num_cand] = g[cand_idx[i_cand]]

                        i_cand += 1
                        num_cand += 1

                    if beam_step >= beam_width:
                        break

                rec_input = self.L_ys(tmp_bestidx) + self.L_ss(
                    tmp_s) + self.L_gs(tmp_g)
                tmps, tmpc = self._func_lstm(rec_input, tmp_c)
                beam_search['s'] = tmps
                beam_search['c'] = tmpc

                if beam_step >= beam_width:
                    break
        best_idx = beam_results['score'].argmax()
        length = beam_results['length'][best_idx]
        results = beam_results['result'][best_idx][:length].cpu().tolist()

        return results
Esempio n. 42
0
 def forward(self, x):
     return F.log_softmax(self.proj(x), dim=-1)
Esempio n. 43
0
 def forward(self, data):
     x, edge_index = data.x, data.edge_index
     x = self.conv1(x, edge_index)
     return F.log_softmax(x, dim=1)
Esempio n. 44
0
def regr_fcn(logits, multi_label=False):
    if multi_label:
        return torch.sigmoid(logits)
    else:
        return F.log_softmax(logits, 1)
Esempio n. 45
0
 def forward(self, x):
     x = x.view(-1, self.iSize * self.iSize)
     x = self.fc1(x)
     x = torch.relu(x)
     x = self.fc2(x)
     return F.log_softmax(x, dim=1)
Esempio n. 46
0
 def forward(self, x, edge_index):
     x = F.relu(self.conv1(x, edge_index))
     x = F.dropout(x, training=self.training)
     x = self.conv2(x, edge_index)
     return F.log_softmax(x, dim=1)
Esempio n. 47
0
 def decoder(self,z):
     z = F.tanh(self.fc5(F.tanh(self.fc4(z))))
     x= self.fc6(z)
     x = F.log_softmax(x, dim=1)
     return x
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")

    parser.add_argument('--kshot',
                        type=int,
                        default=5,
                        help="random seed for initialization")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=16,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=64,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=1e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")

    args = parser.parse_args()

    processors = {"rte": RteProcessor}

    output_modes = {"rte": "classification"}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    output_mode = output_modes[task_name]

    # train_examples = processor.get_RTE_as_train_k_shot('/export/home/Dataset/glue_data/RTE/train.tsv', args.kshot) #train_pu_half_v1.txt
    # dev_examples = processor.get_RTE_as_dev('/export/home/Dataset/glue_data/RTE/dev.tsv')
    # test_examples = processor.get_RTE_as_test('/export/home/Dataset/RTE/test_RTE_1235.txt')

    scitail_path = '/export/home/Dataset/SciTailV1/tsv_format/'
    train_examples = processor.get_SciTail_as_train_k_shot(
        scitail_path + 'scitail_1.0_train.tsv',
        args.kshot)  #train_pu_half_v1.txt
    dev_examples, test_examples = processor.get_SciTail_dev_and_test(
        scitail_path + 'scitail_1.0_dev.tsv',
        scitail_path + 'scitail_1.0_test.tsv')

    label_list = ["entails", "neutral"]
    num_labels = len(label_list)
    print('num_labels:', num_labels, 'training size:', len(train_examples),
          'dev size:', len(dev_examples), 'test size:', len(test_examples))

    num_train_optimization_steps = None
    num_train_optimization_steps = int(
        len(train_examples) / args.train_batch_size /
        args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
        num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
        )

    model = RobertaForSequenceClassification(3)
    tokenizer = RobertaTokenizer.from_pretrained(
        pretrain_model_dir, do_lower_case=args.do_lower_case)
    model.load_state_dict(
        torch.load(
            '/export/home/Dataset/BERT_pretrained_mine/MNLI_pretrained/_acc_0.9040886899918633.pt'
        ))
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    max_test_acc = 0.0
    max_dev_acc = 0.0
    if args.do_train:
        train_features = convert_examples_to_features(
            train_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)
        '''load dev set'''
        dev_features = convert_examples_to_features(
            dev_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        dev_all_input_ids = torch.tensor([f.input_ids for f in dev_features],
                                         dtype=torch.long)
        dev_all_input_mask = torch.tensor([f.input_mask for f in dev_features],
                                          dtype=torch.long)
        dev_all_segment_ids = torch.tensor(
            [f.segment_ids for f in dev_features], dtype=torch.long)
        dev_all_label_ids = torch.tensor([f.label_id for f in dev_features],
                                         dtype=torch.long)

        dev_data = TensorDataset(dev_all_input_ids, dev_all_input_mask,
                                 dev_all_segment_ids, dev_all_label_ids)
        dev_sampler = SequentialSampler(dev_data)
        dev_dataloader = DataLoader(dev_data,
                                    sampler=dev_sampler,
                                    batch_size=args.eval_batch_size)
        '''load test set'''
        test_features = convert_examples_to_features(
            test_examples,
            label_list,
            args.max_seq_length,
            tokenizer,
            output_mode,
            cls_token_at_end=
            False,  #bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
            cls_token=tokenizer.cls_token,
            cls_token_segment_id=0,  #2 if args.model_type in ['xlnet'] else 0,
            sep_token=tokenizer.sep_token,
            sep_token_extra=
            True,  #bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
            pad_on_left=
            False,  #bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token
                                                       ])[0],
            pad_token_segment_id=0
        )  #4 if args.model_type in ['xlnet'] else 0,)

        eval_all_input_ids = torch.tensor([f.input_ids for f in test_features],
                                          dtype=torch.long)
        eval_all_input_mask = torch.tensor(
            [f.input_mask for f in test_features], dtype=torch.long)
        eval_all_segment_ids = torch.tensor(
            [f.segment_ids for f in test_features], dtype=torch.long)
        eval_all_label_ids = torch.tensor([f.label_id for f in test_features],
                                          dtype=torch.long)

        eval_data = TensorDataset(eval_all_input_ids, eval_all_input_mask,
                                  eval_all_segment_ids, eval_all_label_ids)
        eval_sampler = SequentialSampler(eval_data)
        test_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        iter_co = 0
        final_test_performance = 0.0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                model.train()
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch

                logits = model(input_ids, input_mask)
                # loss_fct = CrossEntropyLoss()

                prob_matrix = F.log_softmax(logits.view(-1, 3), dim=1)
                '''this step *1.0 is very important, otherwise bug'''
                new_prob_matrix = prob_matrix * 1.0
                '''change the entail prob to p or 1-p'''
                changed_places = torch.nonzero(label_ids.view(-1),
                                               as_tuple=False)
                new_prob_matrix[changed_places,
                                0] = 1.0 - prob_matrix[changed_places, 0]

                loss = F.nll_loss(
                    new_prob_matrix,
                    torch.zeros_like(label_ids).to(device).view(-1))

                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1

                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
                iter_co += 1
                # if iter_co %20==0:
                if iter_co % len(train_dataloader) == 0:
                    '''
                    start evaluate on dev set after this epoch
                    '''
                    model.eval()

                    for idd, dev_or_test_dataloader in enumerate(
                        [dev_dataloader, test_dataloader]):

                        if idd == 0:
                            logger.info("***** Running dev *****")
                            logger.info("  Num examples = %d",
                                        len(dev_examples))
                        else:
                            logger.info("***** Running test *****")
                            logger.info("  Num examples = %d",
                                        len(test_examples))
                        # logger.info("  Batch size = %d", args.eval_batch_size)

                        eval_loss = 0
                        nb_eval_steps = 0
                        preds = []
                        gold_label_ids = []
                        # print('Evaluating...')
                        for input_ids, input_mask, segment_ids, label_ids in dev_or_test_dataloader:
                            input_ids = input_ids.to(device)
                            input_mask = input_mask.to(device)
                            segment_ids = segment_ids.to(device)
                            label_ids = label_ids.to(device)
                            gold_label_ids += list(
                                label_ids.detach().cpu().numpy())

                            with torch.no_grad():
                                logits = model(input_ids, input_mask)
                            if len(preds) == 0:
                                preds.append(logits.detach().cpu().numpy())
                            else:
                                preds[0] = np.append(
                                    preds[0],
                                    logits.detach().cpu().numpy(),
                                    axis=0)

                        preds = preds[0]

                        pred_probs = softmax(preds, axis=1)
                        pred_label_ids_3way = list(
                            np.argmax(pred_probs, axis=1))
                        '''change from 3-way to 2-way'''
                        pred_label_ids = []
                        for pred_id in pred_label_ids_3way:
                            if pred_id != 0:
                                pred_label_ids.append(1)
                            else:
                                pred_label_ids.append(0)

                        gold_label_ids = gold_label_ids
                        assert len(pred_label_ids) == len(gold_label_ids)
                        hit_co = 0
                        for k in range(len(pred_label_ids)):
                            if pred_label_ids[k] == gold_label_ids[k]:
                                hit_co += 1
                        test_acc = hit_co / len(gold_label_ids)

                        if idd == 0:  # this is dev
                            if test_acc > max_dev_acc:
                                max_dev_acc = test_acc
                                print('\ndev acc:', test_acc, ' max_dev_acc:',
                                      max_dev_acc, '\n')

                            else:
                                print('\ndev acc:', test_acc, ' max_dev_acc:',
                                      max_dev_acc, '\n')
                                break
                        else:  # this is test
                            if test_acc > max_test_acc:
                                max_test_acc = test_acc

                            final_test_performance = test_acc
                            print('\ntest acc:', test_acc, ' max_test_acc:',
                                  max_test_acc, '\n')
        print('final_test_performance:', final_test_performance)
Esempio n. 49
0
 def forward(self, x, target):
     target = torch.zeros_like(x).scatter_(1, target.unsqueeze(1), 1)
     smoothed_target = (1 - self.e) * target + self.e / x.size(1)
     loss = (- F.log_softmax(x, dim=1) * smoothed_target).sum(dim=1)
     return loss.mean()
Esempio n. 50
0
 def log_prob(self, x):
     """Calculate the log prob of all vocab."""
     x = self.linear(x, linear=True)
     log_prob = F.log_softmax(x, dim=-1, dtype=torch.float32)
     return log_prob
        zn = Variable(Tensor(np.random.normal(0,1, (32, 2048))))
        
        opt_g.zero_grad()
        opt_f.zero_grad()

        s_bottleneck = netG(s_imgs)
        t_bottleneck = netG(t_imgs)      
        
        s_fc2_emb, s_logit = netF(s_bottleneck)
        t_fc2_emb, t_logit = netF(t_bottleneck)
        
        s_cls_loss = get_cls_loss(s_logit, s_labels)

        #kl-divergence
        feat_s_kl = s_bottleneck.view(-1,2048)
        loss_kld_s = F.kl_div(F.log_softmax(feat_s_kl), F.softmax(zn))

        #distribution alignment loss (DAL)
        loss_dal= CriterionDAL(feat_t_recon, feat_zn_recon)
        
        t_prob = F.softmax(t_logit)
        t_entropy_loss = get_entropy_loss(t_prob)
        
        #updated loss function
        loss = s_cls_loss + t_entropy_loss + args.alpha * loss_kld_s  + args.beta * loss_dal
        loss.backward()
        
        if (i+1) % 5 == 0:
            print ("cls_loss: %.4f, entropy_loss: %.4f" % (s_cls_loss.item(), t_entropy_loss.item()))
        
        opt_g.step()
def get_cls_loss(pred, gt):
    cls_loss = F.nll_loss(F.log_softmax(pred), gt)
    return cls_loss
Esempio n. 53
0
 def forward(self, s_logits, t_logits):
     s_prob = F.log_softmax(s_logits / self.temperature, 1)
     t_prob = F.softmax(t_logits / self.temperature, 1)
     loss = self.klloss(s_prob,
                        t_prob) * self.temperature * self.temperature
     return loss
def distillation(y, teacher_scores, labels, T, alpha):
    return F.kl_div(F.log_softmax(y/T, dim=1), F.softmax(teacher_scores/T, dim=1)) * (T*T * 2. * alpha)\
           + F.cross_entropy(y, labels) * (1. - alpha)
Esempio n. 55
0
 def forward(self, x, adj):
     x = F.relu(self.gc1(x, adj))
     x = F.dropout(x, self.dropout,
                   training=self.training)  # 需要模型的整体training状态参数传入dropout函数
     x = self.gc2(x, adj)
     return F.log_softmax(x, dim=1)
Esempio n. 56
0
    def recognize_beam(self, encoder_outputs, char_list, args):
        """Beam search, decode one utterence now.
        Args:
            encoder_outputs: T x H
            char_list: list of character
            args: args.beam

        Returns:
            nbest_hyps:
        """
        # search params
        beam = args.beam_size
        nbest = args.nbest
        if args.decode_max_len == 0:
            maxlen = encoder_outputs.size(0)
        else:
            maxlen = args.decode_max_len

        # *********Init decoder rnn
        h_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        c_list = [self.zero_state(encoder_outputs.unsqueeze(0))]
        for l in range(1, self.num_layers):
            h_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
            c_list.append(self.zero_state(encoder_outputs.unsqueeze(0)))
        att_c = self.zero_state(encoder_outputs.unsqueeze(0),
                                H=encoder_outputs.unsqueeze(0).size(2))
        # prepare sos
        y = self.sos_id
        vy = encoder_outputs.new_zeros(1).long()

        hyp = {'score': 0.0, 'yseq': [y], 'c_prev': c_list, 'h_prev': h_list,
               'a_prev': att_c}
        hyps = [hyp]
        ended_hyps = []

        for i in range(maxlen):
            hyps_best_kept = []
            for hyp in hyps:
                # vy.unsqueeze(1)
                vy[0] = hyp['yseq'][i]
                embedded = self.embedding(vy)
                # embedded.unsqueeze(0)
                # step 1. decoder RNN: s_i = RNN(s_i−1,y_i−1,c_i−1)
                rnn_input = torch.cat((embedded, hyp['a_prev']), dim=1)
                h_list[0], c_list[0] = self.rnn[0](
                    rnn_input, (hyp['h_prev'][0], hyp['c_prev'][0]))
                for l in range(1, self.num_layers):
                    h_list[l], c_list[l] = self.rnn[l](
                        h_list[l-1], (hyp['h_prev'][l], hyp['c_prev'][l]))
                rnn_output = h_list[-1]
                # step 2. attention: c_i = AttentionContext(s_i,h)
                # below unsqueeze: (N x H) -> (N x 1 x H)
                att_c, att_w = self.attention(rnn_output.unsqueeze(dim=1),
                                              encoder_outputs.unsqueeze(0))
                att_c = att_c.squeeze(dim=1)
                # step 3. concate s_i and c_i, and input to MLP
                mlp_input = torch.cat((rnn_output, att_c), dim=1)
                predicted_y_t = self.mlp(mlp_input)
                local_scores = F.log_softmax(predicted_y_t, dim=1)
                # topk scores
                local_best_scores, local_best_ids = torch.topk(
                    local_scores, beam, dim=1)

                for j in range(beam):
                    new_hyp = {}
                    new_hyp['h_prev'] = h_list[:]
                    new_hyp['c_prev'] = c_list[:]
                    new_hyp['a_prev'] = att_c[:]
                    new_hyp['score'] = hyp['score'] + local_best_scores[0, j]
                    new_hyp['yseq'] = [0] * (1 + len(hyp['yseq']))
                    new_hyp['yseq'][:len(hyp['yseq'])] = hyp['yseq']
                    new_hyp['yseq'][len(hyp['yseq'])] = int(
                        local_best_ids[0, j])
                    # will be (2 x beam) hyps at most
                    hyps_best_kept.append(new_hyp)

                hyps_best_kept = sorted(hyps_best_kept,
                                        key=lambda x: x['score'],
                                        reverse=True)[:beam]
            # end for hyp in hyps
            hyps = hyps_best_kept

            # add eos in the final loop to avoid that there are no ended hyps
            if i == maxlen - 1:
                for hyp in hyps:
                    hyp['yseq'].append(self.eos_id)

            # add ended hypothes to a final list, and removed them from current hypothes
            # (this will be a probmlem, number of hyps < beam)
            remained_hyps = []
            for hyp in hyps:
                if hyp['yseq'][-1] == self.eos_id:
                    # hyp['score'] += (i + 1) * penalty
                    ended_hyps.append(hyp)
                else:
                    remained_hyps.append(hyp)

            hyps = remained_hyps
            if len(hyps) > 0:
                print('remeined hypothes: ' + str(len(hyps)))
            else:
                print('no hypothesis. Finish decoding.')
                break

            for hyp in hyps:
                print('hypo: ' + ' '.join([char_list[int(x)]
                                          for x in hyp['yseq'][1:]]))
        # end for i in range(maxlen)
        nbest_hyps = sorted(ended_hyps, key=lambda x: x['score'], reverse=True)[
            :min(len(ended_hyps), nbest)]
        #print(nbest_hyps)
        return nbest_hyps
Esempio n. 57
0
train_loader = DataLoader(dataset=tiny_sst,
                          batch_size=5,
                          collate_fn=batcher(device),
                          shuffle=False,
                          num_workers=0)

# training loop
for epoch in range(epochs):
    for step, batch in enumerate(train_loader):
        g = batch.graph
        n = g.number_of_nodes()
        h = th.zeros((n, h_size))
        c = th.zeros((n, h_size))
        logits = model(batch, h, c)
        logp = F.log_softmax(logits, 1)
        loss = F.nll_loss(logp, batch.label, reduction='sum')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        pred = th.argmax(logits, 1)
        acc = float(th.sum(th.eq(batch.label, pred))) / len(batch.label)
        print("Epoch {:05d} | Step {:05d} | Loss {:.4f} | Acc {:.4f} |".format(
            epoch, step, loss.item(), acc))

##############################################################################
# To train the model on full dataset with different settings(CPU/GPU,
# etc.), please refer to our repo's
# `example <https://github.com/jermainewang/dgl/tree/master/examples/pytorch/tree_lstm>`__.
# Besides, we also provide an implementation of the Child-Sum Tree LSTM.
Esempio n. 58
0
    def generate_beam(self, src_enc, src_len, tgt_lang_id, beam_size, length_penalty, early_stopping, max_len=200):
        """
        Decode a sentence given initial start.
        `x`:
            - LongTensor(bs, slen)
                <EOS> W1 W2 W3 <EOS> <PAD>
                <EOS> W1 W2 W3   W4  <EOS>
        `lengths`:
            - LongTensor(bs) [5, 6]
        `positions`:
            - False, for regular "arange" positions (LM)
            - True, to reset positions from the new generation (MT)
        `langs`:
            - must be None if the model only supports one language
            - lang_id if only one language is involved (LM)
            - (lang_id1, lang_id2) if two languages are involved (MT)
        """

        # check inputs
        assert src_enc.size(0) == src_len.size(0)
        assert beam_size >= 1

        # batch size / number of words
        bs = len(src_len)
        n_words = self.n_words

        # expand to beam size the source latent representations / source lengths
        src_enc = src_enc.unsqueeze(1).expand((bs, beam_size) + src_enc.shape[1:]).contiguous().view((bs * beam_size,) + src_enc.shape[1:])
        src_len = src_len.unsqueeze(1).expand(bs, beam_size).contiguous().view(-1)

        # generated sentences (batch with beam current hypotheses)
        generated = src_len.new(max_len, bs * beam_size)  # upcoming output
        generated.fill_(self.pad_index)                   # fill upcoming ouput with <PAD>
        generated[0].fill_(self.eos_index)                # we use <EOS> for <BOS> everywhere

        # generated hypotheses
        generated_hyps = [BeamHypotheses(beam_size, max_len, length_penalty, early_stopping) for _ in range(bs)]

        # positions
        positions = src_len.new(max_len).long()
        positions = torch.arange(max_len, out=positions).unsqueeze(1).expand_as(generated)

        # language IDs
        langs = positions.clone().fill_(tgt_lang_id)

        # scores for each sentence in the beam
        beam_scores = src_enc.new(bs, beam_size).fill_(0)
        beam_scores[:, 1:] = -1e9
        beam_scores = beam_scores.view(-1)

        # current position
        cur_len = 1

        # cache compute states
        cache = {'slen': 0}

        # store cross attention weights
        self.cross_att = defaultdict(list)

        # done sentences
        done = [False for _ in range(bs)]

        while cur_len < max_len:
            # compute word scores
            tensor = self.forward(
                'fwd',
                x=generated[:cur_len],
                lengths=src_len.new(bs * beam_size).fill_(cur_len),
                positions=positions[:cur_len],
                langs=langs[:cur_len],
                causal=True,
                src_enc=src_enc,
                src_len=src_len,
                cache=cache
            )
            assert tensor.size() == (1, bs * beam_size, self.dim)
            tensor = tensor.data[-1, :, :]               # (bs * beam_size, dim)
            scores = self.pred_layer.get_scores(tensor)  # (bs * beam_size, n_words)
            scores = F.log_softmax(scores, dim=-1)       # (bs * beam_size, n_words)
            assert scores.size() == (bs * beam_size, n_words)

            # select next words with scores
            _scores = scores + beam_scores[:, None].expand_as(scores)  # (bs * beam_size, n_words)
            _scores = _scores.view(bs, beam_size * n_words)            # (bs, beam_size * n_words)

            next_scores, next_words = torch.topk(_scores, 2 * beam_size, dim=1, largest=True, sorted=True)
            assert next_scores.size() == next_words.size() == (bs, 2 * beam_size)

            # next batch beam content
            # list of (bs * beam_size) tuple(next hypothesis score, next word, current position in the batch)
            next_batch_beam = []

            # for each sentence
            for sent_id in range(bs):

                # if we are done with this sentence
                done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item())
                if done[sent_id]:
                    next_batch_beam.extend([(0, self.pad_index, 0)] * beam_size)  # pad the batch
                    continue

                # next sentence beam content
                next_sent_beam = []

                # next words for this sentence
                for idx, value in zip(next_words[sent_id], next_scores[sent_id]):

                    # get beam and word IDs
                    beam_id = idx // n_words
                    word_id = idx % n_words

                    # end of sentence, or next word
                    if word_id == self.eos_index or cur_len + 1 == max_len:
                        generated_hyps[sent_id].add(generated[:cur_len, sent_id * beam_size + beam_id].clone(), value.item())
                    else:
                        next_sent_beam.append((value, word_id, sent_id * beam_size + beam_id))

                    # the beam for next step is full
                    if len(next_sent_beam) == beam_size:
                        break

                # update next beam content
                assert len(next_sent_beam) == 0 if cur_len + 1 == max_len else beam_size
                if len(next_sent_beam) == 0:
                    next_sent_beam = [(0, self.pad_index, 0)] * beam_size  # pad the batch
                next_batch_beam.extend(next_sent_beam)
                assert len(next_batch_beam) == beam_size * (sent_id + 1)

            # sanity check / prepare next batch
            assert len(next_batch_beam) == bs * beam_size
            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
            beam_words = generated.new([x[1] for x in next_batch_beam])
            beam_idx = src_len.new([x[2] for x in next_batch_beam])

            # re-order batch and internal states
            generated = generated[:, beam_idx]
            generated[cur_len] = beam_words
            for k in cache.keys():
                if k != 'slen':
                    cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx])

            # update current length
            cur_len = cur_len + 1

            # stop when we are done with each sentence
            if all(done):
                break

        # select the best hypotheses
        tgt_len = src_len.new(bs)
        best = []

        for i, hypotheses in enumerate(generated_hyps):
            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
            best.append(best_hyp)

        # generate target batch
        decoded = src_len.new(tgt_len.max().item(), bs).fill_(self.pad_index)
        for i, hypo in enumerate(best):
            decoded[:tgt_len[i] - 1, i] = hypo
            decoded[tgt_len[i] - 1, i] = self.eos_index

        # sanity check
        assert (decoded == self.eos_index).sum() == 2 * bs

        return decoded, tgt_len
Esempio n. 59
0
 def forward(self, x):
     h = self.fc_layer(x)
     if len(self.output_shape) > 1:
         h = h.view(h.shape[0], *self.output_shape)
     h = F.log_softmax(h, dim=-1)
     return h
Esempio n. 60
0
import numpy as np
import torch
import torch.nn.functional as F

x_ = np.random.randn(8).astype(np.float32)
x = torch.tensor(x_, requires_grad=True)
p = F.log_softmax(x, -1)
p_ = p.detach()
q = F.log_softmax(torch.randn(8), -1)

print(p)
print(q)

kl_div = F.kl_div(q, torch.exp(p), reduction='sum')
print(kl_div)

kl_div.backward()
print(x.grad)

x = torch.tensor(x_, requires_grad=True)
p = F.log_softmax(x, -1)

divided_kl_loss = 0
for index in range(len(x)):
    divided_kl_loss += (q[index] - p_[index]) * torch.exp(
        p_[index]) * -p[index]

divided_kl_loss.backward()
print(x.grad)

x = torch.tensor(x_, requires_grad=True)