Esempio n. 1
0
    def getloss_batch(self, have_action_batch, batch_buffer, batch_stack, batch_action, batch_output,
                      batch_valid_actions, batch_real_actions=None):
        predict_actions = []
        losses = []
        if self.mode == 'train':
            lstms_output = [torch.cat(
                [batch_buffer[batch_idx][0], batch_stack[batch_idx][0][0], batch_output[batch_idx][0][0],
                 batch_action[batch_idx]], 1)
                            for batch_idx in have_action_batch]
        elif self.mode == 'predict':
            lstms_output = [torch.cat(
                [batch_buffer[batch_idx][0], batch_stack[batch_idx][0][0], batch_output[batch_idx][0][0],
                 batch_action[batch_idx][0][0]], 1)
                            for batch_idx in have_action_batch]
        lstms_output = torch.cat([i for i in lstms_output], 0)
        hidden_output = torch.tanh(self.lstms_output_2_softmax(self.dropout(lstms_output)))
        logits = self.output_2_act(hidden_output)
        for idx in range(len(have_action_batch)):
            logit = logits[idx][
                utils.variable(torch.LongTensor(batch_valid_actions[have_action_batch[idx]]), self.gpu_triger)]
            valid_action_tbl = {a: i for i, a in enumerate(batch_valid_actions[have_action_batch[idx]])}
            log_probs = torch.nn.functional.log_softmax(logit)
            action_idx = torch.max(log_probs.cpu(), 0)[1].item()
            action_predict = batch_valid_actions[have_action_batch[idx]][action_idx]
            predict_actions.append(action_predict)
            if self.mode == 'train':
                if log_probs is not None:
                    losses.append(log_probs[valid_action_tbl[batch_real_actions[have_action_batch[idx]].item()]])

        if self.mode == 'predict':
            losses = None

        return predict_actions, losses
def generate_ner(ner_model, fileout, dataset_loader, action2idx, word2idx,
                 if_cuda):

    idx2action = {v: k for k, v in action2idx.items()}
    idx2word = {v: k for k, v in word2idx.items()}
    ner_model.eval()

    for feature in itertools.chain.from_iterable(
            dataset_loader):  # feature : torch.Size([4, 17])
        fe_v = utils.variable(feature, if_cuda)
        _, pre_action = ner_model.forward(fe_v)
        feature_seq = []
        for sent in fe_v.squeeze(0).data.tolist():
            feature_seq.append([idx2word[w_idx] for w_idx in sent])

        for sent_idx in range(len(pre_action)):
            entitys = []
            ner_start_pos = -1
            word_start = -1
            word_idx = 0
            for ac_idx in range(len(pre_action[sent_idx])):
                if idx2action[pre_action[sent_idx][ac_idx]].startswith(
                        'S') and ner_start_pos < 0:
                    ner_start_pos = ac_idx
                    word_start = word_idx
                    word_idx += 1
                elif idx2action[pre_action[sent_idx][ac_idx]].startswith(
                        'O') and ner_start_pos >= 0:
                    ner_start_pos = -1
                    word_idx += 1
                elif idx2action[pre_action[sent_idx][ac_idx]].startswith(
                        'R') and ner_start_pos >= 0:
                    ent = []
                    ent.append(" ".join(
                        feature_seq[sent_idx][word_start:word_idx]))
                    ent.append([ner_start_pos, ac_idx - 1])
                    ent.append(
                        idx2action[pre_action[sent_idx][ac_idx]].split('-')[1])
                    entitys.append(ent)
                    ner_start_pos = -1
                else:
                    word_idx += 1

            fileout.write("%s\nEntities: " % (" ".join(feature_seq[sent_idx])))
            for i in range(len(entitys)):
                fileout.write("%s-%s " % (entitys[i][0], entitys[i][2]))
            fileout.write("\n\n")
Esempio n. 3
0
    def forward(self, sentence, actions=None, hidden=None):

        # sentence = sentence.squeeze(0)
        if actions is None:
            mode = 'predict'
        else:
            mode = 'train'

        self.set_seq_size(sentence)
        word_embeds = self.dropout_e(self.word_embeds(sentence))
        word_embeds = word_embeds.squeeze(0)
        if mode == 'train':
            # actions = actions.squeeze(0)
            action_embeds = self.dropout_e(self.action_embeds(actions))
            action_embeds = action_embeds.squeeze(0)
            relation_embeds = self.dropout_e(self.relation_embeds(actions))
            relation_embeds = relation_embeds.squeeze(0)
            actions = actions.squeeze(0)

        sentence = sentence.squeeze(0)
        action_count = 0

        buffer = StackRNN(self.buffer_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)
        stack = StackRNN(self.stack_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)
        action = StackRNN(self.action_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)
        output = StackRNN(self.output_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)
        ent_f = StackRNN(self.entity_forward_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)
        ent_b = StackRNN(self.entity_backward_lstm, self.lstm_initial, self.dropout, self._rnn_get_output, self.empty_emb)

        predict_actions = []
        pre_actions = []
        losses = []

        sentence_array = sentence.data.tolist()
        token_embedding = list()

        for word_idx in range(len(sentence_array)):
            if self.use_spelling:
                if sentence_array[word_idx] == 0:
                    tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), self.unk_char_embeds], 1)
                elif sentence_array[word_idx] != 1:
                    word = sentence_array[word_idx]
                    chars_in_word = [self.char2idx[char] for char in self.idx2word[word]]
                    chars_Tensor = utils.variable(torch.from_numpy(np.array(chars_in_word)), self.gpu_triger)
                    chars_embeds = self.dropout_e(self.char_embeds(chars_Tensor.unsqueeze(0)))
                    if self.char_structure == 'lstm':
                        char_o, hidden = self.char_bi_lstm(chars_embeds.transpose(0, 1), hidden)
                        char_out = torch.chunk(hidden[0].squeeze(1), 2, 0)
                        tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), char_out[0], char_out[1]], 1)
                    elif self.char_structure == 'cnn':
                        char = chars_embeds.unsqueeze(0)
                        char = char.transpose(1, 2)
                        char, _ = self.conv1d(char).max(dim=2)
                        char = torch.tanh(char)
                        tok_rep = torch.cat([word_embeds[word_idx].unsqueeze(0), char], 1)
            else:
                tok_rep = word_embeds[word_idx].unsqueeze(0)
            if word_idx == 0:
                token_embedding = tok_rep
            elif sentence_array[word_idx] != 1:
                token_embedding = torch.cat([token_embedding, tok_rep], 0)

        for i in range(token_embedding.size()[0]):
            tok_embed = token_embedding[token_embedding.size()[0]-1-i].unsqueeze(0)
            tok = sentence.data[token_embedding.size()[0]-1-i]
            buffer.push(tok_embed, (tok_embed, self.idx2word[tok]))

        while len(buffer) > 0 or len(stack) > 0:
            valid_actions = self.get_possible_actions(stack, buffer)
            log_probs = None
            if len(valid_actions)>1:

                lstms_output = torch.cat([buffer.embedding(), stack.embedding(), output.embedding(), action.embedding()], 1)
                hidden_output = torch.tanh(self.lstms_output_2_softmax(self.dropout(lstms_output)))
                if self.gpu_triger is True:
                    logits = self.output_2_act(hidden_output)[0][torch.autograd.Variable(torch.LongTensor(valid_actions)).cuda()]
                else:
                    logits = self.output_2_act(hidden_output)[0][torch.autograd.Variable(torch.LongTensor(valid_actions))]
                valid_action_tbl = {a: i for i, a in enumerate(valid_actions)}
                log_probs = torch.nn.functional.log_softmax(logits, dim=0)
                action_idx = torch.max(log_probs.cpu(), 0)[1][0].data.numpy()[0]
                action_predict = valid_actions[action_idx]
                pre_actions.append(action_predict)
                if mode == 'train':
                    if log_probs is not None:
                        losses.append(log_probs[valid_action_tbl[actions.data[action_count]]])

            if mode == 'train':
                real_action = self.idx2action[actions.data[action_count]]
                act_embedding = action_embeds[action_count].unsqueeze(0)
                rel_embedding = relation_embeds[action_count].unsqueeze(0)
            elif mode == 'predict':
                real_action = self.idx2action[action_predict]
                action_predict_tensor = utils.variable(torch.from_numpy(np.array([action_predict])), self.gpu_triger)
                action_embeds = self.dropout_e(self.action_embeds(action_predict_tensor))
                relation_embeds = self.dropout_e(self.relation_embeds(action_predict_tensor))
                act_embedding = action_embeds[0].unsqueeze(0)
                rel_embedding = relation_embeds[0].unsqueeze(0)

            action.push(act_embedding,(act_embedding, real_action))
            if real_action.startswith('S'):
                assert len(buffer) > 0
                tok_buffer_embedding, buffer_token = buffer.pop()
                stack.push(tok_buffer_embedding, (tok_buffer_embedding, buffer_token))
            elif real_action.startswith('O'):
                assert len(buffer) > 0
                tok_buffer_embedding, buffer_token = buffer.pop()
                output.push(tok_buffer_embedding, (tok_buffer_embedding, buffer_token))
            elif real_action.startswith('R'):
                ent =''
                entity = []
                assert len(stack) > 0
                while len(stack) > 0:
                    tok_stack_embedding, stack_token = stack.pop()
                    entity.append([tok_stack_embedding, stack_token])
                if len(entity) > 1:

                    for i in range(len(entity)):
                        ent_f.push(entity[i][0], (entity[i][0],entity[i][1]))
                        ent_b.push(entity[len(entity)-i-1][0], (entity[len(entity)-i-1][0], entity[len(entity)-i-1][1]))
                        ent += entity[i][1]
                        ent += ' '
                    entity_input = self.dropout(torch.cat([ent_f.embedding(), ent_b.embedding()], 1))
                else:
                    ent_f.push(entity[0][0], (entity[0][0], entity[0][1]))
                    ent_b.push(entity[0][0], (entity[0][0], entity[0][1]))
                    ent = entity[0][1]
                    entity_input = self.dropout(torch.cat([ent_f.embedding(), ent_b.embedding()], 1))
                ent_f.clear()
                ent_b.clear()
                output_input = self.entity_2_output(torch.cat([entity_input, rel_embedding], 1))
                output.push(output_input, (entity_input, ent))
            action_count += 1

        if len(losses) > 0:
            loss = -torch.sum(torch.cat(losses))
        else:
            loss = -1
        predict_actions.append(pre_actions)

        return loss, predict_actions
Esempio n. 4
0
    def forward(self, sentences, actions=None, hidden=None):

        if actions is not None:
            self.mode = "train"
        else:
            self.mode = "predict"

        self.set_batch_seq_size(sentences)  # sentences [batch_size, max_len]
        word_embeds = self.dropout_e(self.word_embeds(sentences))  # [batch_size, max_len, embeddind_size]
        if self.mode == 'train':
            action_embeds = self.dropout_e(self.action_embeds(actions))
            relation_embeds = self.dropout_e(self.relation_embeds(actions))
            action_output, _ = self.ac_lstm(action_embeds.transpose(0, 1))
            action_output = action_output.transpose(0, 1)

        lstm_initial = (
        utils.xavier_init(self.gpu_triger, 1, self.hidden_dim), utils.xavier_init(self.gpu_triger, 1, self.hidden_dim))

        sentence_array = sentences.data.cpu().numpy()
        sents_len = []
        token_embedds = None
        for sent_idx in range(len(sentence_array)):
            count_words = 0
            token_embedding = None
            for word_idx in reversed(range(len(sentence_array[sent_idx]))):
                if self.use_spelling:
                    if sentence_array[sent_idx][word_idx] == 1:
                        tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), self.pad_char_embeds], 1)
                    elif sentence_array[sent_idx][word_idx] == 0:
                        count_words += 1
                        tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), self.unk_char_embeds], 1)
                    else:
                        count_words += 1
                        word = sentence_array[sent_idx][word_idx]
                        chars_in_word = [self.char2idx[char] for char in self.idx2word[word]]
                        chars_Tensor = utils.variable(torch.from_numpy(np.array(chars_in_word)), self.gpu_triger)
                        chars_embeds = self.dropout_e(self.char_embeds(chars_Tensor))
                        if self.char_structure == 'lstm':
                            char_o, hidden = self.char_bi_lstm(chars_embeds.unsqueeze(1), hidden)
                            char_out = torch.chunk(hidden[0].squeeze(1), 2, 0)
                            tok_rep = torch.cat(
                                [word_embeds[sent_idx][word_idx].unsqueeze(0), char_out[0], char_out[1]], 1)
                        elif self.char_structure == 'cnn':
                            char, _ = self.conv1d(chars_embeds.unsqueeze(0).transpose(1, 2)).max(
                                dim=2)  # [batch_size, Embedding_sie, sentence_len] --> [batch_size, output_dim, sentence_len+padding_num*2 - kernel_num + 1]
                            char = torch.tanh(char)
                            tok_rep = torch.cat([word_embeds[sent_idx][word_idx].unsqueeze(0), char], 1)
                else:
                    if sentence_array[sent_idx][word_idx] != 1:
                        count_words += 1
                    tok_rep = word_embeds[sent_idx][word_idx].unsqueeze(0)
                if token_embedding is None:
                    token_embedding = tok_rep
                else:
                    token_embedding = torch.cat([token_embedding, tok_rep], 0)

            sents_len.append(count_words)
            if token_embedds is None:
                token_embedds = token_embedding.unsqueeze(0)
            else:
                token_embedds = torch.cat([token_embedds, token_embedding.unsqueeze(0)], 0)

        tokens = token_embedds.transpose(0, 1)
        tok_output, hidden = self.lstm(tokens)  # [max_len, batch_size, hidden_dim]
        tok_output = tok_output.transpose(0, 1)

        buffer = [[] for i in range(self.batch_size)]
        losses = [[] for i in range(self.batch_size)]
        right = [0 for i in range(self.batch_size)]
        predict_actions = [[] for i in range(self.batch_size)]
        output = [[[lstm_initial, "<pad>"]] for i in range(self.batch_size)]
        if self.mode == 'predict':
            action = [[[lstm_initial, "<pad>"]] for i in range(self.batch_size)]

        for idx in range(tok_output.size(0)):
            for word_idx in range(tok_output.size(1)):
                buffer[idx].append([tok_output[idx][word_idx].unsqueeze(0), token_embedds[idx][word_idx].unsqueeze(0),
                                    self.idx2word[sentence_array[idx][tok_output.size(1) - 1 - word_idx]]])

        stack = [[[lstm_initial, "<pad>"]] for i in range(self.batch_size)]
        for act_idx in range(self.seq_length):
            batch_buffer = [b[-1] for b in buffer]
            if self.mode == 'train':
                if act_idx == 0:
                    batch_action = [lstm_initial[0] for a in range(self.batch_size)]
                else:
                    batch_action = [a[act_idx - 1].unsqueeze(0) for a in action_output]
                batch_relation = [r[act_idx].unsqueeze(0) for r in relation_embeds]
            elif self.mode == 'predict':
                batch_action = [a[-1] for a in action]
            batch_output = [o[-1] for o in output]
            batch_stack = [s[-1] for s in stack]

            have_action_batch_1 = [i for i in range(len(sents_len)) if sents_len[i] > 0]
            have_action_batch_2 = [i for i in range(len(batch_stack)) if batch_stack[i][1] != '<pad>']
            have_action_batch = list(set(have_action_batch_1).union(set(have_action_batch_2)))

            if len(have_action_batch) > 0:
                batch_valid_actions = self.get_possible_actions_batch(batch_stack, sents_len, have_action_batch)
                if self.mode == 'train':
                    batch_real_action = [ac[act_idx] for ac in actions.data]
                    batch_pred, batch_loss = self.getloss_batch(have_action_batch, batch_buffer, batch_stack,
                                                                batch_action, batch_output, batch_valid_actions,
                                                                batch_real_action)
                    batch_real_action = [self.idx2action[ac.item()] for ac in batch_real_action]
                elif self.mode == 'predict':
                    batch_pred, batch_loss = self.getloss_batch(have_action_batch, batch_buffer, batch_stack,
                                                                batch_action, batch_output, batch_valid_actions)
                    pred_action_tensor = utils.variable(torch.from_numpy(np.array(batch_pred)), self.gpu_triger)
                    predict_actions_embed = self.dropout_e(self.action_embeds(pred_action_tensor))
                    ac_lstm_h, ac_lstm_c = self.action_lstm(predict_actions_embed, (torch.cat(
                        [action[ac_idx][-1][0][0] for ac_idx in range(len(action)) if ac_idx in have_action_batch]),
                                                                                    torch.cat(
                                                                                        [action[ac_idx][-1][0][1] for
                                                                                         ac_idx in range(len(action)) if
                                                                                         ac_idx in have_action_batch])))

                i = 0
                for batch_idx in range(self.batch_size):
                    if batch_idx in have_action_batch:
                        predict_actions[batch_idx].append(batch_pred[i])
                        if self.mode == 'train':
                            losses[batch_idx].append(batch_loss[i])
                        elif self.mode == 'predict':
                            action[batch_idx].append([(ac_lstm_h[i].unsqueeze(0), ac_lstm_c[i].unsqueeze(0)),
                                                      self.idx2action[batch_pred[i]]])
                        i += 1
                    else:
                        if self.mode == 'predict':
                            action[batch_idx].append([lstm_initial, "<pad>"])

                if self.mode == 'predict':
                    batch_real_action = [ac[-1][1] for ac in action]
                    relation_embeds = self.dropout_e(self.relation_embeds(
                        utils.variable(torch.from_numpy(np.array([self.action2idx[a] for a in batch_real_action])),
                                      self.gpu_triger)))
                    batch_relation = [relation_embed.unsqueeze(0) for relation_embed in relation_embeds]

                batch_shift_idx = [idx for idx in range(len(batch_real_action)) if
                                   batch_real_action[idx].startswith('S')]
                batch_out_idx = [idx for idx in range(len(batch_real_action)) if batch_real_action[idx].startswith('O')]
                batch_reduce_idx = [idx for idx in range(len(batch_real_action)) if
                                    batch_real_action[idx].startswith('R')]

                # batch_relation = [batch_relation[i] for i in batch_reduce_idx]
                if len(batch_shift_idx) > 0:
                    buffer, stack = self.batch_shift_out('S', buffer, stack, batch_shift_idx)
                    for i in range(len(sents_len)):
                        if i in batch_shift_idx:
                            sents_len[i] -= 1
                if len(batch_out_idx) > 0:
                    buffer, output = self.batch_shift_out('O', buffer, output, batch_out_idx)
                    for i in range(len(sents_len)):
                        if i in batch_out_idx:
                            sents_len[i] -= 1
                if len(batch_reduce_idx) > 0:
                    stack, output = self.batch_reduce(stack, output, batch_relation, batch_reduce_idx)
        loss = 0
        if self.mode == 'train':
            for idx in range(self.batch_size):
                loss += -torch.sum(torch.cat(losses[idx]))

        return loss, predict_actions