Beispiel #1
0
    def forward(self, batch, return_attns=False):
        batch_size = len(batch)

        sentences_per_doc = []
        all_batch_sentences = []
        for document in batch:
            all_batch_sentences.extend(document)
            sentences_per_doc.append(len(document))

        lengths = [s.size()[0] for s in all_batch_sentences]
        sort_order = np.argsort(lengths)[::-1]
        sorted_sentences = [all_batch_sentences[i] for i in sort_order]
        sorted_lengths = [s.size()[0] for s in sorted_sentences]

        max_length = max(lengths)
        logger.debug('Num sentences: %s, max sentence length: %s', 
                     sum(sentences_per_doc), max_length)

        padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
        big_tensor = torch.cat(padded_sentences, 1)  # (max_length, batch size, 300)
        processed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
        encoded_sentences = self.sentence_encoder(processed_tensor)
        unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order))))
        unsorted_encodings = encoded_sentences.index_select(0, unsort_order)

        index = 0
        encoded_documents = []
        for sentences_count in sentences_per_doc:
            end_index = index + sentences_count
            encoded_documents.append(unsorted_encodings[index : end_index, :])
            index = end_index

        doc_sizes = [doc.size()[0] for doc in encoded_documents]
        max_doc_size = np.max(doc_sizes)
        ordered_document_idx = np.argsort(doc_sizes)[::-1]
        ordered_doc_sizes = sorted(doc_sizes)[::-1]
        ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx]
        padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents]
        padded_docs = torch.stack(padded_docs).squeeze(2) # turn the tensor list into tensor

        ##############################################################
        pos_emb = self.PositionEncoder.pe[:, :padded_docs.size()[1]].expand(padded_docs.size())
        inputs = padded_docs+pos_emb

        sent_mask = generate_mask(ordered_doc_sizes, max_doc_size)
        non_pad_mask = sent_mask.unsqueeze(-1)
        slf_attn_mask = (1-sent_mask).unsqueeze(1).expand(-1,sent_mask.size()[1],-1).type(torch.bool)

        outputs = self.Transformer(inputs,non_pad_mask, slf_attn_mask)
        outputs = self.Dropoutlayer(outputs)
        outputs = self.Decoderlayer(outputs) # batch * length * 1

        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(outputs[i, 0:doc_len - 1, :])  # -1 to remove last prediction

        unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)]
        x = torch.cat(unsorted_doc_outputs, 0)
        
        return x
Beispiel #2
0
    def forward(self, batch):
        batch_size = len(batch)
        sentences_per_doc = []
        all_batch_sentences = []
        for document in batch:
            all_batch_sentences.extend(document)
            sentences_per_doc.append(len(document))
        lengths = [s.size()[0] for s in all_batch_sentences]
        sort_order = np.argsort(lengths)[::-1]
        sorted_sentences = [all_batch_sentences[i] for i in sort_order]
        sorted_lengths = [s.size()[0] for s in sorted_sentences]
        max_length = max(lengths)
        logger.debug('Num sentences: %s, max sentence length: %s',
                     sum(sentences_per_doc), max_length)
        padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
        big_tensor = torch.cat(padded_sentences,
                               1)  # (max_length, batch size, 300)
        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
        encoded_sentences = self.sentence_encoder(packed_tensor)
        unsort_order = torch.tensor(
            maybe_cuda(torch.LongTensor(unsort(sort_order))))
        unsorted_encodings = encoded_sentences.index_select(0, unsort_order)

        index = 0
        encoded_documents = []
        for sentences_count in sentences_per_doc:
            end_index = index + sentences_count
            encoded_documents.append(unsorted_encodings[index:end_index, :])
            index = end_index

        doc_sizes = [doc.size()[0] for doc in encoded_documents]
        max_doc_size = np.max(doc_sizes)
        ordered_document_idx = np.argsort(doc_sizes)[::-1]
        ordered_doc_sizes = sorted(doc_sizes)[::-1]
        ordered_documents = [
            encoded_documents[idx] for idx in ordered_document_idx
        ]
        padded_docs = [
            self.pad_document(d, max_doc_size) for d in ordered_documents
        ]
        docs_tensor = torch.cat(padded_docs, 1)
        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
        sentence_lstm_output, _ = self.sentence_lstm(
            packed_docs, zero_state(self, batch_size=batch_size))
        padded_x, _ = pad_packed_sequence(
            sentence_lstm_output)  # (max sentence len, batch, 256)
        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len - 1,
                                        i, :])  # -1 to remove last prediction
        unsorted_doc_outputs = [
            doc_outputs[i] for i in unsort(ordered_document_idx)
        ]
        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)
        x = self.h2s(sentence_outputs)
        return x
Beispiel #3
0
    def forward(self, batch):
        batch_size = len(batch)

        sentences_per_doc = []
        all_batch_sentences = []
        for document in batch:
            all_batch_sentences.extend(document)
            sentences_per_doc.append(len(document))
        lengths = [s.size()[0] for s in all_batch_sentences]
        max_length = max(lengths) if max(
            lengths) < self.len_max_seq else self.len_max_seq - 1
        padded_sentences = [
            self.pad(s, max_length) for s in all_batch_sentences
        ]
        padded_sentences_tensor = torch.cat(padded_sentences,
                                            1).permute(1, 0, 2)

        #print(type(padded_sentences_tensor), padded_sentences_tensor.size(), max_length, len(lengths))
        unsorted_encodings = self.sentence_encoder(padded_sentences_tensor,
                                                   max_length, lengths)
        #print ("gdq -------------> ", len(all_batch_sentences), type(unsorted_encodings), unsorted_encodings.size())

        index = 0
        encoded_documents = []
        for sentences_count in sentences_per_doc:
            end_index = index + sentences_count
            encoded_documents.append(unsorted_encodings[index:end_index, :])
            index = end_index

        doc_sizes = [doc.size()[0] for doc in encoded_documents]
        max_doc_size = np.max(doc_sizes)
        ordered_document_idx = np.argsort(doc_sizes)[::-1]
        ordered_doc_sizes = sorted(doc_sizes)[::-1]
        ordered_documents = [
            encoded_documents[idx] for idx in ordered_document_idx
        ]
        padded_docs = [
            self.pad_document(d, max_doc_size) for d in ordered_documents
        ]
        docs_tensor = torch.cat(padded_docs, 1)
        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
        sentence_lstm_output, _ = self.sentence_lstm(
            packed_docs, zero_state(self, batch_size=batch_size))
        padded_x, _ = pad_packed_sequence(
            sentence_lstm_output)  # (max sentence len, batch, 256)

        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len - 1,
                                        i, :])  # -1 to remove last prediction

        unsorted_doc_outputs = [
            doc_outputs[i] for i in unsort(ordered_document_idx)
        ]
        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)

        x = self.h2s(sentence_outputs)
        return x
    def forward(self, batch, sent_bert_vec, target_idx):
        batch_size = len(batch)

        sentences_per_doc = []
        all_batch_sentences = []
        for document in batch:
            all_batch_sentences.extend(document)
            sentences_per_doc.append(len(document))

        lengths = [s.size()[0] for s in all_batch_sentences]
        sort_order = np.argsort(lengths)[::-1]
        sorted_sentences = [all_batch_sentences[i] for i in sort_order]
        sorted_lengths = [s.size()[0] for s in sorted_sentences]

        max_length = max(lengths)

        padded_sentences = [self.pad(s, max_length) for s in sorted_sentences]
        big_tensor = torch.cat(padded_sentences,
                               1)  # (max_length, batch size, 300)
        packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths)
        encoded_sentences = self.sentence_encoder(packed_tensor)
        unsort_order = Variable(
            maybe_cuda(torch.LongTensor(unsort(sort_order))))
        unsorted_encodings = encoded_sentences.index_select(0, unsort_order)

        sent_bert_vec_conc = sent_bert_vec[0]
        for i in range(1, len(sent_bert_vec)):
            sent_bert_vec_conc = torch.cat(
                (sent_bert_vec_conc, sent_bert_vec[i]), 0)
        #unsorted_encodings = torch.autograd.Variable(maybe_cuda(sent_bert_vec_conc), requires_grad=True)
        sent_bert_vec_conc = torch.autograd.Variable(
            maybe_cuda(sent_bert_vec_conc), requires_grad=True)
        unsorted_encodings = torch.cat(
            (unsorted_encodings, sent_bert_vec_conc), 1)

        index = 0
        encoded_documents = []
        for sentences_count in sentences_per_doc:
            end_index = index + sentences_count
            encoded_documents.append(unsorted_encodings[index:end_index, :])
            index = end_index

        doc_sizes = [doc.size()[0] for doc in encoded_documents]
        max_doc_size = np.max(doc_sizes)
        ordered_document_idx = np.argsort(doc_sizes)[::-1]
        ordered_doc_sizes = sorted(doc_sizes)[::-1]
        ordered_documents = [
            encoded_documents[idx] for idx in ordered_document_idx
        ]
        padded_docs = [
            self.pad_document(d, max_doc_size) for d in ordered_documents
        ]
        docs_tensor = torch.cat(padded_docs, 1)
        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
        sentence_lstm_output, _ = self.sentence_lstm(
            packed_docs, zero_state(self, batch_size=batch_size))
        padded_x, _ = pad_packed_sequence(
            sentence_lstm_output)  # (max sentence len, batch, 256)

        # computing the consecutive sentence pairs' similarities...
        window = 1
        doc_outputs = []
        sims_outputs = []
        pd_f = (0, 0, window, 0)
        pd_b = (0, 0, 0, window)
        for i, doc_len in enumerate(ordered_doc_sizes):
            #doc_outputs.append(padded_x[0:doc_len - 1, i, :])  # *** -1 to remove last prediction ***
            batch_x = padded_x[0:doc_len, i, :]

            forward_padded_x = self.fwd(batch_x[:-1, :self.hidden] - F.pad(
                batch_x[:-1, :self.hidden], pd_f, 'constant', 0)[:-window, :])
            backward_padded_x = self.fwd(
                batch_x[1:, self.hidden:] -
                F.pad(batch_x[1:, self.hidden:], pd_b, 'constant', 0)[
                    window:, :]).permute(1, 0)
            sims_outputs.append(
                F.sigmoid(
                    torch.diag(torch.mm(forward_padded_x, backward_padded_x))))

        doc_outputs = []
        doc_outputs_complete = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len - 1,
                                        i, :])  # -1 to remove last prediction
            doc_outputs_complete.append(padded_x[0:doc_len, i, :])

        unsorted_doc_outputs = [
            doc_outputs[i] for i in unsort(ordered_document_idx)
        ]
        unsorted_sims_outputs = [
            sims_outputs[i] for i in unsort(ordered_document_idx)
        ]
        unsorted_doc_outputs_complete = [
            doc_outputs_complete[i] for i in unsort(ordered_document_idx)
        ]

        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)
        sims_outputs = torch.cat(unsorted_sims_outputs, 0).unsqueeze(1)

        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len, i, :])

        win_size = 3
        encoded_documents_2 = []
        for doc_output in doc_outputs:
            doc_l = doc_output.size()[0]
            new_one = []
            for i in range(doc_l):
                if i - win_size < 0:
                    new_one.append(
                        no_name(doc_output[0:i + win_size + 1], i, win_size,
                                self.hidden, self.self_attn))
                else:
                    new_one.append(
                        no_name(doc_output[i - win_size:i + win_size + 1], i,
                                win_size, self.hidden, self.self_attn))

            encoded_documents_2.append(torch.stack(new_one, dim=0).squeeze(1))

        encoded_documents_2 = [
            torch.cat([doc_outputs[i], encoded_documents_2[i]], -1)
            for i in range(len(doc_outputs))
        ]
        doc_outputs = encoded_documents_2

        padded_docs = [self.pad_document(d, max_doc_size) for d in doc_outputs]
        docs_tensor = torch.cat(padded_docs, 1)
        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)

        sentence_lstm_output, _ = self.sentence_lstm_2(
            packed_docs, zero_state(
                self,
                batch_size=batch_size))  #till now, no sentence is removed
        #sentence_lstm_output = packed_docs
        padded_x, _ = pad_packed_sequence(
            sentence_lstm_output)  # (max sentence len, batch, 256)

        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len - 1, i, :])

        unsorted_doc_outputs = [
            doc_outputs[i] for i in unsort(ordered_document_idx)
        ]
        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)

        x = self.h2s(sentence_outputs)

        #return x, all_similarities.mean()
        return x, sims_outputs
Beispiel #5
0
    def forward(self, batch):
        batch_size = len(batch)

        sentences_per_doc = []
        all_batch_sentences = []
        for document in batch:
            all_batch_sentences.extend(document)
            sentences_per_doc.append(len(document))
        lengths = [s.size()[0] for s in all_batch_sentences]
        max_length = max(lengths)
        padded_sentences = [
            self.pad(s, max_length) for s in all_batch_sentences
        ]
        padded_sentences_tensor = torch.cat(padded_sentences,
                                            1).permute(1, 0, 2)

        unsorted_encodings = self.sentence_encoder(padded_sentences_tensor)
        #print ("gdq -------------> ", len(all_batch_sentences), type(unsorted_encodings), unsorted_encodings.size())

        index = 0
        encoded_documents = []
        for sentences_count in sentences_per_doc:
            end_index = index + sentences_count
            encoded_documents.append(unsorted_encodings[index:end_index, :])
            index = end_index

        doc_sizes = [doc.size()[0] for doc in encoded_documents]
        max_doc_size = np.max(doc_sizes)
        ordered_document_idx = np.argsort(doc_sizes)[::-1]
        ordered_doc_sizes = sorted(doc_sizes)[::-1]
        ordered_documents = [
            encoded_documents[idx] for idx in ordered_document_idx
        ]
        padded_docs = [
            self.pad_document(d, max_doc_size) for d in ordered_documents
        ]
        docs_tensor = torch.cat(padded_docs, 1)
        packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes)
        sentence_lstm_output, _ = self.sentence_lstm(
            packed_docs, zero_state(self, batch_size=batch_size))

        padded_x, _ = pad_packed_sequence(
            sentence_lstm_output)  # (max sentence len, batch, 256*2)

        sent_squish = batch_matmul_bias(padded_x,
                                        self.weight_W_sent,
                                        self.bias_sent,
                                        nonlinearity='tanh')
        sent_attn = batch_matmul(sent_squish, self.weight_proj_sent)
        sent_attn_norm = self.softmax_sent(sent_attn)
        sent_attn_vectors = attention_mul(padded_x, sent_attn_norm)
        padded_x = sent_attn_vectors

        doc_outputs = []
        for i, doc_len in enumerate(ordered_doc_sizes):
            doc_outputs.append(padded_x[0:doc_len - 1,
                                        i, :])  # -1 to remove last prediction

        unsorted_doc_outputs = [
            doc_outputs[i] for i in unsort(ordered_document_idx)
        ]
        sentence_outputs = torch.cat(unsorted_doc_outputs, 0)

        x = self.h2s(sentence_outputs)
        return x
Beispiel #6
0
 def test_unsort(self):
     x = np.random.randint(0, 100, 10)
     sort_order = np.argsort(x)
     unsort_order = unsort(sort_order)
     assert np.all(x[sort_order][unsort_order] == x)