def forward(self, batch, return_attns=False): batch_size = len(batch) sentences_per_doc = [] all_batch_sentences = [] for document in batch: all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) lengths = [s.size()[0] for s in all_batch_sentences] sort_order = np.argsort(lengths)[::-1] sorted_sentences = [all_batch_sentences[i] for i in sort_order] sorted_lengths = [s.size()[0] for s in sorted_sentences] max_length = max(lengths) logger.debug('Num sentences: %s, max sentence length: %s', sum(sentences_per_doc), max_length) padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) processed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) encoded_sentences = self.sentence_encoder(processed_tensor) unsort_order = Variable(maybe_cuda(torch.LongTensor(unsort(sort_order)))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count encoded_documents.append(unsorted_encodings[index : end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [encoded_documents[idx] for idx in ordered_document_idx] padded_docs = [self.pad_document(d, max_doc_size) for d in ordered_documents] padded_docs = torch.stack(padded_docs).squeeze(2) # turn the tensor list into tensor ############################################################## pos_emb = self.PositionEncoder.pe[:, :padded_docs.size()[1]].expand(padded_docs.size()) inputs = padded_docs+pos_emb sent_mask = generate_mask(ordered_doc_sizes, max_doc_size) non_pad_mask = sent_mask.unsqueeze(-1) slf_attn_mask = (1-sent_mask).unsqueeze(1).expand(-1,sent_mask.size()[1],-1).type(torch.bool) outputs = self.Transformer(inputs,non_pad_mask, slf_attn_mask) outputs = self.Dropoutlayer(outputs) outputs = self.Decoderlayer(outputs) # batch * length * 1 doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(outputs[i, 0:doc_len - 1, :]) # -1 to remove last prediction unsorted_doc_outputs = [doc_outputs[i] for i in unsort(ordered_document_idx)] x = torch.cat(unsorted_doc_outputs, 0) return x
def forward(self, batch): batch_size = len(batch) sentences_per_doc = [] all_batch_sentences = [] for document in batch: all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) lengths = [s.size()[0] for s in all_batch_sentences] sort_order = np.argsort(lengths)[::-1] sorted_sentences = [all_batch_sentences[i] for i in sort_order] sorted_lengths = [s.size()[0] for s in sorted_sentences] max_length = max(lengths) logger.debug('Num sentences: %s, max sentence length: %s', sum(sentences_per_doc), max_length) padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) encoded_sentences = self.sentence_encoder(packed_tensor) unsort_order = torch.tensor( maybe_cuda(torch.LongTensor(unsort(sort_order)))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count encoded_documents.append(unsorted_encodings[index:end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [ encoded_documents[idx] for idx in ordered_document_idx ] padded_docs = [ self.pad_document(d, max_doc_size) for d in ordered_documents ] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) sentence_lstm_output, _ = self.sentence_lstm( packed_docs, zero_state(self, batch_size=batch_size)) padded_x, _ = pad_packed_sequence( sentence_lstm_output) # (max sentence len, batch, 256) doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction unsorted_doc_outputs = [ doc_outputs[i] for i in unsort(ordered_document_idx) ] sentence_outputs = torch.cat(unsorted_doc_outputs, 0) x = self.h2s(sentence_outputs) return x
def forward(self, batch): batch_size = len(batch) sentences_per_doc = [] all_batch_sentences = [] for document in batch: all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) lengths = [s.size()[0] for s in all_batch_sentences] max_length = max(lengths) if max( lengths) < self.len_max_seq else self.len_max_seq - 1 padded_sentences = [ self.pad(s, max_length) for s in all_batch_sentences ] padded_sentences_tensor = torch.cat(padded_sentences, 1).permute(1, 0, 2) #print(type(padded_sentences_tensor), padded_sentences_tensor.size(), max_length, len(lengths)) unsorted_encodings = self.sentence_encoder(padded_sentences_tensor, max_length, lengths) #print ("gdq -------------> ", len(all_batch_sentences), type(unsorted_encodings), unsorted_encodings.size()) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count encoded_documents.append(unsorted_encodings[index:end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [ encoded_documents[idx] for idx in ordered_document_idx ] padded_docs = [ self.pad_document(d, max_doc_size) for d in ordered_documents ] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) sentence_lstm_output, _ = self.sentence_lstm( packed_docs, zero_state(self, batch_size=batch_size)) padded_x, _ = pad_packed_sequence( sentence_lstm_output) # (max sentence len, batch, 256) doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction unsorted_doc_outputs = [ doc_outputs[i] for i in unsort(ordered_document_idx) ] sentence_outputs = torch.cat(unsorted_doc_outputs, 0) x = self.h2s(sentence_outputs) return x
def forward(self, batch, sent_bert_vec, target_idx): batch_size = len(batch) sentences_per_doc = [] all_batch_sentences = [] for document in batch: all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) lengths = [s.size()[0] for s in all_batch_sentences] sort_order = np.argsort(lengths)[::-1] sorted_sentences = [all_batch_sentences[i] for i in sort_order] sorted_lengths = [s.size()[0] for s in sorted_sentences] max_length = max(lengths) padded_sentences = [self.pad(s, max_length) for s in sorted_sentences] big_tensor = torch.cat(padded_sentences, 1) # (max_length, batch size, 300) packed_tensor = pack_padded_sequence(big_tensor, sorted_lengths) encoded_sentences = self.sentence_encoder(packed_tensor) unsort_order = Variable( maybe_cuda(torch.LongTensor(unsort(sort_order)))) unsorted_encodings = encoded_sentences.index_select(0, unsort_order) sent_bert_vec_conc = sent_bert_vec[0] for i in range(1, len(sent_bert_vec)): sent_bert_vec_conc = torch.cat( (sent_bert_vec_conc, sent_bert_vec[i]), 0) #unsorted_encodings = torch.autograd.Variable(maybe_cuda(sent_bert_vec_conc), requires_grad=True) sent_bert_vec_conc = torch.autograd.Variable( maybe_cuda(sent_bert_vec_conc), requires_grad=True) unsorted_encodings = torch.cat( (unsorted_encodings, sent_bert_vec_conc), 1) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count encoded_documents.append(unsorted_encodings[index:end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [ encoded_documents[idx] for idx in ordered_document_idx ] padded_docs = [ self.pad_document(d, max_doc_size) for d in ordered_documents ] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) sentence_lstm_output, _ = self.sentence_lstm( packed_docs, zero_state(self, batch_size=batch_size)) padded_x, _ = pad_packed_sequence( sentence_lstm_output) # (max sentence len, batch, 256) # computing the consecutive sentence pairs' similarities... window = 1 doc_outputs = [] sims_outputs = [] pd_f = (0, 0, window, 0) pd_b = (0, 0, 0, window) for i, doc_len in enumerate(ordered_doc_sizes): #doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # *** -1 to remove last prediction *** batch_x = padded_x[0:doc_len, i, :] forward_padded_x = self.fwd(batch_x[:-1, :self.hidden] - F.pad( batch_x[:-1, :self.hidden], pd_f, 'constant', 0)[:-window, :]) backward_padded_x = self.fwd( batch_x[1:, self.hidden:] - F.pad(batch_x[1:, self.hidden:], pd_b, 'constant', 0)[ window:, :]).permute(1, 0) sims_outputs.append( F.sigmoid( torch.diag(torch.mm(forward_padded_x, backward_padded_x)))) doc_outputs = [] doc_outputs_complete = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction doc_outputs_complete.append(padded_x[0:doc_len, i, :]) unsorted_doc_outputs = [ doc_outputs[i] for i in unsort(ordered_document_idx) ] unsorted_sims_outputs = [ sims_outputs[i] for i in unsort(ordered_document_idx) ] unsorted_doc_outputs_complete = [ doc_outputs_complete[i] for i in unsort(ordered_document_idx) ] sentence_outputs = torch.cat(unsorted_doc_outputs, 0) sims_outputs = torch.cat(unsorted_sims_outputs, 0).unsqueeze(1) doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len, i, :]) win_size = 3 encoded_documents_2 = [] for doc_output in doc_outputs: doc_l = doc_output.size()[0] new_one = [] for i in range(doc_l): if i - win_size < 0: new_one.append( no_name(doc_output[0:i + win_size + 1], i, win_size, self.hidden, self.self_attn)) else: new_one.append( no_name(doc_output[i - win_size:i + win_size + 1], i, win_size, self.hidden, self.self_attn)) encoded_documents_2.append(torch.stack(new_one, dim=0).squeeze(1)) encoded_documents_2 = [ torch.cat([doc_outputs[i], encoded_documents_2[i]], -1) for i in range(len(doc_outputs)) ] doc_outputs = encoded_documents_2 padded_docs = [self.pad_document(d, max_doc_size) for d in doc_outputs] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) sentence_lstm_output, _ = self.sentence_lstm_2( packed_docs, zero_state( self, batch_size=batch_size)) #till now, no sentence is removed #sentence_lstm_output = packed_docs padded_x, _ = pad_packed_sequence( sentence_lstm_output) # (max sentence len, batch, 256) doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len - 1, i, :]) unsorted_doc_outputs = [ doc_outputs[i] for i in unsort(ordered_document_idx) ] sentence_outputs = torch.cat(unsorted_doc_outputs, 0) x = self.h2s(sentence_outputs) #return x, all_similarities.mean() return x, sims_outputs
def forward(self, batch): batch_size = len(batch) sentences_per_doc = [] all_batch_sentences = [] for document in batch: all_batch_sentences.extend(document) sentences_per_doc.append(len(document)) lengths = [s.size()[0] for s in all_batch_sentences] max_length = max(lengths) padded_sentences = [ self.pad(s, max_length) for s in all_batch_sentences ] padded_sentences_tensor = torch.cat(padded_sentences, 1).permute(1, 0, 2) unsorted_encodings = self.sentence_encoder(padded_sentences_tensor) #print ("gdq -------------> ", len(all_batch_sentences), type(unsorted_encodings), unsorted_encodings.size()) index = 0 encoded_documents = [] for sentences_count in sentences_per_doc: end_index = index + sentences_count encoded_documents.append(unsorted_encodings[index:end_index, :]) index = end_index doc_sizes = [doc.size()[0] for doc in encoded_documents] max_doc_size = np.max(doc_sizes) ordered_document_idx = np.argsort(doc_sizes)[::-1] ordered_doc_sizes = sorted(doc_sizes)[::-1] ordered_documents = [ encoded_documents[idx] for idx in ordered_document_idx ] padded_docs = [ self.pad_document(d, max_doc_size) for d in ordered_documents ] docs_tensor = torch.cat(padded_docs, 1) packed_docs = pack_padded_sequence(docs_tensor, ordered_doc_sizes) sentence_lstm_output, _ = self.sentence_lstm( packed_docs, zero_state(self, batch_size=batch_size)) padded_x, _ = pad_packed_sequence( sentence_lstm_output) # (max sentence len, batch, 256*2) sent_squish = batch_matmul_bias(padded_x, self.weight_W_sent, self.bias_sent, nonlinearity='tanh') sent_attn = batch_matmul(sent_squish, self.weight_proj_sent) sent_attn_norm = self.softmax_sent(sent_attn) sent_attn_vectors = attention_mul(padded_x, sent_attn_norm) padded_x = sent_attn_vectors doc_outputs = [] for i, doc_len in enumerate(ordered_doc_sizes): doc_outputs.append(padded_x[0:doc_len - 1, i, :]) # -1 to remove last prediction unsorted_doc_outputs = [ doc_outputs[i] for i in unsort(ordered_document_idx) ] sentence_outputs = torch.cat(unsorted_doc_outputs, 0) x = self.h2s(sentence_outputs) return x
def test_unsort(self): x = np.random.randint(0, 100, 10) sort_order = np.argsort(x) unsort_order = unsort(sort_order) assert np.all(x[sort_order][unsort_order] == x)